mirror of
https://github.com/We-Dont-Byte/Mind_Reader.git
synced 2025-02-04 10:38:42 +00:00
7
src/pylex/index.ts
Normal file
7
src/pylex/index.ts
Normal file
@@ -0,0 +1,7 @@
|
||||
// expose parser by default
|
||||
export {default as Parser} from './parser';
|
||||
export {default as LineToken} from './token';
|
||||
export {default as Lexer} from './lexer';
|
||||
export {default as LexNode} from './node';
|
||||
export {TabInfo as TabInfo} from './token';
|
||||
|
||||
214
src/pylex/lexer.ts
Normal file
214
src/pylex/lexer.ts
Normal file
@@ -0,0 +1,214 @@
|
||||
import { LineToken } from '.';
|
||||
import { Symbol, EOFTOKEN, TabInfo } from './token';
|
||||
|
||||
type Rule = {
|
||||
pattern: RegExp,
|
||||
type: Symbol,
|
||||
};
|
||||
|
||||
/**
|
||||
* List of recognition patterns, in order of priority
|
||||
* The first item is a recognition pattern, used to recognize the token
|
||||
* the second item is the token type
|
||||
*/
|
||||
const rules: Rule[] = [
|
||||
{
|
||||
pattern: /^\s*def\s+(?<attr>[a-zA-Z_][a-zA-Z0-9_]*)\(/,
|
||||
type: Symbol.FUNCTION
|
||||
},
|
||||
{
|
||||
pattern: /^\s*class\s+(?<attr>[a-zA-Z_][a-zA-Z0-9_]*)/,
|
||||
type: Symbol.CLASS
|
||||
},
|
||||
{
|
||||
pattern: /^\s*if\s+(?<attr>[^:]+):\s*/,
|
||||
type: Symbol.IF
|
||||
},
|
||||
{
|
||||
pattern: /^\s*elif\s+(?<attr>[^:]+):\s*$/,
|
||||
type: Symbol.ELIF
|
||||
},
|
||||
{
|
||||
pattern: /^\s*else\s*:/,
|
||||
type: Symbol.ELSE
|
||||
},
|
||||
{
|
||||
pattern: /^\s*for\s+(?<attr>[^:]+):\s*$/,
|
||||
type: Symbol.FOR
|
||||
},
|
||||
{
|
||||
pattern: /^\s*while\s+(?<attr>[^:]+):\s*$/,
|
||||
type: Symbol.WHILE
|
||||
},
|
||||
{
|
||||
pattern: /^\s*try\s*:/,
|
||||
type: Symbol.TRY
|
||||
},
|
||||
{
|
||||
pattern: /^\s*except(\s*(?<attr>[^:]+))?:\s*$/,
|
||||
type: Symbol.EXCEPT
|
||||
},
|
||||
{
|
||||
pattern: /^\s*finally\s*:\s*$/,
|
||||
type: Symbol.FINALLY
|
||||
},
|
||||
{
|
||||
pattern: /^\s*with\s+(?<attr>[^:]+):\s*$/,
|
||||
type: Symbol.WITH
|
||||
},
|
||||
];
|
||||
|
||||
/**
|
||||
* Line-By-Line Lexer
|
||||
*/
|
||||
export default class Lexer {
|
||||
private textLines: string[] = []; // array of text lines
|
||||
private pos: number = 0;
|
||||
private _currToken: LineToken = EOFTOKEN;
|
||||
|
||||
/**
|
||||
* Calculates indentation level for a line. If using soft tabs,
|
||||
* indent level rounds up (so, tabSize+1 spaces is 2 levels,
|
||||
* 2*tabSize+1 is 3, etc.)
|
||||
*
|
||||
* @param `text` The line of text.
|
||||
* @param `tabFmt` A tab information descriptor.
|
||||
* @return The indent of `text` with consideration for `tabFmt`.
|
||||
*/
|
||||
static getIndent(text: string, tabFmt: TabInfo): number {
|
||||
let leadingSpace: number = text.length - text.trimLeft().length;
|
||||
let indent: number;
|
||||
if (tabFmt.hard) {
|
||||
// used tabs
|
||||
indent = leadingSpace;
|
||||
} else {
|
||||
// use spaces
|
||||
indent = Math.ceil(leadingSpace/tabFmt.size!);
|
||||
}
|
||||
|
||||
return indent;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param `text` The text to lex.
|
||||
* @param `tabFmt` A tab information descriptor
|
||||
*/
|
||||
constructor(text?: string, private tabFmt?: TabInfo) {
|
||||
// default is 4 wide expanded tabs
|
||||
this.tabFmt = {
|
||||
...{size: 4, hard: false},
|
||||
...tabFmt
|
||||
};
|
||||
|
||||
if (text) {
|
||||
// normalize linefeeds
|
||||
text = text.replace('\r\n', '\n');
|
||||
}
|
||||
this.restart(text);
|
||||
}
|
||||
|
||||
/**
|
||||
* Restart lexer with new text.
|
||||
*
|
||||
* @param `text` The new text to lex.
|
||||
*/
|
||||
restart(text?: string): void {
|
||||
this.pos = 0;
|
||||
this._currToken = EOFTOKEN; // if no input, already on EOFTOKEN
|
||||
if (text) {
|
||||
this.textLines = text.split('\n');
|
||||
this.next(); // advance to the first token
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @return the current {@link LineToken}.
|
||||
*/
|
||||
currToken(): LineToken { return this._currToken; }
|
||||
|
||||
/**
|
||||
* Advance the position in the token stream.
|
||||
*
|
||||
* @return The new current token, after advancing
|
||||
*/
|
||||
next(): LineToken {
|
||||
if (this._currToken === EOFTOKEN && this.pos > this.textLines.length) {
|
||||
throw new Error('Cannot advance past end');
|
||||
}
|
||||
|
||||
// Until a LineToken is found, or EOF
|
||||
while (this.pos < this.textLines.length) {
|
||||
let line: string = this.textLines[this.pos];
|
||||
let indent: number = Lexer.getIndent(line, this.tabFmt!);
|
||||
let token: LineToken;
|
||||
for (var r of rules) {
|
||||
// Does line match pattern?
|
||||
let match: RegExpMatchArray | null = line.match(r.pattern);
|
||||
if (match) {
|
||||
// Yes...
|
||||
if (match.groups) {
|
||||
token = new LineToken(r.type, this.pos, indent, match.groups["attr"]);
|
||||
} else {
|
||||
token = new LineToken(r.type, this.pos, indent);
|
||||
}
|
||||
|
||||
this._currToken = token;
|
||||
this.pos++;
|
||||
return this.currToken();
|
||||
}
|
||||
}
|
||||
// No rules matched
|
||||
|
||||
// Skip this line if it is whitespace, comment, or empty
|
||||
if (/^\s*(#.*)?$/.test(line)) {
|
||||
this.pos++;
|
||||
continue;
|
||||
}
|
||||
|
||||
// This is an INDENT token
|
||||
token = new LineToken(Symbol.INDENT, this.pos, indent);
|
||||
this._currToken = token;
|
||||
this.pos++;
|
||||
return this.currToken();
|
||||
}
|
||||
|
||||
// Didn't return, must be EOF
|
||||
this._currToken = EOFTOKEN;
|
||||
this.pos++;
|
||||
return this.currToken();
|
||||
}
|
||||
|
||||
/**
|
||||
* Move backwards in the token stream
|
||||
*
|
||||
* @param `n` The number of positions to retract.
|
||||
* @return The new current token after retracting.
|
||||
*/
|
||||
retract(n: number = 1): LineToken {
|
||||
if (this.pos - 1 - n < 0) {
|
||||
// -1 because this.pos is currently on the next token
|
||||
throw new RangeError('Cannot retract past start');
|
||||
}
|
||||
|
||||
if (n <= 0) {
|
||||
throw new RangeError('Retract distance must be positive');
|
||||
}
|
||||
|
||||
if (this.pos - n === 0) {
|
||||
// just restart
|
||||
this.pos = 0;
|
||||
return this.next();
|
||||
}
|
||||
|
||||
let c = n + 1;
|
||||
while (c > 0) {
|
||||
this.pos--;
|
||||
while (/^\s*(#.*)?$/.test(this.textLines[this.pos])) {
|
||||
// Skip empty lines
|
||||
this.pos--;
|
||||
}
|
||||
c--;
|
||||
}
|
||||
return this.next();
|
||||
}
|
||||
}
|
||||
82
src/pylex/node.ts
Normal file
82
src/pylex/node.ts
Normal file
@@ -0,0 +1,82 @@
|
||||
import * as vscode from 'vscode';
|
||||
|
||||
import LineToken from './token';
|
||||
|
||||
/**
|
||||
* A node in a Parse tree.
|
||||
*/
|
||||
export default class LexNode extends vscode.TreeItem {
|
||||
|
||||
/**
|
||||
* @param `label` A human-readable string describing this item
|
||||
* @param `collapsibleState` {@link TreeItemCollapsibleState} of the tree item.
|
||||
* @param `token` The token at this node.
|
||||
* @param `_children` The children in this node's subtree.
|
||||
* @param `_parent` The parent node of this node.
|
||||
*/
|
||||
constructor(
|
||||
public readonly label: string,
|
||||
public readonly collapsibleState: vscode.TreeItemCollapsibleState,
|
||||
public readonly token: LineToken | null,
|
||||
private _children: LexNode[] | null = null,
|
||||
private _parent: LexNode | null = null,
|
||||
) {
|
||||
super(label, collapsibleState);
|
||||
this.tooltip = this.label;
|
||||
if (this.token && this.token.linenr >= 0) {
|
||||
this.tooltip += `: ${this.token.linenr+1}`;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @return The children of this node.
|
||||
*/
|
||||
children(): LexNode[] | null {
|
||||
return this._children;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return The parent of this node.
|
||||
*/
|
||||
parent(): LexNode | null {
|
||||
return this._parent;
|
||||
}
|
||||
|
||||
/**
|
||||
* Adopt child nodes.
|
||||
*
|
||||
* @param `child` Array of nodes to adopt.
|
||||
*/
|
||||
adopt(children: LexNode[]): void {
|
||||
let parentedChildren = children.map(c => new LexNode(
|
||||
c.label,
|
||||
c.collapsibleState,
|
||||
c.token,
|
||||
c.children(),
|
||||
this
|
||||
));
|
||||
|
||||
// Are there any other children?
|
||||
if (this._children) {
|
||||
// Yes...
|
||||
this._children = this._children.concat(children);
|
||||
} else {
|
||||
// No....
|
||||
this._children = parentedChildren;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the root path for this node.
|
||||
*
|
||||
* @return A path of parent nodes from this node to the root of the tree.
|
||||
*/
|
||||
rootPath(): LexNode[] {
|
||||
if (this._parent) {
|
||||
return [new LexNode(this.label, this.collapsibleState, this.token, this._children, this._parent)].concat(this._parent.rootPath());
|
||||
} else {
|
||||
return [new LexNode(this.label, this.collapsibleState, this.token, this._children, this._parent)];
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
133
src/pylex/parser.ts
Normal file
133
src/pylex/parser.ts
Normal file
@@ -0,0 +1,133 @@
|
||||
import * as vscode from 'vscode';
|
||||
|
||||
import { EOFTOKEN, Symbol, TabInfo } from './token';
|
||||
import Lexer from './lexer';
|
||||
import LexNode from './node';
|
||||
|
||||
/**
|
||||
* A parse tree generator
|
||||
*/
|
||||
export default class Parser {
|
||||
private lexer: Lexer;
|
||||
private currIndent: number;
|
||||
private root: LexNode; // Root of syntax tree
|
||||
|
||||
/**
|
||||
* @param `text` Text to parse.
|
||||
* @param `tabFmt` A tab information descriptor
|
||||
*/
|
||||
constructor (private text?: string, private tabFmt?: TabInfo) {}
|
||||
|
||||
/**
|
||||
* Parse the passed text.
|
||||
*
|
||||
* @param `text` Text to parse. If undefined, use current value of `this.text`
|
||||
* @param `tabFmt` A tab information descriptor
|
||||
* @return A parse tree representing `text`.
|
||||
*/
|
||||
parse(text?: string, tabFmt?: TabInfo): LexNode {
|
||||
if (text) {
|
||||
// save text
|
||||
this.text = text;
|
||||
} else {
|
||||
// default to this.text
|
||||
// this might still be undefined
|
||||
text = this.text;
|
||||
}
|
||||
|
||||
if (tabFmt) {
|
||||
// save tabFmt
|
||||
this.tabFmt = tabFmt;
|
||||
} else {
|
||||
// default to this.tabFmt
|
||||
// this might still be undefined
|
||||
tabFmt = this.tabFmt;
|
||||
}
|
||||
|
||||
// initialize root
|
||||
this.lexer = new Lexer(this.text, this.tabFmt);
|
||||
this.root = new LexNode(
|
||||
"root",
|
||||
vscode.TreeItemCollapsibleState.None,
|
||||
null,
|
||||
null,
|
||||
null
|
||||
);
|
||||
|
||||
// parse children
|
||||
this.currIndent = 0;
|
||||
const children = this._parse(this.root);
|
||||
|
||||
if (children.length > 0) {
|
||||
this.root.adopt(children);
|
||||
}
|
||||
return this.root;
|
||||
}
|
||||
|
||||
private _parse(parent: LexNode | null): LexNode[] {
|
||||
let children: LexNode[] = [];
|
||||
while (this.lexer.currToken() !== EOFTOKEN) {
|
||||
if (this.lexer.currToken().indentLevel < this.currIndent) {
|
||||
// go up 1 level of recursion at a time to unravel properly
|
||||
this.currIndent--;
|
||||
return children;
|
||||
} else if (this.lexer.currToken().type === Symbol.INDENT) {
|
||||
// regular code, advance and stay in same block
|
||||
this.lexer.next();
|
||||
continue;
|
||||
} else {
|
||||
// new block starts here
|
||||
const label = this.lexer.currToken().type + (this.lexer.currToken().attr === undefined ? "" : " " + this.lexer.currToken().attr);
|
||||
let blockRoot = new LexNode(
|
||||
label,
|
||||
vscode.TreeItemCollapsibleState.None,
|
||||
this.lexer.currToken(),
|
||||
null,
|
||||
parent
|
||||
);
|
||||
this.lexer.next();
|
||||
this.currIndent++;
|
||||
const blockChildren = this._parse(blockRoot); // Recursively parse all child blocks
|
||||
if (blockChildren.length > 0) {
|
||||
blockRoot.adopt(blockChildren);
|
||||
}
|
||||
children.push(blockRoot);
|
||||
}
|
||||
}
|
||||
return children;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get an array of LexNodes representing the rootpath of LexNodes from the
|
||||
* passed line number to the root of the document. A list of "this" inside
|
||||
* "that" inside ... inside the document root.
|
||||
*
|
||||
* @param `lineNumber` The line number to query context for.
|
||||
* @return An array of LexNodes for the root path containing `lineNumber`
|
||||
*/
|
||||
context(lineNumber: number): LexNode[] {
|
||||
if (!this.root.children()) {
|
||||
return [];
|
||||
}
|
||||
|
||||
// Returns the LexNode that is the parent
|
||||
// of the queried line number
|
||||
let find = (root: LexNode): LexNode | undefined => {
|
||||
let prevChild: LexNode;
|
||||
for (var child of root.children()!) {
|
||||
if (lineNumber < child.token!.linenr) {
|
||||
if (prevChild!.children()) {
|
||||
return find(prevChild!);
|
||||
} else {
|
||||
return prevChild!;
|
||||
}
|
||||
} else {
|
||||
prevChild = child;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
let target = find(this.root);
|
||||
return target!.rootPath();
|
||||
}
|
||||
}
|
||||
66
src/pylex/token.ts
Normal file
66
src/pylex/token.ts
Normal file
@@ -0,0 +1,66 @@
|
||||
/* eslint-disable @typescript-eslint/naming-convention */
|
||||
/* ^ allow uppercase enum */
|
||||
|
||||
/**
|
||||
* LineToken Symbol Types
|
||||
*/
|
||||
export enum Symbol {
|
||||
FUNCTION = "function",
|
||||
CLASS = "class",
|
||||
IF = "if",
|
||||
ELSE = "else",
|
||||
ELIF = "elif",
|
||||
FOR = "for",
|
||||
WHILE = "while",
|
||||
TRY = "try",
|
||||
EXCEPT = "except",
|
||||
FINALLY = "finally",
|
||||
WITH = "with",
|
||||
INDENT = "INDENT", // Indent token, default if not EOF, only contains indent information
|
||||
EOF = "EOF"
|
||||
}
|
||||
|
||||
/**
|
||||
* @typedef {Object} TabInfo
|
||||
* @prop {number} size // The width of a tab in spaces
|
||||
* @prop {boolean} hard // Whether to use literal tab characters
|
||||
*/
|
||||
export type TabInfo = {
|
||||
size: number,
|
||||
hard: boolean,
|
||||
};
|
||||
|
||||
/**
|
||||
* A token for a line in a Python file
|
||||
*/
|
||||
export default class LineToken {
|
||||
|
||||
/**
|
||||
* @param `type` The type of token for this line.
|
||||
* @param `linenr` The line number (0-indexed)
|
||||
* @param `indentLevel` The level of indentation.
|
||||
* @param `attr` Additional item for tokens that might need it.
|
||||
*/
|
||||
constructor(
|
||||
public readonly type: Symbol,
|
||||
public readonly linenr: number,
|
||||
public readonly indentLevel: number,
|
||||
public readonly attr?: any // Any additional things a token might need (class name, control conidition)
|
||||
) { }
|
||||
|
||||
/**
|
||||
* @return A string representation of the token
|
||||
*/
|
||||
toString(): string {
|
||||
return this.type + ", linenr:" + (this.linenr+1) + ", indentLevel: " + this.indentLevel + ", attr: " + this.attr;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* The End-Of-File token
|
||||
*
|
||||
* EOFTOKEN is returned when `next()` is called
|
||||
* while the lexer is on the last token in the stream.
|
||||
*/
|
||||
const EOFTOKEN = new LineToken(Symbol.EOF, -1, -1);
|
||||
export { EOFTOKEN };
|
||||
Reference in New Issue
Block a user