Integrate Parser (#4)

Integrate parser
This commit is contained in:
Jake Grossman
2021-10-26 12:48:04 -05:00
committed by GitHub
parent 9db476dd39
commit d105544596
15 changed files with 1234 additions and 16 deletions

7
src/pylex/index.ts Normal file
View File

@@ -0,0 +1,7 @@
// expose parser by default
export {default as Parser} from './parser';
export {default as LineToken} from './token';
export {default as Lexer} from './lexer';
export {default as LexNode} from './node';
export {TabInfo as TabInfo} from './token';

214
src/pylex/lexer.ts Normal file
View File

@@ -0,0 +1,214 @@
import { LineToken } from '.';
import { Symbol, EOFTOKEN, TabInfo } from './token';
type Rule = {
pattern: RegExp,
type: Symbol,
};
/**
* List of recognition patterns, in order of priority
* The first item is a recognition pattern, used to recognize the token
* the second item is the token type
*/
const rules: Rule[] = [
{
pattern: /^\s*def\s+(?<attr>[a-zA-Z_][a-zA-Z0-9_]*)\(/,
type: Symbol.FUNCTION
},
{
pattern: /^\s*class\s+(?<attr>[a-zA-Z_][a-zA-Z0-9_]*)/,
type: Symbol.CLASS
},
{
pattern: /^\s*if\s+(?<attr>[^:]+):\s*/,
type: Symbol.IF
},
{
pattern: /^\s*elif\s+(?<attr>[^:]+):\s*$/,
type: Symbol.ELIF
},
{
pattern: /^\s*else\s*:/,
type: Symbol.ELSE
},
{
pattern: /^\s*for\s+(?<attr>[^:]+):\s*$/,
type: Symbol.FOR
},
{
pattern: /^\s*while\s+(?<attr>[^:]+):\s*$/,
type: Symbol.WHILE
},
{
pattern: /^\s*try\s*:/,
type: Symbol.TRY
},
{
pattern: /^\s*except(\s*(?<attr>[^:]+))?:\s*$/,
type: Symbol.EXCEPT
},
{
pattern: /^\s*finally\s*:\s*$/,
type: Symbol.FINALLY
},
{
pattern: /^\s*with\s+(?<attr>[^:]+):\s*$/,
type: Symbol.WITH
},
];
/**
* Line-By-Line Lexer
*/
export default class Lexer {
private textLines: string[] = []; // array of text lines
private pos: number = 0;
private _currToken: LineToken = EOFTOKEN;
/**
* Calculates indentation level for a line. If using soft tabs,
* indent level rounds up (so, tabSize+1 spaces is 2 levels,
* 2*tabSize+1 is 3, etc.)
*
* @param `text` The line of text.
* @param `tabFmt` A tab information descriptor.
* @return The indent of `text` with consideration for `tabFmt`.
*/
static getIndent(text: string, tabFmt: TabInfo): number {
let leadingSpace: number = text.length - text.trimLeft().length;
let indent: number;
if (tabFmt.hard) {
// used tabs
indent = leadingSpace;
} else {
// use spaces
indent = Math.ceil(leadingSpace/tabFmt.size!);
}
return indent;
}
/**
* @param `text` The text to lex.
* @param `tabFmt` A tab information descriptor
*/
constructor(text?: string, private tabFmt?: TabInfo) {
// default is 4 wide expanded tabs
this.tabFmt = {
...{size: 4, hard: false},
...tabFmt
};
if (text) {
// normalize linefeeds
text = text.replace('\r\n', '\n');
}
this.restart(text);
}
/**
* Restart lexer with new text.
*
* @param `text` The new text to lex.
*/
restart(text?: string): void {
this.pos = 0;
this._currToken = EOFTOKEN; // if no input, already on EOFTOKEN
if (text) {
this.textLines = text.split('\n');
this.next(); // advance to the first token
}
}
/**
* @return the current {@link LineToken}.
*/
currToken(): LineToken { return this._currToken; }
/**
* Advance the position in the token stream.
*
* @return The new current token, after advancing
*/
next(): LineToken {
if (this._currToken === EOFTOKEN && this.pos > this.textLines.length) {
throw new Error('Cannot advance past end');
}
// Until a LineToken is found, or EOF
while (this.pos < this.textLines.length) {
let line: string = this.textLines[this.pos];
let indent: number = Lexer.getIndent(line, this.tabFmt!);
let token: LineToken;
for (var r of rules) {
// Does line match pattern?
let match: RegExpMatchArray | null = line.match(r.pattern);
if (match) {
// Yes...
if (match.groups) {
token = new LineToken(r.type, this.pos, indent, match.groups["attr"]);
} else {
token = new LineToken(r.type, this.pos, indent);
}
this._currToken = token;
this.pos++;
return this.currToken();
}
}
// No rules matched
// Skip this line if it is whitespace, comment, or empty
if (/^\s*(#.*)?$/.test(line)) {
this.pos++;
continue;
}
// This is an INDENT token
token = new LineToken(Symbol.INDENT, this.pos, indent);
this._currToken = token;
this.pos++;
return this.currToken();
}
// Didn't return, must be EOF
this._currToken = EOFTOKEN;
this.pos++;
return this.currToken();
}
/**
* Move backwards in the token stream
*
* @param `n` The number of positions to retract.
* @return The new current token after retracting.
*/
retract(n: number = 1): LineToken {
if (this.pos - 1 - n < 0) {
// -1 because this.pos is currently on the next token
throw new RangeError('Cannot retract past start');
}
if (n <= 0) {
throw new RangeError('Retract distance must be positive');
}
if (this.pos - n === 0) {
// just restart
this.pos = 0;
return this.next();
}
let c = n + 1;
while (c > 0) {
this.pos--;
while (/^\s*(#.*)?$/.test(this.textLines[this.pos])) {
// Skip empty lines
this.pos--;
}
c--;
}
return this.next();
}
}

82
src/pylex/node.ts Normal file
View File

@@ -0,0 +1,82 @@
import * as vscode from 'vscode';
import LineToken from './token';
/**
* A node in a Parse tree.
*/
export default class LexNode extends vscode.TreeItem {
/**
* @param `label` A human-readable string describing this item
* @param `collapsibleState` {@link TreeItemCollapsibleState} of the tree item.
* @param `token` The token at this node.
* @param `_children` The children in this node's subtree.
* @param `_parent` The parent node of this node.
*/
constructor(
public readonly label: string,
public readonly collapsibleState: vscode.TreeItemCollapsibleState,
public readonly token: LineToken | null,
private _children: LexNode[] | null = null,
private _parent: LexNode | null = null,
) {
super(label, collapsibleState);
this.tooltip = this.label;
if (this.token && this.token.linenr >= 0) {
this.tooltip += `: ${this.token.linenr+1}`;
}
}
/**
* @return The children of this node.
*/
children(): LexNode[] | null {
return this._children;
}
/**
* @return The parent of this node.
*/
parent(): LexNode | null {
return this._parent;
}
/**
* Adopt child nodes.
*
* @param `child` Array of nodes to adopt.
*/
adopt(children: LexNode[]): void {
let parentedChildren = children.map(c => new LexNode(
c.label,
c.collapsibleState,
c.token,
c.children(),
this
));
// Are there any other children?
if (this._children) {
// Yes...
this._children = this._children.concat(children);
} else {
// No....
this._children = parentedChildren;
}
}
/**
* Return the root path for this node.
*
* @return A path of parent nodes from this node to the root of the tree.
*/
rootPath(): LexNode[] {
if (this._parent) {
return [new LexNode(this.label, this.collapsibleState, this.token, this._children, this._parent)].concat(this._parent.rootPath());
} else {
return [new LexNode(this.label, this.collapsibleState, this.token, this._children, this._parent)];
}
}
}

133
src/pylex/parser.ts Normal file
View File

@@ -0,0 +1,133 @@
import * as vscode from 'vscode';
import { EOFTOKEN, Symbol, TabInfo } from './token';
import Lexer from './lexer';
import LexNode from './node';
/**
* A parse tree generator
*/
export default class Parser {
private lexer: Lexer;
private currIndent: number;
private root: LexNode; // Root of syntax tree
/**
* @param `text` Text to parse.
* @param `tabFmt` A tab information descriptor
*/
constructor (private text?: string, private tabFmt?: TabInfo) {}
/**
* Parse the passed text.
*
* @param `text` Text to parse. If undefined, use current value of `this.text`
* @param `tabFmt` A tab information descriptor
* @return A parse tree representing `text`.
*/
parse(text?: string, tabFmt?: TabInfo): LexNode {
if (text) {
// save text
this.text = text;
} else {
// default to this.text
// this might still be undefined
text = this.text;
}
if (tabFmt) {
// save tabFmt
this.tabFmt = tabFmt;
} else {
// default to this.tabFmt
// this might still be undefined
tabFmt = this.tabFmt;
}
// initialize root
this.lexer = new Lexer(this.text, this.tabFmt);
this.root = new LexNode(
"root",
vscode.TreeItemCollapsibleState.None,
null,
null,
null
);
// parse children
this.currIndent = 0;
const children = this._parse(this.root);
if (children.length > 0) {
this.root.adopt(children);
}
return this.root;
}
private _parse(parent: LexNode | null): LexNode[] {
let children: LexNode[] = [];
while (this.lexer.currToken() !== EOFTOKEN) {
if (this.lexer.currToken().indentLevel < this.currIndent) {
// go up 1 level of recursion at a time to unravel properly
this.currIndent--;
return children;
} else if (this.lexer.currToken().type === Symbol.INDENT) {
// regular code, advance and stay in same block
this.lexer.next();
continue;
} else {
// new block starts here
const label = this.lexer.currToken().type + (this.lexer.currToken().attr === undefined ? "" : " " + this.lexer.currToken().attr);
let blockRoot = new LexNode(
label,
vscode.TreeItemCollapsibleState.None,
this.lexer.currToken(),
null,
parent
);
this.lexer.next();
this.currIndent++;
const blockChildren = this._parse(blockRoot); // Recursively parse all child blocks
if (blockChildren.length > 0) {
blockRoot.adopt(blockChildren);
}
children.push(blockRoot);
}
}
return children;
}
/**
* Get an array of LexNodes representing the rootpath of LexNodes from the
* passed line number to the root of the document. A list of "this" inside
* "that" inside ... inside the document root.
*
* @param `lineNumber` The line number to query context for.
* @return An array of LexNodes for the root path containing `lineNumber`
*/
context(lineNumber: number): LexNode[] {
if (!this.root.children()) {
return [];
}
// Returns the LexNode that is the parent
// of the queried line number
let find = (root: LexNode): LexNode | undefined => {
let prevChild: LexNode;
for (var child of root.children()!) {
if (lineNumber < child.token!.linenr) {
if (prevChild!.children()) {
return find(prevChild!);
} else {
return prevChild!;
}
} else {
prevChild = child;
}
}
};
let target = find(this.root);
return target!.rootPath();
}
}

66
src/pylex/token.ts Normal file
View File

@@ -0,0 +1,66 @@
/* eslint-disable @typescript-eslint/naming-convention */
/* ^ allow uppercase enum */
/**
* LineToken Symbol Types
*/
export enum Symbol {
FUNCTION = "function",
CLASS = "class",
IF = "if",
ELSE = "else",
ELIF = "elif",
FOR = "for",
WHILE = "while",
TRY = "try",
EXCEPT = "except",
FINALLY = "finally",
WITH = "with",
INDENT = "INDENT", // Indent token, default if not EOF, only contains indent information
EOF = "EOF"
}
/**
* @typedef {Object} TabInfo
* @prop {number} size // The width of a tab in spaces
* @prop {boolean} hard // Whether to use literal tab characters
*/
export type TabInfo = {
size: number,
hard: boolean,
};
/**
* A token for a line in a Python file
*/
export default class LineToken {
/**
* @param `type` The type of token for this line.
* @param `linenr` The line number (0-indexed)
* @param `indentLevel` The level of indentation.
* @param `attr` Additional item for tokens that might need it.
*/
constructor(
public readonly type: Symbol,
public readonly linenr: number,
public readonly indentLevel: number,
public readonly attr?: any // Any additional things a token might need (class name, control conidition)
) { }
/**
* @return A string representation of the token
*/
toString(): string {
return this.type + ", linenr:" + (this.linenr+1) + ", indentLevel: " + this.indentLevel + ", attr: " + this.attr;
}
}
/**
* The End-Of-File token
*
* EOFTOKEN is returned when `next()` is called
* while the lexer is on the last token in the stream.
*/
const EOFTOKEN = new LineToken(Symbol.EOF, -1, -1);
export { EOFTOKEN };