Integrate Parser (#4)

Integrate parser
2025-02-04 10:38:42 +00:00 · 2021-10-26 12:48:04 -05:00
parent 9db476dd39
commit d105544596
15 changed files with 1234 additions and 16 deletions
--- a/src/pylex/index.ts
+++ b/src/pylex/index.ts
@@ -0,0 +1,7 @@
+// expose parser by default
+export {default as Parser} from './parser';
+export {default as LineToken} from './token';
+export {default as Lexer} from './lexer';
+export {default as LexNode} from './node';
+export {TabInfo as TabInfo} from './token';
+
--- a/src/pylex/lexer.ts
+++ b/src/pylex/lexer.ts
@@ -0,0 +1,214 @@
+import { LineToken } from '.';
+import { Symbol, EOFTOKEN, TabInfo } from './token';
+
+type Rule = {
+  pattern: RegExp,
+  type: Symbol,
+};
+
+/**
+ * List of recognition patterns, in order of priority
+ * The first item is a recognition pattern, used to recognize the token
+ * the second item is the token type
+ */
+const rules: Rule[] = [
+  {
+    pattern: /^\s*def\s+(?<attr>[a-zA-Z_][a-zA-Z0-9_]*)\(/,
+    type: Symbol.FUNCTION
+  },
+  {
+    pattern: /^\s*class\s+(?<attr>[a-zA-Z_][a-zA-Z0-9_]*)/,
+    type: Symbol.CLASS
+  },
+  {
+    pattern: /^\s*if\s+(?<attr>[^:]+):\s*/,
+    type: Symbol.IF
+  },
+  {
+    pattern: /^\s*elif\s+(?<attr>[^:]+):\s*$/,
+    type: Symbol.ELIF
+  },
+  {
+    pattern: /^\s*else\s*:/,
+    type: Symbol.ELSE
+  },
+  {
+    pattern: /^\s*for\s+(?<attr>[^:]+):\s*$/,
+    type: Symbol.FOR
+  },
+  {
+    pattern: /^\s*while\s+(?<attr>[^:]+):\s*$/,
+    type: Symbol.WHILE
+  },
+  {
+    pattern: /^\s*try\s*:/,
+    type: Symbol.TRY
+  },
+  {
+    pattern: /^\s*except(\s*(?<attr>[^:]+))?:\s*$/,
+    type: Symbol.EXCEPT
+  },
+  {
+    pattern: /^\s*finally\s*:\s*$/,
+    type: Symbol.FINALLY
+  },
+  {
+    pattern: /^\s*with\s+(?<attr>[^:]+):\s*$/,
+    type: Symbol.WITH
+  },
+];
+
+/**
+ * Line-By-Line Lexer
+ */
+export default class Lexer {
+  private textLines: string[] = []; // array of text lines
+  private pos: number = 0;
+  private _currToken: LineToken = EOFTOKEN;
+
+  /**
+   * Calculates indentation level for a line. If using soft tabs,
+   * indent level rounds up (so, tabSize+1 spaces is 2 levels,
+   * 2*tabSize+1 is 3, etc.)
+   *
+   * @param `text` The line of text.
+   * @param `tabFmt` A tab information descriptor.
+   * @return The indent of `text` with consideration for `tabFmt`.
+   */
+  static getIndent(text: string, tabFmt: TabInfo): number {
+    let leadingSpace: number = text.length - text.trimLeft().length;
+    let indent: number;
+    if (tabFmt.hard) {
+      // used tabs
+      indent = leadingSpace;
+    } else {
+      // use spaces
+      indent = Math.ceil(leadingSpace/tabFmt.size!);
+    }
+
+    return indent;
+  }
+
+  /**
+   * @param `text` The text to lex.
+   * @param `tabFmt` A tab information descriptor
+   */
+  constructor(text?: string, private tabFmt?: TabInfo) {
+    // default is 4 wide expanded tabs
+    this.tabFmt = {
+      ...{size: 4, hard: false},
+      ...tabFmt
+    };
+
+    if (text) {
+      // normalize linefeeds
+      text = text.replace('\r\n', '\n');
+    }
+    this.restart(text);
+  }
+
+  /**
+   * Restart lexer with new text.
+   *
+   * @param `text` The new text to lex.
+   */
+  restart(text?: string): void {
+    this.pos = 0;
+    this._currToken = EOFTOKEN; // if no input, already on EOFTOKEN
+    if (text) {
+      this.textLines = text.split('\n');
+      this.next(); // advance to the first token
+    }
+  }
+
+  /**
+   * @return the current {@link LineToken}.
+   */
+  currToken(): LineToken { return this._currToken; }
+
+  /**
+   * Advance the position in the token stream.
+   *
+   * @return The new current token, after advancing
+   */
+  next(): LineToken {
+    if (this._currToken === EOFTOKEN && this.pos > this.textLines.length) {
+      throw new Error('Cannot advance past end');
+    }
+
+    // Until a LineToken is found, or EOF
+    while (this.pos < this.textLines.length) {
+      let line: string = this.textLines[this.pos];
+      let indent: number = Lexer.getIndent(line, this.tabFmt!);
+      let token: LineToken;
+      for (var r of rules) {
+        // Does line match pattern?
+        let match: RegExpMatchArray | null = line.match(r.pattern);
+        if (match) {
+          // Yes...
+          if (match.groups) {
+            token = new LineToken(r.type, this.pos, indent, match.groups["attr"]);
+          } else {
+            token = new LineToken(r.type, this.pos, indent);
+          }
+
+          this._currToken = token;
+          this.pos++;
+          return this.currToken();
+        }
+      }
+      // No rules matched
+
+      // Skip this line if it is whitespace, comment, or empty
+      if (/^\s*(#.*)?$/.test(line)) {
+        this.pos++;
+        continue;
+      }
+
+      // This is an INDENT token
+      token = new LineToken(Symbol.INDENT, this.pos, indent);
+      this._currToken = token;
+      this.pos++;
+      return this.currToken();
+    }
+
+    // Didn't return, must be EOF
+    this._currToken = EOFTOKEN;
+    this.pos++;
+    return this.currToken();
+  }
+
+  /**
+   * Move backwards in the token stream
+   *
+   * @param `n` The number of positions to retract.
+   * @return The new current token after retracting.
+   */
+  retract(n: number = 1): LineToken {
+    if (this.pos - 1 - n < 0) {
+      // -1 because this.pos is currently on the next token
+      throw new RangeError('Cannot retract past start');
+    }
+
+    if (n <= 0) {
+      throw new RangeError('Retract distance must be positive');
+    }
+
+    if (this.pos - n === 0) {
+      // just restart
+      this.pos = 0;
+      return this.next();
+    }
+
+    let c = n + 1;
+    while (c > 0) {
+      this.pos--;
+      while (/^\s*(#.*)?$/.test(this.textLines[this.pos])) {
+        // Skip empty lines
+        this.pos--;
+      }
+      c--;
+    }
+    return this.next();
+  }
+}
--- a/src/pylex/node.ts
+++ b/src/pylex/node.ts
@@ -0,0 +1,82 @@
+import * as vscode from 'vscode';
+
+import LineToken from './token';
+
+/**
+ * A node in a Parse tree.
+ */
+export default class LexNode extends vscode.TreeItem {
+
+  /**
+   * @param `label` A human-readable string describing this item
+   * @param `collapsibleState` {@link TreeItemCollapsibleState} of the tree item.
+   * @param `token` The token at this node.
+   * @param `_children` The children in this node's subtree.
+   * @param `_parent` The parent node of this node.
+   */
+  constructor(
+    public readonly label: string,
+    public readonly collapsibleState: vscode.TreeItemCollapsibleState,
+    public readonly token: LineToken | null,
+    private _children: LexNode[] | null = null,
+    private _parent: LexNode | null = null,
+  ) {
+    super(label, collapsibleState);
+    this.tooltip = this.label;
+    if (this.token && this.token.linenr >= 0) {
+      this.tooltip += `: ${this.token.linenr+1}`;
+    }
+  }
+
+  /**
+   * @return The children of this node.
+   */
+  children(): LexNode[] | null {
+    return this._children;
+  }
+
+  /**
+   * @return The parent of this node.
+   */
+  parent(): LexNode | null {
+    return this._parent;
+  }
+
+  /**
+   * Adopt child nodes.
+   *
+   * @param `child` Array of nodes to adopt.
+   */
+  adopt(children: LexNode[]): void {
+    let parentedChildren = children.map(c => new LexNode(
+      c.label,
+      c.collapsibleState,
+      c.token,
+      c.children(),
+      this
+    ));
+
+    // Are there any other children?
+    if (this._children) {
+      // Yes...
+      this._children = this._children.concat(children);
+    } else {
+      // No....
+      this._children = parentedChildren;
+    }
+  }
+
+  /**
+   * Return the root path for this node.
+   *
+   * @return A path of parent nodes from this node to the root of the tree.
+   */
+  rootPath(): LexNode[] {
+    if (this._parent) {
+      return [new LexNode(this.label, this.collapsibleState, this.token, this._children, this._parent)].concat(this._parent.rootPath());
+    } else {
+      return [new LexNode(this.label, this.collapsibleState, this.token, this._children, this._parent)];
+    }
+
+  }
+}
--- a/src/pylex/parser.ts
+++ b/src/pylex/parser.ts
@@ -0,0 +1,133 @@
+import * as vscode from 'vscode';
+
+import { EOFTOKEN, Symbol, TabInfo } from './token';
+import Lexer from './lexer';
+import LexNode from './node';
+
+/**
+ * A parse tree generator
+ */
+export default class Parser {
+  private lexer: Lexer;
+  private currIndent: number;
+  private root: LexNode; // Root of syntax tree
+
+  /**
+   * @param `text` Text to parse.
+   * @param `tabFmt` A tab information descriptor
+   */
+  constructor (private text?: string, private tabFmt?: TabInfo) {}
+
+  /**
+   * Parse the passed text.
+   *
+   * @param `text` Text to parse. If undefined, use current value of `this.text`
+   * @param `tabFmt` A tab information descriptor
+   * @return A parse tree representing `text`.
+   */
+  parse(text?: string, tabFmt?: TabInfo): LexNode {
+    if (text) {
+      // save text
+      this.text = text;
+    } else {
+      // default to this.text
+      // this might still be undefined
+      text = this.text;
+    }
+
+    if (tabFmt) {
+      // save tabFmt
+      this.tabFmt = tabFmt;
+    } else {
+      // default to this.tabFmt
+      // this might still be undefined
+      tabFmt = this.tabFmt;
+    }
+
+    // initialize root
+    this.lexer = new Lexer(this.text, this.tabFmt);
+    this.root = new LexNode(
+      "root",
+      vscode.TreeItemCollapsibleState.None,
+      null,
+      null,
+      null
+    );
+
+    // parse children
+    this.currIndent = 0;
+    const children = this._parse(this.root);
+
+    if (children.length > 0) {
+      this.root.adopt(children);
+    }
+    return this.root;
+  }
+
+  private _parse(parent: LexNode | null): LexNode[] {
+    let children: LexNode[] = [];
+    while (this.lexer.currToken() !== EOFTOKEN) {
+      if (this.lexer.currToken().indentLevel < this.currIndent) {
+        // go up 1 level of recursion at a time to unravel properly
+        this.currIndent--;
+        return children;
+      } else if (this.lexer.currToken().type === Symbol.INDENT) {
+        // regular code, advance and stay in same block
+        this.lexer.next();
+        continue;
+      } else {
+        // new block starts here
+        const label = this.lexer.currToken().type + (this.lexer.currToken().attr === undefined ? "" : " " + this.lexer.currToken().attr);
+        let blockRoot = new LexNode(
+          label,
+          vscode.TreeItemCollapsibleState.None,
+          this.lexer.currToken(),
+          null,
+          parent
+        );
+        this.lexer.next();
+        this.currIndent++;
+        const blockChildren = this._parse(blockRoot); // Recursively parse all child blocks
+        if (blockChildren.length > 0) {
+          blockRoot.adopt(blockChildren);
+        }
+        children.push(blockRoot);
+      }
+    }
+    return children;
+  }
+
+  /**
+   * Get an array of LexNodes representing the rootpath of LexNodes from the
+   * passed line number to the root of the document. A list of "this" inside
+   * "that" inside ... inside the document root.
+   *
+   * @param `lineNumber` The line number to query context for.
+   * @return An array of LexNodes for the root path containing `lineNumber`
+   */
+  context(lineNumber: number): LexNode[] {
+    if (!this.root.children()) {
+      return [];
+    }
+
+    // Returns the LexNode that is the parent
+    // of the queried line number
+    let find = (root: LexNode): LexNode | undefined => {
+      let prevChild: LexNode;
+      for (var child of root.children()!) {
+        if (lineNumber < child.token!.linenr) {
+          if (prevChild!.children()) {
+            return find(prevChild!);
+          } else {
+            return prevChild!;
+          }
+        } else {
+          prevChild = child;
+        }
+      }
+    };
+
+    let target = find(this.root);
+    return target!.rootPath();
+  }
+}
--- a/src/pylex/token.ts
+++ b/src/pylex/token.ts
@@ -0,0 +1,66 @@
+/* eslint-disable @typescript-eslint/naming-convention */
+/* ^ allow uppercase enum */
+
+/**
+ * LineToken Symbol Types
+ */
+export enum Symbol {
+  FUNCTION = "function",
+  CLASS = "class",
+  IF = "if",
+  ELSE = "else",
+  ELIF = "elif",
+  FOR = "for",
+  WHILE = "while",
+  TRY = "try",
+  EXCEPT = "except",
+  FINALLY = "finally",
+  WITH = "with",
+  INDENT = "INDENT", // Indent token, default if not EOF, only contains indent information
+  EOF = "EOF"
+}
+
+/**
+ * @typedef {Object} TabInfo
+ * @prop {number} size // The width of a tab in spaces
+ * @prop {boolean} hard // Whether to use literal tab characters
+ */
+export type TabInfo = {
+  size: number,
+  hard: boolean,
+};
+
+/**
+ * A token for a line in a Python file
+ */
+export default class LineToken {
+
+  /**
+   * @param `type` The type of token for this line.
+   * @param `linenr` The line number (0-indexed)
+   * @param `indentLevel` The level of indentation.
+   * @param `attr` Additional item for tokens that might need it.
+   */
+  constructor(
+    public readonly type: Symbol,
+    public readonly linenr: number,
+    public readonly indentLevel: number,
+    public readonly attr?: any // Any additional things a token might need (class name, control conidition)
+  ) { }
+
+  /**
+   * @return A string representation of the token
+   */
+  toString(): string {
+    return this.type + ", linenr:" + (this.linenr+1) + ", indentLevel: " + this.indentLevel + ", attr: " + this.attr;
+  }
+}
+
+/**
+ * The End-Of-File token
+ *
+ * EOFTOKEN is returned when `next()` is called
+ * while the lexer is on the last token in the stream.
+ */
+const EOFTOKEN = new LineToken(Symbol.EOF, -1, -1);
+export { EOFTOKEN };