added 2 functions

added functions getLeadingSpacesByArithmetic and getLeadingSpacesByIndex to facilitate functionality of finding the number of leading spaces via two methods.
2025-02-04 10:38:42 +00:00 · 2022-03-24 04:03:17 -05:00
parent d8c923e13c
commit e67f731bc6
1 changed files with 205 additions and 197 deletions
--- a/src/pylex/lexer.ts
+++ b/src/pylex/lexer.ts
@@ -1,215 +1,223 @@
-import { LineToken } from '.';
-import { Symbol, EOFTOKEN, TabInfo } from './token';
-
-type Rule = {
-  pattern: RegExp,
-  type: Symbol,
-};
-
+"use strict";
+Object.defineProperty(exports, "__esModule", { value: true });
+const _1 = require(".");
+const token_1 = require("./token");
 /**
 * List of recognition patterns, in order of priority
 * The first item is a recognition pattern, used to recognize the token
 * the second item is the token type
 */
-const rules: Rule[] = [
-  {
-    pattern: /^\s*def\s+(?<attr>[a-zA-Z_][a-zA-Z0-9_]*)\(/,
-    type: Symbol.FUNCTION
-  },
-  {
-    pattern: /^\s*class\s+(?<attr>[a-zA-Z_][a-zA-Z0-9_]*)/,
-    type: Symbol.CLASS
-  },
-  {
-    pattern: /^\s*if\s+(?<attr>[^:]+):\s*/,
-    type: Symbol.IF
-  },
-  {
-    pattern: /^\s*elif\s+(?<attr>[^:]+):\s*$/,
-    type: Symbol.ELIF
-  },
-  {
-    pattern: /^\s*else\s*:/,
-    type: Symbol.ELSE
-  },
-  {
-    pattern: /^\s*for\s+(?<attr>[^:]+):\s*$/,
-    type: Symbol.FOR
-  },
-  {
-    pattern: /^\s*while\s+(?<attr>[^:]+):\s*$/,
-    type: Symbol.WHILE
-  },
-  {
-    pattern: /^\s*try\s*:/,
-    type: Symbol.TRY
-  },
-  {
-    pattern: /^\s*except(\s*(?<attr>[^:]+))?:\s*$/,
-    type: Symbol.EXCEPT
-  },
-  {
-    pattern: /^\s*finally\s*:\s*$/,
-    type: Symbol.FINALLY
-  },
-  {
-    pattern: /^\s*with\s+(?<attr>[^:]+):\s*$/,
-    type: Symbol.WITH
-  },
+const rules = [
+    {
+        pattern: /^\s*def\s+(?<attr>[a-zA-Z_][a-zA-Z0-9_]*)\(/,
+        type: token_1.Symbol.FUNCTION
+    },
+    {
+        pattern: /^\s*class\s+(?<attr>[a-zA-Z_][a-zA-Z0-9_]*)/,
+        type: token_1.Symbol.CLASS
+    },
+    {
+        pattern: /^\s*if\s+(?<attr>[^:]+):\s*/,
+        type: token_1.Symbol.IF
+    },
+    {
+        pattern: /^\s*elif\s+(?<attr>[^:]+):\s*$/,
+        type: token_1.Symbol.ELIF
+    },
+    {
+        pattern: /^\s*else\s*:/,
+        type: token_1.Symbol.ELSE
+    },
+    {
+        pattern: /^\s*for\s+(?<attr>[^:]+):\s*$/,
+        type: token_1.Symbol.FOR
+    },
+    {
+        pattern: /^\s*while\s+(?<attr>[^:]+):\s*$/,
+        type: token_1.Symbol.WHILE
+    },
+    {
+        pattern: /^\s*try\s*:/,
+        type: token_1.Symbol.TRY
+    },
+    {
+        pattern: /^\s*except(\s*(?<attr>[^:]+))?:\s*$/,
+        type: token_1.Symbol.EXCEPT
+    },
+    {
+        pattern: /^\s*finally\s*:\s*$/,
+        type: token_1.Symbol.FINALLY
+    },
+    {
+        pattern: /^\s*with\s+(?<attr>[^:]+):\s*$/,
+        type: token_1.Symbol.WITH
+    },
 ];
-
 /**
 * Line-By-Line Lexer
 */
-export default class Lexer {
-  private textLines: string[] = []; // array of text lines
-  private pos: number = 0;
-  private _currToken: LineToken = EOFTOKEN;
-
-  /**
-   * Calculates indentation level for a line. If using soft tabs,
-   * indent level rounds up (so, tabSize+1 spaces is 2 levels,
-   * 2*tabSize+1 is 3, etc.)
-   *
-   * @param `text` The line of text.
-   * @param `tabFmt` A tab information descriptor.
-   * @return The indent of `text` with consideration for `tabFmt`.
-   */
-  static getIndent(text: string, tabFmt: TabInfo): number {
-    let leadingSpace: number = text.length - text.trimLeft().length;
-    let indent: number;
-    if (tabFmt.hard) {
-      // used tabs
-      indent = leadingSpace;
-    } else {
-      // use spaces
-      indent = Math.ceil(leadingSpace/tabFmt.size!);
-    }
-
-    return indent;
-  }
-
-  /**
-   * @param `text` The text to lex.
-   * @param `tabFmt` A tab information descriptor
-   */
-  constructor(text?: string, private tabFmt?: TabInfo) {
-    // default is 4 wide expanded tabs
-    this.tabFmt = {
-      ...{size: 4, hard: false},
-      ...tabFmt
-    };
-
-    if (text) {
-      // normalize linefeeds
-      text = text.replace('\r\n', '\n');
-    }
-    this.restart(text);
-  }
-
-  /**
-   * Restart lexer with new text.
-   *
-   * @param `text` The new text to lex.
-   */
-  restart(text?: string): void {
-    this.pos = 0;
-    this._currToken = EOFTOKEN; // if no input, already on EOFTOKEN
-    if (text) {
-      this.textLines = text.split('\n');
-      this.next(); // advance to the first token
-    }
-  }
-
-  /**
-   * @return the current {@link LineToken}.
-   */
-  currToken(): LineToken { return this._currToken; }
-
-  /**
-   * Advance the position in the token stream.
-   *
-   * @return The new current token, after advancing
-   */
-  next(): LineToken {
-    if (this._currToken === EOFTOKEN && this.pos > this.textLines.length) {
-      throw new Error('Cannot advance past end');
-    }
-
-    // Until a LineToken is found, or EOF
-    while (this.pos < this.textLines.length) {
-      let line: string = this.textLines[this.pos];
-      let indent: number = Lexer.getIndent(line, this.tabFmt!);
-      let token: LineToken;
-      for (var r of rules) {
-        // Does line match pattern?
-        let match: RegExpMatchArray | null = line.match(r.pattern);
-        if (match) {
-          // Yes...
-          if (match.groups) {
-            token = new LineToken(r.type, this.pos, indent, match.groups["attr"]);
-          } else {
-            token = new LineToken(r.type, this.pos, indent);
-          }
-
-          this._currToken = token;
-          this.pos++;
-          return this.currToken();
+class Lexer {
+    /**
+     * @param `text` The text to lex.
+     * @param `tabFmt` A tab information descriptor
+     */
+    constructor(text, tabFmt) {
+        this.tabFmt = tabFmt;
+        this.textLines = []; // array of text lines
+        this.pos = 0;
+        this._currToken = token_1.EOFTOKEN;
+        // default is 4 wide expanded tabs
+        this.tabFmt = Object.assign({ size: 4, hard: false }, tabFmt);
+        if (text) {
+            // normalize linefeeds
+            text = text.replace('\r\n', '\n');
        }
-      }
-      // No rules matched
-
-      // TODO: move to rules
-      if (/^\s*(#.*)?$/.test(line)) {
-        // "empty" line
-        token = new LineToken(Symbol.EMPTY, this.pos, 999999);
-      } else {
-        // This is an INDENT token
-        token = new LineToken(Symbol.INDENT, this.pos, indent);
-      }
-
-      this._currToken = token;
-      this.pos++;
-      return this.currToken();
+        this.restart(text);
    }
-
-    // Didn't return, must be EOF
-    this._currToken = EOFTOKEN;
-    this.pos++;
-    return this.currToken();
-  }
-
-  /**
-   * Move backwards in the token stream
-   *
-   * @param `n` The number of positions to retract.
-   * @return The new current token after retracting.
-   */
-  retract(n: number = 1): LineToken {
-    if (this.pos - 1 - n < 0) {
-      // -1 because this.pos is currently on the next token
-      throw new RangeError('Cannot retract past start');
+    /**
+     * Calculates indentation level for a line. If using soft tabs,
+     * indent level rounds up (so, tabSize+1 spaces is 2 levels,
+     * 2*tabSize+1 is 3, etc.)
+     *
+     * @param `text` The line of text.
+     * @param `tabFmt` A tab information descriptor.
+     * @return The indent of `text` with consideration for `tabFmt`.
+     */
+    static getIndent(text, tabFmt) {
+        let leadingSpace = text.length - text.trimLeft().length;
+        let indent;
+        if (tabFmt.hard) {
+            // used tabs
+            indent = leadingSpace;
+        }
+        else {
+            // use spaces
+            indent = Math.ceil(leadingSpace / tabFmt.size);
+        }
+        return indent;
    }
+    /**
+     * Calculates leading spaces for a line. 
+     * This method uses arithmetic to calculate the number of leading spaces
+     *  
+     * @param `text` The line of text.
+     * @return The number of leading spaces of `text`.
+     */
+    static getLeadingSpacesByArithmetic(textLine) {
+        const leadingSpaces = textLine.text.length - textLine.text.trimStart().length;

-    if (n <= 0) {
-      throw new RangeError('Retract distance must be positive');
+        return leadingSpaces;
    }
+    /**
+     * Calculates leading spaces for a line. 
+     * This method finds the index position of the first non-whitespace character
+     * Since the index is built using a 0-index, the position of this character
+     * will equal the number of spaces preceding the character.
+     *  
+     * @param `text` The line of text.
+     * @return The number of leading spaces of `text` with respect to the index position of the first non-whitespace character.
+     */
+    static getLeadingSpacesByIndex(textLine) {
+        const indexNum = textLine.firstNonWhitespaceCharacterIndex;

-    if (this.pos - n === 0) {
-      // just restart
-      this.pos = 0;
-      return this.next();
+        return indexNum;
    }
-
-    let c = n + 1;
-    while (c > 0) {
-      this.pos--;
-      while (/^\s*(#.*)?$/.test(this.textLines[this.pos])) {
-        // Skip empty lines
-        this.pos--;
-      }
-      c--;
+    /**
+     * Restart lexer with new text.
+     *
+     * @param `text` The new text to lex.
+     */
+    restart(text) {
+        this.pos = 0;
+        this._currToken = token_1.EOFTOKEN; // if no input, already on EOFTOKEN
+        if (text) {
+            this.textLines = text.split('\n');
+            this.next(); // advance to the first token
+        }
+    }
+    /**
+     * @return the current {@link LineToken}.
+     */
+    currToken() { return this._currToken; }
+    /**
+     * Advance the position in the token stream.
+     *
+     * @return The new current token, after advancing
+     */
+    next() {
+        if (this._currToken === token_1.EOFTOKEN && this.pos > this.textLines.length) {
+            throw new Error('Cannot advance past end');
+        }
+        // Until a LineToken is found, or EOF
+        while (this.pos < this.textLines.length) {
+            let line = this.textLines[this.pos];
+            let indent = Lexer.getIndent(line, this.tabFmt);
+            let token;
+            for (var r of rules) {
+                // Does line match pattern?
+                let match = line.match(r.pattern);
+                if (match) {
+                    // Yes...
+                    if (match.groups) {
+                        token = new _1.LineToken(r.type, this.pos, indent, match.groups["attr"]);
+                    }
+                    else {
+                        token = new _1.LineToken(r.type, this.pos, indent);
+                    }
+                    this._currToken = token;
+                    this.pos++;
+                    return this.currToken();
+                }
+            }
+            // No rules matched
+            // TODO: move to rules
+            if (/^\s*(#.*)?$/.test(line)) {
+                // "empty" line
+                token = new _1.LineToken(token_1.Symbol.EMPTY, this.pos, 999999);
+            }
+            else {
+                // This is an INDENT token
+                token = new _1.LineToken(token_1.Symbol.INDENT, this.pos, indent);
+            }
+            this._currToken = token;
+            this.pos++;
+            return this.currToken();
+        }
+        // Didn't return, must be EOF
+        this._currToken = token_1.EOFTOKEN;
+        this.pos++;
+        return this.currToken();
+    }
+    /**
+     * Move backwards in the token stream
+     *
+     * @param `n` The number of positions to retract.
+     * @return The new current token after retracting.
+     */
+    retract(n = 1) {
+        if (this.pos - 1 - n < 0) {
+            // -1 because this.pos is currently on the next token
+            throw new RangeError('Cannot retract past start');
+        }
+        if (n <= 0) {
+            throw new RangeError('Retract distance must be positive');
+        }
+        if (this.pos - n === 0) {
+            // just restart
+            this.pos = 0;
+            return this.next();
+        }
+        let c = n + 1;
+        while (c > 0) {
+            this.pos--;
+            while (/^\s*(#.*)?$/.test(this.textLines[this.pos])) {
+                // Skip empty lines
+                this.pos--;
+            }
+            c--;
+        }
+        return this.next();
    }
-    return this.next();
-  }
 }
+exports.default = Lexer;
+//# sourceMappingURL=lexer.js.map