diff --git a/src/pylex/lexer.js b/src/pylex/lexer.js new file mode 100644 index 0000000..8ea195b --- /dev/null +++ b/src/pylex/lexer.js @@ -0,0 +1,227 @@ +"use strict"; +Object.defineProperty(exports, "__esModule", { + value: true +}); +const _1 = require("."); +const token_1 = require("./token"); +/** + * List of recognition patterns, in order of priority + * The first item is a recognition pattern, used to recognize the token + * the second item is the token type + */ +const rules = [ + { + pattern: /^\s*def\s+(?[a-zA-Z_][a-zA-Z0-9_]*)\(/, + type: token_1.Symbol.FUNCTION + }, + { + pattern: /^\s*class\s+(?[a-zA-Z_][a-zA-Z0-9_]*)/, + type: token_1.Symbol.CLASS + }, + { + pattern: /^\s*if\s+(?[^:]+):\s*/, + type: token_1.Symbol.IF + }, + { + pattern: /^\s*elif\s+(?[^:]+):\s*$/, + type: token_1.Symbol.ELIF + }, + { + pattern: /^\s*else\s*:/, + type: token_1.Symbol.ELSE + }, + { + pattern: /^\s*for\s+(?[^:]+):\s*$/, + type: token_1.Symbol.FOR + }, + { + pattern: /^\s*while\s+(?[^:]+):\s*$/, + type: token_1.Symbol.WHILE + }, + { + pattern: /^\s*try\s*:/, + type: token_1.Symbol.TRY + }, + { + pattern: /^\s*except(\s*(?[^:]+))?:\s*$/, + type: token_1.Symbol.EXCEPT + }, + { + pattern: /^\s*finally\s*:\s*$/, + type: token_1.Symbol.FINALLY + }, + { + pattern: /^\s*with\s+(?[^:]+):\s*$/, + type: token_1.Symbol.WITH + }, +]; +/** + * Line-By-Line Lexer + */ +class Lexer { + /** + * @param `text` The text to lex. + * @param `tabFmt` A tab information descriptor + */ + constructor(text, tabFmt) { + this.tabFmt = tabFmt; + this.textLines = []; // array of text lines + this.pos = 0; + this._currToken = token_1.EOFTOKEN; + // default is 4 wide expanded tabs + this.tabFmt = Object.assign({ + size: 4, + hard: false + }, tabFmt); + if (text) { + // normalize linefeeds + text = text.replace('\r\n', '\n'); + } + this.restart(text); + } + /** + * Calculates indentation level for a line. If using soft tabs, + * indent level rounds up (so, tabSize+1 spaces is 2 levels, + * 2*tabSize+1 is 3, etc.) + * + * @param `text` The line of text. + * @param `tabFmt` A tab information descriptor. + * @return The indent of `text` with consideration for `tabFmt`. + */ + static getIndent(text, tabFmt) { + let leadingSpace = text.length - text.trimLeft().length; + let indent; + if (tabFmt.hard) { + // used tabs + indent = leadingSpace; + } else { + // use spaces + indent = Math.ceil(leadingSpace / tabFmt.size); + } + return indent; + } + /** + * Calculates leading spaces for a line. + * This method uses arithmetic to calculate the number of leading spaces + * + * @param `text` The line of text. + * @return The number of leading spaces of `text`. + */ + static getLeadingSpacesByArithmetic(textLine) { + const leadingSpaces = textLine.text.length - textLine.text.trimStart().length; + + return leadingSpaces; + } + /** + * Calculates leading spaces for a line. + * This method finds the index position of the first non-whitespace character + * Since the index is built using a 0-index, the position of this character + * will equal the number of spaces preceding the character. + * + * @param `text` The line of text. + * @return The number of leading spaces of `text` with respect to the index position of the first non-whitespace character. + */ + static getLeadingSpacesByIndex(textLine) { + const indexNum = textLine.firstNonWhitespaceCharacterIndex; + + return indexNum; + } + /** + * Restart lexer with new text. + * + * @param `text` The new text to lex. + */ + restart(text) { + this.pos = 0; + this._currToken = token_1.EOFTOKEN; // if no input, already on EOFTOKEN + if (text) { + this.textLines = text.split('\n'); + this.next(); // advance to the first token + } + } + /** + * @return the current {@link LineToken}. + */ + currToken() { + return this._currToken; + } + /** + * Advance the position in the token stream. + * + * @return The new current token, after advancing + */ + next() { + if (this._currToken === token_1.EOFTOKEN && this.pos > this.textLines.length) { + throw new Error('Cannot advance past end'); + } + // Until a LineToken is found, or EOF + while (this.pos < this.textLines.length) { + let line = this.textLines[this.pos]; + let indent = Lexer.getIndent(line, this.tabFmt); + let token; + for (var r of rules) { + // Does line match pattern? + let match = line.match(r.pattern); + if (match) { + // Yes... + if (match.groups) { + token = new _1.LineToken(r.type, this.pos, indent, match.groups["attr"]); + } else { + token = new _1.LineToken(r.type, this.pos, indent); + } + this._currToken = token; + this.pos++; + return this.currToken(); + } + } + // No rules matched + // TODO: move to rules + if (/^\s*(#.*)?$/.test(line)) { + // "empty" line + token = new _1.LineToken(token_1.Symbol.EMPTY, this.pos, 999999); + } else { + // This is an INDENT token + token = new _1.LineToken(token_1.Symbol.INDENT, this.pos, indent); + } + this._currToken = token; + this.pos++; + return this.currToken(); + } + // Didn't return, must be EOF + this._currToken = token_1.EOFTOKEN; + this.pos++; + return this.currToken(); + } + /** + * Move backwards in the token stream + * + * @param `n` The number of positions to retract. + * @return The new current token after retracting. + */ + retract(n = 1) { + if (this.pos - 1 - n < 0) { + // -1 because this.pos is currently on the next token + throw new RangeError('Cannot retract past start'); + } + if (n <= 0) { + throw new RangeError('Retract distance must be positive'); + } + if (this.pos - n === 0) { + // just restart + this.pos = 0; + return this.next(); + } + let c = n + 1; + while (c > 0) { + this.pos--; + while (/^\s*(#.*)?$/.test(this.textLines[this.pos])) { + // Skip empty lines + this.pos--; + } + c--; + } + return this.next(); + } +} +exports.default = Lexer; +//# sourceMappingURL=lexer.js.map \ No newline at end of file