mirror of
https://github.com/We-Dont-Byte/Mind_Reader.git
synced 2024-11-15 03:35:59 +00:00
lexter.ts - added types
This commit is contained in:
parent
c087f1666a
commit
a619146afc
@ -1,216 +1,195 @@
|
|||||||
"use strict";
|
import { LineToken } from '.';
|
||||||
Object.defineProperty(exports, "__esModule", {
|
import { Symbol, EOFTOKEN, TabInfo } from './token';
|
||||||
value: true
|
|
||||||
});
|
type Rule = {
|
||||||
const _1 = require(".");
|
pattern: RegExp,
|
||||||
const token_1 = require("./token");
|
type: Symbol,
|
||||||
|
};
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* List of recognition patterns, in order of priority
|
* List of recognition patterns, in order of priority
|
||||||
* The first item is a recognition pattern, used to recognize the token
|
* The first item is a recognition pattern, used to recognize the token
|
||||||
* the second item is the token type
|
* the second item is the token type
|
||||||
*/
|
*/
|
||||||
const rules = [
|
const rules: Rule[] = [
|
||||||
{
|
{
|
||||||
pattern: /^\s*def\s+(?<attr>[a-zA-Z_][a-zA-Z0-9_]*)\(/,
|
pattern: /^\s*def\s+(?<attr>[a-zA-Z_][a-zA-Z0-9_]*)\(/,
|
||||||
type: token_1.Symbol.FUNCTION
|
type: Symbol.FUNCTION
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
pattern: /^\s*class\s+(?<attr>[a-zA-Z_][a-zA-Z0-9_]*)/,
|
pattern: /^\s*class\s+(?<attr>[a-zA-Z_][a-zA-Z0-9_]*)/,
|
||||||
type: token_1.Symbol.CLASS
|
type: Symbol.CLASS
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
pattern: /^\s*if\s+(?<attr>[^:]+):\s*/,
|
pattern: /^\s*if\s+(?<attr>[^:]+):\s*/,
|
||||||
type: token_1.Symbol.IF
|
type: Symbol.IF
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
pattern: /^\s*elif\s+(?<attr>[^:]+):\s*$/,
|
pattern: /^\s*elif\s+(?<attr>[^:]+):\s*$/,
|
||||||
type: token_1.Symbol.ELIF
|
type: Symbol.ELIF
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
pattern: /^\s*else\s*:/,
|
pattern: /^\s*else\s*:/,
|
||||||
type: token_1.Symbol.ELSE
|
type: Symbol.ELSE
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
pattern: /^\s*for\s+(?<attr>[^:]+):\s*$/,
|
pattern: /^\s*for\s+(?<attr>[^:]+):\s*$/,
|
||||||
type: token_1.Symbol.FOR
|
type: Symbol.FOR
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
pattern: /^\s*while\s+(?<attr>[^:]+):\s*$/,
|
pattern: /^\s*while\s+(?<attr>[^:]+):\s*$/,
|
||||||
type: token_1.Symbol.WHILE
|
type: Symbol.WHILE
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
pattern: /^\s*try\s*:/,
|
pattern: /^\s*try\s*:/,
|
||||||
type: token_1.Symbol.TRY
|
type: Symbol.TRY
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
pattern: /^\s*except(\s*(?<attr>[^:]+))?:\s*$/,
|
pattern: /^\s*except(\s*(?<attr>[^:]+))?:\s*$/,
|
||||||
type: token_1.Symbol.EXCEPT
|
type: Symbol.EXCEPT
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
pattern: /^\s*finally\s*:\s*$/,
|
pattern: /^\s*finally\s*:\s*$/,
|
||||||
type: token_1.Symbol.FINALLY
|
type: Symbol.FINALLY
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
pattern: /^\s*with\s+(?<attr>[^:]+):\s*$/,
|
pattern: /^\s*with\s+(?<attr>[^:]+):\s*$/,
|
||||||
type: token_1.Symbol.WITH
|
type: Symbol.WITH
|
||||||
},
|
},
|
||||||
];
|
];
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Line-By-Line Lexer
|
* Line-By-Line Lexer
|
||||||
*/
|
*/
|
||||||
class Lexer {
|
export default class Lexer {
|
||||||
|
private textLines: string[] = []; // array of text lines
|
||||||
|
private pos: number = 0;
|
||||||
|
private _currToken: LineToken = EOFTOKEN;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @param `text` The text to lex.
|
* @param `text` The text to lex.
|
||||||
* @param `tabFmt` A tab information descriptor
|
* @param `tabFmt` A tab information descriptor
|
||||||
*/
|
*/
|
||||||
constructor(text, tabFmt) {
|
constructor(text ? : string, private tabFmt ? : TabInfo) {
|
||||||
this.tabFmt = tabFmt;
|
|
||||||
this.textLines = []; // array of text lines
|
|
||||||
this.pos = 0;
|
|
||||||
this._currToken = token_1.EOFTOKEN;
|
|
||||||
// default is 4 wide expanded tabs
|
// default is 4 wide expanded tabs
|
||||||
this.tabFmt = Object.assign({
|
this.tabFmt = {
|
||||||
size: 4,
|
...{
|
||||||
hard: false
|
size: 4,
|
||||||
}, tabFmt);
|
hard: false
|
||||||
|
},
|
||||||
|
...tabFmt
|
||||||
|
};
|
||||||
|
|
||||||
if (text) {
|
if (text) {
|
||||||
// normalize linefeeds
|
// normalize linefeeds
|
||||||
text = text.replace('\r\n', '\n');
|
text = text.replace('\r\n', '\n');
|
||||||
}
|
}
|
||||||
this.restart(text);
|
this.restart(text);
|
||||||
}
|
}
|
||||||
/**
|
|
||||||
* Calculates indentation level for a line. If using soft tabs,
|
|
||||||
* indent level rounds up (so, tabSize+1 spaces is 2 levels,
|
|
||||||
* 2*tabSize+1 is 3, etc.)
|
|
||||||
*
|
|
||||||
* @param `text` The line of text.
|
|
||||||
* @param `tabFmt` A tab information descriptor.
|
|
||||||
* @return The indent of `text` with consideration for `tabFmt`.
|
|
||||||
*/
|
|
||||||
static getIndent(text, tabFmt) {
|
|
||||||
let leadingSpace = text.length - text.trimLeft().length;
|
|
||||||
let indent;
|
|
||||||
if (tabFmt.hard) {
|
|
||||||
// used tabs
|
|
||||||
indent = leadingSpace;
|
|
||||||
} else {
|
|
||||||
// use spaces
|
|
||||||
indent = Math.ceil(leadingSpace / tabFmt.size);
|
|
||||||
}
|
|
||||||
return indent;
|
|
||||||
}
|
|
||||||
/**
|
|
||||||
* Calculates leading spaces for a line.
|
|
||||||
* This method uses arithmetic to calculate the number of leading spaces
|
|
||||||
*
|
|
||||||
* @param `text` The line of text.
|
|
||||||
* @return The number of leading spaces of `text`.
|
|
||||||
*/
|
|
||||||
static getLeadingSpacesByArithmetic(textLine) {
|
|
||||||
const leadingSpaces = textLine.text.length - textLine.text.trimStart().length;
|
|
||||||
|
|
||||||
return leadingSpaces;
|
|
||||||
}
|
|
||||||
/**
|
|
||||||
* Calculates leading spaces for a line.
|
|
||||||
* This method finds the index position of the first non-whitespace character
|
|
||||||
* Since the index is built using a 0-index, the position of this character
|
|
||||||
* will equal the number of spaces preceding the character.
|
|
||||||
*
|
|
||||||
* @param `text` The line of text.
|
|
||||||
* @return The number of leading spaces of `text` with respect to the index position of the first non-whitespace character.
|
|
||||||
*/
|
|
||||||
static getLeadingSpacesByIndex(textLine) {
|
|
||||||
const indexNum = textLine.firstNonWhitespaceCharacterIndex;
|
|
||||||
|
|
||||||
return indexNum;
|
|
||||||
}
|
|
||||||
/**
|
/**
|
||||||
* Restart lexer with new text.
|
* Restart lexer with new text.
|
||||||
*
|
*
|
||||||
* @param `text` The new text to lex.
|
* @param `text` The new text to lex.
|
||||||
*/
|
*/
|
||||||
restart(text) {
|
restart(text ? : string): void {
|
||||||
this.pos = 0;
|
this.pos = 0;
|
||||||
this._currToken = token_1.EOFTOKEN; // if no input, already on EOFTOKEN
|
this._currToken = EOFTOKEN; // if no input, already on EOFTOKEN
|
||||||
|
|
||||||
if (text) {
|
if (text) {
|
||||||
this.textLines = text.split('\n');
|
this.textLines = text.split('\n');
|
||||||
this.next(); // advance to the first token
|
this.next(); // advance to the first token
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @return the current {@link LineToken}.
|
* @return the current {@link LineToken}.
|
||||||
*/
|
*/
|
||||||
currToken() {
|
currToken(): LineToken {
|
||||||
return this._currToken;
|
return this._currToken;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Advance the position in the token stream.
|
* Advance the position in the token stream.
|
||||||
*
|
*
|
||||||
* @return The new current token, after advancing
|
* @return The new current token, after advancing
|
||||||
*/
|
*/
|
||||||
next() {
|
next(): LineToken {
|
||||||
if (this._currToken === token_1.EOFTOKEN && this.pos > this.textLines.length) {
|
if (this._currToken === EOFTOKEN && this.pos > this.textLines.length) {
|
||||||
throw new Error('Cannot advance past end');
|
throw new Error('Cannot advance past end');
|
||||||
}
|
}
|
||||||
|
|
||||||
// Until a LineToken is found, or EOF
|
// Until a LineToken is found, or EOF
|
||||||
while (this.pos < this.textLines.length) {
|
while (this.pos < this.textLines.length) {
|
||||||
let line = this.textLines[this.pos];
|
let line: string = this.textLines[this.pos];
|
||||||
let indent = Lexer.getIndent(line, this.tabFmt);
|
let indent: number = Lexer.getIndent(line, this.tabFmt!);
|
||||||
let token;
|
let token: LineToken;
|
||||||
|
|
||||||
for (var r of rules) {
|
for (var r of rules) {
|
||||||
// Does line match pattern?
|
// Does line match pattern?
|
||||||
let match = line.match(r.pattern);
|
let match: RegExpMatchArray | null = line.match(r.pattern);
|
||||||
if (match) {
|
if (match) {
|
||||||
// Yes...
|
// Yes...
|
||||||
if (match.groups) {
|
if (match.groups) {
|
||||||
token = new _1.LineToken(r.type, this.pos, indent, match.groups["attr"]);
|
token = new LineToken(r.type, this.pos, indent, match.groups["attr"]);
|
||||||
} else {
|
}
|
||||||
token = new _1.LineToken(r.type, this.pos, indent);
|
else {
|
||||||
|
token = new LineToken(r.type, this.pos, indent);
|
||||||
}
|
}
|
||||||
|
|
||||||
this._currToken = token;
|
this._currToken = token;
|
||||||
this.pos++;
|
this.pos++;
|
||||||
|
|
||||||
return this.currToken();
|
return this.currToken();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// No rules matched
|
// No rules matched
|
||||||
|
|
||||||
// TODO: move to rules
|
// TODO: move to rules
|
||||||
if (/^\s*(#.*)?$/.test(line)) {
|
if (/^\s*(#.*)?$/.test(line)) {
|
||||||
// "empty" line
|
// "empty" line
|
||||||
token = new _1.LineToken(token_1.Symbol.EMPTY, this.pos, 999999);
|
token = new LineToken(Symbol.EMPTY, this.pos, 999999);
|
||||||
} else {
|
}
|
||||||
|
else {
|
||||||
// This is an INDENT token
|
// This is an INDENT token
|
||||||
token = new _1.LineToken(token_1.Symbol.INDENT, this.pos, indent);
|
token = new LineToken(Symbol.INDENT, this.pos, indent);
|
||||||
}
|
}
|
||||||
|
|
||||||
this._currToken = token;
|
this._currToken = token;
|
||||||
this.pos++;
|
this.pos++;
|
||||||
|
|
||||||
return this.currToken();
|
return this.currToken();
|
||||||
}
|
}
|
||||||
|
|
||||||
// Didn't return, must be EOF
|
// Didn't return, must be EOF
|
||||||
this._currToken = token_1.EOFTOKEN;
|
this._currToken = EOFTOKEN;
|
||||||
this.pos++;
|
this.pos++;
|
||||||
|
|
||||||
return this.currToken();
|
return this.currToken();
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Move backwards in the token stream
|
* Move backwards in the token stream
|
||||||
*
|
*
|
||||||
* @param `n` The number of positions to retract.
|
* @param `n` The number of positions to retract.
|
||||||
* @return The new current token after retracting.
|
* @return The new current token after retracting.
|
||||||
*/
|
*/
|
||||||
retract(n = 1) {
|
retract(n: number = 1): LineToken {
|
||||||
if (this.pos - 1 - n < 0) {
|
if (this.pos - 1 - n < 0) {
|
||||||
// -1 because this.pos is currently on the next token
|
// -1 because this.pos is currently on the next token
|
||||||
throw new RangeError('Cannot retract past start');
|
throw new RangeError('Cannot retract past start');
|
||||||
}
|
}
|
||||||
|
|
||||||
if (n <= 0) {
|
if (n <= 0) {
|
||||||
throw new RangeError('Retract distance must be positive');
|
throw new RangeError('Retract distance must be positive');
|
||||||
}
|
}
|
||||||
|
|
||||||
if (this.pos - n === 0) {
|
if (this.pos - n === 0) {
|
||||||
// just restart
|
// just restart
|
||||||
this.pos = 0;
|
this.pos = 0;
|
||||||
return this.next();
|
return this.next();
|
||||||
}
|
}
|
||||||
|
|
||||||
let c = n + 1;
|
let c = n + 1;
|
||||||
while (c > 0) {
|
while (c > 0) {
|
||||||
this.pos--;
|
this.pos--;
|
||||||
@ -220,8 +199,60 @@ class Lexer {
|
|||||||
}
|
}
|
||||||
c--;
|
c--;
|
||||||
}
|
}
|
||||||
|
|
||||||
return this.next();
|
return this.next();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Calculates indentation level for a line. If using soft tabs,
|
||||||
|
* indent level rounds up (so, tabSize+1 spaces is 2 levels,
|
||||||
|
* 2*tabSize+1 is 3, etc.)
|
||||||
|
*
|
||||||
|
* @param `text` The line of text.
|
||||||
|
* @param `tabFmt` A tab information descriptor.
|
||||||
|
* @return The indent of `text` with consideration for `tabFmt`.
|
||||||
|
*/
|
||||||
|
static getIndent(text: string, tabFmt: TabInfo): number {
|
||||||
|
let leadingSpace: number = text.length - text.trimStart().length;
|
||||||
|
let indent: number;
|
||||||
|
|
||||||
|
if (tabFmt.hard) {
|
||||||
|
// used tabs
|
||||||
|
indent = leadingSpace;
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
// use spaces
|
||||||
|
indent = Math.ceil(leadingSpace / tabFmt.size!);
|
||||||
|
}
|
||||||
|
|
||||||
|
return indent;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Calculates leading spaces for a line.
|
||||||
|
* This method uses arithmetic to calculate the number of leading spaces
|
||||||
|
*
|
||||||
|
* @param `line` The line of text.
|
||||||
|
* @return The number of leading spaces of `text`.
|
||||||
|
*/
|
||||||
|
static getLeadingSpacesByArithmetic(line: any) {
|
||||||
|
const leadingSpaces: number = line.text.length - line.text.trimStart().length;
|
||||||
|
|
||||||
|
return leadingSpaces;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Calculates leading spaces for a line.
|
||||||
|
* This method finds the index position of the first non-whitespace character
|
||||||
|
* Since the index is built using a 0-index, the position of this character
|
||||||
|
* will equal the number of spaces preceding the character.
|
||||||
|
*
|
||||||
|
* @param `text` The line of text.
|
||||||
|
* @return The number of leading spaces of `text` with respect to the index position of the first non-whitespace character.
|
||||||
|
*/
|
||||||
|
static getLeadingSpacesByIndex(text: any) {
|
||||||
|
const indexNum: number = text.firstNonWhitespaceCharacterIndex;
|
||||||
|
|
||||||
|
return indexNum;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
exports.default = Lexer;
|
|
||||||
//# sourceMappingURL=lexer.js.map
|
|
||||||
|
Loading…
Reference in New Issue
Block a user