Files
modeling-app/src/tokeniser.ts

111 lines
3.2 KiB
TypeScript
Raw Normal View History

2022-11-12 13:12:20 +11:00
import fsp from "node:fs/promises";
const NUMBER = /^[0-9]+/;
const WHITESPACE = /\s+/;
const WORD = /^[a-zA-Z_][a-zA-Z0-9_]*/;
// regex that captures everything between two non escaped quotes and the quotes aren't captured in the match
const STRING = /^(["'])(?:(?=(\\?))\2.)*?\1/;
// regex for operators
const OPERATOR = /^[>=|<=|+|\-|*|/|>|<|^|%]/;
2022-11-12 13:33:35 +11:00
const BLOCK_START = /^\{/;
const BLOCK_END = /^\}/;
2022-11-12 13:12:20 +11:00
const PARAN_START = /^\(/;
const PARAN_END = /^\)/;
export const isNumber = (character: string) => NUMBER.test(character);
export const isWhitespace = (character: string) => WHITESPACE.test(character);
export const isWord = (character: string) => WORD.test(character);
export const isString = (character: string) => STRING.test(character);
export const isOperator = (character: string) => OPERATOR.test(character);
export const isBlockStart = (character: string) => BLOCK_START.test(character);
export const isBlockEnd = (character: string) => BLOCK_END.test(character);
export const isParanStart = (character: string) => PARAN_START.test(character);
export const isParanEnd = (character: string) => PARAN_END.test(character);
function matchFirst(str: string, regex: RegExp) {
const theMatch = str.match(regex);
if (!theMatch) {
throw new Error("Should always be a match:" + str);
}
return theMatch[0];
}
interface Token {
2022-11-12 13:33:35 +11:00
type: "number" | "word" | "operator" | "string" | "brace";
2022-11-12 13:12:20 +11:00
value: string;
}
2022-11-12 13:33:35 +11:00
const returnTokenAtIndex = (str: string, startIndex: number): Token | null => {
const strFromIndex = str.slice(startIndex);
if (isOperator(strFromIndex)) {
return {
type: "operator",
value: matchFirst(strFromIndex, OPERATOR),
};
}
if (isString(strFromIndex)) {
return {
type: "string",
value: matchFirst(strFromIndex, STRING),
};
}
if (isParanEnd(strFromIndex)) {
return {
type: "brace",
value: matchFirst(strFromIndex, PARAN_END),
};
}
if (isParanStart(strFromIndex)) {
return {
type: "brace",
value: matchFirst(strFromIndex, PARAN_START),
};
}
if (isBlockStart(strFromIndex)) {
return {
type: "brace",
value: matchFirst(strFromIndex, BLOCK_START),
};
}
if (isBlockEnd(strFromIndex)) {
return {
type: "brace",
value: matchFirst(strFromIndex, BLOCK_END),
};
}
if (isNumber(strFromIndex)) {
return {
type: "number",
value: matchFirst(strFromIndex, NUMBER),
};
}
if (isWord(strFromIndex)) {
return {
type: "word",
value: matchFirst(strFromIndex, WORD),
};
}
return null;
};
2022-11-12 13:12:20 +11:00
2022-11-12 13:33:35 +11:00
export const lexer = (str: string): Token[] => {
const recursivelyTokenise = (
2022-11-12 13:12:20 +11:00
str: string,
2022-11-12 13:33:35 +11:00
currentIndex: number = 0,
previousTokens: Token[] = []
): Token[] => {
if (currentIndex >= str.length) {
return previousTokens;
2022-11-12 13:12:20 +11:00
}
const token = returnTokenAtIndex(str, currentIndex);
2022-11-12 13:33:35 +11:00
if (!token) {
return recursivelyTokenise(str, currentIndex + 1, previousTokens);
2022-11-12 13:12:20 +11:00
}
2022-11-12 13:33:35 +11:00
const nextIndex = currentIndex + token.value.length;
return recursivelyTokenise(str, nextIndex, [...previousTokens, token]);
};
return recursivelyTokenise(str);
2022-11-12 13:12:20 +11:00
};
2022-11-12 13:33:35 +11:00
// const example1 = await fsp.readFile("./examples/addition.cado", "ascii");