diff --git a/src/tokeniser.test.ts b/src/tokeniser.test.ts new file mode 100644 index 000000000..10ce37ea1 --- /dev/null +++ b/src/tokeniser.test.ts @@ -0,0 +1,186 @@ +import { + isBlockEnd, + isBlockStart, + isNumber, + isOperator, + isParanEnd, + isParanStart, + isString, + isWhitespace, + isWord, + lexer, +} from "./tokeniser"; + +describe("testing helpers", () => { + it("test is number", () => { + expect(isNumber("1")).toBe(true); + expect(isNumber("5?")).toBe(true); + expect(isNumber("5 + 6")).toBe(true); + expect(isNumber("5 + a")).toBe(true); + + expect(isNumber("a")).toBe(false); + expect(isNumber("?")).toBe(false); + expect(isNumber("?5")).toBe(false); + }); + it("test is whitespace", () => { + expect(isWhitespace(" ")).toBe(true); + expect(isWhitespace(" ")).toBe(true); + expect(isWhitespace(" a")).toBe(true); + expect(isWhitespace("a ")).toBe(true); + + expect(isWhitespace("a")).toBe(false); + expect(isWhitespace("?")).toBe(false); + }); + it("test is word", () => { + expect(isWord("a")).toBe(true); + expect(isWord("a ")).toBe(true); + expect(isWord("a5")).toBe(true); + expect(isWord("a5a")).toBe(true); + + expect(isWord("5")).toBe(false); + expect(isWord("5a")).toBe(false); + expect(isWord("5a5")).toBe(false); + }); + it("test is string", () => { + expect(isString('""')).toBe(true); + expect(isString('"a"')).toBe(true); + expect(isString('"a" ')).toBe(true); + expect(isString('"a"5')).toBe(true); + expect(isString("'a'5")).toBe(true); + expect(isString('"with escaped \\" backslash"')).toBe(true); + + expect(isString('"')).toBe(false); + expect(isString('"a')).toBe(false); + expect(isString('a"')).toBe(false); + expect(isString(' "a"')).toBe(false); + expect(isString('5"a"')).toBe(false); + }); + it("test is operator", () => { + expect(isOperator("+")).toBe(true); + expect(isOperator("+ ")).toBe(true); + expect(isOperator("-")).toBe(true); + expect(isOperator("<=")).toBe(true); + expect(isOperator("<= ")).toBe(true); + expect(isOperator(">=")).toBe(true); + expect(isOperator(">= ")).toBe(true); + expect(isOperator("> ")).toBe(true); + expect(isOperator("< ")).toBe(true); + expect(isOperator("| ")).toBe(true); + expect(isOperator("|> ")).toBe(true); + expect(isOperator("^ ")).toBe(true); + expect(isOperator("% ")).toBe(true); + expect(isOperator("+* ")).toBe(true); + + expect(isOperator("5 + 5")).toBe(false); + expect(isOperator("a")).toBe(false); + expect(isOperator("a+")).toBe(false); + expect(isOperator("a+5")).toBe(false); + expect(isOperator("5a+5")).toBe(false); + }); + it("test is paran start", () => { + expect(isParanStart("(")).toBe(true); + expect(isParanStart("( ")).toBe(true); + expect(isParanStart("(5")).toBe(true); + expect(isParanStart("(5 ")).toBe(true); + expect(isParanStart("(5 + 5")).toBe(true); + expect(isParanStart("(5 + 5)")).toBe(true); + expect(isParanStart("(5 + 5) ")).toBe(true); + + expect(isParanStart("5")).toBe(false); + expect(isParanStart("5 + 5")).toBe(false); + expect(isParanStart("5( + 5)")).toBe(false); + expect(isParanStart(" ( + 5)")).toBe(false); + }); + it("test is paran end", () => { + expect(isParanEnd(")")).toBe(true); + expect(isParanEnd(") ")).toBe(true); + expect(isParanEnd(")5")).toBe(true); + expect(isParanEnd(")5 ")).toBe(true); + + expect(isParanEnd("5")).toBe(false); + expect(isParanEnd("5 + 5")).toBe(false); + expect(isParanEnd("5) + 5")).toBe(false); + expect(isParanEnd(" ) + 5")).toBe(false); + }); + it("test is block start", () => { + expect(isBlockStart("{")).toBe(true); + expect(isBlockStart("{ ")).toBe(true); + expect(isBlockStart("{5")).toBe(true); + expect(isBlockStart("{a")).toBe(true); + expect(isBlockStart("{5 ")).toBe(true); + + expect(isBlockStart("5")).toBe(false); + expect(isBlockStart("5 + 5")).toBe(false); + expect(isBlockStart("5{ + 5")).toBe(false); + expect(isBlockStart("a{ + 5")).toBe(false); + expect(isBlockStart(" { + 5")).toBe(false); + }); + it("test is block end", () => { + expect(isBlockEnd("}")).toBe(true); + expect(isBlockEnd("} ")).toBe(true); + expect(isBlockEnd("}5")).toBe(true); + expect(isBlockEnd("}5 ")).toBe(true); + + expect(isBlockEnd("5")).toBe(false); + expect(isBlockEnd("5 + 5")).toBe(false); + expect(isBlockEnd("5} + 5")).toBe(false); + expect(isBlockEnd(" } + 5")).toBe(false); + }); + +}); + +describe("testing lexer", () => { + it("test lexer", () => { + expect(lexer("1 + 2")).toEqual([ + { type: "number", value: "1" }, + { type: "operator", value: "+" }, + { type: "number", value: "2" }, + ]); + expect(lexer("54 + 22500 + 6")).toEqual([ + { type: "number", value: "54" }, + { type: "operator", value: "+" }, + { type: "number", value: "22500" }, + { type: "operator", value: "+" }, + { type: "number", value: "6" }, + ]); + expect(lexer("a + bo + t5 - 6")).toEqual([ + { type: "word", value: "a" }, + { type: "operator", value: "+" }, + { type: "word", value: "bo" }, + { type: "operator", value: "+" }, + { type: "word", value: "t5" }, + { type: "operator", value: "-" }, + { type: "number", value: "6" }, + ]); + expect(lexer('a + "a str" - 6')).toEqual([ + { type: "word", value: "a" }, + { type: "operator", value: "+" }, + { type: "string", value: '"a str"' }, + { type: "operator", value: "-" }, + { type: "number", value: "6" }, + ]); + const sameWithOrWithoutWhiteSpaces = [ + { type: "word", value: "a" }, + { type: "operator", value: "+" }, + { type: "string", value: "'str'" }, + ]; + expect(lexer("a + 'str'")).toEqual(sameWithOrWithoutWhiteSpaces); + expect(lexer("a +'str'")).toEqual(sameWithOrWithoutWhiteSpaces); + + expect(lexer("a + (sick)")).toEqual([ + { type: "word", value: "a" }, + { type: "operator", value: "+" }, + { type: "brace", value: "(" }, + { type: "word", value: "sick" }, + { type: "brace", value: ")" }, + ]); + + expect(lexer("a + {sick}")).toEqual([ + { type: "word", value: "a" }, + { type: "operator", value: "+" }, + { type: "brace", value: "{" }, + { type: "word", value: "sick" }, + { type: "brace", value: "}" }, + ]); + }); +}); diff --git a/src/tokeniser.ts b/src/tokeniser.ts new file mode 100644 index 000000000..44aef8c3b --- /dev/null +++ b/src/tokeniser.ts @@ -0,0 +1,119 @@ +import fsp from "node:fs/promises"; + +const NUMBER = /^[0-9]+/; +const WHITESPACE = /\s+/; +const WORD = /^[a-zA-Z_][a-zA-Z0-9_]*/; +// regex that captures everything between two non escaped quotes and the quotes aren't captured in the match +const STRING = /^(["'])(?:(?=(\\?))\2.)*?\1/; +// regex for operators +const OPERATOR = /^[>=|<=|+|\-|*|/|>|<|^|%]/; +const BLOCK_START = /^\{/ +const BLOCK_END = /^\}/ +const PARAN_START = /^\(/; +const PARAN_END = /^\)/; + + +export const isNumber = (character: string) => NUMBER.test(character); +export const isWhitespace = (character: string) => WHITESPACE.test(character); +export const isWord = (character: string) => WORD.test(character); +export const isString = (character: string) => STRING.test(character); +export const isOperator = (character: string) => OPERATOR.test(character); +export const isBlockStart = (character: string) => BLOCK_START.test(character); +export const isBlockEnd = (character: string) => BLOCK_END.test(character); +export const isParanStart = (character: string) => PARAN_START.test(character); +export const isParanEnd = (character: string) => PARAN_END.test(character); + + +function matchFirst(str: string, regex: RegExp) { + const theMatch = str.match(regex); + if (!theMatch) { + throw new Error("Should always be a match:" + str); + } + return theMatch[0]; +} + +// type TokenTypes = +interface Token { + type: 'number' | 'word' | 'operator' | 'string' | 'brace' + value: string; +} + +export const lexer = (str: string): Token[] => { + const tokens: Token[] = []; + let currentIndex = 0; + + const returnTokenAtIndex = ( + str: string, + startIndex: number + ): Token | null => { + const strFromIndex = str.slice(startIndex); + if (isOperator(strFromIndex)) { + return { + type: "operator", + value: matchFirst(strFromIndex, OPERATOR), + }; + } + if (isString(strFromIndex)) { + return { + type: "string", + value: matchFirst(strFromIndex, STRING), + }; + } + if(isParanEnd(strFromIndex)){ + return { + type: "brace", + value: matchFirst(strFromIndex, PARAN_END), + }; + } + if(isParanStart(strFromIndex)){ + return { + type: "brace", + value: matchFirst(strFromIndex, PARAN_START), + }; + } + if(isBlockStart(strFromIndex)){ + return { + type: "brace", + value: matchFirst(strFromIndex, BLOCK_START), + }; + } + if(isBlockEnd(strFromIndex)){ + return { + type: "brace", + value: matchFirst(strFromIndex, BLOCK_END), + }; + } + if (isNumber(strFromIndex)) { + return { + type: "number", + value: matchFirst(strFromIndex, NUMBER), + }; + } + if(isWord(strFromIndex)) { + return { + type: "word", + value: matchFirst(strFromIndex, WORD), + }; + } + return null; + }; + while (currentIndex < str.length) { + const token = returnTokenAtIndex(str, currentIndex); + if (token) { + tokens.push(token); + currentIndex += token.value.length; + } else { + currentIndex++; + } + } + + return tokens +}; + +async function main() { + const example1 = await fsp.readFile("./examples/addition.cado", "ascii"); + const parsed = lexer(example1); + console.log(parsed); +} + +// main()