start of tokeniser
This commit is contained in:
		
							
								
								
									
										186
									
								
								src/tokeniser.test.ts
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										186
									
								
								src/tokeniser.test.ts
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,186 @@
 | 
			
		||||
import {
 | 
			
		||||
  isBlockEnd,
 | 
			
		||||
  isBlockStart,
 | 
			
		||||
  isNumber,
 | 
			
		||||
  isOperator,
 | 
			
		||||
  isParanEnd,
 | 
			
		||||
  isParanStart,
 | 
			
		||||
  isString,
 | 
			
		||||
  isWhitespace,
 | 
			
		||||
  isWord,
 | 
			
		||||
  lexer,
 | 
			
		||||
} from "./tokeniser";
 | 
			
		||||
 | 
			
		||||
describe("testing helpers", () => {
 | 
			
		||||
  it("test is number", () => {
 | 
			
		||||
    expect(isNumber("1")).toBe(true);
 | 
			
		||||
    expect(isNumber("5?")).toBe(true);
 | 
			
		||||
    expect(isNumber("5 + 6")).toBe(true);
 | 
			
		||||
    expect(isNumber("5 + a")).toBe(true);
 | 
			
		||||
 | 
			
		||||
    expect(isNumber("a")).toBe(false);
 | 
			
		||||
    expect(isNumber("?")).toBe(false);
 | 
			
		||||
    expect(isNumber("?5")).toBe(false);
 | 
			
		||||
  });
 | 
			
		||||
  it("test is whitespace", () => {
 | 
			
		||||
    expect(isWhitespace(" ")).toBe(true);
 | 
			
		||||
    expect(isWhitespace("  ")).toBe(true);
 | 
			
		||||
    expect(isWhitespace(" a")).toBe(true);
 | 
			
		||||
    expect(isWhitespace("a ")).toBe(true);
 | 
			
		||||
 | 
			
		||||
    expect(isWhitespace("a")).toBe(false);
 | 
			
		||||
    expect(isWhitespace("?")).toBe(false);
 | 
			
		||||
  });
 | 
			
		||||
  it("test is word", () => {
 | 
			
		||||
    expect(isWord("a")).toBe(true);
 | 
			
		||||
    expect(isWord("a ")).toBe(true);
 | 
			
		||||
    expect(isWord("a5")).toBe(true);
 | 
			
		||||
    expect(isWord("a5a")).toBe(true);
 | 
			
		||||
 | 
			
		||||
    expect(isWord("5")).toBe(false);
 | 
			
		||||
    expect(isWord("5a")).toBe(false);
 | 
			
		||||
    expect(isWord("5a5")).toBe(false);
 | 
			
		||||
  });
 | 
			
		||||
  it("test is string", () => {
 | 
			
		||||
    expect(isString('""')).toBe(true);
 | 
			
		||||
    expect(isString('"a"')).toBe(true);
 | 
			
		||||
    expect(isString('"a" ')).toBe(true);
 | 
			
		||||
    expect(isString('"a"5')).toBe(true);
 | 
			
		||||
    expect(isString("'a'5")).toBe(true);
 | 
			
		||||
    expect(isString('"with escaped \\" backslash"')).toBe(true);
 | 
			
		||||
 | 
			
		||||
    expect(isString('"')).toBe(false);
 | 
			
		||||
    expect(isString('"a')).toBe(false);
 | 
			
		||||
    expect(isString('a"')).toBe(false);
 | 
			
		||||
    expect(isString(' "a"')).toBe(false);
 | 
			
		||||
    expect(isString('5"a"')).toBe(false);
 | 
			
		||||
  });
 | 
			
		||||
  it("test is operator", () => {
 | 
			
		||||
    expect(isOperator("+")).toBe(true);
 | 
			
		||||
    expect(isOperator("+ ")).toBe(true);
 | 
			
		||||
    expect(isOperator("-")).toBe(true);
 | 
			
		||||
    expect(isOperator("<=")).toBe(true);
 | 
			
		||||
    expect(isOperator("<= ")).toBe(true);
 | 
			
		||||
    expect(isOperator(">=")).toBe(true);
 | 
			
		||||
    expect(isOperator(">= ")).toBe(true);
 | 
			
		||||
    expect(isOperator("> ")).toBe(true);
 | 
			
		||||
    expect(isOperator("< ")).toBe(true);
 | 
			
		||||
    expect(isOperator("| ")).toBe(true);
 | 
			
		||||
    expect(isOperator("|> ")).toBe(true);
 | 
			
		||||
    expect(isOperator("^ ")).toBe(true);
 | 
			
		||||
    expect(isOperator("% ")).toBe(true);
 | 
			
		||||
    expect(isOperator("+* ")).toBe(true);
 | 
			
		||||
 | 
			
		||||
    expect(isOperator("5 + 5")).toBe(false);
 | 
			
		||||
    expect(isOperator("a")).toBe(false);
 | 
			
		||||
    expect(isOperator("a+")).toBe(false);
 | 
			
		||||
    expect(isOperator("a+5")).toBe(false);
 | 
			
		||||
    expect(isOperator("5a+5")).toBe(false);
 | 
			
		||||
  });
 | 
			
		||||
  it("test is paran start", () => {
 | 
			
		||||
    expect(isParanStart("(")).toBe(true);
 | 
			
		||||
    expect(isParanStart("( ")).toBe(true);
 | 
			
		||||
    expect(isParanStart("(5")).toBe(true);
 | 
			
		||||
    expect(isParanStart("(5 ")).toBe(true);
 | 
			
		||||
    expect(isParanStart("(5 + 5")).toBe(true);
 | 
			
		||||
    expect(isParanStart("(5 + 5)")).toBe(true);
 | 
			
		||||
    expect(isParanStart("(5 + 5) ")).toBe(true);
 | 
			
		||||
 | 
			
		||||
    expect(isParanStart("5")).toBe(false);
 | 
			
		||||
    expect(isParanStart("5 + 5")).toBe(false);
 | 
			
		||||
    expect(isParanStart("5( + 5)")).toBe(false);
 | 
			
		||||
    expect(isParanStart(" ( + 5)")).toBe(false);
 | 
			
		||||
  });
 | 
			
		||||
  it("test is paran end", () => {
 | 
			
		||||
    expect(isParanEnd(")")).toBe(true);
 | 
			
		||||
    expect(isParanEnd(") ")).toBe(true);
 | 
			
		||||
    expect(isParanEnd(")5")).toBe(true);
 | 
			
		||||
    expect(isParanEnd(")5 ")).toBe(true);
 | 
			
		||||
 | 
			
		||||
    expect(isParanEnd("5")).toBe(false);
 | 
			
		||||
    expect(isParanEnd("5 + 5")).toBe(false);
 | 
			
		||||
    expect(isParanEnd("5) + 5")).toBe(false);
 | 
			
		||||
    expect(isParanEnd(" ) + 5")).toBe(false);
 | 
			
		||||
  });
 | 
			
		||||
  it("test is block start", () => {
 | 
			
		||||
    expect(isBlockStart("{")).toBe(true);
 | 
			
		||||
    expect(isBlockStart("{ ")).toBe(true);
 | 
			
		||||
    expect(isBlockStart("{5")).toBe(true);
 | 
			
		||||
    expect(isBlockStart("{a")).toBe(true);
 | 
			
		||||
    expect(isBlockStart("{5 ")).toBe(true);
 | 
			
		||||
 | 
			
		||||
    expect(isBlockStart("5")).toBe(false);
 | 
			
		||||
    expect(isBlockStart("5 + 5")).toBe(false);
 | 
			
		||||
    expect(isBlockStart("5{ + 5")).toBe(false);
 | 
			
		||||
    expect(isBlockStart("a{ + 5")).toBe(false);
 | 
			
		||||
    expect(isBlockStart(" { + 5")).toBe(false);
 | 
			
		||||
  });
 | 
			
		||||
  it("test is block end", () => {
 | 
			
		||||
    expect(isBlockEnd("}")).toBe(true);
 | 
			
		||||
    expect(isBlockEnd("} ")).toBe(true);
 | 
			
		||||
    expect(isBlockEnd("}5")).toBe(true);
 | 
			
		||||
    expect(isBlockEnd("}5 ")).toBe(true);
 | 
			
		||||
 | 
			
		||||
    expect(isBlockEnd("5")).toBe(false);
 | 
			
		||||
    expect(isBlockEnd("5 + 5")).toBe(false);
 | 
			
		||||
    expect(isBlockEnd("5} + 5")).toBe(false);
 | 
			
		||||
    expect(isBlockEnd(" } + 5")).toBe(false);
 | 
			
		||||
  });
 | 
			
		||||
 | 
			
		||||
});
 | 
			
		||||
 | 
			
		||||
describe("testing lexer", () => {
 | 
			
		||||
  it("test lexer", () => {
 | 
			
		||||
    expect(lexer("1 + 2")).toEqual([
 | 
			
		||||
      { type: "number", value: "1" },
 | 
			
		||||
      { type: "operator", value: "+" },
 | 
			
		||||
      { type: "number", value: "2" },
 | 
			
		||||
    ]);
 | 
			
		||||
    expect(lexer("54 + 22500 + 6")).toEqual([
 | 
			
		||||
      { type: "number", value: "54" },
 | 
			
		||||
      { type: "operator", value: "+" },
 | 
			
		||||
      { type: "number", value: "22500" },
 | 
			
		||||
      { type: "operator", value: "+" },
 | 
			
		||||
      { type: "number", value: "6" },
 | 
			
		||||
    ]);
 | 
			
		||||
    expect(lexer("a + bo + t5 - 6")).toEqual([
 | 
			
		||||
      { type: "word", value: "a" },
 | 
			
		||||
      { type: "operator", value: "+" },
 | 
			
		||||
      { type: "word", value: "bo" },
 | 
			
		||||
      { type: "operator", value: "+" },
 | 
			
		||||
      { type: "word", value: "t5" },
 | 
			
		||||
      { type: "operator", value: "-" },
 | 
			
		||||
      { type: "number", value: "6" },
 | 
			
		||||
    ]);
 | 
			
		||||
    expect(lexer('a + "a str" - 6')).toEqual([
 | 
			
		||||
      { type: "word", value: "a" },
 | 
			
		||||
      { type: "operator", value: "+" },
 | 
			
		||||
      { type: "string", value: '"a str"' },
 | 
			
		||||
      { type: "operator", value: "-" },
 | 
			
		||||
      { type: "number", value: "6" },
 | 
			
		||||
    ]);
 | 
			
		||||
    const sameWithOrWithoutWhiteSpaces = [
 | 
			
		||||
      { type: "word", value: "a" },
 | 
			
		||||
      { type: "operator", value: "+" },
 | 
			
		||||
      { type: "string", value: "'str'" },
 | 
			
		||||
    ];
 | 
			
		||||
    expect(lexer("a + 'str'")).toEqual(sameWithOrWithoutWhiteSpaces);
 | 
			
		||||
    expect(lexer("a +'str'")).toEqual(sameWithOrWithoutWhiteSpaces);
 | 
			
		||||
 | 
			
		||||
    expect(lexer("a + (sick)")).toEqual([
 | 
			
		||||
      { type: "word", value: "a" },
 | 
			
		||||
      { type: "operator", value: "+" },
 | 
			
		||||
      { type: "brace", value: "(" },
 | 
			
		||||
      { type: "word", value: "sick" },
 | 
			
		||||
      { type: "brace", value: ")" },
 | 
			
		||||
    ]);
 | 
			
		||||
 | 
			
		||||
    expect(lexer("a + {sick}")).toEqual([
 | 
			
		||||
      { type: "word", value: "a" },
 | 
			
		||||
      { type: "operator", value: "+" },
 | 
			
		||||
      { type: "brace", value: "{" },
 | 
			
		||||
      { type: "word", value: "sick" },
 | 
			
		||||
      { type: "brace", value: "}" },
 | 
			
		||||
    ]);
 | 
			
		||||
  });
 | 
			
		||||
});
 | 
			
		||||
							
								
								
									
										119
									
								
								src/tokeniser.ts
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										119
									
								
								src/tokeniser.ts
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,119 @@
 | 
			
		||||
import fsp from "node:fs/promises";
 | 
			
		||||
 | 
			
		||||
const NUMBER = /^[0-9]+/;
 | 
			
		||||
const WHITESPACE = /\s+/;
 | 
			
		||||
const WORD = /^[a-zA-Z_][a-zA-Z0-9_]*/;
 | 
			
		||||
// regex that captures everything between two non escaped quotes and the quotes aren't captured in the match
 | 
			
		||||
const STRING = /^(["'])(?:(?=(\\?))\2.)*?\1/;
 | 
			
		||||
// regex for operators
 | 
			
		||||
const OPERATOR = /^[>=|<=|+|\-|*|/|>|<|^|%]/;
 | 
			
		||||
const BLOCK_START = /^\{/
 | 
			
		||||
const BLOCK_END = /^\}/
 | 
			
		||||
const PARAN_START = /^\(/;
 | 
			
		||||
const PARAN_END = /^\)/;
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
export const isNumber = (character: string) => NUMBER.test(character);
 | 
			
		||||
export const isWhitespace = (character: string) => WHITESPACE.test(character);
 | 
			
		||||
export const isWord = (character: string) => WORD.test(character);
 | 
			
		||||
export const isString = (character: string) => STRING.test(character);
 | 
			
		||||
export const isOperator = (character: string) => OPERATOR.test(character);
 | 
			
		||||
export const isBlockStart = (character: string) => BLOCK_START.test(character);
 | 
			
		||||
export const isBlockEnd = (character: string) => BLOCK_END.test(character);
 | 
			
		||||
export const isParanStart = (character: string) => PARAN_START.test(character);
 | 
			
		||||
export const isParanEnd = (character: string) => PARAN_END.test(character);
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
function matchFirst(str: string, regex: RegExp) {
 | 
			
		||||
  const theMatch = str.match(regex);
 | 
			
		||||
  if (!theMatch) {
 | 
			
		||||
    throw new Error("Should always be a match:" + str);
 | 
			
		||||
  }
 | 
			
		||||
  return theMatch[0];
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
// type TokenTypes = 
 | 
			
		||||
interface Token {
 | 
			
		||||
  type: 'number' | 'word' | 'operator' | 'string' | 'brace'
 | 
			
		||||
  value: string;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
export const lexer = (str: string): Token[] => {
 | 
			
		||||
  const tokens: Token[] = [];
 | 
			
		||||
  let currentIndex = 0;
 | 
			
		||||
 | 
			
		||||
  const returnTokenAtIndex = (
 | 
			
		||||
    str: string,
 | 
			
		||||
    startIndex: number
 | 
			
		||||
  ): Token | null => {
 | 
			
		||||
    const strFromIndex = str.slice(startIndex);
 | 
			
		||||
    if (isOperator(strFromIndex)) {
 | 
			
		||||
      return {
 | 
			
		||||
        type: "operator",
 | 
			
		||||
        value: matchFirst(strFromIndex, OPERATOR),
 | 
			
		||||
      };
 | 
			
		||||
    }
 | 
			
		||||
    if (isString(strFromIndex)) {
 | 
			
		||||
      return {
 | 
			
		||||
        type: "string",
 | 
			
		||||
        value: matchFirst(strFromIndex, STRING),
 | 
			
		||||
      };
 | 
			
		||||
    }
 | 
			
		||||
    if(isParanEnd(strFromIndex)){
 | 
			
		||||
      return {
 | 
			
		||||
        type: "brace",
 | 
			
		||||
        value: matchFirst(strFromIndex, PARAN_END),
 | 
			
		||||
      };
 | 
			
		||||
    }
 | 
			
		||||
    if(isParanStart(strFromIndex)){
 | 
			
		||||
      return {
 | 
			
		||||
        type: "brace",
 | 
			
		||||
        value: matchFirst(strFromIndex, PARAN_START),
 | 
			
		||||
      };
 | 
			
		||||
    }
 | 
			
		||||
    if(isBlockStart(strFromIndex)){
 | 
			
		||||
      return {
 | 
			
		||||
        type: "brace",
 | 
			
		||||
        value: matchFirst(strFromIndex, BLOCK_START),
 | 
			
		||||
      };
 | 
			
		||||
    }
 | 
			
		||||
    if(isBlockEnd(strFromIndex)){
 | 
			
		||||
      return {
 | 
			
		||||
        type: "brace",
 | 
			
		||||
        value: matchFirst(strFromIndex, BLOCK_END),
 | 
			
		||||
      };
 | 
			
		||||
    }
 | 
			
		||||
    if (isNumber(strFromIndex)) {
 | 
			
		||||
      return {
 | 
			
		||||
        type: "number",
 | 
			
		||||
        value: matchFirst(strFromIndex, NUMBER),
 | 
			
		||||
      };
 | 
			
		||||
    }
 | 
			
		||||
    if(isWord(strFromIndex)) {
 | 
			
		||||
      return {
 | 
			
		||||
        type: "word",
 | 
			
		||||
        value: matchFirst(strFromIndex, WORD),
 | 
			
		||||
      };
 | 
			
		||||
    }
 | 
			
		||||
    return null;
 | 
			
		||||
  };
 | 
			
		||||
  while (currentIndex < str.length) {
 | 
			
		||||
    const token = returnTokenAtIndex(str, currentIndex);
 | 
			
		||||
    if (token) {
 | 
			
		||||
      tokens.push(token);
 | 
			
		||||
      currentIndex += token.value.length;
 | 
			
		||||
    } else {
 | 
			
		||||
      currentIndex++;
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  return tokens
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
async function main() {
 | 
			
		||||
  const example1 = await fsp.readFile("./examples/addition.cado", "ascii");
 | 
			
		||||
  const parsed = lexer(example1);
 | 
			
		||||
  console.log(parsed);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
// main()
 | 
			
		||||
		Reference in New Issue
	
	Block a user