modeling-app/src/lang/tokeniser.ts

// regular expression for number that includes a decimal point or starts with a minus sign
const NUMBER = /^-?\d+(\.\d+)?/

const WHITESPACE = /\s+/
const WORD = /^[a-zA-Z_][a-zA-Z0-9_]*/
// regex that captures everything between two non escaped quotes and the quotes aren't captured in the match
const STRING = /^(["'])(?:(?=(\\?))\2.)*?\1/
// verbose regex for finding operators, multiple character operators need to be first
const OPERATOR = /^(>=|<=|==|=>|!=|\*|\+|-|\/|%|=|<|>|\||\^)/

const BLOCK_START = /^\{/
const BLOCK_END = /^\}/
const PARAN_START = /^\(/
const PARAN_END = /^\)/
const COMMA = /^,/

export const isNumber = (character: string) => NUMBER.test(character)
export const isWhitespace = (character: string) => WHITESPACE.test(character)
export const isWord = (character: string) => WORD.test(character)
export const isString = (character: string) => STRING.test(character)
export const isOperator = (character: string) => OPERATOR.test(character)
export const isBlockStart = (character: string) => BLOCK_START.test(character)
export const isBlockEnd = (character: string) => BLOCK_END.test(character)
export const isParanStart = (character: string) => PARAN_START.test(character)
export const isParanEnd = (character: string) => PARAN_END.test(character)
export const isComma = (character: string) => COMMA.test(character)

function matchFirst(str: string, regex: RegExp) {
  const theMatch = str.match(regex)
  if (!theMatch) {
    throw new Error('Should always be a match:' + str)
  }
  return theMatch[0]
}

export interface Token {
  type:
    | 'number'
    | 'word'
    | 'operator'
    | 'string'
    | 'brace'
    | 'whitespace'
    | 'comma'
  value: string
  start: number
  end: number
}

const makeToken = (
  type: Token['type'],
  value: string,
  start: number
): Token => ({
  type,
  value,
  start,
  end: start + value.length,
})

const returnTokenAtIndex = (str: string, startIndex: number): Token | null => {
  const strFromIndex = str.slice(startIndex)
  if (isString(strFromIndex)) {
    return makeToken('string', matchFirst(strFromIndex, STRING), startIndex)
  }
  if (isParanEnd(strFromIndex)) {
    return makeToken('brace', matchFirst(strFromIndex, PARAN_END), startIndex)
  }
  if (isParanStart(strFromIndex)) {
    return makeToken('brace', matchFirst(strFromIndex, PARAN_START), startIndex)
  }
  if (isBlockStart(strFromIndex)) {
    return makeToken('brace', matchFirst(strFromIndex, BLOCK_START), startIndex)
  }
  if (isBlockEnd(strFromIndex)) {
    return makeToken('brace', matchFirst(strFromIndex, BLOCK_END), startIndex)
  }
  if (isComma(strFromIndex)) {
    return makeToken('comma', matchFirst(strFromIndex, COMMA), startIndex)
  }
  if (isNumber(strFromIndex)) {
    return makeToken('number', matchFirst(strFromIndex, NUMBER), startIndex)
  }
  if (isOperator(strFromIndex)) {
    return makeToken('operator', matchFirst(strFromIndex, OPERATOR), startIndex)
  }
  if (isWord(strFromIndex)) {
    return makeToken('word', matchFirst(strFromIndex, WORD), startIndex)
  }
  if (isWhitespace(strFromIndex)) {
    return makeToken(
      'whitespace',
      matchFirst(strFromIndex, WHITESPACE),
      startIndex
    )
  }
  return null
}

export const lexer = (str: string): Token[] => {
  const recursivelyTokenise = (
    str: string,
    currentIndex: number = 0,
    previousTokens: Token[] = []
  ): Token[] => {
    if (currentIndex >= str.length) {
      return previousTokens
    }
    const token = returnTokenAtIndex(str, currentIndex)
    if (!token) {
      return recursivelyTokenise(str, currentIndex + 1, previousTokens)
    }
    const nextIndex = currentIndex + token.value.length
    return recursivelyTokenise(str, nextIndex, [...previousTokens, token])
  }
  return recursivelyTokenise(str)
}
tokeniser should handle negative and decimal place numbers 2022-11-26 20:38:07 +11:00			`// regular expression for number that includes a decimal point or starts with a minus sign`
			`const NUMBER = /^-?\d+(\.\d+)?/`

remove semi-colons 2022-11-26 08:34:23 +11:00			`const WHITESPACE = /\s+/`
			`const WORD = /^[a-zA-Z_][a-zA-Z0-9_]*/`
start of tokeniser 2022-11-12 13:12:20 +11:00			`// regex that captures everything between two non escaped quotes and the quotes aren't captured in the match`
remove semi-colons 2022-11-26 08:34:23 +11:00			`const STRING = /^(["'])(?:(?=(\\?))\2.)*?\1/`
fix operator regex 2022-11-17 16:05:14 +11:00			`// verbose regex for finding operators, multiple character operators need to be first`
remove semi-colons 2022-11-26 08:34:23 +11:00			`const OPERATOR = /^(>=\|<=\|==\|=>\|!=\|\*\|\+\|-\|\/\|%\|=\|<\|>\|\\|\|\^)/`
fix operator regex 2022-11-17 16:05:14 +11:00
remove semi-colons 2022-11-26 08:34:23 +11:00			`const BLOCK_START = /^\{/`
			`const BLOCK_END = /^\}/`
			`const PARAN_START = /^\(/`
			`const PARAN_END = /^\)/`
			`const COMMA = /^,/`
start of tokeniser 2022-11-12 13:12:20 +11:00
remove semi-colons 2022-11-26 08:34:23 +11:00			`export const isNumber = (character: string) => NUMBER.test(character)`
			`export const isWhitespace = (character: string) => WHITESPACE.test(character)`
			`export const isWord = (character: string) => WORD.test(character)`
			`export const isString = (character: string) => STRING.test(character)`
			`export const isOperator = (character: string) => OPERATOR.test(character)`
			`export const isBlockStart = (character: string) => BLOCK_START.test(character)`
			`export const isBlockEnd = (character: string) => BLOCK_END.test(character)`
			`export const isParanStart = (character: string) => PARAN_START.test(character)`
			`export const isParanEnd = (character: string) => PARAN_END.test(character)`
			`export const isComma = (character: string) => COMMA.test(character)`
start of tokeniser 2022-11-12 13:12:20 +11:00
			`function matchFirst(str: string, regex: RegExp) {`
remove semi-colons 2022-11-26 08:34:23 +11:00			`const theMatch = str.match(regex)`
start of tokeniser 2022-11-12 13:12:20 +11:00			`if (!theMatch) {`
remove semi-colons 2022-11-26 08:34:23 +11:00			`throw new Error('Should always be a match:' + str)`
start of tokeniser 2022-11-12 13:12:20 +11:00			`}`
remove semi-colons 2022-11-26 08:34:23 +11:00			`return theMatch[0]`
start of tokeniser 2022-11-12 13:12:20 +11:00			`}`

add white spacetoken 2022-11-12 17:47:41 +11:00			`export interface Token {`
remove semi-colons 2022-11-26 08:34:23 +11:00			`type:`
			`\| 'number'`
			`\| 'word'`
			`\| 'operator'`
			`\| 'string'`
			`\| 'brace'`
			`\| 'whitespace'`
			`\| 'comma'`
			`value: string`
			`start: number`
			`end: number`
start of tokeniser 2022-11-12 13:12:20 +11:00			`}`

remove semi-colons 2022-11-26 08:34:23 +11:00			`const makeToken = (`
			`type: Token['type'],`
			`value: string,`
			`start: number`
			`): Token => ({`
add white spacetoken 2022-11-12 17:47:41 +11:00			`type,`
			`value,`
			`start,`
			`end: start + value.length,`
			`})`

refactor to remove mutation 2022-11-12 13:33:35 +11:00			`const returnTokenAtIndex = (str: string, startIndex: number): Token \| null => {`
remove semi-colons 2022-11-26 08:34:23 +11:00			`const strFromIndex = str.slice(startIndex)`
refactor to remove mutation 2022-11-12 13:33:35 +11:00			`if (isString(strFromIndex)) {`
remove semi-colons 2022-11-26 08:34:23 +11:00			`return makeToken('string', matchFirst(strFromIndex, STRING), startIndex)`
refactor to remove mutation 2022-11-12 13:33:35 +11:00			`}`
			`if (isParanEnd(strFromIndex)) {`
remove semi-colons 2022-11-26 08:34:23 +11:00			`return makeToken('brace', matchFirst(strFromIndex, PARAN_END), startIndex)`
refactor to remove mutation 2022-11-12 13:33:35 +11:00			`}`
			`if (isParanStart(strFromIndex)) {`
remove semi-colons 2022-11-26 08:34:23 +11:00			`return makeToken('brace', matchFirst(strFromIndex, PARAN_START), startIndex)`
refactor to remove mutation 2022-11-12 13:33:35 +11:00			`}`
			`if (isBlockStart(strFromIndex)) {`
remove semi-colons 2022-11-26 08:34:23 +11:00			`return makeToken('brace', matchFirst(strFromIndex, BLOCK_START), startIndex)`
refactor to remove mutation 2022-11-12 13:33:35 +11:00			`}`
			`if (isBlockEnd(strFromIndex)) {`
remove semi-colons 2022-11-26 08:34:23 +11:00			`return makeToken('brace', matchFirst(strFromIndex, BLOCK_END), startIndex)`
refactor to remove mutation 2022-11-12 13:33:35 +11:00			`}`
Add callee expression parsing 2022-11-14 13:28:16 +11:00			`if (isComma(strFromIndex)) {`
remove semi-colons 2022-11-26 08:34:23 +11:00			`return makeToken('comma', matchFirst(strFromIndex, COMMA), startIndex)`
Add callee expression parsing 2022-11-14 13:28:16 +11:00			`}`
refactor to remove mutation 2022-11-12 13:33:35 +11:00			`if (isNumber(strFromIndex)) {`
remove semi-colons 2022-11-26 08:34:23 +11:00			`return makeToken('number', matchFirst(strFromIndex, NUMBER), startIndex)`
refactor to remove mutation 2022-11-12 13:33:35 +11:00			`}`
tokeniser should handle negative and decimal place numbers 2022-11-26 20:38:07 +11:00			`if (isOperator(strFromIndex)) {`
			`return makeToken('operator', matchFirst(strFromIndex, OPERATOR), startIndex)`
			`}`
refactor to remove mutation 2022-11-12 13:33:35 +11:00			`if (isWord(strFromIndex)) {`
remove semi-colons 2022-11-26 08:34:23 +11:00			`return makeToken('word', matchFirst(strFromIndex, WORD), startIndex)`
add white spacetoken 2022-11-12 17:47:41 +11:00			`}`
			`if (isWhitespace(strFromIndex)) {`
remove semi-colons 2022-11-26 08:34:23 +11:00			`return makeToken(`
			`'whitespace',`
			`matchFirst(strFromIndex, WHITESPACE),`
			`startIndex`
			`)`
refactor to remove mutation 2022-11-12 13:33:35 +11:00			`}`
remove semi-colons 2022-11-26 08:34:23 +11:00			`return null`
			`}`
start of tokeniser 2022-11-12 13:12:20 +11:00
refactor to remove mutation 2022-11-12 13:33:35 +11:00			`export const lexer = (str: string): Token[] => {`
			`const recursivelyTokenise = (`
start of tokeniser 2022-11-12 13:12:20 +11:00			`str: string,`
refactor to remove mutation 2022-11-12 13:33:35 +11:00			`currentIndex: number = 0,`
			`previousTokens: Token[] = []`
			`): Token[] => {`
			`if (currentIndex >= str.length) {`
remove semi-colons 2022-11-26 08:34:23 +11:00			`return previousTokens`
start of tokeniser 2022-11-12 13:12:20 +11:00			`}`
remove semi-colons 2022-11-26 08:34:23 +11:00			`const token = returnTokenAtIndex(str, currentIndex)`
refactor to remove mutation 2022-11-12 13:33:35 +11:00			`if (!token) {`
remove semi-colons 2022-11-26 08:34:23 +11:00			`return recursivelyTokenise(str, currentIndex + 1, previousTokens)`
start of tokeniser 2022-11-12 13:12:20 +11:00			`}`
remove semi-colons 2022-11-26 08:34:23 +11:00			`const nextIndex = currentIndex + token.value.length`
			`return recursivelyTokenise(str, nextIndex, [...previousTokens, token])`
			`}`
			`return recursivelyTokenise(str)`
			`}`