Files
modeling-app/src/lang/tokeniser.ts

134 lines
4.3 KiB
TypeScript
Raw Normal View History

// regular expression for number that includes a decimal point or starts with a minus sign
const NUMBER = /^-?\d+(\.\d+)?/
2022-11-26 08:34:23 +11:00
const WHITESPACE = /\s+/
const WORD = /^[a-zA-Z_][a-zA-Z0-9_]*/
2022-11-12 13:12:20 +11:00
// regex that captures everything between two non escaped quotes and the quotes aren't captured in the match
2022-11-26 08:34:23 +11:00
const STRING = /^(["'])(?:(?=(\\?))\2.)*?\1/
2022-11-17 16:05:14 +11:00
// verbose regex for finding operators, multiple character operators need to be first
const OPERATOR = /^(>=|<=|==|=>|!= |\|>|\*|\+|-|\/|%|=|<|>|\||\^)/
2022-11-17 16:05:14 +11:00
2022-11-26 08:34:23 +11:00
const BLOCK_START = /^\{/
const BLOCK_END = /^\}/
const PARAN_START = /^\(/
const PARAN_END = /^\)/
2022-12-30 21:53:50 +11:00
const ARRAY_START = /^\[/
const ARRAY_END = /^\]/
2022-11-26 08:34:23 +11:00
const COMMA = /^,/
2023-01-01 21:48:30 +11:00
const COLON = /^:/
2022-11-12 13:12:20 +11:00
2022-11-26 08:34:23 +11:00
export const isNumber = (character: string) => NUMBER.test(character)
export const isWhitespace = (character: string) => WHITESPACE.test(character)
export const isWord = (character: string) => WORD.test(character)
export const isString = (character: string) => STRING.test(character)
export const isOperator = (character: string) => OPERATOR.test(character)
export const isBlockStart = (character: string) => BLOCK_START.test(character)
export const isBlockEnd = (character: string) => BLOCK_END.test(character)
export const isParanStart = (character: string) => PARAN_START.test(character)
export const isParanEnd = (character: string) => PARAN_END.test(character)
2022-12-30 21:53:50 +11:00
export const isArrayStart = (character: string) => ARRAY_START.test(character)
export const isArrayEnd = (character: string) => ARRAY_END.test(character)
2022-11-26 08:34:23 +11:00
export const isComma = (character: string) => COMMA.test(character)
2023-01-01 21:48:30 +11:00
export const isColon = (character: string) => COLON.test(character)
2022-11-12 13:12:20 +11:00
function matchFirst(str: string, regex: RegExp) {
2022-11-26 08:34:23 +11:00
const theMatch = str.match(regex)
2022-11-12 13:12:20 +11:00
if (!theMatch) {
2022-11-26 08:34:23 +11:00
throw new Error('Should always be a match:' + str)
2022-11-12 13:12:20 +11:00
}
2022-11-26 08:34:23 +11:00
return theMatch[0]
2022-11-12 13:12:20 +11:00
}
2022-11-12 17:47:41 +11:00
export interface Token {
2022-11-26 08:34:23 +11:00
type:
| 'number'
| 'word'
| 'operator'
| 'string'
| 'brace'
| 'whitespace'
| 'comma'
2023-01-01 21:48:30 +11:00
| 'colon'
2022-11-26 08:34:23 +11:00
value: string
start: number
end: number
2022-11-12 13:12:20 +11:00
}
2022-11-26 08:34:23 +11:00
const makeToken = (
type: Token['type'],
value: string,
start: number
): Token => ({
2022-11-12 17:47:41 +11:00
type,
value,
start,
end: start + value.length,
})
2022-11-12 13:33:35 +11:00
const returnTokenAtIndex = (str: string, startIndex: number): Token | null => {
2022-11-26 08:34:23 +11:00
const strFromIndex = str.slice(startIndex)
2022-11-12 13:33:35 +11:00
if (isString(strFromIndex)) {
2022-11-26 08:34:23 +11:00
return makeToken('string', matchFirst(strFromIndex, STRING), startIndex)
2022-11-12 13:33:35 +11:00
}
if (isParanEnd(strFromIndex)) {
2022-11-26 08:34:23 +11:00
return makeToken('brace', matchFirst(strFromIndex, PARAN_END), startIndex)
2022-11-12 13:33:35 +11:00
}
if (isParanStart(strFromIndex)) {
2022-11-26 08:34:23 +11:00
return makeToken('brace', matchFirst(strFromIndex, PARAN_START), startIndex)
2022-11-12 13:33:35 +11:00
}
if (isBlockStart(strFromIndex)) {
2022-11-26 08:34:23 +11:00
return makeToken('brace', matchFirst(strFromIndex, BLOCK_START), startIndex)
2022-11-12 13:33:35 +11:00
}
if (isBlockEnd(strFromIndex)) {
2022-11-26 08:34:23 +11:00
return makeToken('brace', matchFirst(strFromIndex, BLOCK_END), startIndex)
2022-11-12 13:33:35 +11:00
}
2022-12-30 21:53:50 +11:00
if (isArrayStart(strFromIndex)) {
return makeToken('brace', matchFirst(strFromIndex, ARRAY_START), startIndex)
}
if (isArrayEnd(strFromIndex)) {
return makeToken('brace', matchFirst(strFromIndex, ARRAY_END), startIndex)
}
2022-11-14 13:28:16 +11:00
if (isComma(strFromIndex)) {
2022-11-26 08:34:23 +11:00
return makeToken('comma', matchFirst(strFromIndex, COMMA), startIndex)
2022-11-14 13:28:16 +11:00
}
2022-11-12 13:33:35 +11:00
if (isNumber(strFromIndex)) {
2022-11-26 08:34:23 +11:00
return makeToken('number', matchFirst(strFromIndex, NUMBER), startIndex)
2022-11-12 13:33:35 +11:00
}
if (isOperator(strFromIndex)) {
return makeToken('operator', matchFirst(strFromIndex, OPERATOR), startIndex)
}
2022-11-12 13:33:35 +11:00
if (isWord(strFromIndex)) {
2022-11-26 08:34:23 +11:00
return makeToken('word', matchFirst(strFromIndex, WORD), startIndex)
2022-11-12 17:47:41 +11:00
}
2023-01-01 21:48:30 +11:00
if (isColon(strFromIndex)) {
return makeToken('colon', matchFirst(strFromIndex, COLON), startIndex)
}
2022-11-12 17:47:41 +11:00
if (isWhitespace(strFromIndex)) {
2022-11-26 08:34:23 +11:00
return makeToken(
'whitespace',
matchFirst(strFromIndex, WHITESPACE),
startIndex
)
2022-11-12 13:33:35 +11:00
}
2022-11-26 08:34:23 +11:00
return null
}
2022-11-12 13:12:20 +11:00
2022-11-12 13:33:35 +11:00
export const lexer = (str: string): Token[] => {
const recursivelyTokenise = (
2022-11-12 13:12:20 +11:00
str: string,
2022-11-12 13:33:35 +11:00
currentIndex: number = 0,
previousTokens: Token[] = []
): Token[] => {
if (currentIndex >= str.length) {
2022-11-26 08:34:23 +11:00
return previousTokens
2022-11-12 13:12:20 +11:00
}
2022-11-26 08:34:23 +11:00
const token = returnTokenAtIndex(str, currentIndex)
2022-11-12 13:33:35 +11:00
if (!token) {
2022-11-26 08:34:23 +11:00
return recursivelyTokenise(str, currentIndex + 1, previousTokens)
2022-11-12 13:12:20 +11:00
}
2022-11-26 08:34:23 +11:00
const nextIndex = currentIndex + token.value.length
return recursivelyTokenise(str, nextIndex, [...previousTokens, token])
}
return recursivelyTokenise(str)
}