2022-11-26 20:38:07 +11:00
|
|
|
// regular expression for number that includes a decimal point or starts with a minus sign
|
|
|
|
const NUMBER = /^-?\d+(\.\d+)?/
|
|
|
|
|
2022-11-26 08:34:23 +11:00
|
|
|
const WHITESPACE = /\s+/
|
|
|
|
const WORD = /^[a-zA-Z_][a-zA-Z0-9_]*/
|
2022-11-12 13:12:20 +11:00
|
|
|
// regex that captures everything between two non escaped quotes and the quotes aren't captured in the match
|
2022-11-26 08:34:23 +11:00
|
|
|
const STRING = /^(["'])(?:(?=(\\?))\2.)*?\1/
|
2022-11-17 16:05:14 +11:00
|
|
|
// verbose regex for finding operators, multiple character operators need to be first
|
2022-11-26 08:34:23 +11:00
|
|
|
const OPERATOR = /^(>=|<=|==|=>|!=|\*|\+|-|\/|%|=|<|>|\||\^)/
|
2022-11-17 16:05:14 +11:00
|
|
|
|
2022-11-26 08:34:23 +11:00
|
|
|
const BLOCK_START = /^\{/
|
|
|
|
const BLOCK_END = /^\}/
|
|
|
|
const PARAN_START = /^\(/
|
|
|
|
const PARAN_END = /^\)/
|
|
|
|
const COMMA = /^,/
|
2022-11-12 13:12:20 +11:00
|
|
|
|
2022-11-26 08:34:23 +11:00
|
|
|
export const isNumber = (character: string) => NUMBER.test(character)
|
|
|
|
export const isWhitespace = (character: string) => WHITESPACE.test(character)
|
|
|
|
export const isWord = (character: string) => WORD.test(character)
|
|
|
|
export const isString = (character: string) => STRING.test(character)
|
|
|
|
export const isOperator = (character: string) => OPERATOR.test(character)
|
|
|
|
export const isBlockStart = (character: string) => BLOCK_START.test(character)
|
|
|
|
export const isBlockEnd = (character: string) => BLOCK_END.test(character)
|
|
|
|
export const isParanStart = (character: string) => PARAN_START.test(character)
|
|
|
|
export const isParanEnd = (character: string) => PARAN_END.test(character)
|
|
|
|
export const isComma = (character: string) => COMMA.test(character)
|
2022-11-12 13:12:20 +11:00
|
|
|
|
|
|
|
function matchFirst(str: string, regex: RegExp) {
|
2022-11-26 08:34:23 +11:00
|
|
|
const theMatch = str.match(regex)
|
2022-11-12 13:12:20 +11:00
|
|
|
if (!theMatch) {
|
2022-11-26 08:34:23 +11:00
|
|
|
throw new Error('Should always be a match:' + str)
|
2022-11-12 13:12:20 +11:00
|
|
|
}
|
2022-11-26 08:34:23 +11:00
|
|
|
return theMatch[0]
|
2022-11-12 13:12:20 +11:00
|
|
|
}
|
|
|
|
|
2022-11-12 17:47:41 +11:00
|
|
|
export interface Token {
|
2022-11-26 08:34:23 +11:00
|
|
|
type:
|
|
|
|
| 'number'
|
|
|
|
| 'word'
|
|
|
|
| 'operator'
|
|
|
|
| 'string'
|
|
|
|
| 'brace'
|
|
|
|
| 'whitespace'
|
|
|
|
| 'comma'
|
|
|
|
value: string
|
|
|
|
start: number
|
|
|
|
end: number
|
2022-11-12 13:12:20 +11:00
|
|
|
}
|
|
|
|
|
2022-11-26 08:34:23 +11:00
|
|
|
const makeToken = (
|
|
|
|
type: Token['type'],
|
|
|
|
value: string,
|
|
|
|
start: number
|
|
|
|
): Token => ({
|
2022-11-12 17:47:41 +11:00
|
|
|
type,
|
|
|
|
value,
|
|
|
|
start,
|
|
|
|
end: start + value.length,
|
|
|
|
})
|
|
|
|
|
2022-11-12 13:33:35 +11:00
|
|
|
const returnTokenAtIndex = (str: string, startIndex: number): Token | null => {
|
2022-11-26 08:34:23 +11:00
|
|
|
const strFromIndex = str.slice(startIndex)
|
2022-11-12 13:33:35 +11:00
|
|
|
if (isString(strFromIndex)) {
|
2022-11-26 08:34:23 +11:00
|
|
|
return makeToken('string', matchFirst(strFromIndex, STRING), startIndex)
|
2022-11-12 13:33:35 +11:00
|
|
|
}
|
|
|
|
if (isParanEnd(strFromIndex)) {
|
2022-11-26 08:34:23 +11:00
|
|
|
return makeToken('brace', matchFirst(strFromIndex, PARAN_END), startIndex)
|
2022-11-12 13:33:35 +11:00
|
|
|
}
|
|
|
|
if (isParanStart(strFromIndex)) {
|
2022-11-26 08:34:23 +11:00
|
|
|
return makeToken('brace', matchFirst(strFromIndex, PARAN_START), startIndex)
|
2022-11-12 13:33:35 +11:00
|
|
|
}
|
|
|
|
if (isBlockStart(strFromIndex)) {
|
2022-11-26 08:34:23 +11:00
|
|
|
return makeToken('brace', matchFirst(strFromIndex, BLOCK_START), startIndex)
|
2022-11-12 13:33:35 +11:00
|
|
|
}
|
|
|
|
if (isBlockEnd(strFromIndex)) {
|
2022-11-26 08:34:23 +11:00
|
|
|
return makeToken('brace', matchFirst(strFromIndex, BLOCK_END), startIndex)
|
2022-11-12 13:33:35 +11:00
|
|
|
}
|
2022-11-14 13:28:16 +11:00
|
|
|
if (isComma(strFromIndex)) {
|
2022-11-26 08:34:23 +11:00
|
|
|
return makeToken('comma', matchFirst(strFromIndex, COMMA), startIndex)
|
2022-11-14 13:28:16 +11:00
|
|
|
}
|
2022-11-12 13:33:35 +11:00
|
|
|
if (isNumber(strFromIndex)) {
|
2022-11-26 08:34:23 +11:00
|
|
|
return makeToken('number', matchFirst(strFromIndex, NUMBER), startIndex)
|
2022-11-12 13:33:35 +11:00
|
|
|
}
|
2022-11-26 20:38:07 +11:00
|
|
|
if (isOperator(strFromIndex)) {
|
|
|
|
return makeToken('operator', matchFirst(strFromIndex, OPERATOR), startIndex)
|
|
|
|
}
|
2022-11-12 13:33:35 +11:00
|
|
|
if (isWord(strFromIndex)) {
|
2022-11-26 08:34:23 +11:00
|
|
|
return makeToken('word', matchFirst(strFromIndex, WORD), startIndex)
|
2022-11-12 17:47:41 +11:00
|
|
|
}
|
|
|
|
if (isWhitespace(strFromIndex)) {
|
2022-11-26 08:34:23 +11:00
|
|
|
return makeToken(
|
|
|
|
'whitespace',
|
|
|
|
matchFirst(strFromIndex, WHITESPACE),
|
|
|
|
startIndex
|
|
|
|
)
|
2022-11-12 13:33:35 +11:00
|
|
|
}
|
2022-11-26 08:34:23 +11:00
|
|
|
return null
|
|
|
|
}
|
2022-11-12 13:12:20 +11:00
|
|
|
|
2022-11-12 13:33:35 +11:00
|
|
|
export const lexer = (str: string): Token[] => {
|
|
|
|
const recursivelyTokenise = (
|
2022-11-12 13:12:20 +11:00
|
|
|
str: string,
|
2022-11-12 13:33:35 +11:00
|
|
|
currentIndex: number = 0,
|
|
|
|
previousTokens: Token[] = []
|
|
|
|
): Token[] => {
|
|
|
|
if (currentIndex >= str.length) {
|
2022-11-26 08:34:23 +11:00
|
|
|
return previousTokens
|
2022-11-12 13:12:20 +11:00
|
|
|
}
|
2022-11-26 08:34:23 +11:00
|
|
|
const token = returnTokenAtIndex(str, currentIndex)
|
2022-11-12 13:33:35 +11:00
|
|
|
if (!token) {
|
2022-11-26 08:34:23 +11:00
|
|
|
return recursivelyTokenise(str, currentIndex + 1, previousTokens)
|
2022-11-12 13:12:20 +11:00
|
|
|
}
|
2022-11-26 08:34:23 +11:00
|
|
|
const nextIndex = currentIndex + token.value.length
|
|
|
|
return recursivelyTokenise(str, nextIndex, [...previousTokens, token])
|
|
|
|
}
|
|
|
|
return recursivelyTokenise(str)
|
|
|
|
}
|