Tokenizer is accidentally quadratic (#689)

* Add comments and rename a function

* Typo: paran -> paren

* Use bytes, not string, for the tokenizer

* Fix typo
This commit is contained in:
Adam Chalmers
2023-09-21 14:18:42 -05:00
committed by GitHub
parent 0c724c4971
commit d820cf2446
2 changed files with 164 additions and 159 deletions

View File

@ -3,7 +3,7 @@ use std::str::FromStr;
use anyhow::Result;
use lazy_static::lazy_static;
use parse_display::{Display, FromStr};
use regex::Regex;
use regex::bytes::Regex;
use schemars::JsonSchema;
use serde::{Deserialize, Serialize};
use tower_lsp::lsp_types::SemanticTokenType;
@ -44,6 +44,7 @@ pub enum TokenType {
Function,
}
/// Most KCL tokens correspond to LSP semantic tokens (but not all).
impl TryFrom<TokenType> for SemanticTokenType {
type Error = anyhow::Error;
fn try_from(token_type: TokenType) -> Result<Self> {
@ -70,7 +71,7 @@ impl TryFrom<TokenType> for SemanticTokenType {
impl TokenType {
// This is for the lsp server.
pub fn to_semantic_token_types() -> Result<Vec<SemanticTokenType>> {
pub fn all_semantic_token_types() -> Result<Vec<SemanticTokenType>> {
let mut settings = schemars::gen::SchemaSettings::openapi3();
settings.inline_subschemas = true;
let mut generator = schemars::gen::SchemaGenerator::new(settings);
@ -119,7 +120,9 @@ impl TokenType {
pub struct Token {
#[serde(rename = "type")]
pub token_type: TokenType,
/// Offset in the source code where this token begins.
pub start: usize,
/// Offset in the source code where this token ends.
pub end: usize,
pub value: String,
}
@ -159,66 +162,68 @@ lazy_static! {
static ref BLOCKCOMMENT: Regex = Regex::new(r"^/\*[\s\S]*?\*/").unwrap();
}
fn is_number(character: &str) -> bool {
fn is_number(character: &[u8]) -> bool {
NUMBER.is_match(character)
}
fn is_whitespace(character: &str) -> bool {
fn is_whitespace(character: &[u8]) -> bool {
WHITESPACE.is_match(character)
}
fn is_word(character: &str) -> bool {
fn is_word(character: &[u8]) -> bool {
WORD.is_match(character)
}
fn is_keyword(character: &str) -> bool {
fn is_keyword(character: &[u8]) -> bool {
KEYWORD.is_match(character)
}
fn is_string(character: &str) -> bool {
fn is_string(character: &[u8]) -> bool {
match STRING.find(character) {
Some(m) => m.start() == 0,
None => false,
}
}
fn is_operator(character: &str) -> bool {
fn is_operator(character: &[u8]) -> bool {
OPERATOR.is_match(character)
}
fn is_block_start(character: &str) -> bool {
fn is_block_start(character: &[u8]) -> bool {
BLOCK_START.is_match(character)
}
fn is_block_end(character: &str) -> bool {
fn is_block_end(character: &[u8]) -> bool {
BLOCK_END.is_match(character)
}
fn is_paran_start(character: &str) -> bool {
fn is_paren_start(character: &[u8]) -> bool {
PARAN_START.is_match(character)
}
fn is_paran_end(character: &str) -> bool {
fn is_paren_end(character: &[u8]) -> bool {
PARAN_END.is_match(character)
}
fn is_array_start(character: &str) -> bool {
fn is_array_start(character: &[u8]) -> bool {
ARRAY_START.is_match(character)
}
fn is_array_end(character: &str) -> bool {
fn is_array_end(character: &[u8]) -> bool {
ARRAY_END.is_match(character)
}
fn is_comma(character: &str) -> bool {
fn is_comma(character: &[u8]) -> bool {
COMMA.is_match(character)
}
fn is_colon(character: &str) -> bool {
fn is_colon(character: &[u8]) -> bool {
COLON.is_match(character)
}
fn is_double_period(character: &str) -> bool {
fn is_double_period(character: &[u8]) -> bool {
DOUBLE_PERIOD.is_match(character)
}
fn is_period(character: &str) -> bool {
fn is_period(character: &[u8]) -> bool {
PERIOD.is_match(character)
}
fn is_line_comment(character: &str) -> bool {
fn is_line_comment(character: &[u8]) -> bool {
LINECOMMENT.is_match(character)
}
fn is_block_comment(character: &str) -> bool {
fn is_block_comment(character: &[u8]) -> bool {
BLOCKCOMMENT.is_match(character)
}
fn match_first(s: &str, regex: &Regex) -> Option<String> {
regex.find(s).map(|the_match| the_match.as_str().to_string())
fn match_first(s: &[u8], regex: &Regex) -> Option<String> {
regex
.find(s)
.map(|the_match| String::from_utf8_lossy(the_match.as_bytes()).into())
}
fn make_token(token_type: TokenType, value: &str, start: usize) -> Token {
@ -230,8 +235,7 @@ fn make_token(token_type: TokenType, value: &str, start: usize) -> Token {
}
}
fn return_token_at_index(s: &str, start_index: usize) -> Option<Token> {
let str_from_index = &s.chars().skip(start_index).collect::<String>();
fn return_token_at_index(str_from_index: &[u8], start_index: usize) -> Option<Token> {
if is_string(str_from_index) {
return Some(make_token(
TokenType::String,
@ -258,14 +262,14 @@ fn return_token_at_index(s: &str, start_index: usize) -> Option<Token> {
start_index,
));
}
if is_paran_end(str_from_index) {
if is_paren_end(str_from_index) {
return Some(make_token(
TokenType::Brace,
&match_first(str_from_index, &PARAN_END)?,
start_index,
));
}
if is_paran_start(str_from_index) {
if is_paren_start(str_from_index) {
return Some(make_token(
TokenType::Brace,
&match_first(str_from_index, &PARAN_START)?,
@ -366,11 +370,11 @@ fn return_token_at_index(s: &str, start_index: usize) -> Option<Token> {
None
}
fn recursively_tokenise(s: &str, current_index: usize, previous_tokens: Vec<Token>) -> Vec<Token> {
fn recursively_tokenise(s: &[u8], current_index: usize, previous_tokens: Vec<Token>) -> Vec<Token> {
if current_index >= s.len() {
return previous_tokens;
}
let token = return_token_at_index(s, current_index);
let token = return_token_at_index(&s[current_index..], current_index);
let Some(token) = token else {
return recursively_tokenise(s, current_index + 1, previous_tokens);
};
@ -381,7 +385,7 @@ fn recursively_tokenise(s: &str, current_index: usize, previous_tokens: Vec<Toke
}
pub fn lexer(s: &str) -> Vec<Token> {
recursively_tokenise(s, 0, Vec::new())
recursively_tokenise(s.as_bytes(), 0, Vec::new())
}
#[cfg(test)]
@ -392,192 +396,193 @@ mod tests {
#[test]
fn is_number_test() {
assert!(is_number("1"));
assert!(is_number("1 abc"));
assert!(is_number("1.1"));
assert!(is_number("1.1 abc"));
assert!(!is_number("a"));
assert!(is_number("1".as_bytes()));
assert!(is_number("1 abc".as_bytes()));
assert!(is_number("1.1".as_bytes()));
assert!(is_number("1.1 abc".as_bytes()));
assert!(!is_number("a".as_bytes()));
assert!(is_number("1"));
assert!(is_number(".1"));
assert!(is_number("5?"));
assert!(is_number("5 + 6"));
assert!(is_number("5 + a"));
assert!(is_number("5.5"));
assert!(is_number("1".as_bytes()));
assert!(is_number(".1".as_bytes()));
assert!(is_number("5?".as_bytes()));
assert!(is_number("5 + 6".as_bytes()));
assert!(is_number("5 + a".as_bytes()));
assert!(is_number("5.5".as_bytes()));
assert!(!is_number("1abc"));
assert!(!is_number("a"));
assert!(!is_number("?"));
assert!(!is_number("?5"));
assert!(!is_number("1abc".as_bytes()));
assert!(!is_number("a".as_bytes()));
assert!(!is_number("?".as_bytes()));
assert!(!is_number("?5".as_bytes()));
}
#[test]
fn is_whitespace_test() {
assert!(is_whitespace(" "));
assert!(is_whitespace(" "));
assert!(is_whitespace(" a"));
assert!(is_whitespace("a "));
assert!(is_whitespace(" ".as_bytes()));
assert!(is_whitespace(" ".as_bytes()));
assert!(is_whitespace(" a".as_bytes()));
assert!(is_whitespace("a ".as_bytes()));
assert!(!is_whitespace("a"));
assert!(!is_whitespace("?"));
assert!(!is_whitespace("a".as_bytes()));
assert!(!is_whitespace("?".as_bytes()));
}
#[test]
fn is_word_test() {
assert!(is_word("a"));
assert!(is_word("a "));
assert!(is_word("a5"));
assert!(is_word("a5a"));
assert!(is_word("a".as_bytes()));
assert!(is_word("a ".as_bytes()));
assert!(is_word("a5".as_bytes()));
assert!(is_word("a5a".as_bytes()));
assert!(!is_word("5"));
assert!(!is_word("5a"));
assert!(!is_word("5a5"));
assert!(!is_word("5".as_bytes()));
assert!(!is_word("5a".as_bytes()));
assert!(!is_word("5a5".as_bytes()));
}
#[test]
fn is_string_test() {
assert!(is_string("\"\""));
assert!(is_string("\"a\""));
assert!(is_string("\"a\" "));
assert!(is_string("\"a\"5"));
assert!(is_string("'a'5"));
assert!(is_string("\"with escaped \\\" backslash\""));
assert!(is_string("\"\"".as_bytes()));
assert!(is_string("\"a\"".as_bytes()));
assert!(is_string("\"a\" ".as_bytes()));
assert!(is_string("\"a\"5".as_bytes()));
assert!(is_string("'a'5".as_bytes()));
assert!(is_string("\"with escaped \\\" backslash\"".as_bytes()));
assert!(!is_string("\""));
assert!(!is_string("\"a"));
assert!(!is_string("a\""));
assert!(!is_string(" \"a\""));
assert!(!is_string("5\"a\""));
assert!(!is_string("a + 'str'"));
assert!(is_string("'c'"));
assert!(!is_string("\"".as_bytes()));
assert!(!is_string("\"a".as_bytes()));
assert!(!is_string("a\"".as_bytes()));
assert!(!is_string(" \"a\"".as_bytes()));
assert!(!is_string("5\"a\"".as_bytes()));
assert!(!is_string("a + 'str'".as_bytes()));
assert!(is_string("'c'".as_bytes()));
}
#[test]
fn is_operator_test() {
assert!(is_operator("+"));
assert!(is_operator("+ "));
assert!(is_operator("-"));
assert!(is_operator("<="));
assert!(is_operator("<= "));
assert!(is_operator(">="));
assert!(is_operator(">= "));
assert!(is_operator("> "));
assert!(is_operator("< "));
assert!(is_operator("| "));
assert!(is_operator("|> "));
assert!(is_operator("^ "));
assert!(is_operator("% "));
assert!(is_operator("+* "));
assert!(is_operator("+".as_bytes()));
assert!(is_operator("+ ".as_bytes()));
assert!(is_operator("-".as_bytes()));
assert!(is_operator("<=".as_bytes()));
assert!(is_operator("<= ".as_bytes()));
assert!(is_operator(">=".as_bytes()));
assert!(is_operator(">= ".as_bytes()));
assert!(is_operator("> ".as_bytes()));
assert!(is_operator("< ".as_bytes()));
assert!(is_operator("| ".as_bytes()));
assert!(is_operator("|> ".as_bytes()));
assert!(is_operator("^ ".as_bytes()));
assert!(is_operator("% ".as_bytes()));
assert!(is_operator("+* ".as_bytes()));
assert!(!is_operator("5 + 5"));
assert!(!is_operator("a"));
assert!(!is_operator("a+"));
assert!(!is_operator("a+5"));
assert!(!is_operator("5a+5"));
assert!(!is_operator(", newVar"));
assert!(!is_operator(","));
assert!(!is_operator("5 + 5".as_bytes()));
assert!(!is_operator("a".as_bytes()));
assert!(!is_operator("a+".as_bytes()));
assert!(!is_operator("a+5".as_bytes()));
assert!(!is_operator("5a+5".as_bytes()));
assert!(!is_operator(", newVar".as_bytes()));
assert!(!is_operator(",".as_bytes()));
}
#[test]
fn is_block_start_test() {
assert!(is_block_start("{"));
assert!(is_block_start("{ "));
assert!(is_block_start("{5"));
assert!(is_block_start("{a"));
assert!(is_block_start("{5 "));
assert!(is_block_start("{".as_bytes()));
assert!(is_block_start("{ ".as_bytes()));
assert!(is_block_start("{5".as_bytes()));
assert!(is_block_start("{a".as_bytes()));
assert!(is_block_start("{5 ".as_bytes()));
assert!(!is_block_start("5"));
assert!(!is_block_start("5 + 5"));
assert!(!is_block_start("5{ + 5"));
assert!(!is_block_start("a{ + 5"));
assert!(!is_block_start(" { + 5"));
assert!(!is_block_start("5".as_bytes()));
assert!(!is_block_start("5 + 5".as_bytes()));
assert!(!is_block_start("5{ + 5".as_bytes()));
assert!(!is_block_start("a{ + 5".as_bytes()));
assert!(!is_block_start(" { + 5".as_bytes()));
}
#[test]
fn is_block_end_test() {
assert!(is_block_end("}"));
assert!(is_block_end("} "));
assert!(is_block_end("}5"));
assert!(is_block_end("}5 "));
assert!(is_block_end("}".as_bytes()));
assert!(is_block_end("} ".as_bytes()));
assert!(is_block_end("}5".as_bytes()));
assert!(is_block_end("}5 ".as_bytes()));
assert!(!is_block_end("5"));
assert!(!is_block_end("5 + 5"));
assert!(!is_block_end("5} + 5"));
assert!(!is_block_end(" } + 5"));
assert!(!is_block_end("5".as_bytes()));
assert!(!is_block_end("5 + 5".as_bytes()));
assert!(!is_block_end("5} + 5".as_bytes()));
assert!(!is_block_end(" } + 5".as_bytes()));
}
#[test]
fn is_paran_start_test() {
assert!(is_paran_start("("));
assert!(is_paran_start("( "));
assert!(is_paran_start("(5"));
assert!(is_paran_start("(5 "));
assert!(is_paran_start("(5 + 5"));
assert!(is_paran_start("(5 + 5)"));
assert!(is_paran_start("(5 + 5) "));
fn is_paren_start_test() {
assert!(is_paren_start("(".as_bytes()));
assert!(is_paren_start("( ".as_bytes()));
assert!(is_paren_start("(5".as_bytes()));
assert!(is_paren_start("(5 ".as_bytes()));
assert!(is_paren_start("(5 + 5".as_bytes()));
assert!(is_paren_start("(5 + 5)".as_bytes()));
assert!(is_paren_start("(5 + 5) ".as_bytes()));
assert!(!is_paran_start("5"));
assert!(!is_paran_start("5 + 5"));
assert!(!is_paran_start("5( + 5)"));
assert!(!is_paran_start(" ( + 5)"));
assert!(!is_paren_start("5".as_bytes()));
assert!(!is_paren_start("5 + 5".as_bytes()));
assert!(!is_paren_start("5( + 5)".as_bytes()));
assert!(!is_paren_start(" ( + 5)".as_bytes()));
}
#[test]
fn is_paran_end_test() {
assert!(is_paran_end(")"));
assert!(is_paran_end(") "));
assert!(is_paran_end(")5"));
assert!(is_paran_end(")5 "));
fn is_paren_end_test() {
assert!(is_paren_end(")".as_bytes()));
assert!(is_paren_end(") ".as_bytes()));
assert!(is_paren_end(")5".as_bytes()));
assert!(is_paren_end(")5 ".as_bytes()));
assert!(!is_paran_end("5"));
assert!(!is_paran_end("5 + 5"));
assert!(!is_paran_end("5) + 5"));
assert!(!is_paran_end(" ) + 5"));
assert!(!is_paren_end("5".as_bytes()));
assert!(!is_paren_end("5 + 5".as_bytes()));
assert!(!is_paren_end("5) + 5".as_bytes()));
assert!(!is_paren_end(" ) + 5".as_bytes()));
}
#[test]
fn is_comma_test() {
assert!(is_comma(","));
assert!(is_comma(", "));
assert!(is_comma(",5"));
assert!(is_comma(",5 "));
assert!(is_comma(",".as_bytes()));
assert!(is_comma(", ".as_bytes()));
assert!(is_comma(",5".as_bytes()));
assert!(is_comma(",5 ".as_bytes()));
assert!(!is_comma("5"));
assert!(!is_comma("5 + 5"));
assert!(!is_comma("5, + 5"));
assert!(!is_comma(" , + 5"));
assert!(!is_comma("5".as_bytes()));
assert!(!is_comma("5 + 5".as_bytes()));
assert!(!is_comma("5, + 5".as_bytes()));
assert!(!is_comma(" , + 5".as_bytes()));
}
#[test]
fn is_line_comment_test() {
assert!(is_line_comment("//"));
assert!(is_line_comment("// "));
assert!(is_line_comment("//5"));
assert!(is_line_comment("//5 "));
assert!(is_line_comment("//".as_bytes()));
assert!(is_line_comment("// ".as_bytes()));
assert!(is_line_comment("//5".as_bytes()));
assert!(is_line_comment("//5 ".as_bytes()));
assert!(!is_line_comment("5"));
assert!(!is_line_comment("5 + 5"));
assert!(!is_line_comment("5// + 5"));
assert!(!is_line_comment(" // + 5"));
assert!(!is_line_comment("5".as_bytes()));
assert!(!is_line_comment("5 + 5".as_bytes()));
assert!(!is_line_comment("5// + 5".as_bytes()));
assert!(!is_line_comment(" // + 5".as_bytes()));
}
#[test]
fn is_block_comment_test() {
assert!(is_block_comment("/* */"));
assert!(is_block_comment("/***/"));
assert!(is_block_comment("/*5*/"));
assert!(is_block_comment("/*5 */"));
assert!(is_block_comment("/* */".as_bytes()));
assert!(is_block_comment("/***/".as_bytes()));
assert!(is_block_comment("/*5*/".as_bytes()));
assert!(is_block_comment("/*5 */".as_bytes()));
assert!(!is_block_comment("/*"));
assert!(!is_block_comment("5"));
assert!(!is_block_comment("5 + 5"));
assert!(!is_block_comment("5/* + 5"));
assert!(!is_block_comment(" /* + 5"));
assert!(!is_block_comment("/*".as_bytes()));
assert!(!is_block_comment("5".as_bytes()));
assert!(!is_block_comment("5 + 5".as_bytes()));
assert!(!is_block_comment("5/* + 5".as_bytes()));
assert!(!is_block_comment(" /* + 5".as_bytes()));
assert!(!is_block_comment(
r#" /* and
here
*/
"#
.as_bytes()
));
}
@ -597,7 +602,7 @@ mod tests {
#[test]
fn return_token_at_index_test() {
assert_eq!(
return_token_at_index("const", 0),
return_token_at_index("const".as_bytes(), 0),
Some(Token {
token_type: TokenType::Keyword,
value: "const".to_string(),
@ -606,7 +611,7 @@ mod tests {
})
);
assert_eq!(
return_token_at_index(" 4554", 2),
return_token_at_index("4554".as_bytes(), 2),
Some(Token {
token_type: TokenType::Number,
value: "4554".to_string(),
@ -717,7 +722,7 @@ mod tests {
// We have this as a test so we can ensure it never panics with an unwrap in the server.
#[test]
fn test_token_type_to_semantic_token_type() {
let semantic_types = TokenType::to_semantic_token_types().unwrap();
let semantic_types = TokenType::all_semantic_token_types().unwrap();
assert!(!semantic_types.is_empty());
}

View File

@ -149,7 +149,7 @@ pub async fn lsp_run(config: ServerConfig) -> Result<(), JsValue> {
let stdlib_signatures = get_signatures_from_stdlib(&stdlib).map_err(|e| e.to_string())?;
// We can unwrap here because we know the tokeniser is valid, since
// we have a test for it.
let token_types = kcl_lib::tokeniser::TokenType::to_semantic_token_types().unwrap();
let token_types = kcl_lib::tokeniser::TokenType::all_semantic_token_types().unwrap();
let (service, socket) = LspService::new(|client| Backend {
client,