Tokenizer is accidentally quadratic (#689)

* Add comments and rename a function * Typo: paran -> paren * Use bytes, not string, for the tokenizer * Fix typo
2023-09-21 14:18:42 -05:00
parent 0c724c4971
commit d820cf2446
2 changed files with 164 additions and 159 deletions
--- a/src/wasm-lib/kcl/src/tokeniser.rs
+++ b/src/wasm-lib/kcl/src/tokeniser.rs
@ -3,7 +3,7 @@ use std::str::FromStr;
 use anyhow::Result;
 use lazy_static::lazy_static;
 use parse_display::{Display, FromStr};
-use regex::Regex;
+use regex::bytes::Regex;
 use schemars::JsonSchema;
 use serde::{Deserialize, Serialize};
 use tower_lsp::lsp_types::SemanticTokenType;
@ -44,6 +44,7 @@ pub enum TokenType {
    Function,
 }

+/// Most KCL tokens correspond to LSP semantic tokens (but not all).
 impl TryFrom<TokenType> for SemanticTokenType {
    type Error = anyhow::Error;
    fn try_from(token_type: TokenType) -> Result<Self> {
@ -70,7 +71,7 @@ impl TryFrom<TokenType> for SemanticTokenType {

 impl TokenType {
    // This is for the lsp server.
-    pub fn to_semantic_token_types() -> Result<Vec<SemanticTokenType>> {
+    pub fn all_semantic_token_types() -> Result<Vec<SemanticTokenType>> {
        let mut settings = schemars::gen::SchemaSettings::openapi3();
        settings.inline_subschemas = true;
        let mut generator = schemars::gen::SchemaGenerator::new(settings);
@ -119,7 +120,9 @@ impl TokenType {
 pub struct Token {
    #[serde(rename = "type")]
    pub token_type: TokenType,
+    /// Offset in the source code where this token begins.
    pub start: usize,
+    /// Offset in the source code where this token ends.
    pub end: usize,
    pub value: String,
 }
@ -159,66 +162,68 @@ lazy_static! {
    static ref BLOCKCOMMENT: Regex = Regex::new(r"^/\*[\s\S]*?\*/").unwrap();
 }

-fn is_number(character: &str) -> bool {
+fn is_number(character: &[u8]) -> bool {
    NUMBER.is_match(character)
 }
-fn is_whitespace(character: &str) -> bool {
+fn is_whitespace(character: &[u8]) -> bool {
    WHITESPACE.is_match(character)
 }
-fn is_word(character: &str) -> bool {
+fn is_word(character: &[u8]) -> bool {
    WORD.is_match(character)
 }
-fn is_keyword(character: &str) -> bool {
+fn is_keyword(character: &[u8]) -> bool {
    KEYWORD.is_match(character)
 }
-fn is_string(character: &str) -> bool {
+fn is_string(character: &[u8]) -> bool {
    match STRING.find(character) {
        Some(m) => m.start() == 0,
        None => false,
    }
 }
-fn is_operator(character: &str) -> bool {
+fn is_operator(character: &[u8]) -> bool {
    OPERATOR.is_match(character)
 }
-fn is_block_start(character: &str) -> bool {
+fn is_block_start(character: &[u8]) -> bool {
    BLOCK_START.is_match(character)
 }
-fn is_block_end(character: &str) -> bool {
+fn is_block_end(character: &[u8]) -> bool {
    BLOCK_END.is_match(character)
 }
-fn is_paran_start(character: &str) -> bool {
+fn is_paren_start(character: &[u8]) -> bool {
    PARAN_START.is_match(character)
 }
-fn is_paran_end(character: &str) -> bool {
+fn is_paren_end(character: &[u8]) -> bool {
    PARAN_END.is_match(character)
 }
-fn is_array_start(character: &str) -> bool {
+fn is_array_start(character: &[u8]) -> bool {
    ARRAY_START.is_match(character)
 }
-fn is_array_end(character: &str) -> bool {
+fn is_array_end(character: &[u8]) -> bool {
    ARRAY_END.is_match(character)
 }
-fn is_comma(character: &str) -> bool {
+fn is_comma(character: &[u8]) -> bool {
    COMMA.is_match(character)
 }
-fn is_colon(character: &str) -> bool {
+fn is_colon(character: &[u8]) -> bool {
    COLON.is_match(character)
 }
-fn is_double_period(character: &str) -> bool {
+fn is_double_period(character: &[u8]) -> bool {
    DOUBLE_PERIOD.is_match(character)
 }
-fn is_period(character: &str) -> bool {
+fn is_period(character: &[u8]) -> bool {
    PERIOD.is_match(character)
 }
-fn is_line_comment(character: &str) -> bool {
+fn is_line_comment(character: &[u8]) -> bool {
    LINECOMMENT.is_match(character)
 }
-fn is_block_comment(character: &str) -> bool {
+fn is_block_comment(character: &[u8]) -> bool {
    BLOCKCOMMENT.is_match(character)
 }

-fn match_first(s: &str, regex: &Regex) -> Option<String> {
-    regex.find(s).map(|the_match| the_match.as_str().to_string())
+fn match_first(s: &[u8], regex: &Regex) -> Option<String> {
+    regex
+        .find(s)
+        .map(|the_match| String::from_utf8_lossy(the_match.as_bytes()).into())
 }

 fn make_token(token_type: TokenType, value: &str, start: usize) -> Token {
@ -230,8 +235,7 @@ fn make_token(token_type: TokenType, value: &str, start: usize) -> Token {
    }
 }

-fn return_token_at_index(s: &str, start_index: usize) -> Option<Token> {
-    let str_from_index = &s.chars().skip(start_index).collect::<String>();
+fn return_token_at_index(str_from_index: &[u8], start_index: usize) -> Option<Token> {
    if is_string(str_from_index) {
        return Some(make_token(
            TokenType::String,
@ -258,14 +262,14 @@ fn return_token_at_index(s: &str, start_index: usize) -> Option<Token> {
            start_index,
        ));
    }
-    if is_paran_end(str_from_index) {
+    if is_paren_end(str_from_index) {
        return Some(make_token(
            TokenType::Brace,
            &match_first(str_from_index, &PARAN_END)?,
            start_index,
        ));
    }
-    if is_paran_start(str_from_index) {
+    if is_paren_start(str_from_index) {
        return Some(make_token(
            TokenType::Brace,
            &match_first(str_from_index, &PARAN_START)?,
@ -366,11 +370,11 @@ fn return_token_at_index(s: &str, start_index: usize) -> Option<Token> {
    None
 }

-fn recursively_tokenise(s: &str, current_index: usize, previous_tokens: Vec<Token>) -> Vec<Token> {
+fn recursively_tokenise(s: &[u8], current_index: usize, previous_tokens: Vec<Token>) -> Vec<Token> {
    if current_index >= s.len() {
        return previous_tokens;
    }
-    let token = return_token_at_index(s, current_index);
+    let token = return_token_at_index(&s[current_index..], current_index);
    let Some(token) = token else {
        return recursively_tokenise(s, current_index + 1, previous_tokens);
    };
@ -381,7 +385,7 @@ fn recursively_tokenise(s: &str, current_index: usize, previous_tokens: Vec<Toke
 }

 pub fn lexer(s: &str) -> Vec<Token> {
-    recursively_tokenise(s, 0, Vec::new())
+    recursively_tokenise(s.as_bytes(), 0, Vec::new())
 }

 #[cfg(test)]
@ -392,192 +396,193 @@ mod tests {

    #[test]
    fn is_number_test() {
-        assert!(is_number("1"));
-        assert!(is_number("1 abc"));
-        assert!(is_number("1.1"));
-        assert!(is_number("1.1 abc"));
-        assert!(!is_number("a"));
+        assert!(is_number("1".as_bytes()));
+        assert!(is_number("1 abc".as_bytes()));
+        assert!(is_number("1.1".as_bytes()));
+        assert!(is_number("1.1 abc".as_bytes()));
+        assert!(!is_number("a".as_bytes()));

-        assert!(is_number("1"));
-        assert!(is_number(".1"));
-        assert!(is_number("5?"));
-        assert!(is_number("5 + 6"));
-        assert!(is_number("5 + a"));
-        assert!(is_number("5.5"));
+        assert!(is_number("1".as_bytes()));
+        assert!(is_number(".1".as_bytes()));
+        assert!(is_number("5?".as_bytes()));
+        assert!(is_number("5 + 6".as_bytes()));
+        assert!(is_number("5 + a".as_bytes()));
+        assert!(is_number("5.5".as_bytes()));

-        assert!(!is_number("1abc"));
-        assert!(!is_number("a"));
-        assert!(!is_number("?"));
-        assert!(!is_number("?5"));
+        assert!(!is_number("1abc".as_bytes()));
+        assert!(!is_number("a".as_bytes()));
+        assert!(!is_number("?".as_bytes()));
+        assert!(!is_number("?5".as_bytes()));
    }

    #[test]
    fn is_whitespace_test() {
-        assert!(is_whitespace(" "));
-        assert!(is_whitespace("  "));
-        assert!(is_whitespace(" a"));
-        assert!(is_whitespace("a "));
+        assert!(is_whitespace(" ".as_bytes()));
+        assert!(is_whitespace("  ".as_bytes()));
+        assert!(is_whitespace(" a".as_bytes()));
+        assert!(is_whitespace("a ".as_bytes()));

-        assert!(!is_whitespace("a"));
-        assert!(!is_whitespace("?"));
+        assert!(!is_whitespace("a".as_bytes()));
+        assert!(!is_whitespace("?".as_bytes()));
    }

    #[test]
    fn is_word_test() {
-        assert!(is_word("a"));
-        assert!(is_word("a "));
-        assert!(is_word("a5"));
-        assert!(is_word("a5a"));
+        assert!(is_word("a".as_bytes()));
+        assert!(is_word("a ".as_bytes()));
+        assert!(is_word("a5".as_bytes()));
+        assert!(is_word("a5a".as_bytes()));

-        assert!(!is_word("5"));
-        assert!(!is_word("5a"));
-        assert!(!is_word("5a5"));
+        assert!(!is_word("5".as_bytes()));
+        assert!(!is_word("5a".as_bytes()));
+        assert!(!is_word("5a5".as_bytes()));
    }

    #[test]
    fn is_string_test() {
-        assert!(is_string("\"\""));
-        assert!(is_string("\"a\""));
-        assert!(is_string("\"a\" "));
-        assert!(is_string("\"a\"5"));
-        assert!(is_string("'a'5"));
-        assert!(is_string("\"with escaped \\\" backslash\""));
+        assert!(is_string("\"\"".as_bytes()));
+        assert!(is_string("\"a\"".as_bytes()));
+        assert!(is_string("\"a\" ".as_bytes()));
+        assert!(is_string("\"a\"5".as_bytes()));
+        assert!(is_string("'a'5".as_bytes()));
+        assert!(is_string("\"with escaped \\\" backslash\"".as_bytes()));

-        assert!(!is_string("\""));
-        assert!(!is_string("\"a"));
-        assert!(!is_string("a\""));
-        assert!(!is_string(" \"a\""));
-        assert!(!is_string("5\"a\""));
-        assert!(!is_string("a + 'str'"));
-        assert!(is_string("'c'"));
+        assert!(!is_string("\"".as_bytes()));
+        assert!(!is_string("\"a".as_bytes()));
+        assert!(!is_string("a\"".as_bytes()));
+        assert!(!is_string(" \"a\"".as_bytes()));
+        assert!(!is_string("5\"a\"".as_bytes()));
+        assert!(!is_string("a + 'str'".as_bytes()));
+        assert!(is_string("'c'".as_bytes()));
    }

    #[test]
    fn is_operator_test() {
-        assert!(is_operator("+"));
-        assert!(is_operator("+ "));
-        assert!(is_operator("-"));
-        assert!(is_operator("<="));
-        assert!(is_operator("<= "));
-        assert!(is_operator(">="));
-        assert!(is_operator(">= "));
-        assert!(is_operator("> "));
-        assert!(is_operator("< "));
-        assert!(is_operator("| "));
-        assert!(is_operator("|> "));
-        assert!(is_operator("^ "));
-        assert!(is_operator("% "));
-        assert!(is_operator("+* "));
+        assert!(is_operator("+".as_bytes()));
+        assert!(is_operator("+ ".as_bytes()));
+        assert!(is_operator("-".as_bytes()));
+        assert!(is_operator("<=".as_bytes()));
+        assert!(is_operator("<= ".as_bytes()));
+        assert!(is_operator(">=".as_bytes()));
+        assert!(is_operator(">= ".as_bytes()));
+        assert!(is_operator("> ".as_bytes()));
+        assert!(is_operator("< ".as_bytes()));
+        assert!(is_operator("| ".as_bytes()));
+        assert!(is_operator("|> ".as_bytes()));
+        assert!(is_operator("^ ".as_bytes()));
+        assert!(is_operator("% ".as_bytes()));
+        assert!(is_operator("+* ".as_bytes()));

-        assert!(!is_operator("5 + 5"));
-        assert!(!is_operator("a"));
-        assert!(!is_operator("a+"));
-        assert!(!is_operator("a+5"));
-        assert!(!is_operator("5a+5"));
-        assert!(!is_operator(", newVar"));
-        assert!(!is_operator(","));
+        assert!(!is_operator("5 + 5".as_bytes()));
+        assert!(!is_operator("a".as_bytes()));
+        assert!(!is_operator("a+".as_bytes()));
+        assert!(!is_operator("a+5".as_bytes()));
+        assert!(!is_operator("5a+5".as_bytes()));
+        assert!(!is_operator(", newVar".as_bytes()));
+        assert!(!is_operator(",".as_bytes()));
    }

    #[test]
    fn is_block_start_test() {
-        assert!(is_block_start("{"));
-        assert!(is_block_start("{ "));
-        assert!(is_block_start("{5"));
-        assert!(is_block_start("{a"));
-        assert!(is_block_start("{5 "));
+        assert!(is_block_start("{".as_bytes()));
+        assert!(is_block_start("{ ".as_bytes()));
+        assert!(is_block_start("{5".as_bytes()));
+        assert!(is_block_start("{a".as_bytes()));
+        assert!(is_block_start("{5 ".as_bytes()));

-        assert!(!is_block_start("5"));
-        assert!(!is_block_start("5 + 5"));
-        assert!(!is_block_start("5{ + 5"));
-        assert!(!is_block_start("a{ + 5"));
-        assert!(!is_block_start(" { + 5"));
+        assert!(!is_block_start("5".as_bytes()));
+        assert!(!is_block_start("5 + 5".as_bytes()));
+        assert!(!is_block_start("5{ + 5".as_bytes()));
+        assert!(!is_block_start("a{ + 5".as_bytes()));
+        assert!(!is_block_start(" { + 5".as_bytes()));
    }

    #[test]
    fn is_block_end_test() {
-        assert!(is_block_end("}"));
-        assert!(is_block_end("} "));
-        assert!(is_block_end("}5"));
-        assert!(is_block_end("}5 "));
+        assert!(is_block_end("}".as_bytes()));
+        assert!(is_block_end("} ".as_bytes()));
+        assert!(is_block_end("}5".as_bytes()));
+        assert!(is_block_end("}5 ".as_bytes()));

-        assert!(!is_block_end("5"));
-        assert!(!is_block_end("5 + 5"));
-        assert!(!is_block_end("5} + 5"));
-        assert!(!is_block_end(" } + 5"));
+        assert!(!is_block_end("5".as_bytes()));
+        assert!(!is_block_end("5 + 5".as_bytes()));
+        assert!(!is_block_end("5} + 5".as_bytes()));
+        assert!(!is_block_end(" } + 5".as_bytes()));
    }

    #[test]
-    fn is_paran_start_test() {
-        assert!(is_paran_start("("));
-        assert!(is_paran_start("( "));
-        assert!(is_paran_start("(5"));
-        assert!(is_paran_start("(5 "));
-        assert!(is_paran_start("(5 + 5"));
-        assert!(is_paran_start("(5 + 5)"));
-        assert!(is_paran_start("(5 + 5) "));
+    fn is_paren_start_test() {
+        assert!(is_paren_start("(".as_bytes()));
+        assert!(is_paren_start("( ".as_bytes()));
+        assert!(is_paren_start("(5".as_bytes()));
+        assert!(is_paren_start("(5 ".as_bytes()));
+        assert!(is_paren_start("(5 + 5".as_bytes()));
+        assert!(is_paren_start("(5 + 5)".as_bytes()));
+        assert!(is_paren_start("(5 + 5) ".as_bytes()));

-        assert!(!is_paran_start("5"));
-        assert!(!is_paran_start("5 + 5"));
-        assert!(!is_paran_start("5( + 5)"));
-        assert!(!is_paran_start(" ( + 5)"));
+        assert!(!is_paren_start("5".as_bytes()));
+        assert!(!is_paren_start("5 + 5".as_bytes()));
+        assert!(!is_paren_start("5( + 5)".as_bytes()));
+        assert!(!is_paren_start(" ( + 5)".as_bytes()));
    }

    #[test]
-    fn is_paran_end_test() {
-        assert!(is_paran_end(")"));
-        assert!(is_paran_end(") "));
-        assert!(is_paran_end(")5"));
-        assert!(is_paran_end(")5 "));
+    fn is_paren_end_test() {
+        assert!(is_paren_end(")".as_bytes()));
+        assert!(is_paren_end(") ".as_bytes()));
+        assert!(is_paren_end(")5".as_bytes()));
+        assert!(is_paren_end(")5 ".as_bytes()));

-        assert!(!is_paran_end("5"));
-        assert!(!is_paran_end("5 + 5"));
-        assert!(!is_paran_end("5) + 5"));
-        assert!(!is_paran_end(" ) + 5"));
+        assert!(!is_paren_end("5".as_bytes()));
+        assert!(!is_paren_end("5 + 5".as_bytes()));
+        assert!(!is_paren_end("5) + 5".as_bytes()));
+        assert!(!is_paren_end(" ) + 5".as_bytes()));
    }

    #[test]
    fn is_comma_test() {
-        assert!(is_comma(","));
-        assert!(is_comma(", "));
-        assert!(is_comma(",5"));
-        assert!(is_comma(",5 "));
+        assert!(is_comma(",".as_bytes()));
+        assert!(is_comma(", ".as_bytes()));
+        assert!(is_comma(",5".as_bytes()));
+        assert!(is_comma(",5 ".as_bytes()));

-        assert!(!is_comma("5"));
-        assert!(!is_comma("5 + 5"));
-        assert!(!is_comma("5, + 5"));
-        assert!(!is_comma(" , + 5"));
+        assert!(!is_comma("5".as_bytes()));
+        assert!(!is_comma("5 + 5".as_bytes()));
+        assert!(!is_comma("5, + 5".as_bytes()));
+        assert!(!is_comma(" , + 5".as_bytes()));
    }

    #[test]
    fn is_line_comment_test() {
-        assert!(is_line_comment("//"));
-        assert!(is_line_comment("// "));
-        assert!(is_line_comment("//5"));
-        assert!(is_line_comment("//5 "));
+        assert!(is_line_comment("//".as_bytes()));
+        assert!(is_line_comment("// ".as_bytes()));
+        assert!(is_line_comment("//5".as_bytes()));
+        assert!(is_line_comment("//5 ".as_bytes()));

-        assert!(!is_line_comment("5"));
-        assert!(!is_line_comment("5 + 5"));
-        assert!(!is_line_comment("5// + 5"));
-        assert!(!is_line_comment(" // + 5"));
+        assert!(!is_line_comment("5".as_bytes()));
+        assert!(!is_line_comment("5 + 5".as_bytes()));
+        assert!(!is_line_comment("5// + 5".as_bytes()));
+        assert!(!is_line_comment(" // + 5".as_bytes()));
    }

    #[test]
    fn is_block_comment_test() {
-        assert!(is_block_comment("/*  */"));
-        assert!(is_block_comment("/***/"));
-        assert!(is_block_comment("/*5*/"));
-        assert!(is_block_comment("/*5 */"));
+        assert!(is_block_comment("/*  */".as_bytes()));
+        assert!(is_block_comment("/***/".as_bytes()));
+        assert!(is_block_comment("/*5*/".as_bytes()));
+        assert!(is_block_comment("/*5 */".as_bytes()));

-        assert!(!is_block_comment("/*"));
-        assert!(!is_block_comment("5"));
-        assert!(!is_block_comment("5 + 5"));
-        assert!(!is_block_comment("5/* + 5"));
-        assert!(!is_block_comment(" /* + 5"));
+        assert!(!is_block_comment("/*".as_bytes()));
+        assert!(!is_block_comment("5".as_bytes()));
+        assert!(!is_block_comment("5 + 5".as_bytes()));
+        assert!(!is_block_comment("5/* + 5".as_bytes()));
+        assert!(!is_block_comment(" /* + 5".as_bytes()));
        assert!(!is_block_comment(
            r#"  /* and
   here
   */
   "#
+            .as_bytes()
        ));
    }

@ -597,7 +602,7 @@ mod tests {
    #[test]
    fn return_token_at_index_test() {
        assert_eq!(
-            return_token_at_index("const", 0),
+            return_token_at_index("const".as_bytes(), 0),
            Some(Token {
                token_type: TokenType::Keyword,
                value: "const".to_string(),
@ -606,7 +611,7 @@ mod tests {
            })
        );
        assert_eq!(
-            return_token_at_index("  4554", 2),
+            return_token_at_index("4554".as_bytes(), 2),
            Some(Token {
                token_type: TokenType::Number,
                value: "4554".to_string(),
@ -717,7 +722,7 @@ mod tests {
    // We have this as a test so we can ensure it never panics with an unwrap in the server.
    #[test]
    fn test_token_type_to_semantic_token_type() {
-        let semantic_types = TokenType::to_semantic_token_types().unwrap();
+        let semantic_types = TokenType::all_semantic_token_types().unwrap();
        assert!(!semantic_types.is_empty());
    }

--- a/src/wasm-lib/src/lib.rs
+++ b/src/wasm-lib/src/lib.rs
@ -149,7 +149,7 @@ pub async fn lsp_run(config: ServerConfig) -> Result<(), JsValue> {
    let stdlib_signatures = get_signatures_from_stdlib(&stdlib).map_err(|e| e.to_string())?;
    // We can unwrap here because we know the tokeniser is valid, since
    // we have a test for it.
-    let token_types = kcl_lib::tokeniser::TokenType::to_semantic_token_types().unwrap();
+    let token_types = kcl_lib::tokeniser::TokenType::all_semantic_token_types().unwrap();

    let (service, socket) = LspService::new(|client| Backend {
        client,