Tokenizing fallibility (#883)

Tokenization tracks invalid tokens and produces a nice error about them --------- Co-authored-by: Adam Chalmers <adam.chalmers@kittycad.io>
2023-11-01 17:20:49 -05:00
parent 3d0c5c10b0
commit 2e419907e6
6 changed files with 113 additions and 15 deletions
--- a/src/lang/errors.ts
+++ b/src/lang/errors.ts
@ -18,6 +18,13 @@ export class KCLError {
  }
 }

+export class KCLLexicalError extends KCLError {
+  constructor(msg: string, sourceRanges: [number, number][]) {
+    super('lexical', msg, sourceRanges)
+    Object.setPrototypeOf(this, KCLSyntaxError.prototype)
+  }
+}
+
 export class KCLSyntaxError extends KCLError {
  constructor(msg: string, sourceRanges: [number, number][]) {
    super('syntax', msg, sourceRanges)
--- a/src/wasm-lib/kcl/src/errors.rs
+++ b/src/wasm-lib/kcl/src/errors.rs
@ -8,6 +8,8 @@ use crate::executor::SourceRange;
 #[ts(export)]
 #[serde(tag = "kind", rename_all = "snake_case")]
 pub enum KclError {
+    #[error("lexical: {0:?}")]
+    Lexical(KclErrorDetails),
    #[error("syntax: {0:?}")]
    Syntax(KclErrorDetails),
    #[error("semantic: {0:?}")]
@ -41,6 +43,7 @@ impl KclError {
    /// Get the error message, line and column from the error and input code.
    pub fn get_message_line_column(&self, input: &str) -> (String, Option<usize>, Option<usize>) {
        let (type_, source_range, message) = match &self {
+            KclError::Lexical(e) => ("lexical", e.source_ranges.clone(), e.message.clone()),
            KclError::Syntax(e) => ("syntax", e.source_ranges.clone(), e.message.clone()),
            KclError::Semantic(e) => ("semantic", e.source_ranges.clone(), e.message.clone()),
            KclError::Type(e) => ("type", e.source_ranges.clone(), e.message.clone()),
@ -67,6 +70,7 @@ impl KclError {

    pub fn source_ranges(&self) -> Vec<SourceRange> {
        match &self {
+            KclError::Lexical(e) => e.source_ranges.clone(),
            KclError::Syntax(e) => e.source_ranges.clone(),
            KclError::Semantic(e) => e.source_ranges.clone(),
            KclError::Type(e) => e.source_ranges.clone(),
@ -82,6 +86,7 @@ impl KclError {
    /// Get the inner error message.
    pub fn message(&self) -> &str {
        match &self {
+            KclError::Lexical(e) => &e.message,
            KclError::Syntax(e) => &e.message,
            KclError::Semantic(e) => &e.message,
            KclError::Type(e) => &e.message,
--- a/src/wasm-lib/kcl/src/parser.rs
+++ b/src/wasm-lib/kcl/src/parser.rs
@ -1,4 +1,10 @@
-use crate::{ast::types::Program, errors::KclError, token::Token};
+use crate::{
+    ast::types::Program,
+    errors::KclError,
+    errors::KclErrorDetails,
+    executor::SourceRange,
+    token::{Token, TokenType},
+};

 mod math;
 pub(crate) mod parser_impl;
@ -8,15 +14,37 @@ pub const PIPE_OPERATOR: &str = "|>";

 pub struct Parser {
    pub tokens: Vec<Token>,
+    pub unknown_tokens: Vec<Token>,
 }

 impl Parser {
    pub fn new(tokens: Vec<Token>) -> Self {
-        Self { tokens }
+        let (tokens, unknown_tokens): (Vec<Token>, Vec<Token>) = tokens
+            .into_iter()
+            .partition(|token| token.token_type != TokenType::Unknown);
+        Self { tokens, unknown_tokens }
    }

    /// Run the parser
    pub fn ast(&self) -> Result<Program, KclError> {
+        if self.tokens.is_empty() {
+            return Err(KclError::Syntax(KclErrorDetails {
+                source_ranges: vec![],
+                message: "file is empty".to_string(),
+            }));
+        }
+
+        if !self.unknown_tokens.is_empty() {
+            let source_ranges = self.unknown_tokens.iter().map(SourceRange::from).collect();
+            return Err(KclError::Lexical(KclErrorDetails {
+                source_ranges,
+                message: format!(
+                    "found unknown tokens {:?}",
+                    self.unknown_tokens.iter().map(|t| t.value.as_str()).collect::<Vec<_>>()
+                ),
+            }));
+        }
+
        parser_impl::run_parser(&mut self.tokens.as_slice())
    }
 }
--- a/src/wasm-lib/kcl/src/parser/parser_impl.rs
+++ b/src/wasm-lib/kcl/src/parser/parser_impl.rs
@ -34,13 +34,6 @@ lazy_static::lazy_static! {
 type TokenSlice<'slice, 'input> = &'slice mut &'input [Token];

 pub fn run_parser(i: TokenSlice) -> Result<Program, KclError> {
-    if i.is_empty() {
-        return Err(KclError::Syntax(KclErrorDetails {
-            source_ranges: vec![],
-            message: "file is empty".to_string(),
-        }));
-    }
-
    program.parse(i).map_err(KclError::from)
 }

@ -2223,7 +2216,7 @@ const secondExtrude = startSketchOn('XY')
        let err = parser.ast().unwrap_err();
        // TODO: Better errors when program cannot tokenize.
        // https://github.com/KittyCAD/modeling-app/issues/696
-        assert!(err.to_string().contains("file is empty"));
+        assert!(err.to_string().contains("found list of unknown tokens"));
    }

    #[test]
@ -2283,7 +2276,7 @@ z(-[["#,
        // https://github.com/KittyCAD/modeling-app/issues/696
        assert_eq!(
            result.err().unwrap().to_string(),
-            r#"syntax: KclErrorDetails { source_ranges: [], message: "file is empty" }"#
+            r##"lexical: KclErrorDetails { source_ranges: [SourceRange([6, 7])], message: "found list of unknown tokens \"#\"" }"##
        );
    }

@ -2297,7 +2290,7 @@ z(-[["#,
        // https://github.com/KittyCAD/modeling-app/issues/696
        assert_eq!(
            result.err().unwrap().to_string(),
-            r#"syntax: KclErrorDetails { source_ranges: [], message: "file is empty" }"#
+            r##"lexical: KclErrorDetails { source_ranges: [SourceRange([25, 26]), SourceRange([26, 27])], message: "found list of unknown tokens \"# #\"" }"##
        );
    }

--- a/src/wasm-lib/kcl/src/token.rs
+++ b/src/wasm-lib/kcl/src/token.rs
@ -45,6 +45,8 @@ pub enum TokenType {
    BlockComment,
    /// A function name.
    Function,
+    /// Unknown lexemes.
+    Unknown,
 }

 /// Most KCL tokens correspond to LSP semantic tokens (but not all).
@ -65,7 +67,8 @@ impl TryFrom<TokenType> for SemanticTokenType {
            | TokenType::Comma
            | TokenType::Colon
            | TokenType::Period
-            | TokenType::DoublePeriod => {
+            | TokenType::DoublePeriod
+            | TokenType::Unknown => {
                anyhow::bail!("unsupported token type: {:?}", token_type)
            }
        })
--- a/src/wasm-lib/kcl/src/token/tokeniser.rs
+++ b/src/wasm-lib/kcl/src/token/tokeniser.rs
@ -3,6 +3,7 @@ use winnow::{
    combinator::{alt, opt, peek, preceded, repeat, terminated},
    error::{ContextError, ParseError},
    prelude::*,
+    stream::{Location, Stream},
    token::{any, none_of, one_of, take_till1, take_until0},
    Located,
 };
@ -14,7 +15,7 @@ pub fn lexer(i: &str) -> Result<Vec<Token>, ParseError<Located<&str>, ContextErr
 }

 pub fn token(i: &mut Located<&str>) -> PResult<Token> {
-    winnow::combinator::dispatch! {peek(any);
+    match winnow::combinator::dispatch! {peek(any);
        '"' | '\'' => string,
        '/' => alt((line_comment, block_comment, operator)),
        '{' | '(' | '[' => brace_start,
@ -27,6 +28,21 @@ pub fn token(i: &mut Located<&str>) -> PResult<Token> {
        _ => alt((operator, keyword, word))
    }
    .parse_next(i)
+    {
+        Ok(token) => Ok(token),
+        Err(x) => {
+            // TODO: Handle non ascii cases
+            if i.len() == 0 || !i.is_ascii() {
+                return Err(x);
+            }
+
+            Ok(Token::from_range(
+                i.location()..i.location() + 1,
+                TokenType::Unknown,
+                i.next_slice(1).to_string(),
+            ))
+        }
+    }
 }

 fn block_comment(i: &mut Located<&str>) -> PResult<Token> {
@ -234,6 +250,14 @@ mod tests {
    }

    fn assert_tokens(expected: Vec<Token>, actual: Vec<Token>) {
+        assert_eq!(
+            expected.len(),
+            actual.len(),
+            "\nexpected {} tokens, actually got {}",
+            expected.len(),
+            actual.len()
+        );
+
        let n = expected.len();
        for i in 0..n {
            assert_eq!(
@ -242,7 +266,6 @@ mod tests {
                expected[i], actual[i],
            )
        }
-        assert_eq!(n, actual.len(), "expected {} tokens, actually got {}", n, actual.len());
    }

    #[test]
@ -1461,4 +1484,43 @@ const things = "things"
        ];
        assert_tokens(expected, actual);
    }
+
+    #[test]
+    fn test_unrecognized_token() {
+        let actual = lexer("12 ; 8").unwrap();
+        let expected = vec![
+            Token {
+                token_type: TokenType::Number,
+                value: "12".to_string(),
+                start: 0,
+                end: 2,
+            },
+            Token {
+                token_type: TokenType::Whitespace,
+                value: " ".to_string(),
+                start: 2,
+                end: 3,
+            },
+            Token {
+                token_type: TokenType::Unknown,
+                value: ";".to_string(),
+                start: 3,
+                end: 4,
+            },
+            Token {
+                token_type: TokenType::Whitespace,
+                value: " ".to_string(),
+                start: 4,
+                end: 5,
+            },
+            Token {
+                token_type: TokenType::Number,
+                value: "8".to_string(),
+                start: 5,
+                end: 6,
+            },
+        ];
+
+        assert_tokens(expected, actual);
+    }
 }