Tokenizing fallibility (#883)

Tokenization tracks invalid tokens and produces a nice error about them

---------

Co-authored-by: Adam Chalmers <adam.chalmers@kittycad.io>
This commit is contained in:
Alfredo Gutierrez
2023-11-01 17:20:49 -05:00
committed by GitHub
parent 3d0c5c10b0
commit 2e419907e6
6 changed files with 113 additions and 15 deletions

View File

@ -18,6 +18,13 @@ export class KCLError {
}
}
export class KCLLexicalError extends KCLError {
constructor(msg: string, sourceRanges: [number, number][]) {
super('lexical', msg, sourceRanges)
Object.setPrototypeOf(this, KCLSyntaxError.prototype)
}
}
export class KCLSyntaxError extends KCLError {
constructor(msg: string, sourceRanges: [number, number][]) {
super('syntax', msg, sourceRanges)

View File

@ -8,6 +8,8 @@ use crate::executor::SourceRange;
#[ts(export)]
#[serde(tag = "kind", rename_all = "snake_case")]
pub enum KclError {
#[error("lexical: {0:?}")]
Lexical(KclErrorDetails),
#[error("syntax: {0:?}")]
Syntax(KclErrorDetails),
#[error("semantic: {0:?}")]
@ -41,6 +43,7 @@ impl KclError {
/// Get the error message, line and column from the error and input code.
pub fn get_message_line_column(&self, input: &str) -> (String, Option<usize>, Option<usize>) {
let (type_, source_range, message) = match &self {
KclError::Lexical(e) => ("lexical", e.source_ranges.clone(), e.message.clone()),
KclError::Syntax(e) => ("syntax", e.source_ranges.clone(), e.message.clone()),
KclError::Semantic(e) => ("semantic", e.source_ranges.clone(), e.message.clone()),
KclError::Type(e) => ("type", e.source_ranges.clone(), e.message.clone()),
@ -67,6 +70,7 @@ impl KclError {
pub fn source_ranges(&self) -> Vec<SourceRange> {
match &self {
KclError::Lexical(e) => e.source_ranges.clone(),
KclError::Syntax(e) => e.source_ranges.clone(),
KclError::Semantic(e) => e.source_ranges.clone(),
KclError::Type(e) => e.source_ranges.clone(),
@ -82,6 +86,7 @@ impl KclError {
/// Get the inner error message.
pub fn message(&self) -> &str {
match &self {
KclError::Lexical(e) => &e.message,
KclError::Syntax(e) => &e.message,
KclError::Semantic(e) => &e.message,
KclError::Type(e) => &e.message,

View File

@ -1,4 +1,10 @@
use crate::{ast::types::Program, errors::KclError, token::Token};
use crate::{
ast::types::Program,
errors::KclError,
errors::KclErrorDetails,
executor::SourceRange,
token::{Token, TokenType},
};
mod math;
pub(crate) mod parser_impl;
@ -8,15 +14,37 @@ pub const PIPE_OPERATOR: &str = "|>";
pub struct Parser {
pub tokens: Vec<Token>,
pub unknown_tokens: Vec<Token>,
}
impl Parser {
pub fn new(tokens: Vec<Token>) -> Self {
Self { tokens }
let (tokens, unknown_tokens): (Vec<Token>, Vec<Token>) = tokens
.into_iter()
.partition(|token| token.token_type != TokenType::Unknown);
Self { tokens, unknown_tokens }
}
/// Run the parser
pub fn ast(&self) -> Result<Program, KclError> {
if self.tokens.is_empty() {
return Err(KclError::Syntax(KclErrorDetails {
source_ranges: vec![],
message: "file is empty".to_string(),
}));
}
if !self.unknown_tokens.is_empty() {
let source_ranges = self.unknown_tokens.iter().map(SourceRange::from).collect();
return Err(KclError::Lexical(KclErrorDetails {
source_ranges,
message: format!(
"found unknown tokens {:?}",
self.unknown_tokens.iter().map(|t| t.value.as_str()).collect::<Vec<_>>()
),
}));
}
parser_impl::run_parser(&mut self.tokens.as_slice())
}
}

View File

@ -34,13 +34,6 @@ lazy_static::lazy_static! {
type TokenSlice<'slice, 'input> = &'slice mut &'input [Token];
pub fn run_parser(i: TokenSlice) -> Result<Program, KclError> {
if i.is_empty() {
return Err(KclError::Syntax(KclErrorDetails {
source_ranges: vec![],
message: "file is empty".to_string(),
}));
}
program.parse(i).map_err(KclError::from)
}
@ -2223,7 +2216,7 @@ const secondExtrude = startSketchOn('XY')
let err = parser.ast().unwrap_err();
// TODO: Better errors when program cannot tokenize.
// https://github.com/KittyCAD/modeling-app/issues/696
assert!(err.to_string().contains("file is empty"));
assert!(err.to_string().contains("found list of unknown tokens"));
}
#[test]
@ -2283,7 +2276,7 @@ z(-[["#,
// https://github.com/KittyCAD/modeling-app/issues/696
assert_eq!(
result.err().unwrap().to_string(),
r#"syntax: KclErrorDetails { source_ranges: [], message: "file is empty" }"#
r##"lexical: KclErrorDetails { source_ranges: [SourceRange([6, 7])], message: "found list of unknown tokens \"#\"" }"##
);
}
@ -2297,7 +2290,7 @@ z(-[["#,
// https://github.com/KittyCAD/modeling-app/issues/696
assert_eq!(
result.err().unwrap().to_string(),
r#"syntax: KclErrorDetails { source_ranges: [], message: "file is empty" }"#
r##"lexical: KclErrorDetails { source_ranges: [SourceRange([25, 26]), SourceRange([26, 27])], message: "found list of unknown tokens \"# #\"" }"##
);
}

View File

@ -45,6 +45,8 @@ pub enum TokenType {
BlockComment,
/// A function name.
Function,
/// Unknown lexemes.
Unknown,
}
/// Most KCL tokens correspond to LSP semantic tokens (but not all).
@ -65,7 +67,8 @@ impl TryFrom<TokenType> for SemanticTokenType {
| TokenType::Comma
| TokenType::Colon
| TokenType::Period
| TokenType::DoublePeriod => {
| TokenType::DoublePeriod
| TokenType::Unknown => {
anyhow::bail!("unsupported token type: {:?}", token_type)
}
})

View File

@ -3,6 +3,7 @@ use winnow::{
combinator::{alt, opt, peek, preceded, repeat, terminated},
error::{ContextError, ParseError},
prelude::*,
stream::{Location, Stream},
token::{any, none_of, one_of, take_till1, take_until0},
Located,
};
@ -14,7 +15,7 @@ pub fn lexer(i: &str) -> Result<Vec<Token>, ParseError<Located<&str>, ContextErr
}
pub fn token(i: &mut Located<&str>) -> PResult<Token> {
winnow::combinator::dispatch! {peek(any);
match winnow::combinator::dispatch! {peek(any);
'"' | '\'' => string,
'/' => alt((line_comment, block_comment, operator)),
'{' | '(' | '[' => brace_start,
@ -27,6 +28,21 @@ pub fn token(i: &mut Located<&str>) -> PResult<Token> {
_ => alt((operator, keyword, word))
}
.parse_next(i)
{
Ok(token) => Ok(token),
Err(x) => {
// TODO: Handle non ascii cases
if i.len() == 0 || !i.is_ascii() {
return Err(x);
}
Ok(Token::from_range(
i.location()..i.location() + 1,
TokenType::Unknown,
i.next_slice(1).to_string(),
))
}
}
}
fn block_comment(i: &mut Located<&str>) -> PResult<Token> {
@ -234,6 +250,14 @@ mod tests {
}
fn assert_tokens(expected: Vec<Token>, actual: Vec<Token>) {
assert_eq!(
expected.len(),
actual.len(),
"\nexpected {} tokens, actually got {}",
expected.len(),
actual.len()
);
let n = expected.len();
for i in 0..n {
assert_eq!(
@ -242,7 +266,6 @@ mod tests {
expected[i], actual[i],
)
}
assert_eq!(n, actual.len(), "expected {} tokens, actually got {}", n, actual.len());
}
#[test]
@ -1461,4 +1484,43 @@ const things = "things"
];
assert_tokens(expected, actual);
}
#[test]
fn test_unrecognized_token() {
let actual = lexer("12 ; 8").unwrap();
let expected = vec![
Token {
token_type: TokenType::Number,
value: "12".to_string(),
start: 0,
end: 2,
},
Token {
token_type: TokenType::Whitespace,
value: " ".to_string(),
start: 2,
end: 3,
},
Token {
token_type: TokenType::Unknown,
value: ";".to_string(),
start: 3,
end: 4,
},
Token {
token_type: TokenType::Whitespace,
value: " ".to_string(),
start: 4,
end: 5,
},
Token {
token_type: TokenType::Number,
value: "8".to_string(),
start: 5,
end: 6,
},
];
assert_tokens(expected, actual);
}
}