KCL: Support non-ASCII identifiers (#7525)
Both human and LLMs want to write KCL code in non-English languages. This is important and we should support it. Note that errors are currently a bit broken with non-ASCII identifiers, see #4327
This commit is contained in:
@ -6,7 +6,7 @@ use winnow::{
|
||||
error::{ContextError, ParseError},
|
||||
prelude::*,
|
||||
stream::{Location, Stream},
|
||||
token::{any, none_of, one_of, take_till, take_until},
|
||||
token::{any, none_of, take_till, take_until, take_while},
|
||||
LocatingSlice, Stateful,
|
||||
};
|
||||
|
||||
@ -163,8 +163,8 @@ fn whitespace(i: &mut Input<'_>) -> ModalResult<Token> {
|
||||
}
|
||||
|
||||
fn inner_word(i: &mut Input<'_>) -> ModalResult<()> {
|
||||
one_of(('a'..='z', 'A'..='Z', '_')).parse_next(i)?;
|
||||
repeat::<_, _, (), _, _>(0.., one_of(('a'..='z', 'A'..='Z', '0'..='9', '_'))).parse_next(i)?;
|
||||
take_while(1.., |c: char| c.is_alphabetic() || c == '_').parse_next(i)?;
|
||||
take_while(0.., |c: char| c.is_alphabetic() || c.is_ascii_digit() || c == '_').parse_next(i)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@ -786,6 +786,7 @@ const things = "things"
|
||||
};
|
||||
assert_eq!(actual.tokens[0], expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_word_starting_with_keyword() {
|
||||
let module_id = ModuleId::default();
|
||||
@ -799,4 +800,18 @@ const things = "things"
|
||||
};
|
||||
assert_eq!(actual.tokens[0], expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn non_english_identifiers() {
|
||||
let module_id = ModuleId::default();
|
||||
let actual = lex("亞當", module_id).unwrap();
|
||||
let expected = Token {
|
||||
token_type: TokenType::Word,
|
||||
value: "亞當".to_owned(),
|
||||
start: 0,
|
||||
end: 6,
|
||||
module_id,
|
||||
};
|
||||
assert_eq!(actual.tokens[0], expected);
|
||||
}
|
||||
}
|
||||
|
Reference in New Issue
Block a user