KCL: Support non-ASCII identifiers (#7525)

Both human and LLMs want to write KCL code in non-English languages. This is important and we should support it. Note that errors are currently a bit broken with non-ASCII identifiers, see #4327
2025-06-19 09:10:21 -05:00
parent 9eaacc2a51
commit 9dd6e3e852
12 changed files with 655 additions and 11 deletions
--- a/rust/kcl-lib/src/parsing/token/tokeniser.rs
+++ b/rust/kcl-lib/src/parsing/token/tokeniser.rs
@ -6,7 +6,7 @@ use winnow::{
    error::{ContextError, ParseError},
    prelude::*,
    stream::{Location, Stream},
-    token::{any, none_of, one_of, take_till, take_until},
+    token::{any, none_of, take_till, take_until, take_while},
    LocatingSlice, Stateful,
 };

@ -163,8 +163,8 @@ fn whitespace(i: &mut Input<'_>) -> ModalResult<Token> {
 }

 fn inner_word(i: &mut Input<'_>) -> ModalResult<()> {
-    one_of(('a'..='z', 'A'..='Z', '_')).parse_next(i)?;
-    repeat::<_, _, (), _, _>(0.., one_of(('a'..='z', 'A'..='Z', '0'..='9', '_'))).parse_next(i)?;
+    take_while(1.., |c: char| c.is_alphabetic() || c == '_').parse_next(i)?;
+    take_while(0.., |c: char| c.is_alphabetic() || c.is_ascii_digit() || c == '_').parse_next(i)?;
    Ok(())
 }

@ -786,6 +786,7 @@ const things = "things"
        };
        assert_eq!(actual.tokens[0], expected);
    }
+
    #[test]
    fn test_word_starting_with_keyword() {
        let module_id = ModuleId::default();
@ -799,4 +800,18 @@ const things = "things"
        };
        assert_eq!(actual.tokens[0], expected);
    }
+
+    #[test]
+    fn non_english_identifiers() {
+        let module_id = ModuleId::default();
+        let actual = lex("亞當", module_id).unwrap();
+        let expected = Token {
+            token_type: TokenType::Word,
+            value: "亞當".to_owned(),
+            start: 0,
+            end: 6,
+            module_id,
+        };
+        assert_eq!(actual.tokens[0], expected);
+    }
 }