Refactor TokenStream (and some minor changes to Token) (#4695)

* Refactor TokenStream (and some minor changes to Token)

Signed-off-by: Nick Cameron <nrc@ncameron.org>

* Tidy up lexer tests

Signed-off-by: Nick Cameron <nrc@ncameron.org>

---------

Signed-off-by: Nick Cameron <nrc@ncameron.org>
This commit is contained in:
Nick Cameron
2024-12-10 14:26:53 +13:00
committed by GitHub
parent 6aa588f09f
commit c943a3f192
10 changed files with 1625 additions and 1839 deletions

View File

@ -137,7 +137,7 @@ pub use lsp::test_util::kcl_lsp_server;
impl Program {
pub fn parse(input: &str) -> Result<(Option<Program>, Vec<CompilationError>), KclError> {
let module_id = ModuleId::default();
let tokens = parsing::token::lexer(input, module_id)?;
let tokens = parsing::token::lex(input, module_id)?;
let (ast, errs) = parsing::parse_tokens(tokens).0?;
Ok((ast.map(|ast| Program { ast }), errs))
@ -145,7 +145,7 @@ impl Program {
pub fn parse_no_errs(input: &str) -> Result<Program, KclError> {
let module_id = ModuleId::default();
let tokens = parsing::token::lexer(input, module_id)?;
let tokens = parsing::token::lex(input, module_id)?;
let ast = parsing::parse_tokens(tokens).parse_errs_as_err()?;
Ok(Program { ast })

View File

@ -46,34 +46,31 @@ use crate::{
lsp::{backend::Backend as _, util::IntoDiagnostic},
parsing::{
ast::types::{Expr, Node, VariableKind},
token::TokenType,
token::TokenStream,
PIPE_OPERATOR,
},
CacheInformation, ModuleId, OldAstState, Program, SourceRange,
};
lazy_static::lazy_static! {
pub static ref SEMANTIC_TOKEN_TYPES: Vec<SemanticTokenType> = {
// This is safe to unwrap because we know all the token types are valid.
// And the test would fail if they were not.
let mut gen = TokenType::all_semantic_token_types().unwrap();
gen.extend(vec![
const SEMANTIC_TOKEN_TYPES: [SemanticTokenType; 10] = [
SemanticTokenType::NUMBER,
SemanticTokenType::VARIABLE,
SemanticTokenType::KEYWORD,
SemanticTokenType::TYPE,
SemanticTokenType::STRING,
SemanticTokenType::OPERATOR,
SemanticTokenType::COMMENT,
SemanticTokenType::FUNCTION,
SemanticTokenType::PARAMETER,
SemanticTokenType::PROPERTY,
]);
gen
};
];
pub static ref SEMANTIC_TOKEN_MODIFIERS: Vec<SemanticTokenModifier> = {
vec![
const SEMANTIC_TOKEN_MODIFIERS: [SemanticTokenModifier; 5] = [
SemanticTokenModifier::DECLARATION,
SemanticTokenModifier::DEFINITION,
SemanticTokenModifier::DEFAULT_LIBRARY,
SemanticTokenModifier::READONLY,
SemanticTokenModifier::STATIC,
]
};
}
];
/// A subcommand for running the server.
#[derive(Clone, Debug)]
@ -102,7 +99,7 @@ pub struct Backend {
/// The stdlib signatures for the language.
pub stdlib_signatures: HashMap<String, SignatureHelp>,
/// Token maps.
pub token_map: DashMap<String, Vec<crate::parsing::token::Token>>,
pub(super) token_map: DashMap<String, TokenStream>,
/// AST maps.
pub ast_map: DashMap<String, Node<crate::parsing::ast::types::Program>>,
/// Last successful execution.
@ -281,7 +278,7 @@ impl crate::lsp::backend::Backend for Backend {
// Lets update the tokens.
let module_id = ModuleId::default();
let tokens = match crate::parsing::token::lexer(&params.text, module_id) {
let tokens = match crate::parsing::token::lex(&params.text, module_id) {
Ok(tokens) => tokens,
Err(err) => {
self.add_to_diagnostics(&params, &[err], true).await;
@ -407,11 +404,11 @@ impl Backend {
self.executor_ctx.read().await
}
async fn update_semantic_tokens(&self, tokens: &[crate::parsing::token::Token], params: &TextDocumentItem) {
async fn update_semantic_tokens(&self, tokens: &TokenStream, params: &TextDocumentItem) {
// Update the semantic tokens map.
let mut semantic_tokens = vec![];
let mut last_position = Position::new(0, 0);
for token in tokens {
for token in tokens.as_slice() {
let Ok(token_type) = SemanticTokenType::try_from(token.token_type) else {
// We continue here because not all tokens can be converted this way, we will get
// the rest from the ast.
@ -563,7 +560,7 @@ impl Backend {
let semantic_token = SemanticToken {
delta_line: position.line - last_position.line + 1,
delta_start: 0,
length: token.value.len() as u32,
length: (token.end - token.start) as u32,
token_type: token_type_index,
token_modifiers_bitset,
};
@ -582,7 +579,7 @@ impl Backend {
} else {
position.character - last_position.character
},
length: token.value.len() as u32,
length: (token.end - token.start) as u32,
token_type: token_type_index,
token_modifiers_bitset,
};
@ -963,8 +960,8 @@ impl LanguageServer for Backend {
semantic_tokens_options: SemanticTokensOptions {
work_done_progress_options: WorkDoneProgressOptions::default(),
legend: SemanticTokensLegend {
token_types: SEMANTIC_TOKEN_TYPES.clone(),
token_modifiers: SEMANTIC_TOKEN_MODIFIERS.clone(),
token_types: SEMANTIC_TOKEN_TYPES.to_vec(),
token_modifiers: SEMANTIC_TOKEN_MODIFIERS.to_vec(),
},
range: Some(false),
full: Some(SemanticTokensFullOptions::Bool(true)),

View File

@ -1082,7 +1082,7 @@ fn myFn = (param1) => {
// Get the token map.
let token_map = server.token_map.get("file:///test.kcl").unwrap().clone();
assert!(token_map != vec![]);
assert!(!token_map.is_empty());
// Get the ast.
let ast = server.ast_map.get("file:///test.kcl").unwrap().clone();
@ -2206,7 +2206,7 @@ part001 = cube([0,0], 20)
// Get the tokens.
let tokens = server.token_map.get("file:///test.kcl").unwrap().clone();
assert_eq!(tokens.len(), 120);
assert_eq!(tokens.as_slice().len(), 120);
// Get the ast.
let ast = server.ast_map.get("file:///test.kcl").unwrap().clone();
@ -3379,11 +3379,11 @@ part001 = startSketchOn('XY')
// Get the symbols map.
let symbols_map = server.symbols_map.get("file:///test.kcl").unwrap().clone();
assert!(symbols_map != vec![]);
assert!(!symbols_map.is_empty());
// Get the semantic tokens map.
let semantic_tokens_map = server.semantic_tokens_map.get("file:///test.kcl").unwrap().clone();
assert!(semantic_tokens_map != vec![]);
assert!(!semantic_tokens_map.is_empty());
// Get the memory.
let memory = server.memory_map.get("file:///test.kcl").unwrap().clone();
@ -3422,7 +3422,7 @@ NEW_LINT = 1"#
// Get the semantic tokens map.
let semantic_tokens_map = server.semantic_tokens_map.get("file:///test.kcl").unwrap().clone();
assert!(semantic_tokens_map != vec![]);
assert!(!semantic_tokens_map.is_empty());
// Get the memory.
let memory = server.memory_map.get("file:///test.kcl");
@ -3466,7 +3466,7 @@ part001 = startSketchOn('XY')
// Get the token map.
let token_map = server.token_map.get("file:///test.kcl").unwrap().clone();
assert!(token_map != vec![]);
assert!(!token_map.is_empty());
// Get the ast.
let ast = server.ast_map.get("file:///test.kcl").unwrap().clone();
@ -3474,11 +3474,11 @@ part001 = startSketchOn('XY')
// Get the symbols map.
let symbols_map = server.symbols_map.get("file:///test.kcl").unwrap().clone();
assert!(symbols_map != vec![]);
assert!(!symbols_map.is_empty());
// Get the semantic tokens map.
let semantic_tokens_map = server.semantic_tokens_map.get("file:///test.kcl").unwrap().clone();
assert!(semantic_tokens_map != vec![]);
assert!(!semantic_tokens_map.is_empty());
// Get the memory.
let memory = server.memory_map.get("file:///test.kcl").unwrap().clone();
@ -3509,7 +3509,7 @@ part001 = startSketchOn('XY')
// Get the token map.
let token_map = server.token_map.get("file:///test.kcl").unwrap().clone();
assert!(token_map != vec![]);
assert!(!token_map.is_empty());
// Get the ast.
let ast = server.ast_map.get("file:///test.kcl").unwrap().clone();
@ -3517,11 +3517,11 @@ part001 = startSketchOn('XY')
// Get the symbols map.
let symbols_map = server.symbols_map.get("file:///test.kcl").unwrap().clone();
assert!(symbols_map != vec![]);
assert!(!symbols_map.is_empty());
// Get the semantic tokens map.
let semantic_tokens_map = server.semantic_tokens_map.get("file:///test.kcl").unwrap().clone();
assert!(semantic_tokens_map != vec![]);
assert!(!semantic_tokens_map.is_empty());
// Get the memory.
let memory = server.memory_map.get("file:///test.kcl");

View File

@ -2,7 +2,7 @@ use crate::{
errors::{CompilationError, KclError, KclErrorDetails},
parsing::{
ast::types::{Node, Program},
token::{Token, TokenType},
token::TokenStream,
},
source_range::{ModuleId, SourceRange},
};
@ -34,15 +34,13 @@ pub fn top_level_parse(code: &str) -> ParseResult {
/// Parse the given KCL code into an AST.
pub fn parse_str(code: &str, module_id: ModuleId) -> ParseResult {
let tokens = pr_try!(crate::parsing::token::lexer(code, module_id));
let tokens = pr_try!(crate::parsing::token::lex(code, module_id));
parse_tokens(tokens)
}
/// Parse the supplied tokens into an AST.
pub fn parse_tokens(tokens: Vec<Token>) -> ParseResult {
let (tokens, unknown_tokens): (Vec<Token>, Vec<Token>) = tokens
.into_iter()
.partition(|token| token.token_type != TokenType::Unknown);
pub fn parse_tokens(mut tokens: TokenStream) -> ParseResult {
let unknown_tokens = tokens.remove_unknown();
if !unknown_tokens.is_empty() {
let source_ranges = unknown_tokens.iter().map(SourceRange::from).collect();
@ -69,7 +67,7 @@ pub fn parse_tokens(tokens: Vec<Token>) -> ParseResult {
return Node::<Program>::default().into();
}
parser::run_parser(&mut tokens.as_slice())
parser::run_parser(tokens.as_slice())
}
/// Result of parsing.

File diff suppressed because it is too large Load Diff

View File

@ -1,28 +1,221 @@
use std::str::FromStr;
// Clippy does not agree with rustc here for some reason.
#![allow(clippy::needless_lifetimes)]
use std::{fmt, iter::Enumerate, num::NonZeroUsize};
use anyhow::Result;
use parse_display::{Display, FromStr};
use schemars::JsonSchema;
use serde::{Deserialize, Serialize};
use parse_display::Display;
use tower_lsp::lsp_types::SemanticTokenType;
use winnow::{error::ParseError, stream::ContainsToken};
use winnow::{
self,
error::ParseError,
stream::{ContainsToken, Stream},
};
use crate::{
errors::KclError,
parsing::ast::types::{ItemVisibility, VariableKind},
source_range::{ModuleId, SourceRange},
};
use tokeniser::Input;
mod tokeniser;
// Re-export
pub use tokeniser::Input;
#[cfg(test)]
pub(crate) use tokeniser::RESERVED_WORDS;
#[derive(Clone, Debug, PartialEq)]
pub(crate) struct TokenStream {
tokens: Vec<Token>,
}
impl TokenStream {
fn new(tokens: Vec<Token>) -> Self {
Self { tokens }
}
pub(super) fn remove_unknown(&mut self) -> Vec<Token> {
let tokens = std::mem::take(&mut self.tokens);
let (tokens, unknown_tokens): (Vec<Token>, Vec<Token>) = tokens
.into_iter()
.partition(|token| token.token_type != TokenType::Unknown);
self.tokens = tokens;
unknown_tokens
}
pub fn iter(&self) -> impl Iterator<Item = &Token> {
self.tokens.iter()
}
pub fn is_empty(&self) -> bool {
self.tokens.is_empty()
}
pub fn as_slice(&self) -> TokenSlice {
TokenSlice::from(self)
}
}
impl<'a> From<&'a TokenStream> for TokenSlice<'a> {
fn from(stream: &'a TokenStream) -> Self {
TokenSlice {
start: 0,
end: stream.tokens.len(),
stream,
}
}
}
impl IntoIterator for TokenStream {
type Item = Token;
type IntoIter = std::vec::IntoIter<Token>;
fn into_iter(self) -> Self::IntoIter {
self.tokens.into_iter()
}
}
#[derive(Debug, Clone)]
pub(crate) struct TokenSlice<'a> {
stream: &'a TokenStream,
start: usize,
end: usize,
}
impl<'a> std::ops::Deref for TokenSlice<'a> {
type Target = [Token];
fn deref(&self) -> &Self::Target {
&self.stream.tokens[self.start..self.end]
}
}
impl<'a> TokenSlice<'a> {
pub fn token(&self, i: usize) -> &Token {
&self.stream.tokens[i + self.start]
}
pub fn iter(&self) -> impl Iterator<Item = &Token> {
(**self).iter()
}
pub fn without_ends(&self) -> Self {
Self {
start: self.start + 1,
end: self.end - 1,
stream: self.stream,
}
}
}
impl<'a> IntoIterator for TokenSlice<'a> {
type Item = &'a Token;
type IntoIter = std::slice::Iter<'a, Token>;
fn into_iter(self) -> Self::IntoIter {
self.stream.tokens[self.start..self.end].iter()
}
}
impl<'a> Stream for TokenSlice<'a> {
type Token = Token;
type Slice = Self;
type IterOffsets = Enumerate<std::vec::IntoIter<Token>>;
type Checkpoint = Checkpoint;
fn iter_offsets(&self) -> Self::IterOffsets {
#[allow(clippy::unnecessary_to_owned)]
self.to_vec().into_iter().enumerate()
}
fn eof_offset(&self) -> usize {
self.len()
}
fn next_token(&mut self) -> Option<Self::Token> {
let token = self.first()?.clone();
self.start += 1;
Some(token)
}
fn offset_for<P>(&self, predicate: P) -> Option<usize>
where
P: Fn(Self::Token) -> bool,
{
self.iter().position(|b| predicate(b.clone()))
}
fn offset_at(&self, tokens: usize) -> Result<usize, winnow::error::Needed> {
if let Some(needed) = tokens.checked_sub(self.len()).and_then(NonZeroUsize::new) {
Err(winnow::error::Needed::Size(needed))
} else {
Ok(tokens)
}
}
fn next_slice(&mut self, offset: usize) -> Self::Slice {
assert!(self.start + offset <= self.end);
let next = TokenSlice {
stream: self.stream,
start: self.start,
end: self.start + offset,
};
self.start += offset;
next
}
fn checkpoint(&self) -> Self::Checkpoint {
Checkpoint(self.start, self.end)
}
fn reset(&mut self, checkpoint: &Self::Checkpoint) {
self.start = checkpoint.0;
self.end = checkpoint.1;
}
fn raw(&self) -> &dyn fmt::Debug {
self
}
}
impl<'a> winnow::stream::Offset for TokenSlice<'a> {
fn offset_from(&self, start: &Self) -> usize {
self.start - start.start
}
}
impl<'a> winnow::stream::Offset<Checkpoint> for TokenSlice<'a> {
fn offset_from(&self, start: &Checkpoint) -> usize {
self.start - start.0
}
}
impl winnow::stream::Offset for Checkpoint {
fn offset_from(&self, start: &Self) -> usize {
self.0 - start.0
}
}
impl<'a> winnow::stream::StreamIsPartial for TokenSlice<'a> {
type PartialState = ();
fn complete(&mut self) -> Self::PartialState {}
fn restore_partial(&mut self, _: Self::PartialState) {}
fn is_partial_supported() -> bool {
false
}
}
#[derive(Clone, Debug)]
pub struct Checkpoint(usize, usize);
/// The types of tokens.
#[derive(Debug, PartialEq, Eq, Copy, Clone, Deserialize, Serialize, JsonSchema, FromStr, Display)]
#[serde(rename_all = "camelCase")]
#[derive(Debug, PartialEq, Eq, Copy, Clone, Display)]
#[display(style = "camelCase")]
pub enum TokenType {
/// A number.
@ -73,6 +266,8 @@ pub enum TokenType {
impl TryFrom<TokenType> for SemanticTokenType {
type Error = anyhow::Error;
fn try_from(token_type: TokenType) -> Result<Self> {
// If you return a new kind of `SemanticTokenType`, make sure to update `SEMANTIC_TOKEN_TYPES`
// in the LSP implementation.
Ok(match token_type {
TokenType::Number => Self::NUMBER,
TokenType::Word => Self::VARIABLE,
@ -102,52 +297,6 @@ impl TryFrom<TokenType> for SemanticTokenType {
}
impl TokenType {
// This is for the lsp server.
// Don't call this function directly in the code use a lazy_static instead
// like we do in the lsp server.
pub fn all_semantic_token_types() -> Result<Vec<SemanticTokenType>> {
let mut settings = schemars::gen::SchemaSettings::openapi3();
settings.inline_subschemas = true;
let mut generator = schemars::gen::SchemaGenerator::new(settings);
let schema = TokenType::json_schema(&mut generator);
let schemars::schema::Schema::Object(o) = &schema else {
anyhow::bail!("expected object schema: {:#?}", schema);
};
let Some(subschemas) = &o.subschemas else {
anyhow::bail!("expected subschemas: {:#?}", schema);
};
let Some(one_ofs) = &subschemas.one_of else {
anyhow::bail!("expected one_of: {:#?}", schema);
};
let mut semantic_tokens = vec![];
for one_of in one_ofs {
let schemars::schema::Schema::Object(o) = one_of else {
anyhow::bail!("expected object one_of: {:#?}", one_of);
};
let Some(enum_values) = o.enum_values.as_ref() else {
anyhow::bail!("expected enum values: {:#?}", o);
};
if enum_values.len() > 1 {
anyhow::bail!("expected only one enum value: {:#?}", o);
}
if enum_values.is_empty() {
anyhow::bail!("expected at least one enum value: {:#?}", o);
}
let label = TokenType::from_str(&enum_values[0].to_string().replace('"', ""))?;
if let Ok(semantic_token_type) = SemanticTokenType::try_from(label) {
semantic_tokens.push(semantic_token_type);
}
}
Ok(semantic_tokens)
}
pub fn is_whitespace(&self) -> bool {
matches!(self, Self::Whitespace)
}
@ -157,17 +306,15 @@ impl TokenType {
}
}
#[derive(Debug, PartialEq, Eq, Deserialize, Serialize, Clone)]
#[derive(Debug, PartialEq, Eq, Clone)]
pub struct Token {
#[serde(rename = "type")]
pub token_type: TokenType,
/// Offset in the source code where this token begins.
pub start: usize,
/// Offset in the source code where this token ends.
pub end: usize,
#[serde(default, skip_serializing_if = "ModuleId::is_top_level")]
pub module_id: ModuleId,
pub value: String,
pub(super) module_id: ModuleId,
pub(super) value: String,
}
impl ContainsToken<Token> for (TokenType, &str) {
@ -249,7 +396,7 @@ impl From<&Token> for SourceRange {
}
}
pub fn lexer(s: &str, module_id: ModuleId) -> Result<Vec<Token>, KclError> {
pub fn lex(s: &str, module_id: ModuleId) -> Result<TokenStream, KclError> {
tokeniser::lex(s, module_id).map_err(From::from)
}
@ -281,15 +428,3 @@ impl From<ParseError<Input<'_>, winnow::error::ContextError>> for KclError {
})
}
}
#[cfg(test)]
mod tests {
use super::*;
// We have this as a test so we can ensure it never panics with an unwrap in the server.
#[test]
fn test_token_type_to_semantic_token_type() {
let semantic_types = TokenType::all_semantic_token_types().unwrap();
assert!(!semantic_types.is_empty());
}
}

File diff suppressed because it is too large Load Diff

View File

@ -47,7 +47,7 @@ fn read(filename: &'static str, test_name: &str) -> String {
fn parse(test_name: &str) {
let input = read("input.kcl", test_name);
let tokens = crate::parsing::token::lexer(&input, ModuleId::default()).unwrap();
let tokens = crate::parsing::token::lex(&input, ModuleId::default()).unwrap();
// Parse the tokens into an AST.
let parse_res = Result::<_, KclError>::Ok(crate::parsing::parse_tokens(tokens).unwrap());

View File

@ -2137,8 +2137,10 @@ fn f() {
.into_iter()
.enumerate()
{
let tokens = crate::parsing::token::lexer(raw, ModuleId::default()).unwrap();
let literal = crate::parsing::parser::unsigned_number_literal.parse(&tokens).unwrap();
let tokens = crate::parsing::token::lex(raw, ModuleId::default()).unwrap();
let literal = crate::parsing::parser::unsigned_number_literal
.parse(tokens.as_slice())
.unwrap();
assert_eq!(
literal.recast(),
expected,
@ -2216,9 +2218,9 @@ sketch002 = startSketchOn({
.into_iter()
.enumerate()
{
let tokens = crate::parsing::token::lexer(input, ModuleId::default()).unwrap();
crate::parsing::parser::print_tokens(&tokens);
let expr = crate::parsing::parser::object.parse(&tokens).unwrap();
let tokens = crate::parsing::token::lex(input, ModuleId::default()).unwrap();
crate::parsing::parser::print_tokens(tokens.as_slice());
let expr = crate::parsing::parser::object.parse(tokens.as_slice()).unwrap();
assert_eq!(
expr.recast(&FormatOptions::new(), 0, ExprContext::Other),
expected,
@ -2314,8 +2316,10 @@ sketch002 = startSketchOn({
.into_iter()
.enumerate()
{
let tokens = crate::parsing::token::lexer(input, ModuleId::default()).unwrap();
let expr = crate::parsing::parser::array_elem_by_elem.parse(&tokens).unwrap();
let tokens = crate::parsing::token::lex(input, ModuleId::default()).unwrap();
let expr = crate::parsing::parser::array_elem_by_elem
.parse(tokens.as_slice())
.unwrap();
assert_eq!(
expr.recast(&FormatOptions::new(), 0, ExprContext::Other),
expected,