UTF-8 vs UTF-16 mismatch in KCL diagnostic SourceRanges (#7537)

# Background

The KCL interpreter (written in Rust) reports errors, lints and other diagnostics as UTF-8 source range offsets. JavaScript, including CodeMirror, represents source code as UTF-16 strings. 

# Problem

This means the UTF-8 source ranges sent from Rust don't correspond to the same text in the JS UTF-16 GUI. At best, this means the source ranges highlight the wrong thing if you use non-ASCII characters. At worst, it can cause exceptions by trying to highlight a range that doesn't even exist.

Here's the problem, on main: 

<img width="178" alt="Screenshot 2025-06-20 at 11 52 03 AM" src="https://github.com/user-attachments/assets/9a4e75bf-965f-49d8-b238-8868b35912a1" />

# Solution

We define a KCL SourceRange as _always_ being UTF-8. This means if a source range is constructed in JS, it should be converted to UTF-8. When these ranges are converted into CodeMirror diagnostics, they should be converted to UTF-16.

Here's the same code as above, but on this branch:

<img width="170" alt="Screenshot 2025-06-20 at 11 50 55 AM" src="https://github.com/user-attachments/assets/a5b971c0-0b02-4acd-8fcf-5a133331682b" />

Closes https://github.com/KittyCAD/modeling-app/issues/4327
This commit is contained in:
Adam Chalmers
2025-06-20 14:04:49 -05:00
committed by GitHub
parent 416d0b37a2
commit 53d6613d0d
4 changed files with 105 additions and 33 deletions

View File

@ -111,7 +111,8 @@ commaSep1NoTrailingComma<term> { term ("," term)* }
PipeSubstitution { "%" }
identifier { (@asciiLetter | "_") (@asciiLetter | @digit | "_")* }
// Includes non-whitespace unicode characters.
identifier { $[a-zA-Z_\u{a1}-\u{167f}\u{1681}-\u{1fff}\u{200e}-\u{2027}\u{202a}-\u{202e}\u{2030}-\u{205e}\u{2061}-\u{2fff}\u{3001}-\u{fefe}\u{ff00}-\u{10ffff}] $[a-zA-Z0-9_\u{a1}-\u{167f}\u{1681}-\u{1fff}\u{200e}-\u{2027}\u{202a}-\u{202e}\u{2030}-\u{205e}\u{2061}-\u{2fff}\u{3001}-\u{fefe}\u{ff00}-\u{10ffff}]* }
AnnotationName { "@" identifier? }
PropertyName { identifier }
TagDeclarator { "$" identifier }

View File

@ -389,7 +389,7 @@ export class KclManager extends EventTarget {
if (err(result)) {
const kclError: KCLError = result as KCLError
this.diagnostics = kclErrorsToDiagnostics([kclError])
this.diagnostics = kclErrorsToDiagnostics([kclError], code)
this._astParseFailed = true
await this.checkIfSwitchedFilesShouldClear()
@ -403,8 +403,8 @@ export class KclManager extends EventTarget {
this._kclErrorsCallBack([])
this._logsCallBack([])
this.addDiagnostics(compilationErrorsToDiagnostics(result.errors))
this.addDiagnostics(compilationErrorsToDiagnostics(result.warnings))
this.addDiagnostics(compilationErrorsToDiagnostics(result.errors, code))
this.addDiagnostics(compilationErrorsToDiagnostics(result.warnings, code))
if (result.errors.length > 0) {
this._astParseFailed = true
@ -486,11 +486,16 @@ export class KclManager extends EventTarget {
this.logs = logs
this.errors = errors
const code = this.singletons.codeManager.code
// Do not add the errors since the program was interrupted and the error is not a real KCL error
this.addDiagnostics(isInterrupted ? [] : kclErrorsToDiagnostics(errors))
this.addDiagnostics(
isInterrupted ? [] : kclErrorsToDiagnostics(errors, code)
)
// Add warnings and non-fatal errors
this.addDiagnostics(
isInterrupted ? [] : compilationErrorsToDiagnostics(execState.errors)
isInterrupted
? []
: compilationErrorsToDiagnostics(execState.errors, code)
)
this.execState = execState
if (!errors.length) {

View File

@ -1,8 +1,35 @@
import type { KCLError } from '@src/lang/errors'
import { kclErrorsToDiagnostics } from '@src/lang/errors'
import { kclErrorsToDiagnostics, toUtf16, toUtf8 } from '@src/lang/errors'
import { defaultArtifactGraph } from '@src/lang/std/artifactGraph'
import { topLevelRange } from '@src/lang/util'
describe('test UTF conversions', () => {
it('Converts UTF-8 to UTF-16', () => {
// This KCL program has an error. The variable `亞當` cannot be +3 because
// it holds a string. So that variable, on line 2, should be highlighted by
// a source range.
const sourceCode = "亞當 = 'adam'\nx = 亞當 + 3"
// Start with a SourceRange from the KCL interpreter,
// which is a UTF-8 range, on where the variable is used on the second line.
const utf8SourceRange = [20, 26, 0]
// JS string of the program uses UTF-16, so check we can correctly find the
// source range offset in UTF-16.
const actualStart = toUtf16(utf8SourceRange[0], sourceCode)
const actualEnd = toUtf16(utf8SourceRange[1], sourceCode)
const textInSourceRange = sourceCode.slice(actualStart, actualEnd)
expect(actualStart).toBe(16)
expect(actualEnd).toBe(18)
expect(textInSourceRange).toBe('亞當')
// Test we can convert the UTF-16 source range back to UTF-8,
// getting the original source range back.
const utf16Range: [number, number, number] = [actualStart, actualEnd, 0]
const actualUtf8Range = toUtf8(utf16Range, sourceCode)
expect(actualUtf8Range).toStrictEqual(utf8SourceRange)
})
})
describe('test kclErrToDiagnostic', () => {
it('converts KCL errors to CodeMirror diagnostics', () => {
const errors: KCLError[] = [
@ -33,7 +60,7 @@ describe('test kclErrToDiagnostic', () => {
defaultPlanes: null,
},
]
const diagnostics = kclErrorsToDiagnostics(errors)
const diagnostics = kclErrorsToDiagnostics(errors, 'TEST PROGRAM')
expect(diagnostics).toEqual([
{
from: 0,

View File

@ -290,6 +290,38 @@ export class KCLUndefinedValueError extends KCLError {
}
}
/**
Convert this UTF-16 source range offset to UTF-8 as SourceRange is always a UTF-8
*/
export function toUtf8(
utf16SourceRange: SourceRange,
sourceCode: string
): SourceRange {
const moduleId = utf16SourceRange[2]
const textEncoder = new TextEncoder()
const prefixUtf16 = sourceCode.slice(0, utf16SourceRange[0])
const prefixUtf8 = textEncoder.encode(prefixUtf16)
const prefixLen = prefixUtf8.length
const toHighlightUtf16 = sourceCode.slice(
utf16SourceRange[0],
utf16SourceRange[1]
)
const toHighlightUtf8 = textEncoder.encode(toHighlightUtf16)
const toHighlightLen = toHighlightUtf8.length
return [prefixLen, prefixLen + toHighlightLen, moduleId]
}
/**
Convert this UTF-8 source range offset to UTF-16 for display in CodeMirror,
as it relies on JS-style string encoding which is UTF-16.
*/
export function toUtf16(utf8Offset: number, sourceCode: string): number {
const sourceUtf8 = new TextEncoder().encode(sourceCode)
const prefix = sourceUtf8.slice(0, utf8Offset)
const backTo16 = new TextDecoder().decode(prefix)
return backTo16.length
}
/**
* Maps the lsp diagnostic to an array of KclErrors.
* Currently the diagnostics are all errors, but in the future they could include lints.
@ -299,20 +331,23 @@ export function lspDiagnosticsToKclErrors(
diagnostics: LspDiagnostic[]
): KCLError[] {
return diagnostics
.flatMap(
({ range, message }) =>
new KCLError(
'unexpected',
message,
[posToOffset(doc, range.start)!, posToOffset(doc, range.end)!, 0],
[],
[],
[],
defaultArtifactGraph(),
{},
null
)
)
.flatMap(({ range, message }) => {
const sourceRange = toUtf8(
[posToOffset(doc, range.start)!, posToOffset(doc, range.end)!, 0],
doc.toString()
)
return new KCLError(
'unexpected',
message,
sourceRange,
[],
[],
[],
defaultArtifactGraph(),
{},
null
)
})
.sort((a, b) => {
const c = a.sourceRange[0]
const d = b.sourceRange[0]
@ -331,7 +366,8 @@ export function lspDiagnosticsToKclErrors(
* Currently the diagnostics are all errors, but in the future they could include lints.
* */
export function kclErrorsToDiagnostics(
errors: KCLError[]
errors: KCLError[],
sourceCode: string
): CodeMirrorDiagnostic[] {
let nonFatal: CodeMirrorDiagnostic[] = []
const errs = errors
@ -350,8 +386,8 @@ export function kclErrorsToDiagnostics(
item.sourceRange[1] !== err.sourceRange[1]
) {
diagnostics.push({
from: item.sourceRange[0],
to: item.sourceRange[1],
from: toUtf16(item.sourceRange[0], sourceCode),
to: toUtf16(item.sourceRange[1], sourceCode),
message: 'Part of the error backtrace',
severity: 'hint',
})
@ -365,11 +401,13 @@ export function kclErrorsToDiagnostics(
}
}
if (err.nonFatal.length > 0) {
nonFatal = nonFatal.concat(compilationErrorsToDiagnostics(err.nonFatal))
nonFatal = nonFatal.concat(
compilationErrorsToDiagnostics(err.nonFatal, sourceCode)
)
}
diagnostics.push({
from: err.sourceRange[0],
to: err.sourceRange[1],
from: toUtf16(err.sourceRange[0], sourceCode),
to: toUtf16(err.sourceRange[1], sourceCode),
message,
severity: 'error',
})
@ -379,7 +417,8 @@ export function kclErrorsToDiagnostics(
}
export function compilationErrorsToDiagnostics(
errors: CompilationError[]
errors: CompilationError[],
sourceCode: string
): CodeMirrorDiagnostic[] {
return errors
?.filter((err) => isTopLevelModule(err.sourceRange))
@ -397,8 +436,8 @@ export function compilationErrorsToDiagnostics(
apply: (view: EditorView, from: number, to: number) => {
view.dispatch({
changes: {
from: suggestion.source_range[0],
to: suggestion.source_range[1],
from: toUtf16(suggestion.source_range[0], sourceCode),
to: toUtf16(suggestion.source_range[1], sourceCode),
insert: suggestion.insert,
},
annotations: [lspCodeActionEvent],
@ -408,8 +447,8 @@ export function compilationErrorsToDiagnostics(
]
}
return {
from: err.sourceRange[0],
to: err.sourceRange[1],
from: toUtf16(err.sourceRange[0], sourceCode),
to: toUtf16(err.sourceRange[1], sourceCode),
message: err.message,
severity,
actions,