UTF-8 vs UTF-16 mismatch in KCL diagnostic SourceRanges (#7537)
# Background The KCL interpreter (written in Rust) reports errors, lints and other diagnostics as UTF-8 source range offsets. JavaScript, including CodeMirror, represents source code as UTF-16 strings. # Problem This means the UTF-8 source ranges sent from Rust don't correspond to the same text in the JS UTF-16 GUI. At best, this means the source ranges highlight the wrong thing if you use non-ASCII characters. At worst, it can cause exceptions by trying to highlight a range that doesn't even exist. Here's the problem, on main: <img width="178" alt="Screenshot 2025-06-20 at 11 52 03 AM" src="https://github.com/user-attachments/assets/9a4e75bf-965f-49d8-b238-8868b35912a1" /> # Solution We define a KCL SourceRange as _always_ being UTF-8. This means if a source range is constructed in JS, it should be converted to UTF-8. When these ranges are converted into CodeMirror diagnostics, they should be converted to UTF-16. Here's the same code as above, but on this branch: <img width="170" alt="Screenshot 2025-06-20 at 11 50 55 AM" src="https://github.com/user-attachments/assets/a5b971c0-0b02-4acd-8fcf-5a133331682b" /> Closes https://github.com/KittyCAD/modeling-app/issues/4327
This commit is contained in:
@ -111,7 +111,8 @@ commaSep1NoTrailingComma<term> { term ("," term)* }
|
||||
|
||||
PipeSubstitution { "%" }
|
||||
|
||||
identifier { (@asciiLetter | "_") (@asciiLetter | @digit | "_")* }
|
||||
// Includes non-whitespace unicode characters.
|
||||
identifier { $[a-zA-Z_\u{a1}-\u{167f}\u{1681}-\u{1fff}\u{200e}-\u{2027}\u{202a}-\u{202e}\u{2030}-\u{205e}\u{2061}-\u{2fff}\u{3001}-\u{fefe}\u{ff00}-\u{10ffff}] $[a-zA-Z0-9_\u{a1}-\u{167f}\u{1681}-\u{1fff}\u{200e}-\u{2027}\u{202a}-\u{202e}\u{2030}-\u{205e}\u{2061}-\u{2fff}\u{3001}-\u{fefe}\u{ff00}-\u{10ffff}]* }
|
||||
AnnotationName { "@" identifier? }
|
||||
PropertyName { identifier }
|
||||
TagDeclarator { "$" identifier }
|
||||
|
@ -389,7 +389,7 @@ export class KclManager extends EventTarget {
|
||||
|
||||
if (err(result)) {
|
||||
const kclError: KCLError = result as KCLError
|
||||
this.diagnostics = kclErrorsToDiagnostics([kclError])
|
||||
this.diagnostics = kclErrorsToDiagnostics([kclError], code)
|
||||
this._astParseFailed = true
|
||||
|
||||
await this.checkIfSwitchedFilesShouldClear()
|
||||
@ -403,8 +403,8 @@ export class KclManager extends EventTarget {
|
||||
this._kclErrorsCallBack([])
|
||||
this._logsCallBack([])
|
||||
|
||||
this.addDiagnostics(compilationErrorsToDiagnostics(result.errors))
|
||||
this.addDiagnostics(compilationErrorsToDiagnostics(result.warnings))
|
||||
this.addDiagnostics(compilationErrorsToDiagnostics(result.errors, code))
|
||||
this.addDiagnostics(compilationErrorsToDiagnostics(result.warnings, code))
|
||||
if (result.errors.length > 0) {
|
||||
this._astParseFailed = true
|
||||
|
||||
@ -486,11 +486,16 @@ export class KclManager extends EventTarget {
|
||||
|
||||
this.logs = logs
|
||||
this.errors = errors
|
||||
const code = this.singletons.codeManager.code
|
||||
// Do not add the errors since the program was interrupted and the error is not a real KCL error
|
||||
this.addDiagnostics(isInterrupted ? [] : kclErrorsToDiagnostics(errors))
|
||||
this.addDiagnostics(
|
||||
isInterrupted ? [] : kclErrorsToDiagnostics(errors, code)
|
||||
)
|
||||
// Add warnings and non-fatal errors
|
||||
this.addDiagnostics(
|
||||
isInterrupted ? [] : compilationErrorsToDiagnostics(execState.errors)
|
||||
isInterrupted
|
||||
? []
|
||||
: compilationErrorsToDiagnostics(execState.errors, code)
|
||||
)
|
||||
this.execState = execState
|
||||
if (!errors.length) {
|
||||
|
@ -1,8 +1,35 @@
|
||||
import type { KCLError } from '@src/lang/errors'
|
||||
import { kclErrorsToDiagnostics } from '@src/lang/errors'
|
||||
import { kclErrorsToDiagnostics, toUtf16, toUtf8 } from '@src/lang/errors'
|
||||
import { defaultArtifactGraph } from '@src/lang/std/artifactGraph'
|
||||
import { topLevelRange } from '@src/lang/util'
|
||||
|
||||
describe('test UTF conversions', () => {
|
||||
it('Converts UTF-8 to UTF-16', () => {
|
||||
// This KCL program has an error. The variable `亞當` cannot be +3 because
|
||||
// it holds a string. So that variable, on line 2, should be highlighted by
|
||||
// a source range.
|
||||
const sourceCode = "亞當 = 'adam'\nx = 亞當 + 3"
|
||||
// Start with a SourceRange from the KCL interpreter,
|
||||
// which is a UTF-8 range, on where the variable is used on the second line.
|
||||
const utf8SourceRange = [20, 26, 0]
|
||||
|
||||
// JS string of the program uses UTF-16, so check we can correctly find the
|
||||
// source range offset in UTF-16.
|
||||
const actualStart = toUtf16(utf8SourceRange[0], sourceCode)
|
||||
const actualEnd = toUtf16(utf8SourceRange[1], sourceCode)
|
||||
const textInSourceRange = sourceCode.slice(actualStart, actualEnd)
|
||||
expect(actualStart).toBe(16)
|
||||
expect(actualEnd).toBe(18)
|
||||
expect(textInSourceRange).toBe('亞當')
|
||||
|
||||
// Test we can convert the UTF-16 source range back to UTF-8,
|
||||
// getting the original source range back.
|
||||
const utf16Range: [number, number, number] = [actualStart, actualEnd, 0]
|
||||
const actualUtf8Range = toUtf8(utf16Range, sourceCode)
|
||||
expect(actualUtf8Range).toStrictEqual(utf8SourceRange)
|
||||
})
|
||||
})
|
||||
|
||||
describe('test kclErrToDiagnostic', () => {
|
||||
it('converts KCL errors to CodeMirror diagnostics', () => {
|
||||
const errors: KCLError[] = [
|
||||
@ -33,7 +60,7 @@ describe('test kclErrToDiagnostic', () => {
|
||||
defaultPlanes: null,
|
||||
},
|
||||
]
|
||||
const diagnostics = kclErrorsToDiagnostics(errors)
|
||||
const diagnostics = kclErrorsToDiagnostics(errors, 'TEST PROGRAM')
|
||||
expect(diagnostics).toEqual([
|
||||
{
|
||||
from: 0,
|
||||
|
@ -290,6 +290,38 @@ export class KCLUndefinedValueError extends KCLError {
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
Convert this UTF-16 source range offset to UTF-8 as SourceRange is always a UTF-8
|
||||
*/
|
||||
export function toUtf8(
|
||||
utf16SourceRange: SourceRange,
|
||||
sourceCode: string
|
||||
): SourceRange {
|
||||
const moduleId = utf16SourceRange[2]
|
||||
const textEncoder = new TextEncoder()
|
||||
const prefixUtf16 = sourceCode.slice(0, utf16SourceRange[0])
|
||||
const prefixUtf8 = textEncoder.encode(prefixUtf16)
|
||||
const prefixLen = prefixUtf8.length
|
||||
const toHighlightUtf16 = sourceCode.slice(
|
||||
utf16SourceRange[0],
|
||||
utf16SourceRange[1]
|
||||
)
|
||||
const toHighlightUtf8 = textEncoder.encode(toHighlightUtf16)
|
||||
const toHighlightLen = toHighlightUtf8.length
|
||||
return [prefixLen, prefixLen + toHighlightLen, moduleId]
|
||||
}
|
||||
|
||||
/**
|
||||
Convert this UTF-8 source range offset to UTF-16 for display in CodeMirror,
|
||||
as it relies on JS-style string encoding which is UTF-16.
|
||||
*/
|
||||
export function toUtf16(utf8Offset: number, sourceCode: string): number {
|
||||
const sourceUtf8 = new TextEncoder().encode(sourceCode)
|
||||
const prefix = sourceUtf8.slice(0, utf8Offset)
|
||||
const backTo16 = new TextDecoder().decode(prefix)
|
||||
return backTo16.length
|
||||
}
|
||||
|
||||
/**
|
||||
* Maps the lsp diagnostic to an array of KclErrors.
|
||||
* Currently the diagnostics are all errors, but in the future they could include lints.
|
||||
@ -299,20 +331,23 @@ export function lspDiagnosticsToKclErrors(
|
||||
diagnostics: LspDiagnostic[]
|
||||
): KCLError[] {
|
||||
return diagnostics
|
||||
.flatMap(
|
||||
({ range, message }) =>
|
||||
new KCLError(
|
||||
'unexpected',
|
||||
message,
|
||||
[posToOffset(doc, range.start)!, posToOffset(doc, range.end)!, 0],
|
||||
[],
|
||||
[],
|
||||
[],
|
||||
defaultArtifactGraph(),
|
||||
{},
|
||||
null
|
||||
)
|
||||
)
|
||||
.flatMap(({ range, message }) => {
|
||||
const sourceRange = toUtf8(
|
||||
[posToOffset(doc, range.start)!, posToOffset(doc, range.end)!, 0],
|
||||
doc.toString()
|
||||
)
|
||||
return new KCLError(
|
||||
'unexpected',
|
||||
message,
|
||||
sourceRange,
|
||||
[],
|
||||
[],
|
||||
[],
|
||||
defaultArtifactGraph(),
|
||||
{},
|
||||
null
|
||||
)
|
||||
})
|
||||
.sort((a, b) => {
|
||||
const c = a.sourceRange[0]
|
||||
const d = b.sourceRange[0]
|
||||
@ -331,7 +366,8 @@ export function lspDiagnosticsToKclErrors(
|
||||
* Currently the diagnostics are all errors, but in the future they could include lints.
|
||||
* */
|
||||
export function kclErrorsToDiagnostics(
|
||||
errors: KCLError[]
|
||||
errors: KCLError[],
|
||||
sourceCode: string
|
||||
): CodeMirrorDiagnostic[] {
|
||||
let nonFatal: CodeMirrorDiagnostic[] = []
|
||||
const errs = errors
|
||||
@ -350,8 +386,8 @@ export function kclErrorsToDiagnostics(
|
||||
item.sourceRange[1] !== err.sourceRange[1]
|
||||
) {
|
||||
diagnostics.push({
|
||||
from: item.sourceRange[0],
|
||||
to: item.sourceRange[1],
|
||||
from: toUtf16(item.sourceRange[0], sourceCode),
|
||||
to: toUtf16(item.sourceRange[1], sourceCode),
|
||||
message: 'Part of the error backtrace',
|
||||
severity: 'hint',
|
||||
})
|
||||
@ -365,11 +401,13 @@ export function kclErrorsToDiagnostics(
|
||||
}
|
||||
}
|
||||
if (err.nonFatal.length > 0) {
|
||||
nonFatal = nonFatal.concat(compilationErrorsToDiagnostics(err.nonFatal))
|
||||
nonFatal = nonFatal.concat(
|
||||
compilationErrorsToDiagnostics(err.nonFatal, sourceCode)
|
||||
)
|
||||
}
|
||||
diagnostics.push({
|
||||
from: err.sourceRange[0],
|
||||
to: err.sourceRange[1],
|
||||
from: toUtf16(err.sourceRange[0], sourceCode),
|
||||
to: toUtf16(err.sourceRange[1], sourceCode),
|
||||
message,
|
||||
severity: 'error',
|
||||
})
|
||||
@ -379,7 +417,8 @@ export function kclErrorsToDiagnostics(
|
||||
}
|
||||
|
||||
export function compilationErrorsToDiagnostics(
|
||||
errors: CompilationError[]
|
||||
errors: CompilationError[],
|
||||
sourceCode: string
|
||||
): CodeMirrorDiagnostic[] {
|
||||
return errors
|
||||
?.filter((err) => isTopLevelModule(err.sourceRange))
|
||||
@ -397,8 +436,8 @@ export function compilationErrorsToDiagnostics(
|
||||
apply: (view: EditorView, from: number, to: number) => {
|
||||
view.dispatch({
|
||||
changes: {
|
||||
from: suggestion.source_range[0],
|
||||
to: suggestion.source_range[1],
|
||||
from: toUtf16(suggestion.source_range[0], sourceCode),
|
||||
to: toUtf16(suggestion.source_range[1], sourceCode),
|
||||
insert: suggestion.insert,
|
||||
},
|
||||
annotations: [lspCodeActionEvent],
|
||||
@ -408,8 +447,8 @@ export function compilationErrorsToDiagnostics(
|
||||
]
|
||||
}
|
||||
return {
|
||||
from: err.sourceRange[0],
|
||||
to: err.sourceRange[1],
|
||||
from: toUtf16(err.sourceRange[0], sourceCode),
|
||||
to: toUtf16(err.sourceRange[1], sourceCode),
|
||||
message: err.message,
|
||||
severity,
|
||||
actions,
|
||||
|
Reference in New Issue
Block a user