streetsidesoftware
diff --git a/‎packages/cspell-trie-lib/src/lib/TrieBlob/CharIndex.test.ts
+4-4 b/‎packages/cspell-trie-lib/src/lib/TrieBlob/CharIndex.test.ts
+4-4
diff --git a/‎packages/cspell-trie-lib/src/lib/TrieBlob/CharIndex.ts
+25-61 b/‎packages/cspell-trie-lib/src/lib/TrieBlob/CharIndex.ts
+25-61
diff --git a/‎packages/cspell-trie-lib/src/lib/TrieBlob/FastTrieBlob.ts
+1-1 b/‎packages/cspell-trie-lib/src/lib/TrieBlob/FastTrieBlob.ts
+1-1
diff --git a/‎packages/cspell-trie-lib/src/lib/TrieBlob/FastTrieBlobBuilder.ts
-33 b/‎packages/cspell-trie-lib/src/lib/TrieBlob/FastTrieBlobBuilder.ts
-33
diff --git a/‎packages/cspell-trie-lib/src/lib/TrieBlob/FastTrieBlobInternals.ts
+5-2 b/‎packages/cspell-trie-lib/src/lib/TrieBlob/FastTrieBlobInternals.ts
+5-2
diff --git a/‎packages/cspell-trie-lib/src/lib/TrieBlob/TrieBlob.ts
+2-2 b/‎packages/cspell-trie-lib/src/lib/TrieBlob/TrieBlob.ts
+2-2
@@ -9,15 +9,15 @@ describe('CharIndexBuilder', () => {
         const letters = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j'];
         const indexes = letters.map((c) => charIndexBuilder.getUtf8Value(c));
         expect(indexes).toEqual(letters.map((c) => c.codePointAt(0)));
-        const r = charIndexBuilder.wordToUtf8Seq('abcdefghij');
-        expect(r).toEqual([...textEncoder.encode('abcdefghij')]);
-        expect(charIndexBuilder.size).toBe(11); // One extra for the empty string.
+        const r = charIndexBuilder.wordToUtf8Seq('abcdefghij⚁⚂⚃⚄⚀');
+        expect(r).toEqual([...textEncoder.encode('abcdefghij⚁⚂⚃⚄⚀')]);
+        expect(charIndexBuilder.size).toBe(16); // One extra for the empty string.
 
         // Add the same letters again.
         expect(letters.map((c) => charIndexBuilder.getUtf8Value(c))).toEqual(letters.map((c) => c.codePointAt(0)));
 
         const charIndex = charIndexBuilder.build();
-        expect(charIndex.size).toBe(11);
+        expect(charIndex.size).toBe(16);
         expect(charIndex.wordToUtf8Seq('abcdefghij')).toEqual([...textEncoder.encode('abcdefghij')]);
     });
 });
@@ -1,12 +1,12 @@
-import { encodeUtf8N_BE, type Utf8BE32 } from './Utf8.js';
+import { encodeTextToUtf8, encodeUtf8N_BE, type Utf8BE32 } from './Utf8.js';
 
 export type Utf8Seq = Readonly<number[]>;
 
 export type CharIndexMap = Record<string, Utf8BE32>;
 
 export type RO_CharIndexMap = Readonly<CharIndexMap>;
 
-export type CharIndexSeqMap = Record<string, Utf8Seq | number>;
+export type CharIndexSeqMap = Record<string, Utf8Seq>;
 
 export type RO_CharIndexSeqMap = Readonly<CharIndexSeqMap>;
 
@@ -15,52 +15,29 @@ const emptySeq: Utf8Seq = [0];
 Object.freeze(emptySeq);
 
 export class CharIndex {
-    readonly charToUtf8Map: RO_CharIndexMap;
-    readonly charToUtf8SeqMap: RO_CharIndexSeqMap;
+    #charToUtf8SeqMap: CharIndexSeqMap;
 
     #lastWord = '';
     #lastWordSeq: Utf8Seq = [];
+    #multiByteChars: boolean;
 
     constructor(readonly charIndex: readonly string[]) {
-        this.charToUtf8Map = buildCharIndexMap(charIndex);
-        this.charToUtf8SeqMap = buildCharIndexSequenceMap(this.charToUtf8Map);
-    }
-
-    getUtf8Value(c: string): number {
-        return this.charToUtf8Map[c] || 0;
+        this.#charToUtf8SeqMap = buildCharIndexSequenceMap(charIndex);
+        this.#multiByteChars = Object.values(this.#charToUtf8SeqMap).some((c) => c.length > 1);
     }
 
     getCharUtf8Seq(c: string): Utf8Seq {
-        const r = this.charToUtf8SeqMap[c] ?? emptySeq;
-        return typeof r === 'number' ? [r] : r;
-    }
-
-    __wordToUtf8Seq(word: string): Utf8Seq {
-        // Note: Array.flatMap is very slow
-        const seq: number[] = new Array(word.length);
-        let i = 0;
-        for (const c of word) {
-            const cSep = this.charToUtf8SeqMap[c];
-            if (typeof cSep === 'number') {
-                seq[i++] = cSep;
-                continue;
-            }
-            if (!cSep) {
-                seq[i++] = 0;
-                continue;
-            }
-            for (const cIdx of cSep) {
-                seq[i++] = cIdx;
-            }
-        }
-        if (seq.length !== i) seq.length = i;
-        return seq;
+        const found = this.#charToUtf8SeqMap[c];
+        if (found) return found;
+        const s = encodeTextToUtf8(c);
+        this.#charToUtf8SeqMap[c] = s;
+        return s;
     }
 
     wordToUtf8Seq(word: string): Utf8Seq {
         if (this.#lastWord === word) return this.#lastWordSeq;
 
-        const seq = this.__wordToUtf8Seq(word);
+        const seq = encodeTextToUtf8(word);
 
         this.#lastWord = word;
         this.#lastWordSeq = seq;
@@ -69,7 +46,7 @@ export class CharIndex {
     }
 
     indexContainsMultiByteChars(): boolean {
-        return Object.values(this.charToUtf8Map).some((v) => v >= 0x80);
+        return this.#multiByteChars;
     }
 
     get size(): number {
@@ -81,22 +58,10 @@ export class CharIndex {
     }
 }
 
-function buildCharIndexMap(charIndex: readonly string[]): CharIndexMap {
-    const map: CharIndexMap = Object.create(null);
-    for (const c of charIndex) {
-        const cn = c.normalize('NFC');
-        const utf8 = encodeUtf8N_BE(cn.codePointAt(0) || 0);
-        map[c] = utf8;
-        map[c.normalize('NFC')] = utf8;
-        map[c.normalize('NFD')] = utf8;
-    }
-    return map;
-}
-
-function buildCharIndexSequenceMap(charIndexMap: RO_CharIndexMap): CharIndexSeqMap {
+function buildCharIndexSequenceMap(charIndex: readonly string[]): CharIndexSeqMap {
     const map: CharIndexSeqMap = Object.create(null);
-    for (const [key, value] of Object.entries(charIndexMap)) {
-        map[key] = splitUtf8IfNeeded(value);
+    for (const key of charIndex) {
+        map[key] = encodeTextToUtf8(key);
     }
     return map;
 }
@@ -106,7 +71,7 @@ export class CharIndexBuilder {
     readonly charIndexMap: CharIndexMap = Object.create(null);
     readonly charIndexSeqMap: CharIndexSeqMap = Object.create(null);
 
-    readonly #mapIdxToSeq = new Map<number, number[] | number>();
+    readonly #mapIdxToSeq = new Map<number, number[]>();
 
     constructor() {
         this.getUtf8Value('');
@@ -126,24 +91,22 @@ export class CharIndexBuilder {
         return utf8;
     }
 
-    utf8ValueToUtf8Seq(idx: number): number[] | number {
+    utf8ValueToUtf8Seq(idx: number): number[] {
         const found = this.#mapIdxToSeq.get(idx);
         if (found !== undefined) {
             return found;
         }
-        const seq = splitUtf8IfNeeded(idx);
+        const seq = splitUtf8(idx);
         this.#mapIdxToSeq.set(idx, seq);
         return seq;
     }
 
     charToUtf8Seq(c: string): number[] {
         const idx = this.getUtf8Value(c);
-        const s = this.utf8ValueToUtf8Seq(idx);
-        return typeof s === 'number' ? [s] : s;
+        return this.utf8ValueToUtf8Seq(idx);
     }
 
     wordToUtf8Seq(word: string): number[] {
-        // word = word.normalize('NFC');
         const seq: number[] = new Array(word.length);
         let i = 0;
         for (const c of word) {
@@ -170,8 +133,9 @@ export class CharIndexBuilder {
     }
 }
 
-function splitUtf8IfNeeded(utf8: number): number | number[] {
-    if (utf8 < 0x80) return utf8;
-    const s = [(utf8 >> 24) & 0xff, (utf8 >> 16) & 0xff, (utf8 >> 8) & 0xff, utf8 & 0xff].filter((v) => v);
-    return s.length ? s : s[0];
+function splitUtf8(utf8: number): number[] {
+    if (utf8 <= 0xff) return [utf8];
+    if (utf8 <= 0xffff) return [(utf8 >> 8) & 0xff, utf8 & 0xff];
+    if (utf8 <= 0xff_ffff) return [(utf8 >> 16) & 0xff, (utf8 >> 8) & 0xff, utf8 & 0xff];
+    return [(utf8 >> 24) & 0xff, (utf8 >> 16) & 0xff, (utf8 >> 8) & 0xff, utf8 & 0xff].filter((v) => v);
 }
@@ -214,7 +214,7 @@ export class FastTrieBlob implements TrieData {
 
     static toITrieNodeRoot(trie: FastTrieBlob): ITrieNodeRoot {
         return new FastTrieBlobIRoot(
-            new FastTrieBlobInternals(trie.nodes, trie._charIndex, trie.bitMasksInfo, trie.sorted),
+            new FastTrieBlobInternals(trie.nodes, trie._charIndex, trie.bitMasksInfo),
             0,
             trie.info,
         );
 
@@ -111,7 +111,6 @@ export class FastTrieBlobBuilder implements TrieBuilder<FastTrieBlob> {
             for (let i = 0; i < utf8Seq.length; ++i) {
                 insertCharIndexes(utf8Seq[i], pDepth);
             }
-            // dumpState({ step: 'insertChar', char });
         };
 
         /**
@@ -174,8 +173,6 @@ export class FastTrieBlobBuilder implements TrieBuilder<FastTrieBlob> {
             const pos = s.pos;
             const node = nodes[nodeIdx];
             node[pos] = (refNodeIdx << NodeChildRefShift) | (node[pos] & LetterMask);
-
-            // dumpState({ step: 'reference', refId, refNodeIdx });
         };
 
         const backStep = (num: number) => {
@@ -186,38 +183,8 @@ export class FastTrieBlobBuilder implements TrieBuilder<FastTrieBlob> {
                 depth = stack[depth].pDepth;
             }
             nodeIdx = stack[depth + 1].nodeIdx;
-
-            // dumpState({ step: 'backStep', num });
         };
 
-        // function dumpNode(node: number[]): string {
-        //     const n = node
-        //         .map((n, i) => {
-        //             if (!i) return `w: ${(n & NodeMaskEOW && 1) || 0}`;
-        //             return `{ c: ${(n & LetterMask).toString(16).padStart(2, '0')}, r: ${n >>> NodeChildRefShift} }`;
-        //         })
-        //         .join(', ');
-        //     return `[${n}]`;
-        // }
-
-        // function dumpNodes(nodes: FastTrieBlobNode[]) {
-        //     return nodes.map((n, i) => `${i}: ${dumpNode(n)}`);
-        // }
-
-        // const debug = false;
-
-        // function dumpState(extra?: Record<string, unknown>) {
-        //     debug &&
-        //         console.warn('%o', {
-        //             stack: stack.slice(0, depth + 1),
-        //             nodes: dumpNodes(nodes),
-        //             nodeIdx,
-        //             depth,
-        //             refNodes,
-        //             ...extra,
-        //         });
-        // }
-
         const c: BuilderCursor = {
             insertChar,
             markEOW,
 
@@ -12,14 +12,13 @@ export class FastTrieBlobInternals implements FastTrieBlobBitMaskInfo {
         readonly nodes: number[][],
         readonly charIndex: CharIndex,
         maskInfo: FastTrieBlobBitMaskInfo,
-        sorted = false,
     ) {
         const { NodeMaskEOW, NodeMaskChildCharIndex, NodeChildRefShift } = maskInfo;
         this.NodeMaskEOW = NodeMaskEOW;
         this.NodeMaskChildCharIndex = NodeMaskChildCharIndex;
         this.NodeChildRefShift = NodeChildRefShift;
         this.isIndexDecoderNeeded = charIndex.indexContainsMultiByteChars();
-        !sorted && sortNodes(nodes, this.NodeMaskChildCharIndex);
+        sortNodes(nodes, this.NodeMaskChildCharIndex);
     }
 }
 
@@ -30,6 +29,10 @@ export class FastTrieBlobInternals implements FastTrieBlobBitMaskInfo {
  * @returns
  */
 export function sortNodes(nodes: number[][], mask: number): number[][] {
+    if (Object.isFrozen(nodes)) {
+        assertSorted(nodes, mask);
+        return nodes;
+    }
     for (let i = 0; i < nodes.length; ++i) {
         let node = nodes[i];
         if (node.length > 2) {
 
@@ -82,7 +82,7 @@ export class TrieBlob implements TrieData {
         this.#nonStrictIdx = this._lookupNode(0, this.info.stripCaseAndAccentsPrefix);
     }
 
-    public wordToNodeCharIndexSequence(word: string): Utf8Seq {
+    public wordToUtf8Seq(word: string): Utf8Seq {
         return this.charIndex.wordToUtf8Seq(word);
     }
 
@@ -159,7 +159,7 @@ export class TrieBlob implements TrieData {
         const NodeChildRefShift = TrieBlob.NodeChildRefShift;
         const nodes = this.nodes;
         const nodes8 = this.#nodes8;
-        const wordIndexes = this.wordToNodeCharIndexSequence(word);
+        const wordIndexes = this.wordToUtf8Seq(word);
         const lookup = this.#nodeIdxLookup;
         const len = wordIndexes.length;
         let p = 0;