Skip to content

Commit 077b3ba

Browse files
authored
refactor: char index (#5926)
1 parent 9986720 commit 077b3ba

11 files changed

+431
-122
lines changed

packages/cspell-trie-lib/src/lib/TrieBlob/CharIndex.test.ts

+4-4
Original file line numberDiff line numberDiff line change
@@ -9,15 +9,15 @@ describe('CharIndexBuilder', () => {
99
const letters = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j'];
1010
const indexes = letters.map((c) => charIndexBuilder.getUtf8Value(c));
1111
expect(indexes).toEqual(letters.map((c) => c.codePointAt(0)));
12-
const r = charIndexBuilder.wordToUtf8Seq('abcdefghij');
13-
expect(r).toEqual([...textEncoder.encode('abcdefghij')]);
14-
expect(charIndexBuilder.size).toBe(11); // One extra for the empty string.
12+
const r = charIndexBuilder.wordToUtf8Seq('abcdefghij⚁⚂⚃⚄⚀');
13+
expect(r).toEqual([...textEncoder.encode('abcdefghij⚁⚂⚃⚄⚀')]);
14+
expect(charIndexBuilder.size).toBe(16); // One extra for the empty string.
1515

1616
// Add the same letters again.
1717
expect(letters.map((c) => charIndexBuilder.getUtf8Value(c))).toEqual(letters.map((c) => c.codePointAt(0)));
1818

1919
const charIndex = charIndexBuilder.build();
20-
expect(charIndex.size).toBe(11);
20+
expect(charIndex.size).toBe(16);
2121
expect(charIndex.wordToUtf8Seq('abcdefghij')).toEqual([...textEncoder.encode('abcdefghij')]);
2222
});
2323
});
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,12 @@
1-
import { encodeUtf8N_BE, type Utf8BE32 } from './Utf8.js';
1+
import { encodeTextToUtf8, encodeUtf8N_BE, type Utf8BE32 } from './Utf8.js';
22

33
export type Utf8Seq = Readonly<number[]>;
44

55
export type CharIndexMap = Record<string, Utf8BE32>;
66

77
export type RO_CharIndexMap = Readonly<CharIndexMap>;
88

9-
export type CharIndexSeqMap = Record<string, Utf8Seq | number>;
9+
export type CharIndexSeqMap = Record<string, Utf8Seq>;
1010

1111
export type RO_CharIndexSeqMap = Readonly<CharIndexSeqMap>;
1212

@@ -15,52 +15,29 @@ const emptySeq: Utf8Seq = [0];
1515
Object.freeze(emptySeq);
1616

1717
export class CharIndex {
18-
readonly charToUtf8Map: RO_CharIndexMap;
19-
readonly charToUtf8SeqMap: RO_CharIndexSeqMap;
18+
#charToUtf8SeqMap: CharIndexSeqMap;
2019

2120
#lastWord = '';
2221
#lastWordSeq: Utf8Seq = [];
22+
#multiByteChars: boolean;
2323

2424
constructor(readonly charIndex: readonly string[]) {
25-
this.charToUtf8Map = buildCharIndexMap(charIndex);
26-
this.charToUtf8SeqMap = buildCharIndexSequenceMap(this.charToUtf8Map);
27-
}
28-
29-
getUtf8Value(c: string): number {
30-
return this.charToUtf8Map[c] || 0;
25+
this.#charToUtf8SeqMap = buildCharIndexSequenceMap(charIndex);
26+
this.#multiByteChars = Object.values(this.#charToUtf8SeqMap).some((c) => c.length > 1);
3127
}
3228

3329
getCharUtf8Seq(c: string): Utf8Seq {
34-
const r = this.charToUtf8SeqMap[c] ?? emptySeq;
35-
return typeof r === 'number' ? [r] : r;
36-
}
37-
38-
__wordToUtf8Seq(word: string): Utf8Seq {
39-
// Note: Array.flatMap is very slow
40-
const seq: number[] = new Array(word.length);
41-
let i = 0;
42-
for (const c of word) {
43-
const cSep = this.charToUtf8SeqMap[c];
44-
if (typeof cSep === 'number') {
45-
seq[i++] = cSep;
46-
continue;
47-
}
48-
if (!cSep) {
49-
seq[i++] = 0;
50-
continue;
51-
}
52-
for (const cIdx of cSep) {
53-
seq[i++] = cIdx;
54-
}
55-
}
56-
if (seq.length !== i) seq.length = i;
57-
return seq;
30+
const found = this.#charToUtf8SeqMap[c];
31+
if (found) return found;
32+
const s = encodeTextToUtf8(c);
33+
this.#charToUtf8SeqMap[c] = s;
34+
return s;
5835
}
5936

6037
wordToUtf8Seq(word: string): Utf8Seq {
6138
if (this.#lastWord === word) return this.#lastWordSeq;
6239

63-
const seq = this.__wordToUtf8Seq(word);
40+
const seq = encodeTextToUtf8(word);
6441

6542
this.#lastWord = word;
6643
this.#lastWordSeq = seq;
@@ -69,7 +46,7 @@ export class CharIndex {
6946
}
7047

7148
indexContainsMultiByteChars(): boolean {
72-
return Object.values(this.charToUtf8Map).some((v) => v >= 0x80);
49+
return this.#multiByteChars;
7350
}
7451

7552
get size(): number {
@@ -81,22 +58,10 @@ export class CharIndex {
8158
}
8259
}
8360

84-
function buildCharIndexMap(charIndex: readonly string[]): CharIndexMap {
85-
const map: CharIndexMap = Object.create(null);
86-
for (const c of charIndex) {
87-
const cn = c.normalize('NFC');
88-
const utf8 = encodeUtf8N_BE(cn.codePointAt(0) || 0);
89-
map[c] = utf8;
90-
map[c.normalize('NFC')] = utf8;
91-
map[c.normalize('NFD')] = utf8;
92-
}
93-
return map;
94-
}
95-
96-
function buildCharIndexSequenceMap(charIndexMap: RO_CharIndexMap): CharIndexSeqMap {
61+
function buildCharIndexSequenceMap(charIndex: readonly string[]): CharIndexSeqMap {
9762
const map: CharIndexSeqMap = Object.create(null);
98-
for (const [key, value] of Object.entries(charIndexMap)) {
99-
map[key] = splitUtf8IfNeeded(value);
63+
for (const key of charIndex) {
64+
map[key] = encodeTextToUtf8(key);
10065
}
10166
return map;
10267
}
@@ -106,7 +71,7 @@ export class CharIndexBuilder {
10671
readonly charIndexMap: CharIndexMap = Object.create(null);
10772
readonly charIndexSeqMap: CharIndexSeqMap = Object.create(null);
10873

109-
readonly #mapIdxToSeq = new Map<number, number[] | number>();
74+
readonly #mapIdxToSeq = new Map<number, number[]>();
11075

11176
constructor() {
11277
this.getUtf8Value('');
@@ -126,24 +91,22 @@ export class CharIndexBuilder {
12691
return utf8;
12792
}
12893

129-
utf8ValueToUtf8Seq(idx: number): number[] | number {
94+
utf8ValueToUtf8Seq(idx: number): number[] {
13095
const found = this.#mapIdxToSeq.get(idx);
13196
if (found !== undefined) {
13297
return found;
13398
}
134-
const seq = splitUtf8IfNeeded(idx);
99+
const seq = splitUtf8(idx);
135100
this.#mapIdxToSeq.set(idx, seq);
136101
return seq;
137102
}
138103

139104
charToUtf8Seq(c: string): number[] {
140105
const idx = this.getUtf8Value(c);
141-
const s = this.utf8ValueToUtf8Seq(idx);
142-
return typeof s === 'number' ? [s] : s;
106+
return this.utf8ValueToUtf8Seq(idx);
143107
}
144108

145109
wordToUtf8Seq(word: string): number[] {
146-
// word = word.normalize('NFC');
147110
const seq: number[] = new Array(word.length);
148111
let i = 0;
149112
for (const c of word) {
@@ -170,8 +133,9 @@ export class CharIndexBuilder {
170133
}
171134
}
172135

173-
function splitUtf8IfNeeded(utf8: number): number | number[] {
174-
if (utf8 < 0x80) return utf8;
175-
const s = [(utf8 >> 24) & 0xff, (utf8 >> 16) & 0xff, (utf8 >> 8) & 0xff, utf8 & 0xff].filter((v) => v);
176-
return s.length ? s : s[0];
136+
function splitUtf8(utf8: number): number[] {
137+
if (utf8 <= 0xff) return [utf8];
138+
if (utf8 <= 0xffff) return [(utf8 >> 8) & 0xff, utf8 & 0xff];
139+
if (utf8 <= 0xff_ffff) return [(utf8 >> 16) & 0xff, (utf8 >> 8) & 0xff, utf8 & 0xff];
140+
return [(utf8 >> 24) & 0xff, (utf8 >> 16) & 0xff, (utf8 >> 8) & 0xff, utf8 & 0xff].filter((v) => v);
177141
}

packages/cspell-trie-lib/src/lib/TrieBlob/FastTrieBlob.ts

+1-1
Original file line numberDiff line numberDiff line change
@@ -214,7 +214,7 @@ export class FastTrieBlob implements TrieData {
214214

215215
static toITrieNodeRoot(trie: FastTrieBlob): ITrieNodeRoot {
216216
return new FastTrieBlobIRoot(
217-
new FastTrieBlobInternals(trie.nodes, trie._charIndex, trie.bitMasksInfo, trie.sorted),
217+
new FastTrieBlobInternals(trie.nodes, trie._charIndex, trie.bitMasksInfo),
218218
0,
219219
trie.info,
220220
);

packages/cspell-trie-lib/src/lib/TrieBlob/FastTrieBlobBuilder.ts

-33
Original file line numberDiff line numberDiff line change
@@ -111,7 +111,6 @@ export class FastTrieBlobBuilder implements TrieBuilder<FastTrieBlob> {
111111
for (let i = 0; i < utf8Seq.length; ++i) {
112112
insertCharIndexes(utf8Seq[i], pDepth);
113113
}
114-
// dumpState({ step: 'insertChar', char });
115114
};
116115

117116
/**
@@ -174,8 +173,6 @@ export class FastTrieBlobBuilder implements TrieBuilder<FastTrieBlob> {
174173
const pos = s.pos;
175174
const node = nodes[nodeIdx];
176175
node[pos] = (refNodeIdx << NodeChildRefShift) | (node[pos] & LetterMask);
177-
178-
// dumpState({ step: 'reference', refId, refNodeIdx });
179176
};
180177

181178
const backStep = (num: number) => {
@@ -186,38 +183,8 @@ export class FastTrieBlobBuilder implements TrieBuilder<FastTrieBlob> {
186183
depth = stack[depth].pDepth;
187184
}
188185
nodeIdx = stack[depth + 1].nodeIdx;
189-
190-
// dumpState({ step: 'backStep', num });
191186
};
192187

193-
// function dumpNode(node: number[]): string {
194-
// const n = node
195-
// .map((n, i) => {
196-
// if (!i) return `w: ${(n & NodeMaskEOW && 1) || 0}`;
197-
// return `{ c: ${(n & LetterMask).toString(16).padStart(2, '0')}, r: ${n >>> NodeChildRefShift} }`;
198-
// })
199-
// .join(', ');
200-
// return `[${n}]`;
201-
// }
202-
203-
// function dumpNodes(nodes: FastTrieBlobNode[]) {
204-
// return nodes.map((n, i) => `${i}: ${dumpNode(n)}`);
205-
// }
206-
207-
// const debug = false;
208-
209-
// function dumpState(extra?: Record<string, unknown>) {
210-
// debug &&
211-
// console.warn('%o', {
212-
// stack: stack.slice(0, depth + 1),
213-
// nodes: dumpNodes(nodes),
214-
// nodeIdx,
215-
// depth,
216-
// refNodes,
217-
// ...extra,
218-
// });
219-
// }
220-
221188
const c: BuilderCursor = {
222189
insertChar,
223190
markEOW,

packages/cspell-trie-lib/src/lib/TrieBlob/FastTrieBlobInternals.ts

+5-2
Original file line numberDiff line numberDiff line change
@@ -12,14 +12,13 @@ export class FastTrieBlobInternals implements FastTrieBlobBitMaskInfo {
1212
readonly nodes: number[][],
1313
readonly charIndex: CharIndex,
1414
maskInfo: FastTrieBlobBitMaskInfo,
15-
sorted = false,
1615
) {
1716
const { NodeMaskEOW, NodeMaskChildCharIndex, NodeChildRefShift } = maskInfo;
1817
this.NodeMaskEOW = NodeMaskEOW;
1918
this.NodeMaskChildCharIndex = NodeMaskChildCharIndex;
2019
this.NodeChildRefShift = NodeChildRefShift;
2120
this.isIndexDecoderNeeded = charIndex.indexContainsMultiByteChars();
22-
!sorted && sortNodes(nodes, this.NodeMaskChildCharIndex);
21+
sortNodes(nodes, this.NodeMaskChildCharIndex);
2322
}
2423
}
2524

@@ -30,6 +29,10 @@ export class FastTrieBlobInternals implements FastTrieBlobBitMaskInfo {
3029
* @returns
3130
*/
3231
export function sortNodes(nodes: number[][], mask: number): number[][] {
32+
if (Object.isFrozen(nodes)) {
33+
assertSorted(nodes, mask);
34+
return nodes;
35+
}
3336
for (let i = 0; i < nodes.length; ++i) {
3437
let node = nodes[i];
3538
if (node.length > 2) {

packages/cspell-trie-lib/src/lib/TrieBlob/TrieBlob.ts

+2-2
Original file line numberDiff line numberDiff line change
@@ -82,7 +82,7 @@ export class TrieBlob implements TrieData {
8282
this.#nonStrictIdx = this._lookupNode(0, this.info.stripCaseAndAccentsPrefix);
8383
}
8484

85-
public wordToNodeCharIndexSequence(word: string): Utf8Seq {
85+
public wordToUtf8Seq(word: string): Utf8Seq {
8686
return this.charIndex.wordToUtf8Seq(word);
8787
}
8888

@@ -159,7 +159,7 @@ export class TrieBlob implements TrieData {
159159
const NodeChildRefShift = TrieBlob.NodeChildRefShift;
160160
const nodes = this.nodes;
161161
const nodes8 = this.#nodes8;
162-
const wordIndexes = this.wordToNodeCharIndexSequence(word);
162+
const wordIndexes = this.wordToUtf8Seq(word);
163163
const lookup = this.#nodeIdxLookup;
164164
const len = wordIndexes.length;
165165
let p = 0;

0 commit comments

Comments
 (0)