1
- import { encodeUtf8N_BE , type Utf8BE32 } from './Utf8.js' ;
1
+ import { encodeTextToUtf8 , encodeUtf8N_BE , type Utf8BE32 } from './Utf8.js' ;
2
2
3
3
export type Utf8Seq = Readonly < number [ ] > ;
4
4
5
5
export type CharIndexMap = Record < string , Utf8BE32 > ;
6
6
7
7
export type RO_CharIndexMap = Readonly < CharIndexMap > ;
8
8
9
- export type CharIndexSeqMap = Record < string , Utf8Seq | number > ;
9
+ export type CharIndexSeqMap = Record < string , Utf8Seq > ;
10
10
11
11
export type RO_CharIndexSeqMap = Readonly < CharIndexSeqMap > ;
12
12
@@ -15,52 +15,29 @@ const emptySeq: Utf8Seq = [0];
15
15
Object . freeze ( emptySeq ) ;
16
16
17
17
export class CharIndex {
18
- readonly charToUtf8Map : RO_CharIndexMap ;
19
- readonly charToUtf8SeqMap : RO_CharIndexSeqMap ;
18
+ #charToUtf8SeqMap: CharIndexSeqMap ;
20
19
21
20
#lastWord = '' ;
22
21
#lastWordSeq: Utf8Seq = [ ] ;
22
+ #multiByteChars: boolean ;
23
23
24
24
constructor ( readonly charIndex : readonly string [ ] ) {
25
- this . charToUtf8Map = buildCharIndexMap ( charIndex ) ;
26
- this . charToUtf8SeqMap = buildCharIndexSequenceMap ( this . charToUtf8Map ) ;
27
- }
28
-
29
- getUtf8Value ( c : string ) : number {
30
- return this . charToUtf8Map [ c ] || 0 ;
25
+ this . #charToUtf8SeqMap = buildCharIndexSequenceMap ( charIndex ) ;
26
+ this . #multiByteChars = Object . values ( this . #charToUtf8SeqMap) . some ( ( c ) => c . length > 1 ) ;
31
27
}
32
28
33
29
getCharUtf8Seq ( c : string ) : Utf8Seq {
34
- const r = this . charToUtf8SeqMap [ c ] ?? emptySeq ;
35
- return typeof r === 'number' ? [ r ] : r ;
36
- }
37
-
38
- __wordToUtf8Seq ( word : string ) : Utf8Seq {
39
- // Note: Array.flatMap is very slow
40
- const seq : number [ ] = new Array ( word . length ) ;
41
- let i = 0 ;
42
- for ( const c of word ) {
43
- const cSep = this . charToUtf8SeqMap [ c ] ;
44
- if ( typeof cSep === 'number' ) {
45
- seq [ i ++ ] = cSep ;
46
- continue ;
47
- }
48
- if ( ! cSep ) {
49
- seq [ i ++ ] = 0 ;
50
- continue ;
51
- }
52
- for ( const cIdx of cSep ) {
53
- seq [ i ++ ] = cIdx ;
54
- }
55
- }
56
- if ( seq . length !== i ) seq . length = i ;
57
- return seq ;
30
+ const found = this . #charToUtf8SeqMap[ c ] ;
31
+ if ( found ) return found ;
32
+ const s = encodeTextToUtf8 ( c ) ;
33
+ this . #charToUtf8SeqMap[ c ] = s ;
34
+ return s ;
58
35
}
59
36
60
37
wordToUtf8Seq ( word : string ) : Utf8Seq {
61
38
if ( this . #lastWord === word ) return this . #lastWordSeq;
62
39
63
- const seq = this . __wordToUtf8Seq ( word ) ;
40
+ const seq = encodeTextToUtf8 ( word ) ;
64
41
65
42
this . #lastWord = word ;
66
43
this . #lastWordSeq = seq ;
@@ -69,7 +46,7 @@ export class CharIndex {
69
46
}
70
47
71
48
indexContainsMultiByteChars ( ) : boolean {
72
- return Object . values ( this . charToUtf8Map ) . some ( ( v ) => v >= 0x80 ) ;
49
+ return this . #multiByteChars ;
73
50
}
74
51
75
52
get size ( ) : number {
@@ -81,22 +58,10 @@ export class CharIndex {
81
58
}
82
59
}
83
60
84
- function buildCharIndexMap ( charIndex : readonly string [ ] ) : CharIndexMap {
85
- const map : CharIndexMap = Object . create ( null ) ;
86
- for ( const c of charIndex ) {
87
- const cn = c . normalize ( 'NFC' ) ;
88
- const utf8 = encodeUtf8N_BE ( cn . codePointAt ( 0 ) || 0 ) ;
89
- map [ c ] = utf8 ;
90
- map [ c . normalize ( 'NFC' ) ] = utf8 ;
91
- map [ c . normalize ( 'NFD' ) ] = utf8 ;
92
- }
93
- return map ;
94
- }
95
-
96
- function buildCharIndexSequenceMap ( charIndexMap : RO_CharIndexMap ) : CharIndexSeqMap {
61
+ function buildCharIndexSequenceMap ( charIndex : readonly string [ ] ) : CharIndexSeqMap {
97
62
const map : CharIndexSeqMap = Object . create ( null ) ;
98
- for ( const [ key , value ] of Object . entries ( charIndexMap ) ) {
99
- map [ key ] = splitUtf8IfNeeded ( value ) ;
63
+ for ( const key of charIndex ) {
64
+ map [ key ] = encodeTextToUtf8 ( key ) ;
100
65
}
101
66
return map ;
102
67
}
@@ -106,7 +71,7 @@ export class CharIndexBuilder {
106
71
readonly charIndexMap : CharIndexMap = Object . create ( null ) ;
107
72
readonly charIndexSeqMap : CharIndexSeqMap = Object . create ( null ) ;
108
73
109
- readonly #mapIdxToSeq = new Map < number , number [ ] | number > ( ) ;
74
+ readonly #mapIdxToSeq = new Map < number , number [ ] > ( ) ;
110
75
111
76
constructor ( ) {
112
77
this . getUtf8Value ( '' ) ;
@@ -126,24 +91,22 @@ export class CharIndexBuilder {
126
91
return utf8 ;
127
92
}
128
93
129
- utf8ValueToUtf8Seq ( idx : number ) : number [ ] | number {
94
+ utf8ValueToUtf8Seq ( idx : number ) : number [ ] {
130
95
const found = this . #mapIdxToSeq. get ( idx ) ;
131
96
if ( found !== undefined ) {
132
97
return found ;
133
98
}
134
- const seq = splitUtf8IfNeeded ( idx ) ;
99
+ const seq = splitUtf8 ( idx ) ;
135
100
this . #mapIdxToSeq. set ( idx , seq ) ;
136
101
return seq ;
137
102
}
138
103
139
104
charToUtf8Seq ( c : string ) : number [ ] {
140
105
const idx = this . getUtf8Value ( c ) ;
141
- const s = this . utf8ValueToUtf8Seq ( idx ) ;
142
- return typeof s === 'number' ? [ s ] : s ;
106
+ return this . utf8ValueToUtf8Seq ( idx ) ;
143
107
}
144
108
145
109
wordToUtf8Seq ( word : string ) : number [ ] {
146
- // word = word.normalize('NFC');
147
110
const seq : number [ ] = new Array ( word . length ) ;
148
111
let i = 0 ;
149
112
for ( const c of word ) {
@@ -170,8 +133,9 @@ export class CharIndexBuilder {
170
133
}
171
134
}
172
135
173
- function splitUtf8IfNeeded ( utf8 : number ) : number | number [ ] {
174
- if ( utf8 < 0x80 ) return utf8 ;
175
- const s = [ ( utf8 >> 24 ) & 0xff , ( utf8 >> 16 ) & 0xff , ( utf8 >> 8 ) & 0xff , utf8 & 0xff ] . filter ( ( v ) => v ) ;
176
- return s . length ? s : s [ 0 ] ;
136
+ function splitUtf8 ( utf8 : number ) : number [ ] {
137
+ if ( utf8 <= 0xff ) return [ utf8 ] ;
138
+ if ( utf8 <= 0xffff ) return [ ( utf8 >> 8 ) & 0xff , utf8 & 0xff ] ;
139
+ if ( utf8 <= 0xff_ffff ) return [ ( utf8 >> 16 ) & 0xff , ( utf8 >> 8 ) & 0xff , utf8 & 0xff ] ;
140
+ return [ ( utf8 >> 24 ) & 0xff , ( utf8 >> 16 ) & 0xff , ( utf8 >> 8 ) & 0xff , utf8 & 0xff ] . filter ( ( v ) => v ) ;
177
141
}
0 commit comments