24
24
import org .apache .lucene .analysis .TokenStream ;
25
25
import org .apache .lucene .analysis .tokenattributes .TermToBytesRefAttribute ;
26
26
import org .apache .lucene .util .BytesRef ;
27
- import org .opensearch .common .hash .MurmurHash3 ;
27
+ import org .elasticsearch .common .hash .MurmurHash3 ;
28
28
29
29
import java .io .IOException ;
30
30
import java .util .ArrayList ;
34
34
* have a minimum length - 6 is a good heuristic as it avoids filtering common
35
35
* idioms/phrases but detects longer sections that are typical of cut+paste
36
36
* copies of text.
37
- *
37
+ *
38
38
* <p>
39
39
* Internally each token is hashed/moduloed into a single byte (so 256 possible
40
40
* values for each token) and then recorded in a trie of seen byte sequences
41
41
* using a {@link DuplicateByteSequenceSpotter}. This trie is passed into the
42
42
* TokenFilter constructor so a single object can be reused across multiple
43
43
* documents.
44
- *
44
+ *
45
45
* <p>
46
46
* The emitDuplicates setting controls if duplicate tokens are filtered from
47
47
* results or are output (the {@link DuplicateSequenceAttribute} attribute can
@@ -57,7 +57,7 @@ public DeDuplicatingTokenFilter(TokenStream in, DuplicateByteSequenceSpotter byt
57
57
}
58
58
59
59
/**
60
- *
60
+ *
61
61
* @param in
62
62
* The input token stream
63
63
* @param byteStreamDuplicateSpotter
@@ -110,9 +110,9 @@ public final boolean incrementToken() throws IOException {
110
110
}
111
111
112
112
public void loadAllTokens () throws IOException {
113
- // TODO consider changing this implementation to emit tokens as-we-go
114
- // rather than buffering all. However this array is perhaps not the
115
- // bulk of memory usage (in practice the dupSequenceSpotter requires
113
+ // TODO consider changing this implementation to emit tokens as-we-go
114
+ // rather than buffering all. However this array is perhaps not the
115
+ // bulk of memory usage (in practice the dupSequenceSpotter requires
116
116
// ~5x the original content size in its internal tree ).
117
117
allTokens = new ArrayList <State >(256 );
118
118
@@ -198,4 +198,4 @@ private void recordLengthInfoState(short[] maxNumSightings, State[] tokenStates,
198
198
}
199
199
200
200
}
201
- }
201
+ }
0 commit comments