Skip to content

Commit 7ad1a6a

Browse files
committed
use float not double for taxis floating point fields; fix silly failing tests; avoid converting utf8 bytes to String and back to utf8 again for atom fields
1 parent b838e09 commit 7ad1a6a

File tree

3 files changed

+42
-43
lines changed

3 files changed

+42
-43
lines changed

scripts/indexTaxis.py

+21-19
Original file line numberDiff line numberDiff line change
@@ -210,8 +210,7 @@ def main():
210210
refreshSec = 1.0
211211
else:
212212
# Turn off refreshes to maximize indexing throughput:
213-
#refreshSec = 100000.0
214-
refreshSec = 1.0
213+
refreshSec = 100000.0
215214
send(LOCALHOST, primaryPorts[0], "liveSettings", {'indexName': 'index', 'index.ramBufferSizeMB': 1024., 'maxRefreshSec': refreshSec})
216215

217216
fields = {'indexName': 'index',
@@ -223,23 +222,23 @@ def main():
223222
'pick_up_date_time': {'type': 'long', 'search': True, 'sort': True},
224223
'drop_off_date_time': {'type': 'long', 'search': True, 'sort': True},
225224
'passenger_count': {'type': 'int', 'search': True, 'sort': True},
226-
'trip_distance': {'type': 'double', 'search': True, 'sort': True},
227-
'pick_up_lat': {'type': 'double', 'search': True, 'sort': True},
228-
'pick_up_lon': {'type': 'double', 'search': True, 'sort': True},
229-
'drop_off_lat': {'type': 'double', 'search': True, 'sort': True},
230-
'drop_off_lon': {'type': 'double', 'search': True, 'sort': True},
225+
'trip_distance': {'type': 'float', 'search': True, 'sort': True},
226+
'pick_up_lat': {'type': 'float', 'search': True, 'sort': True},
227+
'pick_up_lon': {'type': 'float', 'search': True, 'sort': True},
228+
'drop_off_lat': {'type': 'float', 'search': True, 'sort': True},
229+
'drop_off_lon': {'type': 'float', 'search': True, 'sort': True},
231230
'payment_type': {'type': 'atom', 'sort': True},
232231
'trip_type': {'type': 'atom', 'sort': True},
233232
'rate_code': {'type': 'atom', 'sort': True},
234-
'fare_amount': {'type': 'double', 'search': True, 'sort': True},
235-
'surcharge': {'type': 'double', 'search': True, 'sort': True},
236-
'mta_tax': {'type': 'double', 'search': True, 'sort': True},
237-
'extra': {'type': 'double', 'search': True, 'sort': True},
238-
'ehail_fee': {'type': 'double', 'search': True, 'sort': True},
239-
'improvement_surcharge': {'type': 'double', 'search': True, 'sort': True},
240-
'tip_amount': {'type': 'double', 'search': True, 'sort': True},
241-
'tolls_amount': {'type': 'double', 'search': True, 'sort': True},
242-
'total_amount': {'type': 'double', 'search': True, 'sort': True},
233+
'fare_amount': {'type': 'float', 'search': True, 'sort': True},
234+
'surcharge': {'type': 'float', 'search': True, 'sort': True},
235+
'mta_tax': {'type': 'float', 'search': True, 'sort': True},
236+
'extra': {'type': 'float', 'search': True, 'sort': True},
237+
'ehail_fee': {'type': 'float', 'search': True, 'sort': True},
238+
'improvement_surcharge': {'type': 'float', 'search': True, 'sort': True},
239+
'tip_amount': {'type': 'float', 'search': True, 'sort': True},
240+
'tolls_amount': {'type': 'float', 'search': True, 'sort': True},
241+
'total_amount': {'type': 'float', 'search': True, 'sort': True},
243242
'store_and_fwd_flag': {'type': 'atom', 'sort': True}}}
244243

245244
send(LOCALHOST, primaryPorts[0], 'registerFields', fields)
@@ -248,10 +247,12 @@ def main():
248247

249248
send(LOCALHOST, primaryPorts[0], "settings", {'indexName': 'index',
250249
#'indexSort': [{'field': 'pick_up_lon'}],
251-
'index.verbose': True,
250+
'index.verbose': False,
252251
'directory': 'MMapDirectory',
253252
'nrtCachingDirectory.maxSizeMB': 0.0,
254-
#'index.merge.scheduler.auto_throttle': False,
253+
'concurrentMergeScheduler.maxThreadCount': 4,
254+
'concurrentMergeScheduler.maxMergeCount': 9,
255+
'index.merge.scheduler.auto_throttle': False,
255256
})
256257

257258
for id, host, installPath, port, binaryPort in replicaPorts:
@@ -277,7 +278,8 @@ def main():
277278
replicaStarted = False
278279

279280
#docSource = '/lucenedata/nyc-taxi-data/alltaxis.csv.blocks'
280-
docSource = '/b/alltaxis.csv.blocks'
281+
#docSource = '/b/alltaxis.csv.blocks'
282+
docSource = '/l/data/alltaxis.csv.blocks'
281283
if not os.path.exists(docSource):
282284
# Not Mike's home computer!
283285
docSource = 'data/alltaxis.1M.csv.blocks'

src/java/org/apache/lucene/server/handlers/CSVParser.java

+16-19
Original file line numberDiff line numberDiff line change
@@ -94,16 +94,23 @@ private void initReuseFields() {
9494
switch(fd.valueType) {
9595
case "atom":
9696
{
97-
reuseFields[i] = new StringField(fd.name, "", stored ? Field.Store.YES : Field.Store.NO);
97+
BytesRef br;
9898
if (fd.usePoints) {
9999
reusePoints[i] = new BinaryPoint(fd.name, new byte[0]);
100+
// little bit sneaky sharing of a single BytesRef across all Lucene
101+
// fields we add for this user's field:
102+
br = reusePoints[i].binaryValue();
103+
assert br != null;
104+
} else {
105+
br = new BytesRef();
100106
}
107+
reuseFields[i] = new StringField(fd.name, br, stored ? Field.Store.YES : Field.Store.NO);
101108
if (dvType == DocValuesType.SORTED) {
102-
reuseDVs[i] = new SortedDocValuesField(fd.name, new BytesRef());
109+
reuseDVs[i] = new SortedDocValuesField(fd.name, br);
103110
} else if (dvType == DocValuesType.SORTED_SET) {
104-
reuseDVs[i] = new SortedSetDocValuesField(fd.name, new BytesRef());
111+
reuseDVs[i] = new SortedSetDocValuesField(fd.name, br);
105112
} else if (dvType == DocValuesType.BINARY) {
106-
reuseDVs[i] = new BinaryDocValuesField(fd.name, new BytesRef());
113+
reuseDVs[i] = new BinaryDocValuesField(fd.name, br);
107114
}
108115
break;
109116
}
@@ -193,29 +200,19 @@ private void addOneField(int i, int lastFieldStart) {
193200
switch(fields[i].valueType) {
194201
case "atom":
195202
{
196-
String s = new String(bytes, lastFieldStart, len, StandardCharsets.UTF_8);
197203
Field field = reuseFields[i];
198-
field.setStringValue(s);
204+
BytesRef br = field.binaryValue();
205+
assert br != null;
206+
br.bytes = bytes;
207+
br.offset = lastFieldStart;
208+
br.length = len;
199209
reuseDoc.add(field);
200210
Field point = reusePoints[i];
201-
BytesRef br;
202211
if (point != null) {
203-
br = point.binaryValue();
204-
br.bytes = bytes;
205-
br.offset = lastFieldStart;
206-
br.length = len;
207212
reuseDoc.add(point);
208-
} else {
209-
br = null;
210213
}
211214
Field dv = reuseDVs[i];
212215
if (dv != null) {
213-
if (br == null) {
214-
br = dv.binaryValue();
215-
br.bytes = bytes;
216-
br.offset = lastFieldStart;
217-
br.length = len;
218-
}
219216
dv.setBytesValue(br);
220217
reuseDoc.add(dv);
221218
}

src/test/org/apache/lucene/server/TestIndexing.java

+5-5
Original file line numberDiff line numberDiff line change
@@ -419,7 +419,7 @@ public void testIllegalIndexCSVBadLong() throws Exception {
419419
server.sendBinary("bulkCSVAddDocument",
420420
"csv\ncount,id2,body\n0,1,some text\n118371391723487213472,2,some more text".getBytes(StandardCharsets.UTF_8));
421421
});
422-
assertTrue(t.getMessage().contains("doc at offset 66: could not parse field \"count\" as long: overflow: \"118371391723487213472\""));
422+
assertContains(t.getMessage(), "doc at offset 66: could not parse field \"count\" as long: overflow: \"118371391723487213472\"");
423423
send("stopIndex");
424424
send("deleteIndex");
425425
}
@@ -430,9 +430,9 @@ public void testIllegalIndexCSVBadFloat() throws Exception {
430430
send("startIndex");
431431
Throwable t = expectThrows(IOException.class, () -> {
432432
server.sendBinary("bulkCSVAddDocument",
433-
"csv\ncount,id2,body\n0,1,some text\n118371391723487213472,2,some more text".getBytes(StandardCharsets.UTF_8));
433+
"csv\ncount,id2,body\n0,1,some text\n1183x71391723487213472,2,some more text".getBytes(StandardCharsets.UTF_8));
434434
});
435-
assertContains(t.getMessage(), "doc at offset 66: could not parse field \"count\" as float: overflow: \"118371391723487213472\"");
435+
assertContains(t.getMessage(), "doc at offset 67: could not parse field \"count\" as float: extra characters: \"1183x71391723487213472\"");
436436
send("stopIndex");
437437
send("deleteIndex");
438438
}
@@ -443,9 +443,9 @@ public void testIllegalIndexCSVBadDouble() throws Exception {
443443
send("startIndex");
444444
Throwable t = expectThrows(IOException.class, () -> {
445445
server.sendBinary("bulkCSVAddDocument",
446-
"csv\ncount,id2,body\n0,1,some text\n118371391723487213472,2,some more text".getBytes(StandardCharsets.UTF_8));
446+
"csv\ncount,id2,body\n0,1,some text\n1183x71391723487213472,2,some more text".getBytes(StandardCharsets.UTF_8));
447447
});
448-
assertContains(t.getMessage(), "doc at offset 66: could not parse field \"count\" as double: overflow: \"118371391723487213472\"");
448+
assertContains(t.getMessage(), "doc at offset 67: could not parse field \"count\" as double: extra characters: \"1183x71391723487213472\"");
449449
send("stopIndex");
450450
send("deleteIndex");
451451
}

0 commit comments

Comments
 (0)