use float not double for taxis floating point fields; fix silly failing tests; avoid converting utf8 bytes to String and back to utf8 again for atom fields

mikemccand · mikemccand · commit 7ad1a6a867a2 · 2016-08-05T15:06:25.000-04:00
diff --git a/scripts/indexTaxis.py b/scripts/indexTaxis.py
@@ -210,8 +210,7 @@ def main():
       refreshSec = 1.0
     else:
       # Turn off refreshes to maximize indexing throughput:
-      #refreshSec = 100000.0
-      refreshSec = 1.0
+      refreshSec = 100000.0
     send(LOCALHOST, primaryPorts[0], "liveSettings", {'indexName': 'index', 'index.ramBufferSizeMB': 1024., 'maxRefreshSec': refreshSec})
 
     fields = {'indexName': 'index',
@@ -223,23 +222,23 @@ def main():
                 'pick_up_date_time': {'type': 'long', 'search': True, 'sort': True},
                 'drop_off_date_time': {'type': 'long', 'search': True, 'sort': True},
                 'passenger_count': {'type': 'int', 'search': True, 'sort': True},
-                'trip_distance': {'type': 'double', 'search': True, 'sort': True},
-                'pick_up_lat': {'type': 'double', 'search': True, 'sort': True},
-                'pick_up_lon': {'type': 'double', 'search': True, 'sort': True},
-                'drop_off_lat': {'type': 'double', 'search': True, 'sort': True},
-                'drop_off_lon': {'type': 'double', 'search': True, 'sort': True},
+                'trip_distance': {'type': 'float', 'search': True, 'sort': True},
+                'pick_up_lat': {'type': 'float', 'search': True, 'sort': True},
+                'pick_up_lon': {'type': 'float', 'search': True, 'sort': True},
+                'drop_off_lat': {'type': 'float', 'search': True, 'sort': True},
+                'drop_off_lon': {'type': 'float', 'search': True, 'sort': True},
                 'payment_type': {'type': 'atom', 'sort': True},
                 'trip_type': {'type': 'atom', 'sort': True},
                 'rate_code': {'type': 'atom', 'sort': True},
-                'fare_amount': {'type': 'double', 'search': True, 'sort': True},
-                'surcharge': {'type': 'double', 'search': True, 'sort': True},
-                'mta_tax': {'type': 'double', 'search': True, 'sort': True},
-                'extra': {'type': 'double', 'search': True, 'sort': True},
-                'ehail_fee': {'type': 'double', 'search': True, 'sort': True},
-                'improvement_surcharge': {'type': 'double', 'search': True, 'sort': True},
-                'tip_amount': {'type': 'double', 'search': True, 'sort': True},
-                'tolls_amount': {'type': 'double', 'search': True, 'sort': True},
-                'total_amount': {'type': 'double', 'search': True, 'sort': True},
+                'fare_amount': {'type': 'float', 'search': True, 'sort': True},
+                'surcharge': {'type': 'float', 'search': True, 'sort': True},
+                'mta_tax': {'type': 'float', 'search': True, 'sort': True},
+                'extra': {'type': 'float', 'search': True, 'sort': True},
+                'ehail_fee': {'type': 'float', 'search': True, 'sort': True},
+                'improvement_surcharge': {'type': 'float', 'search': True, 'sort': True},
+                'tip_amount': {'type': 'float', 'search': True, 'sort': True},
+                'tolls_amount': {'type': 'float', 'search': True, 'sort': True},
+                'total_amount': {'type': 'float', 'search': True, 'sort': True},
                 'store_and_fwd_flag': {'type': 'atom', 'sort': True}}}
 
     send(LOCALHOST, primaryPorts[0], 'registerFields', fields)
@@ -248,10 +247,12 @@ def main():
 
     send(LOCALHOST, primaryPorts[0], "settings", {'indexName': 'index',
                                     #'indexSort': [{'field': 'pick_up_lon'}],
-                                    'index.verbose': True,
+                                    'index.verbose': False,
                                     'directory': 'MMapDirectory',
                                     'nrtCachingDirectory.maxSizeMB': 0.0,
-                                    #'index.merge.scheduler.auto_throttle': False,
+                                    'concurrentMergeScheduler.maxThreadCount': 4,
+                                    'concurrentMergeScheduler.maxMergeCount': 9,
+                                    'index.merge.scheduler.auto_throttle': False,
                                     })
 
     for id, host, installPath, port, binaryPort in replicaPorts:
@@ -277,7 +278,8 @@ def main():
     replicaStarted = False
 
     #docSource = '/lucenedata/nyc-taxi-data/alltaxis.csv.blocks'
-    docSource = '/b/alltaxis.csv.blocks'
+    #docSource = '/b/alltaxis.csv.blocks'
+    docSource = '/l/data/alltaxis.csv.blocks'
     if not os.path.exists(docSource):
       # Not Mike's home computer!
       docSource = 'data/alltaxis.1M.csv.blocks'
diff --git a/src/java/org/apache/lucene/server/handlers/CSVParser.java b/src/java/org/apache/lucene/server/handlers/CSVParser.java
@@ -94,16 +94,23 @@ private void initReuseFields() {
       switch(fd.valueType) {
       case "atom":
         {
-          reuseFields[i] = new StringField(fd.name, "", stored ? Field.Store.YES : Field.Store.NO);
+          BytesRef br;
           if (fd.usePoints) {
             reusePoints[i] = new BinaryPoint(fd.name, new byte[0]);
+            // little bit sneaky sharing of a single BytesRef across all Lucene
+            // fields we add for this user's field:
+            br = reusePoints[i].binaryValue();
+            assert br != null;
+          } else {
+            br = new BytesRef();
           }
+          reuseFields[i] = new StringField(fd.name, br, stored ? Field.Store.YES : Field.Store.NO);
           if (dvType == DocValuesType.SORTED) {
-            reuseDVs[i] = new SortedDocValuesField(fd.name, new BytesRef());
+            reuseDVs[i] = new SortedDocValuesField(fd.name, br);
           } else if (dvType == DocValuesType.SORTED_SET) {
-            reuseDVs[i] = new SortedSetDocValuesField(fd.name, new BytesRef());
+            reuseDVs[i] = new SortedSetDocValuesField(fd.name, br);
           } else if (dvType == DocValuesType.BINARY) {
-            reuseDVs[i] = new BinaryDocValuesField(fd.name, new BytesRef());
+            reuseDVs[i] = new BinaryDocValuesField(fd.name, br);
           }
           break;
         }
@@ -193,29 +200,19 @@ private void addOneField(int i, int lastFieldStart) {
     switch(fields[i].valueType) {
     case "atom":
       {
-        String s = new String(bytes, lastFieldStart, len, StandardCharsets.UTF_8);
         Field field = reuseFields[i];
-        field.setStringValue(s);
+        BytesRef br = field.binaryValue();
+        assert br != null;
+        br.bytes = bytes;
+        br.offset = lastFieldStart;
+        br.length = len;
         reuseDoc.add(field);
         Field point = reusePoints[i];
-        BytesRef br;
         if (point != null) {
-          br = point.binaryValue();
-          br.bytes = bytes;
-          br.offset = lastFieldStart;
-          br.length = len;
           reuseDoc.add(point);
-        } else {
-          br = null;
         }
         Field dv = reuseDVs[i];
         if (dv != null) {
-          if (br == null) {
-            br = dv.binaryValue();
-            br.bytes = bytes;
-            br.offset = lastFieldStart;
-            br.length = len;
-          }
           dv.setBytesValue(br);
           reuseDoc.add(dv);
         }
diff --git a/src/test/org/apache/lucene/server/TestIndexing.java b/src/test/org/apache/lucene/server/TestIndexing.java
@@ -419,7 +419,7 @@ public void testIllegalIndexCSVBadLong() throws Exception {
         server.sendBinary("bulkCSVAddDocument",
                           "csv\ncount,id2,body\n0,1,some text\n118371391723487213472,2,some more text".getBytes(StandardCharsets.UTF_8));
       });
-    assertTrue(t.getMessage().contains("doc at offset 66: could not parse field \"count\" as long: overflow: \"118371391723487213472\""));
+    assertContains(t.getMessage(), "doc at offset 66: could not parse field \"count\" as long: overflow: \"118371391723487213472\"");
     send("stopIndex");
     send("deleteIndex");
   }
@@ -430,9 +430,9 @@ public void testIllegalIndexCSVBadFloat() throws Exception {
     send("startIndex");
     Throwable t = expectThrows(IOException.class, () -> {
         server.sendBinary("bulkCSVAddDocument",
-                          "csv\ncount,id2,body\n0,1,some text\n118371391723487213472,2,some more text".getBytes(StandardCharsets.UTF_8));
+                          "csv\ncount,id2,body\n0,1,some text\n1183x71391723487213472,2,some more text".getBytes(StandardCharsets.UTF_8));
       });
-    assertContains(t.getMessage(), "doc at offset 66: could not parse field \"count\" as float: overflow: \"118371391723487213472\"");
+    assertContains(t.getMessage(), "doc at offset 67: could not parse field \"count\" as float: extra characters: \"1183x71391723487213472\"");
     send("stopIndex");
     send("deleteIndex");
   }
@@ -443,9 +443,9 @@ public void testIllegalIndexCSVBadDouble() throws Exception {
     send("startIndex");
     Throwable t = expectThrows(IOException.class, () -> {
         server.sendBinary("bulkCSVAddDocument",
-                          "csv\ncount,id2,body\n0,1,some text\n118371391723487213472,2,some more text".getBytes(StandardCharsets.UTF_8));
+                          "csv\ncount,id2,body\n0,1,some text\n1183x71391723487213472,2,some more text".getBytes(StandardCharsets.UTF_8));
       });
-    assertContains(t.getMessage(), "doc at offset 66: could not parse field \"count\" as double: overflow: \"118371391723487213472\"");
+    assertContains(t.getMessage(), "doc at offset 67: could not parse field \"count\" as double: extra characters: \"1183x71391723487213472\"");
     send("stopIndex");
     send("deleteIndex");
   }