Merge branch 'fix_overflow_largepredict'

siboehm · siboehm · commit fa140135c445 · 2021-11-21T17:45:49.000+01:00
diff --git a/.github/ci.sh b/.github/ci.sh
@@ -8,7 +8,7 @@ python -m pip install --no-use-pep517 --no-deps --disable-pip-version-check -e .
 pytest -v tests
 
 # Check documentation build only in one job, also do releases
-if [ "${PYTHON_VERSION}" = "3.6" ]; then
+if [ "${PYTHON_VERSION}" = "3.7" ]; then
   pushd docs
   make html
   popd
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -34,14 +34,11 @@ jobs:
         with:
           path: ./.hypothesis
           key: hypothesisDB ${{ matrix.PYTHON_VERSION }}
-      - if: matrix.PYTHON_VERSION == '3.6'
-        shell: bash -x -l {0}
-        run: pip install dataclasses
       - name: Run the unittests
         shell: bash -x -l {0}
         run: ./.github/ci.sh ${{ matrix.PYTHON_VERSION }}
       - name: Publish a Python distribution to PyPI
-        if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags') && matrix.PYTHON_VERSION == '3.6'
+        if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags') && matrix.PYTHON_VERSION == '3.7'
         uses: pypa/gh-action-pypi-publish@v1.4.2
         with:
           user: __token__
diff --git a/environment.yml b/environment.yml
@@ -4,7 +4,7 @@ channels:
   - nodefaults
 dependencies:
   # runtime deps
-  - python>=3.6
+  - python>=3.7
   - llvmlite>=0.36
   - numpy
   # testing
diff --git a/lleaves/compiler/codegen/codegen.py b/lleaves/compiler/codegen/codegen.py
@@ -9,6 +9,7 @@
 FLOAT = ir.FloatType()
 INT_CAT = ir.IntType(bits=32)
 INT = ir.IntType(bits=32)
+LONG = ir.IntType(bits=64)
 ZERO_V = ir.Constant(BOOL, 0)
 FLOAT_POINTER = ir.PointerType(FLOAT)
 DOUBLE_PTR = ir.PointerType(DOUBLE)
@@ -18,6 +19,10 @@ def iconst(value):
     return ir.Constant(INT, value)
 
 
+def lconst(value):
+    return ir.Constant(LONG, value)
+
+
 def fconst(value):
     return ir.Constant(FLOAT, value)
 
@@ -168,7 +173,9 @@ def _populate_instruction_block(
 
     # -- SETUP BLOCK
     builder = ir.IRBuilder(setup_block)
-    loop_iter = builder.alloca(INT, 1, "loop-idx")
+    start_index = builder.zext(start_index, LONG)
+    end_index = builder.zext(end_index, LONG)
+    loop_iter = builder.alloca(LONG, 1, "loop-idx")
     builder.store(start_index, loop_iter)
     condition_block = root_func.append_basic_block("loop-condition")
     builder.branch(condition_block)
@@ -187,9 +194,9 @@ def _populate_instruction_block(
     args = []
     loop_iter_reg = builder.load(loop_iter)
 
-    n_args = ir.Constant(INT, forest.n_args)
+    n_args = ir.Constant(LONG, forest.n_args)
     iter_mul_nargs = builder.mul(loop_iter_reg, n_args)
-    idx = (builder.add(iter_mul_nargs, iconst(i)) for i in range(forest.n_args))
+    idx = (builder.add(iter_mul_nargs, lconst(i)) for i in range(forest.n_args))
     raw_ptrs = [builder.gep(root_func.args[0], (c,)) for c in idx]
     # cast the categorical inputs to integer
     for feature, ptr in zip(forest.features, raw_ptrs):
@@ -203,9 +210,9 @@ def _populate_instruction_block(
     for func in tree_funcs:
         tree_res = builder.call(func.llvm_function, args)
         results[func.class_id] = builder.fadd(tree_res, results[func.class_id])
-    res_idx = builder.mul(iconst(forest.n_classes), loop_iter_reg)
+    res_idx = builder.mul(lconst(forest.n_classes), loop_iter_reg)
     results_ptr = [
-        builder.gep(out_arr, (builder.add(res_idx, iconst(class_idx)),))
+        builder.gep(out_arr, (builder.add(res_idx, lconst(class_idx)),))
         for class_idx in range(forest.n_classes)
     ]
 
@@ -224,8 +231,7 @@ def _populate_instruction_block(
     for result, result_ptr in zip(results, results_ptr):
         builder.store(result, result_ptr)
 
-    tmpp1 = builder.add(loop_iter_reg, iconst(1))
-    builder.store(tmpp1, loop_iter)
+    builder.store(builder.add(loop_iter_reg, lconst(1)), loop_iter)
     builder.branch(condition_block)
     # -- END CORE LOOP BLOCK
 
diff --git a/lleaves/data_processing.py b/lleaves/data_processing.py
@@ -4,6 +4,7 @@
 from typing import List, Optional
 
 import numpy as np
+import pandas as pd
 
 try:
     from pandas import DataFrame as pd_DataFrame
@@ -15,7 +16,7 @@ class pd_DataFrame:
         pass
 
 
-def _dataframe_to_ndarray(data, pd_traintime_categories: List[List]):
+def _dataframe_to_ndarray(data: pd.DataFrame, pd_traintime_categories: List[List]):
     """
     Converts the given dataframe into a 2D numpy array and converts categorical columns to float.
 
@@ -94,7 +95,7 @@ def data_to_ndarray(data, pd_traintime_categories: Optional[List[List]] = None):
     return data
 
 
-def ndarray_to_ptr(data):
+def ndarray_to_ptr(data: np.ndarray):
     """
     Takes a 2D numpy array, converts to float64 if necessary and returns a pointer
 
diff --git a/lleaves/lleaves.py b/lleaves/lleaves.py
@@ -1,7 +1,7 @@
 import concurrent.futures
 import math
 import os
-from ctypes import CFUNCTYPE, POINTER, c_double, c_int
+from ctypes import CFUNCTYPE, POINTER, c_double, c_int32
 from pathlib import Path
 
 import llvmlite.binding
@@ -20,8 +20,8 @@
     None,  # return void
     POINTER(c_double),  # pointer to data array
     POINTER(c_double),  # pointer to results array
-    c_int,  # start index
-    c_int,  # end index
+    c_int32,  # start index
+    c_int32,  # end index
 )
 
 
@@ -89,12 +89,10 @@ def compile(
         """
         Generate the LLVM IR for this model and compile it to ASM.
 
-        For most users tweaking the compilation flags (fcodemodel, fblocksize) will be unnecessary as the default
-        configuration is already very fast.
+        For most users tweaking the compilation flags (fcodemodel, fblocksize, finline) will be unnecessary
+        as the default configuration is already very fast.
         Modifying the flags is useful only if you're trying to squeeze out the last few percent of performance.
 
-        The compile() method is generally not thread-safe.
-
         :param cache: Path to a cache file. If this path doesn't exist, binary will be dumped at path after compilation.
             If path exists, binary will be loaded and compilation skipped.
             No effort is made to check staleness / consistency.
@@ -160,6 +158,12 @@ def predict(self, data, n_jobs=os.cpu_count()):
             raise ValueError(
                 f"Data must be of dimension (N, {self.num_feature()}), is {data.shape}."
             )
+        # protect against `ctypes.c_int32` silently overflowing and causing SIGSEGV
+        if n_predictions >= 2 ** 31 - 1:
+            raise ValueError(
+                "Prediction is not supported for datasets with >=2^31-1 rows. "
+                "Split the dataset into smaller chunks first."
+            )
 
         # setup input data and predictions array
         ptr_data = ndarray_to_ptr(data)
diff --git a/setup.py b/setup.py
@@ -24,6 +24,6 @@
     description="LLVM-based compiler for LightGBM models",
     long_description=long_description,
     long_description_content_type="text/markdown",
-    python_requires=">=3.6",
-    install_requires=["llvmlite>=0.36", "numpy", "dataclasses; python_version < '3.7'"],
+    python_requires=">=3.7",
+    install_requires=["llvmlite>=0.36", "numpy"],
 )
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -0,0 +1,23 @@
+import pytest
+from lightgbm import Booster
+
+from lleaves import Model
+
+
+@pytest.fixture(scope="session")
+def NYC_llvm():
+    llvm_model = Model(model_file="tests/models/NYC_taxi/model.txt")
+    llvm_model.compile()
+    return llvm_model
+
+
+@pytest.fixture(scope="session")
+def NYC_lgbm():
+    return Booster(model_file="tests/models/NYC_taxi/model.txt")
+
+
+@pytest.fixture(scope="session")
+def mtpl2_llvm():
+    llvm_model = Model(model_file="tests/models/mtpl2/model.txt")
+    llvm_model.compile()
+    return llvm_model
diff --git a/tests/test_dataprocessing.py b/tests/test_dataprocessing.py
@@ -3,7 +3,9 @@
 import numpy as np
 import pandas as pd
 import pytest
+from lightgbm import Booster
 
+from lleaves import Model
 from lleaves.data_processing import (
     data_to_ndarray,
     extract_model_global_features,
@@ -87,3 +89,20 @@ def test_no_data_modification():
     pred = pd.DataFrame(data).astype("category")
     ndarray_to_ptr(data_to_ndarray(pred, data))
     pd.testing.assert_frame_equal(pred, orig)
+
+
+def test_sliced_arrays():
+    # predictions should be correct when passed a sliced array
+    llvm_model = Model(model_file="tests/models/single_tree/model.txt")
+    llvm_model.compile()
+    lgbm_model = Booster(model_file="tests/models/single_tree/model.txt")
+
+    n_feature = lgbm_model.num_feature()
+    data = np.array(list(range(-5 * n_feature, 5 * n_feature)), dtype=np.float64)
+    data = data.reshape((5, 2 * n_feature))
+    sliced = data[:, ::2]
+    assert not sliced.flags.c_contiguous
+    np.testing.assert_almost_equal(
+        llvm_model.predict(sliced, n_jobs=4), lgbm_model.predict(sliced), decimal=13
+    )
+    return
diff --git a/tests/test_parallel.py b/tests/test_parallel.py
@@ -1,30 +1,37 @@
 from ctypes import POINTER, c_double
 
 import numpy as np
-from lightgbm import Booster
 
-from lleaves import Model
 
+def test_parallel_edgecases(NYC_llvm, NYC_lgbm):
+    # single row, multiple threads
+    data = np.array(1 * [NYC_lgbm.num_feature() * [1.0]], dtype=np.float64)
+    np.testing.assert_almost_equal(
+        NYC_llvm.predict(data, n_jobs=4), NYC_lgbm.predict(data), decimal=14
+    )
+
+    # last thread has only one prediction (batchsize is ceil(19/7)=3)
+    data = np.array(19 * [NYC_lgbm.num_feature() * [1.0]], dtype=np.float64)
+    np.testing.assert_almost_equal(
+        NYC_llvm.predict(data, n_jobs=7), NYC_lgbm.predict(data), decimal=14
+    )
 
-def test_parallel_iteration():
-    llvm_model = Model(model_file="tests/models/NYC_taxi/model.txt")
-    lgbm_model = Booster(model_file="tests/models/NYC_taxi/model.txt")
-    llvm_model.compile()
 
-    data = np.array(4 * [5 * [1.0]], dtype=np.float64)
+def test_parallel_iteration(NYC_llvm, NYC_lgbm):
+    data = np.array(4 * [NYC_lgbm.num_feature() * [1.0]], dtype=np.float64)
     data_flat = np.array(data.reshape(data.size), dtype=np.float64)
     np.testing.assert_almost_equal(
-        llvm_model.predict(data, n_jobs=4), lgbm_model.predict(data), decimal=14
+        NYC_llvm.predict(data, n_jobs=4), NYC_lgbm.predict(data), decimal=14
     )
 
     ptr_data = data_flat.ctypes.data_as(POINTER(c_double))
     preds = np.zeros(4, dtype=np.float64)
     ptr_preds = preds.ctypes.data_as(POINTER(c_double))
 
-    llvm_model._c_entry_func(ptr_data, ptr_preds, 2, 4)
+    NYC_llvm._c_entry_func(ptr_data, ptr_preds, 2, 4)
     preds_l = list(preds)
     assert preds_l[0] == 0.0 and preds_l[1] == 0.0
     assert preds_l[2] != 0.0 and preds_l[3] != 0.0
-    llvm_model._c_entry_func(ptr_data, ptr_preds, 0, 2)
+    NYC_llvm._c_entry_func(ptr_data, ptr_preds, 0, 2)
     preds_l = list(preds)
     assert preds_l[0] != 0.0 and preds_l[1] != 0.0

Original file line number	Diff line number	Diff line change
`@@ -24,6 +24,6 @@`
`24`	`24`	`description="LLVM-based compiler for LightGBM models",`
`25`	`25`	`long_description=long_description,`
`26`	`26`	`long_description_content_type="text/markdown",`
`27`		`- python_requires=">=3.6",`
`28`		`- install_requires=["llvmlite>=0.36", "numpy", "dataclasses; python_version < '3.7'"],`
	`27`	`+ python_requires=">=3.7",`
	`28`	`+ install_requires=["llvmlite>=0.36", "numpy"],`
`29`	`29`	`)`