Merge pull request #59 from siboehm/siboehm/rf

siboehm · web-flow · commit b5adfae95f25 · 2023-11-11T07:16:37.000-08:00
Implement random forest
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -3,6 +3,7 @@ on:
   push:
     branches: [ master ]
   pull_request:
+    branches: ['*']
   workflow_dispatch:
 
 jobs:
diff --git a/lleaves/compiler/ast/nodes.py b/lleaves/compiler/ast/nodes.py
@@ -29,6 +29,8 @@ class Forest:
     objective_func: str
     objective_func_config: str
     raw_score: bool = False
+    # average output over trees instead of just accumulating
+    average_output: bool = False
 
     @property
     def n_args(self):
diff --git a/lleaves/compiler/ast/parser.py b/lleaves/compiler/ast/parser.py
@@ -101,6 +101,7 @@ def parse_to_ast(model_path):
     objective = scanned_model["general_info"]["objective"]
     objective_func = objective[0]
     objective_func_config = objective[1] if len(objective) > 1 else None
+    average_output = "average_output" in scanned_model["general_info"]
     features = [
         Feature(is_categorical_feature(x))
         for x in scanned_model["general_info"]["feature_infos"]
@@ -114,7 +115,14 @@ def parse_to_ast(model_path):
         )
     ]
     assert len(trees) % n_classes == 0, "Ill formed model file"
-    return Forest(trees, features, n_classes, objective_func, objective_func_config)
+    return Forest(
+        trees,
+        features,
+        n_classes,
+        objective_func,
+        objective_func_config,
+        average_output=average_output,
+    )
 
 
 def is_categorical_feature(feature_info: str):
diff --git a/lleaves/compiler/ast/scanner.py b/lleaves/compiler/ast/scanner.py
@@ -76,6 +76,7 @@ def __init__(self, type: type, is_list=False, null_ok=False):
     "version": ScannedValue(str),
     "feature_infos": ScannedValue(str, True),
     "objective": ScannedValue(str, True),
+    "average_output": ScannedValue(bool, null_ok=True),
 }
 TREE_SCAN_KEYS = {
     "Tree": ScannedValue(int),
@@ -106,7 +107,13 @@ def _scan_block(lines: list, items_to_scan: dict):
         if line == "tree":
             continue
 
-        scanned_key, scanned_value = line.split("=")
+        line_split = line.split("=")
+        if len(line_split) == 2:
+            scanned_key, scanned_value = line.split("=")
+        else:
+            assert len(line_split) == 1, f"Unexpected line {line}"
+            scanned_key, scanned_value = line_split[0], True
+
         target_type = items_to_scan.get(scanned_key)
         if target_type is None:
             continue
diff --git a/lleaves/compiler/codegen/codegen.py b/lleaves/compiler/codegen/codegen.py
@@ -241,6 +241,8 @@ def _populate_instruction_block(
             forest.objective_func,
             forest.objective_func_config,
             forest.raw_score,
+            forest.average_output,
+            len(forest.trees),
         )
     for result, result_ptr in zip(results, results_ptr):
         builder.store(result, result_ptr)
@@ -279,7 +281,13 @@ def _populate_forest_func(forest, root_func, tree_funcs, fblocksize):
 
 
 def _populate_objective_func_block(
-    builder, args, objective: str, objective_config: str, raw_score: bool
+    builder,
+    args,
+    objective: str,
+    objective_config: str,
+    raw_score: bool,
+    average_output: bool,
+    num_trees: int,
 ):
     """
     Takes the objective function specification and generates the code for it into the builder
@@ -290,6 +298,9 @@ def _populate_objective_func_block(
         "llvm.copysign", (DOUBLE, DOUBLE), ir.FunctionType(DOUBLE, (DOUBLE, DOUBLE))
     )
 
+    if average_output:
+        args[0] = builder.fdiv(args[0], dconst(num_trees))
+
     def _populate_sigmoid(alpha):
         if alpha <= 0:
             raise ValueError(f"Sigmoid parameter needs to be >0, is {alpha}")
diff --git a/tests/test_tree_output.py b/tests/test_tree_output.py
@@ -3,7 +3,7 @@
 import pytest
 from hypothesis import given, settings
 from hypothesis import strategies as st
-from sklearn.datasets import make_classification
+from sklearn.datasets import make_blobs, make_classification, make_regression
 
 import lleaves
 
@@ -156,3 +156,60 @@ def test_multiclass_generated(tmpdir):
         lgbm.predict(X, n_jobs=2), llvm.predict(X, n_jobs=2), decimal=10
     )
     assert lgbm.num_model_per_iteration() == llvm.num_model_per_iteration()
+
+
+def test_random_forest_classifier(tmpdir):
+    centers = [[-4, -4], [4, 4]]
+    X, y = make_blobs(n_samples=100, centers=centers, random_state=42)
+
+    # rf = random forest (outputs are averaged over all trees)
+    params = {
+        "boosting_type": "rf",
+        "n_estimators": 7,
+        "bagging_freq": 1,
+        "bagging_fraction": 0.8,
+    }
+    clf = lightgbm.LGBMClassifier(**params).fit(X, y)
+    model_file = str(tmpdir / "model.txt")
+    clf.booster_.save_model(model_file)
+
+    lgbm = lightgbm.Booster(model_file=model_file)
+    llvm = lleaves.Model(model_file=model_file)
+    llvm.compile()
+
+    # check predictions equal on the whole dataset
+    np.testing.assert_almost_equal(
+        lgbm.predict(X, n_jobs=2), llvm.predict(X, n_jobs=2), decimal=10
+    )
+    assert lgbm.num_model_per_iteration() == llvm.num_model_per_iteration()
+
+
+@pytest.mark.parametrize("num_trees", [34, 35])
+def test_random_forest_regressor(tmpdir, num_trees):
+    n_samples = 1000
+    X, y = make_regression(n_samples=n_samples, n_features=5, noise=10.0)
+
+    params = {
+        "objective": "regression",
+        "n_jobs": 1,
+        "boosting_type": "rf",
+        "subsample_freq": 1,
+        "subsample": 0.9,
+        "colsample_bytree": 0.9,
+        "num_leaves": 25,
+        "n_estimators": num_trees,
+        "min_child_samples": 100,
+        "verbose": 0,
+    }
+
+    model = lightgbm.LGBMRegressor(**params).fit(X, y)
+    model_file = str(tmpdir / "model.txt")
+    model.booster_.save_model(model_file)
+
+    lgbm = lightgbm.Booster(model_file=model_file)
+    llvm = lleaves.Model(model_file=model_file)
+    llvm.compile()
+
+    np.testing.assert_almost_equal(
+        lgbm.predict(X, n_jobs=2), llvm.predict(X, n_jobs=2), decimal=10
+    )