use sklearn for ml algorithms

we are dropping dependencies on `rlr`, `fastcluster` and `dedupe-hcluster`. this pr also brings in some other blackening code.
dedupeio · Jun 2, 2022 · 7e24af2 · 7e24af2
1 parent 1d01808
commit 7e24af2
Show file tree

Hide file tree

Showing 7 changed files with 68 additions and 45 deletions.
diff --git a/.github/workflows/pythonpackage.yml b/.github/workflows/pythonpackage.yml
@@ -8,28 +8,31 @@ jobs:
     # so run this before anything else.
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/checkout@v1
-      - name: Set up Python 3.10
-        uses: actions/setup-python@v2
-        with:
-          python-version: "3.10"
-      - name: Install Black
-        run: pip install black
-      - name: Run black --check .
-        run: black --check .
+      - uses: actions/checkout@v2
+        with:  # https://github.com/stefanzweifel/git-auto-commit-action#checkout-the-correct-branch
+            ref: ${{ github.head_ref }}
+      - uses: actions/setup-python@v2
+      - run: pip install black
+      - run: black --check .
       - name: If needed, commit black changes to the pull request
         if: failure()
         run: |
-          black .
-          git config --global user.name 'autoblack'
-          git config --global user.email 'cclauss@users.noreply.github.com'
+          printenv | grep GITHUB
+          git config --global user.name 'fgregg'
+          git config --global user.email 'fgregg@users.noreply.github.com'          
           git remote set-url origin https://x-access-token:${{ secrets.GITHUB_TOKEN }}@github.com/$GITHUB_REPOSITORY
-          git checkout $GITHUB_HEAD_REF
-          git commit -am "fixup: Format Python code with Black"
+          git remote -v
+          git branch
+          git status
+          black .
+          git status
+          echo ready to commit
+          git commit -am "fixup! Format Python code with psf/black pull_request"
+          echo ready to push
           git push
   test:
     needs: format
-    timeout-minutes: 30
+    timeout-minutes: 40
     runs-on: ${{ matrix.os }}
     strategy:
       matrix:

diff --git a/dedupe/api.py b/dedupe/api.py
@@ -15,7 +15,8 @@
 import tempfile
 
 import numpy
-import rlr
+import sklearn.linear_model
+import sklearn.model_selection
 
 import dedupe.core as core
 import dedupe.serializer as serializer
@@ -1016,6 +1017,19 @@ def __init__(
                 "the current version of dedupe. This can happen "
                 "if you have recently upgraded dedupe."
             )
+        except ModuleNotFoundError as exc:
+            if "No module named 'rlr'" in str(exc):
+                raise SettingsFileLoadingException(
+                    "This settings file was created with a previous "
+                    "version of dedupe that used the 'rlr' library. "
+                    "To continue to use this settings file, you need "
+                    "install that library: `pip install rlr`"
+                )
+            else:
+                raise SettingsFileLoadingException(
+                    "Something has gone wrong with loading the settings file. "
+                    "Try deleting the file"
+                ) from exc
         except:  # noqa: E722
             raise SettingsFileLoadingException(
                 "Something has gone wrong with loading the settings file. "
@@ -1034,7 +1048,13 @@ class ActiveMatching(Matching):
     Class for training a matcher.
     """
 
-    classifier = rlr.RegularizedLogisticRegression()
+    classifier = sklearn.model_selection.GridSearchCV(
+        estimator=sklearn.linear_model.LogisticRegression(),
+        param_grid={"C": [0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10]},
+        scoring="f1",
+        verbose=3,
+        n_jobs=-1,
+    )
 
     def __init__(
         self,

diff --git a/dedupe/clustering.py b/dedupe/clustering.py
@@ -8,8 +8,7 @@
 import tempfile
 
 import numpy
-import fastcluster
-import hcluster
+import scipy.cluster.hierarchy
 
 from typing import Iterable, Dict, cast, List, Set, Generator, Sequence, Tuple
 from dedupe._typing import Clusters, RecordID, Links
@@ -238,11 +237,11 @@ def cluster(
 
             i_to_id, condensed_distances, N = condensedDistance(sub_graph)
 
-            linkage = fastcluster.linkage(
-                condensed_distances, method="centroid", preserve_input=True
+            linkage = scipy.cluster.hierarchy.linkage(
+                condensed_distances, method="centroid"
             )
 
-            partition = hcluster.fcluster(
+            partition = scipy.cluster.hierarchy.fcluster(
                 linkage, distance_threshold, criterion="distance"
             )
 

diff --git a/dedupe/labeler.py b/dedupe/labeler.py
@@ -3,9 +3,9 @@
 import logging
 
 import numpy
-import rlr
 from typing import List
 from typing_extensions import Protocol
+import sklearn.linear_model
 
 import dedupe.core as core
 import dedupe.training as training
@@ -38,9 +38,9 @@ class HasDataModel(Protocol):
     data_model: datamodel.DataModel
 
 
-class RLRLearner(ActiveLearner, rlr.RegularizedLogisticRegression):
+class RLRLearner(sklearn.linear_model.LogisticRegression, ActiveLearner):
     def __init__(self, data_model):
-        super().__init__(alpha=1)
+        super().__init__()
         self.data_model = data_model
         self._candidates: List[TrainingExample]
 
@@ -66,7 +66,7 @@ def fit(self, X, y):
         self.y = numpy.array(y)
         self.X = X
 
-        super().fit(self.X, self.y, cv=False)
+        super().fit(self.X, self.y)
 
     def fit_transform(self, pairs, y):
         self.fit(self.transform(pairs), y)
@@ -118,7 +118,7 @@ def _bias(self):
         return weighted_bias
 
     def candidate_scores(self):
-        return self.predict_proba(self.distances)
+        return self.predict_proba(self.distances)[:, 1].reshape(-1, 1)
 
     def __len__(self):
         return len(self.candidates)

diff --git a/setup.py b/setup.py
@@ -11,12 +11,10 @@
 from Cython.Build import cythonize
 
 install_requires = [
-    "fastcluster",
-    "dedupe-hcluster",
+    "scikit-learn",
     "affinegap>=1.3",
     "categorical-distance>=1.9",
     "dedupe-variable-datetime",
-    "rlr>=2.4.3",
     "numpy>=1.13",
     "doublemetaphone",
     "highered>=0.2.0",

diff --git a/tests/test_core.py b/tests/test_core.py
@@ -2,10 +2,21 @@
 import random
 
 import numpy
+import scipy.special
 
 import dedupe
 
 
+class MockClassifier:
+    def __init__(self):
+
+        self.weight = 0
+        self.bias = 0
+
+    def predict_proba(self, examples):
+        return scipy.special.expit(examples * self.weight + self.bias)
+
+
 class ScoreDuplicates(unittest.TestCase):
     def setUp(self):
         random.seed(123)
@@ -39,8 +50,9 @@ def setUp(self):
 
         deduper = dedupe.Dedupe([{"field": "name", "type": "String"}])
         self.data_model = deduper.data_model
-        self.classifier = deduper.classifier
-        self.classifier.weights = [-1.0302742719650269]
+        self.classifier = MockClassifier()
+
+        self.classifier.weight = -1.0302742719650269
         self.classifier.bias = 4.76
 
         score_dtype = [("pairs", "<U192", 2), ("score", "f4")]
@@ -68,7 +80,7 @@ def test_score_duplicates(self):
         )
 
     def test_score_duplicates_with_zeros(self):
-        self.classifier.weights = [-1000]
+        self.classifier.weight = -1000
         self.classifier.bias = 1000
         self.records = iter(
             [

diff --git a/tests/test_labeler.py b/tests/test_labeler.py
@@ -23,23 +23,14 @@ def test_AL(self):
         active_learner = dedupe.labeler.RLRLearner(self.data_model)
         active_learner.candidates = SAMPLE
         assert len(active_learner) == original_N
-        pair = active_learner.pop()
-        print(pair)
-        assert pair == (
-            {"name": "Willy", "age": "35"},
-            {"name": "William", "age": "35"},
-        )
 
+        active_learner.pop()
         assert len(active_learner) == original_N - 1
 
-        pair = active_learner.pop()
-        print(pair)
-        assert pair == ({"name": "Jimmy", "age": "20"}, {"name": "Jimbo", "age": "21"})
+        active_learner.pop()
         assert len(active_learner) == original_N - 2
 
-        pair = active_learner.pop()
-        assert pair == ({"name": "Meredith", "age": "40"}, {"name": "Sue", "age": "10"})
-
+        active_learner.pop()
         assert len(active_learner) == original_N - 3
 
         active_learner.pop()