Skip to content

Commit

Permalink
use sklearn for ml algorithms
Browse files Browse the repository at this point in the history
we are dropping dependencies on `rlr`, `fastcluster` and `dedupe-hcluster`.

this pr also brings in some other blackening code.
  • Loading branch information
fgregg authored Jun 2, 2022
1 parent 1d01808 commit 7e24af2
Show file tree
Hide file tree
Showing 7 changed files with 68 additions and 45 deletions.
33 changes: 18 additions & 15 deletions .github/workflows/pythonpackage.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,28 +8,31 @@ jobs:
# so run this before anything else.
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v1
- name: Set up Python 3.10
uses: actions/setup-python@v2
with:
python-version: "3.10"
- name: Install Black
run: pip install black
- name: Run black --check .
run: black --check .
- uses: actions/checkout@v2
with: # https://github.com/stefanzweifel/git-auto-commit-action#checkout-the-correct-branch
ref: ${{ github.head_ref }}
- uses: actions/setup-python@v2
- run: pip install black
- run: black --check .
- name: If needed, commit black changes to the pull request
if: failure()
run: |
black .
git config --global user.name 'autoblack'
git config --global user.email 'cclauss@users.noreply.github.com'
printenv | grep GITHUB
git config --global user.name 'fgregg'
git config --global user.email 'fgregg@users.noreply.github.com'
git remote set-url origin https://x-access-token:${{ secrets.GITHUB_TOKEN }}@github.com/$GITHUB_REPOSITORY
git checkout $GITHUB_HEAD_REF
git commit -am "fixup: Format Python code with Black"
git remote -v
git branch
git status
black .
git status
echo ready to commit
git commit -am "fixup! Format Python code with psf/black pull_request"
echo ready to push
git push
test:
needs: format
timeout-minutes: 30
timeout-minutes: 40
runs-on: ${{ matrix.os }}
strategy:
matrix:
Expand Down
24 changes: 22 additions & 2 deletions dedupe/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,8 @@
import tempfile

import numpy
import rlr
import sklearn.linear_model
import sklearn.model_selection

import dedupe.core as core
import dedupe.serializer as serializer
Expand Down Expand Up @@ -1016,6 +1017,19 @@ def __init__(
"the current version of dedupe. This can happen "
"if you have recently upgraded dedupe."
)
except ModuleNotFoundError as exc:
if "No module named 'rlr'" in str(exc):
raise SettingsFileLoadingException(
"This settings file was created with a previous "
"version of dedupe that used the 'rlr' library. "
"To continue to use this settings file, you need "
"install that library: `pip install rlr`"
)
else:
raise SettingsFileLoadingException(
"Something has gone wrong with loading the settings file. "
"Try deleting the file"
) from exc
except: # noqa: E722
raise SettingsFileLoadingException(
"Something has gone wrong with loading the settings file. "
Expand All @@ -1034,7 +1048,13 @@ class ActiveMatching(Matching):
Class for training a matcher.
"""

classifier = rlr.RegularizedLogisticRegression()
classifier = sklearn.model_selection.GridSearchCV(
estimator=sklearn.linear_model.LogisticRegression(),
param_grid={"C": [0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10]},
scoring="f1",
verbose=3,
n_jobs=-1,
)

def __init__(
self,
Expand Down
9 changes: 4 additions & 5 deletions dedupe/clustering.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,7 @@
import tempfile

import numpy
import fastcluster
import hcluster
import scipy.cluster.hierarchy

from typing import Iterable, Dict, cast, List, Set, Generator, Sequence, Tuple
from dedupe._typing import Clusters, RecordID, Links
Expand Down Expand Up @@ -238,11 +237,11 @@ def cluster(

i_to_id, condensed_distances, N = condensedDistance(sub_graph)

linkage = fastcluster.linkage(
condensed_distances, method="centroid", preserve_input=True
linkage = scipy.cluster.hierarchy.linkage(
condensed_distances, method="centroid"
)

partition = hcluster.fcluster(
partition = scipy.cluster.hierarchy.fcluster(
linkage, distance_threshold, criterion="distance"
)

Expand Down
10 changes: 5 additions & 5 deletions dedupe/labeler.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,9 @@
import logging

import numpy
import rlr
from typing import List
from typing_extensions import Protocol
import sklearn.linear_model

import dedupe.core as core
import dedupe.training as training
Expand Down Expand Up @@ -38,9 +38,9 @@ class HasDataModel(Protocol):
data_model: datamodel.DataModel


class RLRLearner(ActiveLearner, rlr.RegularizedLogisticRegression):
class RLRLearner(sklearn.linear_model.LogisticRegression, ActiveLearner):
def __init__(self, data_model):
super().__init__(alpha=1)
super().__init__()
self.data_model = data_model
self._candidates: List[TrainingExample]

Expand All @@ -66,7 +66,7 @@ def fit(self, X, y):
self.y = numpy.array(y)
self.X = X

super().fit(self.X, self.y, cv=False)
super().fit(self.X, self.y)

def fit_transform(self, pairs, y):
self.fit(self.transform(pairs), y)
Expand Down Expand Up @@ -118,7 +118,7 @@ def _bias(self):
return weighted_bias

def candidate_scores(self):
return self.predict_proba(self.distances)
return self.predict_proba(self.distances)[:, 1].reshape(-1, 1)

def __len__(self):
return len(self.candidates)
Expand Down
4 changes: 1 addition & 3 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,12 +11,10 @@
from Cython.Build import cythonize

install_requires = [
"fastcluster",
"dedupe-hcluster",
"scikit-learn",
"affinegap>=1.3",
"categorical-distance>=1.9",
"dedupe-variable-datetime",
"rlr>=2.4.3",
"numpy>=1.13",
"doublemetaphone",
"highered>=0.2.0",
Expand Down
18 changes: 15 additions & 3 deletions tests/test_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,21 @@
import random

import numpy
import scipy.special

import dedupe


class MockClassifier:
def __init__(self):

self.weight = 0
self.bias = 0

def predict_proba(self, examples):
return scipy.special.expit(examples * self.weight + self.bias)


class ScoreDuplicates(unittest.TestCase):
def setUp(self):
random.seed(123)
Expand Down Expand Up @@ -39,8 +50,9 @@ def setUp(self):

deduper = dedupe.Dedupe([{"field": "name", "type": "String"}])
self.data_model = deduper.data_model
self.classifier = deduper.classifier
self.classifier.weights = [-1.0302742719650269]
self.classifier = MockClassifier()

self.classifier.weight = -1.0302742719650269
self.classifier.bias = 4.76

score_dtype = [("pairs", "<U192", 2), ("score", "f4")]
Expand Down Expand Up @@ -68,7 +80,7 @@ def test_score_duplicates(self):
)

def test_score_duplicates_with_zeros(self):
self.classifier.weights = [-1000]
self.classifier.weight = -1000
self.classifier.bias = 1000
self.records = iter(
[
Expand Down
15 changes: 3 additions & 12 deletions tests/test_labeler.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,23 +23,14 @@ def test_AL(self):
active_learner = dedupe.labeler.RLRLearner(self.data_model)
active_learner.candidates = SAMPLE
assert len(active_learner) == original_N
pair = active_learner.pop()
print(pair)
assert pair == (
{"name": "Willy", "age": "35"},
{"name": "William", "age": "35"},
)

active_learner.pop()
assert len(active_learner) == original_N - 1

pair = active_learner.pop()
print(pair)
assert pair == ({"name": "Jimmy", "age": "20"}, {"name": "Jimbo", "age": "21"})
active_learner.pop()
assert len(active_learner) == original_N - 2

pair = active_learner.pop()
assert pair == ({"name": "Meredith", "age": "40"}, {"name": "Sue", "age": "10"})

active_learner.pop()
assert len(active_learner) == original_N - 3

active_learner.pop()
Expand Down

0 comments on commit 7e24af2

Please sign in to comment.