fix and format by ruff

online-ml · Dec 27, 2023 · 88f839e · 88f839e
1 parent 7233a75
commit 88f839e
Show file tree

Hide file tree

Showing 31 changed files with 106 additions and 189 deletions.
diff --git a/build.py b/build.py
@@ -1,6 +1,9 @@
+from __future__ import annotations
+
 import platform
 from distutils.command.build_ext import build_ext
 from distutils.errors import CCompilerError, DistutilsExecError, DistutilsPlatformError
+
 import setuptools
 from setuptools_rust import Binding, RustExtension
 
@@ -9,7 +12,6 @@
     from numpy import get_include
 except ImportError:
     subprocess.check_call([sys.executable, "-m", "pip", "install", "numpy"])
-    from numpy import __version__ as numpy_version
     from numpy import get_include
 
 try:

diff --git a/docs/parse/__main__.py b/docs/parse/__main__.py
@@ -3,6 +3,8 @@
 the __doc__ of each object and formats it so that MkDocs can process it in turn.
 
 """
+from __future__ import annotations
+
 import argparse
 import collections
 import doctest
@@ -360,14 +362,16 @@ def print_docstring(obj, file):
     # Methods
     if inspect.isclass(obj) and doc["Methods"]:
         printf(h2("Methods"))
-        printf_indent = lambda x, **kwargs: printf(f"    {x}", **kwargs)
+
+        def printf_indent(x, **kwargs):
+            return printf(f"    {x}", **kwargs)
 
         for meth in doc["Methods"]:
             base_method_names = {"clone", "mutate"}
 
             if (
                 issubclass(obj, river.base.Base)
-                and not obj is river.base.Base
+                and obj is not river.base.Base
                 and meth.name in base_method_names
             ):
                 continue
@@ -395,8 +399,8 @@ def print_docstring(obj, file):
             }
 
             if (
-                issubclass(obj, (collections.UserList, collections.UserDict))
-                and not obj is river.base.Ensemble
+                issubclass(obj, collections.UserList | collections.UserDict)
+                and obj is not river.base.Ensemble
                 and meth.name in container_method_names
             ):
                 continue
@@ -488,7 +492,9 @@ def print_module(mod, path, overview, depth=0, verbose=False):
         print(md_line(mod.__doc__), file=overview)
 
     # Extract all public classes and functions
-    ispublic = lambda x: x.__name__ in mod.__all__ and not x.__name__.startswith("_")
+    def ispublic(x):
+        return x.__name__ in mod.__all__ and not x.__name__.startswith("_")
+
     classes = inspect.getmembers(mod, lambda x: inspect.isclass(x) and ispublic(x))
     funcs = inspect.getmembers(mod, lambda x: inspect.isfunction(x) and ispublic(x))
 

diff --git a/river/active/entropy.py b/river/active/entropy.py
@@ -63,9 +63,7 @@ class EntropySampler(ActiveLearningClassifier):
 
     """
 
-    def __init__(
-        self, classifier: base.Classifier, discount_factor: float = 3, seed=None
-    ):
+    def __init__(self, classifier: base.Classifier, discount_factor: float = 3, seed=None):
         super().__init__(classifier, seed=seed)
         self.discount_factor = discount_factor
 

diff --git a/river/anomaly/filter.py b/river/anomaly/filter.py
@@ -86,9 +86,7 @@ class ThresholdFilter(anomaly.base.AnomalyFilter):
 
     """
 
-    def __init__(
-        self, anomaly_detector, threshold: float, protect_anomaly_detector=True
-    ):
+    def __init__(self, anomaly_detector, threshold: float, protect_anomaly_detector=True):
         super().__init__(
             anomaly_detector=anomaly_detector,
             protect_anomaly_detector=protect_anomaly_detector,
@@ -188,7 +186,6 @@ def _unit_test_params(cls):
         from river import preprocessing
 
         yield {
-            "anomaly_detector": preprocessing.StandardScaler()
-            | anomaly.OneClassSVM(nu=0.2),
+            "anomaly_detector": preprocessing.StandardScaler() | anomaly.OneClassSVM(nu=0.2),
             "q": 0.995,
         }
diff --git a/river/bandit/base.py b/river/bandit/base.py
@@ -10,9 +10,7 @@
 __all__ = ["ArmID", "Policy", "ContextualPolicy", "RewardObj"]
 
 ArmID = typing.Union[int, str]  # noqa: UP007
-RewardObj = typing.Union[  # noqa: UP007
-    stats.base.Statistic, metrics.base.Metric, proba.base.Distribution
-]
+RewardObj = typing.Union[stats.base.Statistic, metrics.base.Metric, proba.base.Distribution]  # noqa: UP007
 
 
 class Policy(base.Base, abc.ABC):

diff --git a/river/bandit/envs/candy_cane.py b/river/bandit/envs/candy_cane.py
@@ -58,12 +58,8 @@ def __init__(self, n_machines=100, reward_decay=0.03):
         self.action_space = gym.spaces.Discrete(n_machines)
         self.observation_space = gym.spaces.Dict(
             {
-                "attempts": gym.spaces.Tuple(
-                    [gym.spaces.Discrete(self.n_steps)] * n_machines
-                ),
-                "successes": gym.spaces.Tuple(
-                    [gym.spaces.Discrete(self.n_steps)] * n_machines
-                ),
+                "attempts": gym.spaces.Tuple([gym.spaces.Discrete(self.n_steps)] * n_machines),
+                "successes": gym.spaces.Tuple([gym.spaces.Discrete(self.n_steps)] * n_machines),
             }
         )
         self.reward_range = (0.0, 1.0)

diff --git a/river/bandit/exp3.py b/river/bandit/exp3.py
@@ -77,9 +77,7 @@ def __init__(
         burn_in=0,
         seed: int | None = None,
     ):
-        super().__init__(
-            reward_obj=reward_obj, reward_scaler=reward_scaler, burn_in=burn_in
-        )
+        super().__init__(reward_obj=reward_obj, reward_scaler=reward_scaler, burn_in=burn_in)
         self.seed = seed
         self.gamma = gamma
         self._rng = random.Random(seed)
@@ -91,8 +89,7 @@ def __init__(
     def _pull(self, arm_ids):
         total = sum(self._weights[arm_id] for arm_id in arm_ids)
         self._probabilities = {
-            arm_id: (1 - self.gamma) * (self._weights[arm_id] / total)
-            + self.gamma / len(arm_ids)
+            arm_id: (1 - self.gamma) * (self._weights[arm_id] / total) + self.gamma / len(arm_ids)
             for arm_id in arm_ids
         }
         return self._rng.choices(arm_ids, weights=self._probabilities.values())[0]

diff --git a/river/bandit/ucb.py b/river/bandit/ucb.py
@@ -77,9 +77,7 @@ def __init__(
         burn_in=0,
         seed: int = None,
     ):
-        super().__init__(
-            reward_obj=reward_obj, reward_scaler=reward_scaler, burn_in=burn_in
-        )
+        super().__init__(reward_obj=reward_obj, reward_scaler=reward_scaler, burn_in=burn_in)
         self.delta = delta
         self.seed = seed
         self._rng = random.Random(seed)

diff --git a/river/cluster/clustream.py b/river/cluster/clustream.py
@@ -349,7 +349,5 @@ def inverse_error(x):
 
     def __iadd__(self, other: CluStreamMicroCluster):
         self.var_time += other.var_time
-        self.var_x = {
-            k: self.var_x[k] + other.var_x.get(k, stats.Var()) for k in self.var_x
-        }
+        self.var_x = {k: self.var_x[k] + other.var_x.get(k, stats.Var()) for k in self.var_x}
         return self
diff --git a/river/cluster/test_dbstream.py b/river/cluster/test_dbstream.py
@@ -134,8 +134,7 @@ def test_density_graph_with_three_micro_clusters():
 
 
 def test_density_graph_with_removed_microcluster():
-    dbstream = build_dbstream(fading_factor=0.1,
-                              intersection_factor=0.3)
+    dbstream = build_dbstream(fading_factor=0.1, intersection_factor=0.3)
 
     add_cluster(dbstream, initial_point={1: 1, 2: 1}, move_towards={1: 1.7, 2: 1.7}, times=25)
     add_cluster(dbstream, initial_point={1: 3, 2: 3}, move_towards={1: 2.3, 2: 2.3}, times=25)
@@ -162,21 +161,17 @@ def test_density_graph_with_removed_microcluster():
 
     dbstream._recluster()
     assert len(dbstream.clusters) == 1
-    assert_micro_cluster_properties(
-        dbstream.clusters[0], center={1: 2.560647, 2: 2.560647}
-    )
+    assert_micro_cluster_properties(dbstream.clusters[0], center={1: 2.560647, 2: 2.560647})
 
 
 def test_dbstream_synthetic_sklearn():
     centers = [(-10, -10), (-5, -5), (0, 0), (5, 5), (10, 10)]
     cluster_std = [0.6] * 5
 
     # Create a dataset with 15000 data points with 5 centers and cluster SD of 0.6 each
-    X, y = make_blobs(n_samples=15_000,
-                      cluster_std=cluster_std,
-                      centers=centers,
-                      n_features=2,
-                      random_state=42)
+    X, y = make_blobs(
+        n_samples=15_000, cluster_std=cluster_std, centers=centers, n_features=2, random_state=42
+    )
 
     dbstream = DBSTREAM(
         clustering_threshold=2,
@@ -202,11 +197,16 @@ def test_dbstream_synthetic_sklearn():
     dbstream._recluster()
 
     # Check that the resulted cluster centers are close to the expected centers
-    dbstream_expected_centers = {0: {0: 10, 1: 10},
-                                 1: {0: -5, 1: -5},
-                                 2: {0: 0, 1: 0},
-                                 3: {0: 5, 1: 5},
-                                 4: {0: -10, 1: -10}}
+    dbstream_expected_centers = {
+        0: {0: 10, 1: 10},
+        1: {0: -5, 1: -5},
+        2: {0: 0, 1: 0},
+        3: {0: 5, 1: 5},
+        4: {0: -10, 1: -10},
+    }
 
     for i in dbstream.centers.keys():
-        assert utils.math.minkowski_distance(dbstream.centers[i], dbstream_expected_centers[i], 2) < 0.2
+        assert (
+            utils.math.minkowski_distance(dbstream.centers[i], dbstream_expected_centers[i], 2)
+            < 0.2
+        )
diff --git a/river/cluster/textclust.py b/river/cluster/textclust.py
@@ -260,9 +260,7 @@ def _get_closest_mc(self, mc, idf, distance):
             if counter > 1:
                 ## our threshold
                 mu = (sumdist - min_dist) / (counter - 1)
-                threshold = mu - self.sigma * math.sqrt(
-                    squaresum / (counter - 1) - mu**2
-                )
+                threshold = mu - self.sigma * math.sqrt(squaresum / (counter - 1) - mu**2)
 
                 if min_dist < threshold:
                     clusterId = smallest_key
@@ -288,9 +286,7 @@ def _calculateIDF(self, micro_clusters):
     # update weights according to the fading factor
     def _updateweights(self):
         for micro in self.micro_clusters.values():
-            micro.fade(
-                self.t, self.omega, self.fading_factor, self.term_fading, self.realtime
-            )
+            micro.fade(self.t, self.omega, self.fading_factor, self.term_fading, self.realtime)
 
         # delete micro clusters with a weight smaller omega
         for key in list(self.micro_clusters.keys()):
@@ -373,9 +369,7 @@ def _get_distance_matrix(self, clusters):
         ids = list(clusters.keys())
 
         # initialize all distances to 0
-        distances = pd.DataFrame(
-            np.zeros((numClusters, numClusters)), columns=ids, index=ids
-        )
+        distances = pd.DataFrame(np.zeros((numClusters, numClusters)), columns=ids, index=ids)
 
         for idx, row in enumerate(ids):
             for col in ids[idx + 1 :]:
@@ -455,10 +449,7 @@ def get_macroclusters(self):
         numClusters = min([self.num_macro, len(self.micro_clusters)])
 
         # create empty clusters
-        macros = {
-            x: self.microcluster({}, self.t, 0, self.realtime, x)
-            for x in range(numClusters)
-        }
+        macros = {x: self.microcluster({}, self.t, 0, self.realtime, x) for x in range(numClusters)}
 
         # merge micro clusters to macro clusters
         for key, value in self.microToMacro.items():
@@ -478,9 +469,7 @@ def get_macroclusters(self):
     def showclusters(self, topn, num, type="micro"):
         # first clusters are sorted according to their respective weights
         if type == "micro":
-            sortedmicro = sorted(
-                self.micro_clusters.values(), key=lambda x: x.weight, reverse=True
-            )
+            sortedmicro = sorted(self.micro_clusters.values(), key=lambda x: x.weight, reverse=True)
         else:
             sortedmicro = sorted(
                 self.get_macroclusters().values(), key=lambda x: x.weight, reverse=True
@@ -510,10 +499,7 @@ def showclusters(self, topn, num, type="micro"):
             ]
             for rep in representatives:
                 print(
-                    "weight: "
-                    + str(round(rep[1], 2))
-                    + "\t token: "
-                    + str(rep[0]).expandtabs(10)
+                    "weight: " + str(round(rep[1], 2)) + "\t token: " + str(rep[0]).expandtabs(10)
                 )
         print("-------------------------------------------")
 
@@ -539,9 +525,7 @@ def get_assignment(self, x, type):
             # identify the closest micro cluster using the predefined distance measure
             for key in self.micro_clusters.keys():
                 if self.micro_clusters[key].weight > self.min_weight:
-                    cur_dist = self.micro_distance.dist(
-                        mc, self.micro_clusters[key], idf
-                    )
+                    cur_dist = self.micro_distance.dist(mc, self.micro_clusters[key], idf)
                     if cur_dist < dist:
                         dist = cur_dist
                         closest = key
@@ -616,9 +600,7 @@ def __init__(self, type):
 
         ## generic method that is called for each distance
         def dist(self, m1, m2, idf):
-            return getattr(self, self.type, lambda: "Invalid distance measure")(
-                m1, m2, idf
-            )
+            return getattr(self, self.type, lambda: "Invalid distance measure")(m1, m2, idf)
 
         ##calculate cosine similarity directly and fast
         def tfidf_cosine_distance(self, mc, microcluster, idf):
@@ -628,17 +610,12 @@ def tfidf_cosine_distance(self, mc, microcluster, idf):
             for k in list(mc.tf.keys()):
                 if k in idf:
                     if k in microcluster.tf:
-                        sum += (mc.tf[k]["tf"] * idf[k]) * (
-                            microcluster.tf[k]["tf"] * idf[k]
-                        )
+                        sum += (mc.tf[k]["tf"] * idf[k]) * (microcluster.tf[k]["tf"] * idf[k])
                     tfidflen += mc.tf[k]["tf"] * idf[k] * mc.tf[k]["tf"] * idf[k]
             tfidflen = math.sqrt(tfidflen)
             for k in list(microcluster.tf.keys()):
                 microtfidflen += (
-                    microcluster.tf[k]["tf"]
-                    * idf[k]
-                    * microcluster.tf[k]["tf"]
-                    * idf[k]
+                    microcluster.tf[k]["tf"] * idf[k] * microcluster.tf[k]["tf"] * idf[k]
                 )
             microtfidflen = math.sqrt(microtfidflen)
             if tfidflen == 0 or microtfidflen == 0:

diff --git a/river/conf/jackknife.py b/river/conf/jackknife.py
@@ -91,9 +91,7 @@ def __init__(
 
         alpha = (1 - confidence_level) / 2
         self._lower = (
-            stats.RollingQuantile(alpha, window_size)
-            if window_size
-            else stats.Quantile(alpha)
+            stats.RollingQuantile(alpha, window_size) if window_size else stats.Quantile(alpha)
         )
         self._upper = (
             stats.RollingQuantile(1 - alpha, window_size)
@@ -109,11 +107,7 @@ def _wrapped_model(self):
     def _unit_test_params(cls):
         from river import linear_model, preprocessing
 
-        yield {
-            "regressor": (
-                preprocessing.StandardScaler() | linear_model.LinearRegression()
-            )
-        }
+        yield {"regressor": (preprocessing.StandardScaler() | linear_model.LinearRegression())}
 
     def learn_one(self, x, y, **kwargs):
         # Update the quantiles