modernize

dedupeio · Aug 10, 2024 · 48916bf · 48916bf
1 parent 46de586
commit 48916bf
Show file tree

Hide file tree

Showing 12 changed files with 14 additions and 24 deletions.
diff --git a/csv_example/csv_evaluation.py b/csv_example/csv_evaluation.py
@@ -28,7 +28,7 @@ def dupePairs(filename, rowname):
     if "x" in dupe_d:
         del dupe_d["x"]
 
-    dupe_s = set([])
+    dupe_s = set()
     for unique_id, cluster in dupe_d.items():
         if len(cluster) > 1:
             for pair in itertools.combinations(cluster, 2):

diff --git a/csv_example/csv_example.py b/csv_example/csv_example.py
@@ -1,5 +1,4 @@
 #!/usr/bin/python
-# -*- coding: utf-8 -*-
 """
 This code demonstrates how to use dedupe with a comma separated values
 (CSV) file. All operations are performed in memory, so will run very
@@ -79,7 +78,7 @@ def readData(filename):
             log_level = logging.INFO
         elif opts.verbose >= 2:
             log_level = logging.DEBUG
-    logging.getLogger().setLevel(log_level)
+    logging.basicConfig(level=log_level)
 
     # ## Setup
 

diff --git a/extended-variables/officers.py b/extended-variables/officers.py
@@ -1,5 +1,4 @@
 #!/usr/bin/python
-# -*- coding: utf-8 -*-
 """
 This code demonstrates how to use some extended dedupe variables
 """
@@ -169,7 +168,7 @@ def readData(filename):
     # 'Cluster ID' which indicates which records refer to each other.
 
     cluster_membership = {}
-    for (cluster_id, cluster) in enumerate(clustered_dupes):
+    for cluster_id, cluster in enumerate(clustered_dupes):
         id_set, scores = cluster
         for record_id, score in zip(id_set, scores):
             cluster_membership[record_id] = {

diff --git a/gazetteer_example/gazetteer_example.py b/gazetteer_example/gazetteer_example.py
@@ -1,5 +1,4 @@
 #!/usr/bin/python
-# -*- coding: utf-8 -*-
 """
 This code demonstrates the Gazetteer.
 
@@ -49,7 +48,7 @@ def readData(filename):
     with open(filename) as f:
         reader = csv.DictReader(f)
         for i, row in enumerate(reader):
-            clean_row = dict([(k, preProcess(v)) for (k, v) in row.items()])
+            clean_row = {k: preProcess(v) for (k, v) in row.items()}
             if clean_row["price"]:
                 clean_row["price"] = float(clean_row["price"][1:])
             data_d[filename + str(i)] = dict(clean_row)
@@ -92,10 +91,10 @@ def readData(filename):
 
     print("importing data ...")
     messy = readData(messy_file)
-    print("N data 1 records: {}".format(len(messy)))
+    print(f"N data 1 records: {len(messy)}")
 
     canonical = readData(canon_file)
-    print("N data 2 records: {}".format(len(canonical)))
+    print(f"N data 2 records: {len(canonical)}")
 
     def descriptions():
         for dataset in (messy, canonical):

diff --git a/gazetteer_example/gazetteer_postgres_example.py b/gazetteer_example/gazetteer_postgres_example.py
@@ -1,5 +1,4 @@
 #!/usr/bin/python
-# -*- coding: utf-8 -*-
 """
 This code demonstrates Gazetteer matching backed by a Postgres database.
 
@@ -160,7 +159,7 @@ def read_data_for_postgres(filename):
         writer.writeheader()
 
         for idx, row in enumerate(reader):
-            clean_row = dict([(k, preProcess(v)) for k, v in row.items()])
+            clean_row = {k: preProcess(v) for k, v in row.items()}
             if clean_row["price"]:
                 clean_row["price"] = float(clean_row["price"][1:])
             if clean_row["unique_id"]:

diff --git a/mysql_example/mysql_example.py b/mysql_example/mysql_example.py
@@ -1,5 +1,4 @@
 #!/usr/bin/python
-# -*- coding: utf-8 -*-
 
 """
 This is an example of working with very large data. There are about

diff --git a/mysql_example/mysql_init_db.py b/mysql_example/mysql_init_db.py
@@ -1,5 +1,4 @@
 #!/usr/bin/python
-# -*- coding: utf-8 -*-
 """
 This is a setup script for mysql_example.  It downloads a zip file of
 Illinois campaign contributions and loads them into a MySQL database

diff --git a/patent_example/patent_evaluation.py b/patent_example/patent_evaluation.py
@@ -31,8 +31,8 @@ def dupePairs(filename, colname):
     if "x" in dupe_d:
         del dupe_d["x"]
 
-    dupe_s = set([])
-    for (unique_id, cluster) in dupe_d.items():
+    dupe_s = set()
+    for unique_id, cluster in dupe_d.items():
         if len(cluster) > 1:
             for pair in itertools.combinations(cluster, 2):
                 dupe_s.add(frozenset(pair))

diff --git a/patent_example/patent_example.py b/patent_example/patent_example.py
@@ -1,5 +1,4 @@
 #!/usr/bin/python
-# -*- coding: utf-8 -*-
 """
 This code demonstrates how to use dedupe to disambiguate patent
 authors and demonstrates the Set and LatLong data types.
@@ -26,7 +25,7 @@ def readData(filename, set_delim="**"):
     with open(filename) as f:
         reader = csv.DictReader(f)
         for idx, row in enumerate(reader):
-            row = dict((k, v.lower()) for k, v in row.items())
+            row = {k: v.lower() for k, v in row.items()}
             if row["Lat"] == row["Lng"] == "0.0":
                 row["LatLong"] = None
             else:

diff --git a/pgsql_big_dedupe_example/pgsql_big_dedupe_example.py b/pgsql_big_dedupe_example/pgsql_big_dedupe_example.py
@@ -1,5 +1,4 @@
 #!/usr/bin/env python
-# -*- coding: utf-8 -*-
 
 """
 This is an example of working with very large data. There are about
@@ -38,7 +37,7 @@
 register_adapter(numpy.float64, AsIs)
 
 
-class Readable(object):
+class Readable:
     def __init__(self, iterator):
 
         self.output = io.StringIO()

diff --git a/pgsql_big_dedupe_example/pgsql_big_dedupe_example_init_db.py b/pgsql_big_dedupe_example/pgsql_big_dedupe_example_init_db.py
@@ -1,5 +1,4 @@
 #!/usr/bin/env python
-# -*- coding: utf-8 -*-
 """
 This is a setup script for mysql_example.  It downloads a zip file of
 Illinois campaign contributions and loads them in t aMySQL database
@@ -51,7 +50,7 @@
 # Postgres COPY doesn't handle "ragged" files very well
 if not os.path.exists(contributions_csv_file):
     print("converting tab-delimited raw file to csv...")
-    with open(contributions_txt_file, "rU") as txt_file, open(
+    with open(contributions_txt_file) as txt_file, open(
         contributions_csv_file, "w"
     ) as csv_file:
         csv_writer = csv.writer(csv_file, quoting=csv.QUOTE_ALL)
@@ -109,7 +108,7 @@
 
 conn.commit()
 
-with open(contributions_csv_file, "rU") as csv_file:
+with open(contributions_csv_file) as csv_file:
     c.copy_expert(
         "COPY raw_table "
         "(reciept_id, last_name, first_name, "

diff --git a/record_linkage_example/record_linkage_example.py b/record_linkage_example/record_linkage_example.py
@@ -1,5 +1,4 @@
 #!/usr/bin/python
-# -*- coding: utf-8 -*-
 """
 This code demonstrates how to use RecordLink with two comma separated
 values (CSV) files. We have listings of products from two different
@@ -49,7 +48,7 @@ def readData(filename):
     with open(filename) as f:
         reader = csv.DictReader(f)
         for i, row in enumerate(reader):
-            clean_row = dict([(k, preProcess(v)) for (k, v) in row.items()])
+            clean_row = {k: preProcess(v) for (k, v) in row.items()}
             if clean_row["price"]:
                 clean_row["price"] = float(clean_row["price"][1:])
             data_d[filename + str(i)] = dict(clean_row)