Skip to content

Commit

Permalink
modernize
Browse files Browse the repository at this point in the history
  • Loading branch information
fgregg committed Aug 10, 2024
1 parent 46de586 commit 48916bf
Show file tree
Hide file tree
Showing 12 changed files with 14 additions and 24 deletions.
2 changes: 1 addition & 1 deletion csv_example/csv_evaluation.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ def dupePairs(filename, rowname):
if "x" in dupe_d:
del dupe_d["x"]

dupe_s = set([])
dupe_s = set()
for unique_id, cluster in dupe_d.items():
if len(cluster) > 1:
for pair in itertools.combinations(cluster, 2):
Expand Down
3 changes: 1 addition & 2 deletions csv_example/csv_example.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
#!/usr/bin/python
# -*- coding: utf-8 -*-
"""
This code demonstrates how to use dedupe with a comma separated values
(CSV) file. All operations are performed in memory, so will run very
Expand Down Expand Up @@ -79,7 +78,7 @@ def readData(filename):
log_level = logging.INFO
elif opts.verbose >= 2:
log_level = logging.DEBUG
logging.getLogger().setLevel(log_level)
logging.basicConfig(level=log_level)

# ## Setup

Expand Down
3 changes: 1 addition & 2 deletions extended-variables/officers.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
#!/usr/bin/python
# -*- coding: utf-8 -*-
"""
This code demonstrates how to use some extended dedupe variables
"""
Expand Down Expand Up @@ -169,7 +168,7 @@ def readData(filename):
# 'Cluster ID' which indicates which records refer to each other.

cluster_membership = {}
for (cluster_id, cluster) in enumerate(clustered_dupes):
for cluster_id, cluster in enumerate(clustered_dupes):
id_set, scores = cluster
for record_id, score in zip(id_set, scores):
cluster_membership[record_id] = {
Expand Down
7 changes: 3 additions & 4 deletions gazetteer_example/gazetteer_example.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
#!/usr/bin/python
# -*- coding: utf-8 -*-
"""
This code demonstrates the Gazetteer.
Expand Down Expand Up @@ -49,7 +48,7 @@ def readData(filename):
with open(filename) as f:
reader = csv.DictReader(f)
for i, row in enumerate(reader):
clean_row = dict([(k, preProcess(v)) for (k, v) in row.items()])
clean_row = {k: preProcess(v) for (k, v) in row.items()}
if clean_row["price"]:
clean_row["price"] = float(clean_row["price"][1:])
data_d[filename + str(i)] = dict(clean_row)
Expand Down Expand Up @@ -92,10 +91,10 @@ def readData(filename):

print("importing data ...")
messy = readData(messy_file)
print("N data 1 records: {}".format(len(messy)))
print(f"N data 1 records: {len(messy)}")

canonical = readData(canon_file)
print("N data 2 records: {}".format(len(canonical)))
print(f"N data 2 records: {len(canonical)}")

def descriptions():
for dataset in (messy, canonical):
Expand Down
3 changes: 1 addition & 2 deletions gazetteer_example/gazetteer_postgres_example.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
#!/usr/bin/python
# -*- coding: utf-8 -*-
"""
This code demonstrates Gazetteer matching backed by a Postgres database.
Expand Down Expand Up @@ -160,7 +159,7 @@ def read_data_for_postgres(filename):
writer.writeheader()

for idx, row in enumerate(reader):
clean_row = dict([(k, preProcess(v)) for k, v in row.items()])
clean_row = {k: preProcess(v) for k, v in row.items()}
if clean_row["price"]:
clean_row["price"] = float(clean_row["price"][1:])
if clean_row["unique_id"]:
Expand Down
1 change: 0 additions & 1 deletion mysql_example/mysql_example.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
#!/usr/bin/python
# -*- coding: utf-8 -*-

"""
This is an example of working with very large data. There are about
Expand Down
1 change: 0 additions & 1 deletion mysql_example/mysql_init_db.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
#!/usr/bin/python
# -*- coding: utf-8 -*-
"""
This is a setup script for mysql_example. It downloads a zip file of
Illinois campaign contributions and loads them into a MySQL database
Expand Down
4 changes: 2 additions & 2 deletions patent_example/patent_evaluation.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,8 @@ def dupePairs(filename, colname):
if "x" in dupe_d:
del dupe_d["x"]

dupe_s = set([])
for (unique_id, cluster) in dupe_d.items():
dupe_s = set()
for unique_id, cluster in dupe_d.items():
if len(cluster) > 1:
for pair in itertools.combinations(cluster, 2):
dupe_s.add(frozenset(pair))
Expand Down
3 changes: 1 addition & 2 deletions patent_example/patent_example.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
#!/usr/bin/python
# -*- coding: utf-8 -*-
"""
This code demonstrates how to use dedupe to disambiguate patent
authors and demonstrates the Set and LatLong data types.
Expand All @@ -26,7 +25,7 @@ def readData(filename, set_delim="**"):
with open(filename) as f:
reader = csv.DictReader(f)
for idx, row in enumerate(reader):
row = dict((k, v.lower()) for k, v in row.items())
row = {k: v.lower() for k, v in row.items()}
if row["Lat"] == row["Lng"] == "0.0":
row["LatLong"] = None
else:
Expand Down
3 changes: 1 addition & 2 deletions pgsql_big_dedupe_example/pgsql_big_dedupe_example.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-

"""
This is an example of working with very large data. There are about
Expand Down Expand Up @@ -38,7 +37,7 @@
register_adapter(numpy.float64, AsIs)


class Readable(object):
class Readable:
def __init__(self, iterator):

self.output = io.StringIO()
Expand Down
5 changes: 2 additions & 3 deletions pgsql_big_dedupe_example/pgsql_big_dedupe_example_init_db.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
This is a setup script for mysql_example. It downloads a zip file of
Illinois campaign contributions and loads them in t aMySQL database
Expand Down Expand Up @@ -51,7 +50,7 @@
# Postgres COPY doesn't handle "ragged" files very well
if not os.path.exists(contributions_csv_file):
print("converting tab-delimited raw file to csv...")
with open(contributions_txt_file, "rU") as txt_file, open(
with open(contributions_txt_file) as txt_file, open(
contributions_csv_file, "w"
) as csv_file:
csv_writer = csv.writer(csv_file, quoting=csv.QUOTE_ALL)
Expand Down Expand Up @@ -109,7 +108,7 @@

conn.commit()

with open(contributions_csv_file, "rU") as csv_file:
with open(contributions_csv_file) as csv_file:
c.copy_expert(
"COPY raw_table "
"(reciept_id, last_name, first_name, "
Expand Down
3 changes: 1 addition & 2 deletions record_linkage_example/record_linkage_example.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
#!/usr/bin/python
# -*- coding: utf-8 -*-
"""
This code demonstrates how to use RecordLink with two comma separated
values (CSV) files. We have listings of products from two different
Expand Down Expand Up @@ -49,7 +48,7 @@ def readData(filename):
with open(filename) as f:
reader = csv.DictReader(f)
for i, row in enumerate(reader):
clean_row = dict([(k, preProcess(v)) for (k, v) in row.items()])
clean_row = {k: preProcess(v) for (k, v) in row.items()}
if clean_row["price"]:
clean_row["price"] = float(clean_row["price"][1:])
data_d[filename + str(i)] = dict(clean_row)
Expand Down

0 comments on commit 48916bf

Please sign in to comment.