-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathMatchExtractor.py
123 lines (107 loc) · 5.91 KB
/
MatchExtractor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
#%%
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from StringUtil import StringUtil
from tqdm import tqdm
import itertools
import pandas as pd
import subprocess
class MatchExtractor:
""" match extractor class, pass initialize with 2 pandas dataframes and the phrase to split each row of text into
"""
def __init__(self, data_woodruff, data_scriptures, phrase_length, threshold):
self.matches_total = pd.DataFrame()
self.matches_current = pd.DataFrame()
self.phrase_length = phrase_length
self.threshold = threshold
# local paths
self.__load_woodruff_data(data_woodruff)
self.__load_scripture_data(data_scriptures)
self.__load_vectorizer()
def __load_woodruff_data(self, data_woodruff):
""" save self.data_woodruff as pandas dataframe
"""
self.data_woodruff_full = data_woodruff.copy()
# split each journal entry into a list of phrases then explode it all
self.data_woodruff = StringUtil.expand_dataframe_of_text(data_woodruff, 'text_woodruff', self.phrase_length)
def __load_scripture_data(self, data_scriptures):
""" save self.data_scripture as pandas dataframe
"""
self.data_scriptures_full = data_scriptures.copy()
# split each verse into a list of phrases then explode it all
self.data_scriptures = StringUtil.expand_dataframe_of_text(data_scriptures, 'text_scriptures', self.phrase_length)
def __load_vectorizer(self):
self.vectorizer = TfidfVectorizer()
self.tfidf_matrix_woodruff = self.vectorizer.fit_transform(self.data_woodruff['text_woodruff'])
def run_extractor(self, path_matches, git_push = False, quarto_publish = False):
""" Uses already trained TFIDF model, first extraction algorithm
loops through each row of expanded scriptures dataframe and computes the tfidf vector of each scriptures phrase
then compute the vectors of each woodruff phrase and create a vector
then compute cosine similarity between each vector and filter by a certain threshold.
then append everything back into a dataset.
"""
self.data_woodruff_copy = self.data_woodruff.copy()
progress_bar = tqdm(total=len(self.data_scriptures))
# iterate through each row of data_scriptures_pandas dataframe and run TFIDF vectorizer on the scripture text
for index, row_scriptures in self.data_scriptures.iterrows():
progress_bar.update(1)
description = f"{row_scriptures['verse_title']} total match count: {len(self.matches_total)}"
progress_bar.set_description(description)
# compute cosine similarity scores
tfidf_matrix_scriptures = self.vectorizer.transform([row_scriptures['text_scriptures']])
# compute cosine similarity scores for given verse for each phrase in woodruff dataset
cosine_scores = cosine_similarity(self.tfidf_matrix_woodruff, tfidf_matrix_scriptures)
self.data_woodruff_copy['cosine_score'] = cosine_scores.flatten()
self.data_woodruff_copy['text_scriptures'] = row_scriptures['text_scriptures']
self.matches_current = self.data_woodruff_copy.rename_axis('index_woodruff').reset_index()
self.matches_current['index_scriptures'] = index
self.matches_current['verse_title'] = row_scriptures['verse_title']
self.matches_current['volume_title'] = row_scriptures['volume_title']
self.matches_current['book_title'] = row_scriptures['book_title']
# filter matches by threshold
self.matches_current = self.matches_current.query("cosine_score > @self.threshold")
self.matches_total = pd.concat([self.matches_total, self.matches_current])
self.matches_total = self.matches_total#.sort_values(by='cosine_score', ascending=False)
# save to file
self.resolve_extensions()
self.matches_total.to_csv(path_matches, index=False)
self.matches_total.to_csv(path_matches, index=False)
progress_bar.close()
if git_push:
self.git_push()
if quarto_publish:
self.quarto_publish()
def resolve_extensions(self):
self.matches_total.sort_values(['index_woodruff', 'index_scriptures'], inplace=True)
# Create a mask to identify rows where the indices are not 1 apart
mask = (self.matches_total['index_woodruff'].diff() != 1) | (self.matches_total['index_scriptures'].diff() != 1)
# Create a new column to identify groups based on the mask
self.matches_total['group'] = mask.cumsum()
self.matches_total['match_count'] = 1
self.matches_total = self.matches_total.groupby('group').agg({
'index_woodruff': 'last',
'index_scriptures': 'last',
'cosine_score': 'mean',
'verse_title': 'first',
'volume_title': 'first',
'internal_id': 'first',
'parent_id': 'first',
'order': 'first',
'website_url': 'first',
'match_count' : 'count',
'text_woodruff': ' '.join,
'text_scriptures': ' '.join,
'dates': 'first',
})
self.matches_total['cosine_score'] = self.matches_total['cosine_score'].apply(lambda x: round(x, 5))
@staticmethod
def git_push():
commands = ['git add .','git commit -m "new matches"','git pull','git push']
subprocess.run(commands[0], shell = True, encoding = 'utf-8')
subprocess.run(commands[1], shell = True, encoding = 'utf-8')
subprocess.run(commands[2], shell = True, encoding = 'utf-8')
subprocess.run(commands[3], shell = True, encoding = 'utf-8')
@staticmethod
def quarto_publish():
command = 'quarto publish'
subprocess.run(command, shell = True, input = 'y\n', encoding = 'utf-8')