-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrun_extractor.py
202 lines (170 loc) · 6.32 KB
/
run_extractor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
#%%
import pandas as pd
from StringUtil import StringUtil
from MatchExtractor import MatchExtractor
import warnings
pd.set_option('display.max_colwidth', None)
# warnings.filterwarnings('ignore')
replacements_woodruff = {
r'\[\[(.*?)\|(.*?)\]\]' : r'\1',
}
#%%
path_root = '../data/matches/all_books_10_words/'
path_data_woodruff_raw = '../data/raw/data_woodruff_raw.csv'
path_data_woodruff_raw = '../data/raw/derived_data.csv'
# path_data_woodruff_clean = path_root + 'data_woodruff_clean.csv'
path_data_scriptures = '../data/raw/data_scriptures.csv'
path_matches = '../data/matches/data_matches2.csv'
# url paths
url_woodruff = "https://github.com/wilfordwoodruff/Main-Data/raw/main/data/derived/derived_data.csv"
url_scriptures = 'https://github.com/wilfordwoodruff/wilford_woodruff_hack23/raw/main/data/lds-scriptures.csv'
# load data
data_scriptures = pd.read_csv(path_data_scriptures)
data_woodruff = pd.read_csv(path_data_woodruff_raw)
# clean woodruff data
columns = ['Internal ID', 'Parent ID', 'Order', 'Document Type', 'Website URL', 'Dates', 'Text Only Transcript']
new_columns = {'Internal ID':'internal_id',
'Parent ID':'parent_id',
'Order':'order',
'Document Type':'document_type',
'Website URL':'website_url',
'Dates':'dates',
'Text Only Transcript':'text_woodruff'
}
data_woodruff = data_woodruff.rename(columns=new_columns)[list(new_columns.values())]
data_woodruff = data_woodruff.query("document_type=='Journals'")
# text = StringUtil.combine_rows(data_woodruff['text'])
data_woodruff['text_woodruff'] = StringUtil.str_replace_column(data_woodruff['text_woodruff'], replacements_woodruff)
# data_woodruff.info()
#%%
# clean scripture data
data_scriptures = data_scriptures.rename(columns={'text':'text_scriptures'})
# data_scriptures['text_scriptures'] = StringUtil.str_replace_column(data_scriptures['text_scriptures'], scripture_replacements)
# filter to certain volumes
volume_titles = [
'Old Testament',
'New Testament',
'Book of Mormon',
'Doctrine and Covenants',
'Pearl of Great Price',
]
data_scriptures = data_scriptures.query("volume_title in @volume_titles")
# query = "verse_title == 'Doctrine and Covenants 136:11'|verse_title == 'Doctrine and Covenants 136:12'|verse_title == 'Doctrine and Covenants 136:13'|verse_title == 'Doctrine and Covenants 136:14'|verse_title == 'Doctrine and Covenants 136:15'|verse_title == 'Doctrine and Covenants 136:16'|verse_title == 'Doctrine and Covenants 136:17'"
# data_scriptures = data_scriptures.query(query)
data_scriptures
#%%
phrase_length = 10
threshold = .7
print('volumes:', volume_titles)
print('phrase length:', phrase_length)
print('threshold:', threshold)
match_extractor = MatchExtractor(data_woodruff.copy(),
data_scriptures.copy(),
phrase_length,
threshold=threshold)
# iterate through each row of scripture phrases dataset and run TFIDF model and cosine similarity.
match_extractor.run_extractor(path_matches=path_matches, git_push = True, quarto_publish=False)
match_extractor.matches_total
#%%
# git add .;git commit -m 'changes';git push;
# import pandas as pd
# cool_dict = {
# 'index_woodruff':[1,2,3,10,20],
# 'index_scriptures':[1,2,3,20,32],
# 'text_woodruff':['hello','my','name','poop','banana'],
# }
# data = pd.DataFrame(cool_dict)
# data
# #%%
# data.sort_values(['index_woodruff', 'index_scriptures'], inplace=True)
# # Create a mask to identify rows where the indices are not 1 apart
# mask = (data['index_woodruff'].diff() != 1) | (data['index_scriptures'].diff() != 1)
# mask
# data['group'] = mask.cumsum()
# data
# #%%
# # Create a new column to identify groups based on the mask
# data = data.groupby('group').agg({
# 'index_woodruff': 'last',
# 'index_scriptures': 'last',
# # 'match_count' : 'sum',
# # 'cosine_score': 'mean',
# # 'verse_title': 'first',
# # 'volume_title': 'first',
# # 'internal_id': 'first',
# # 'parent_id': 'first',
# # 'order': 'first',
# # 'website_url': 'first',
# 'text_woodruff': ' '.join,
# # 'text_scriptures': ' '.join,
# # 'dates': 'first',
# })
# # data['cosine_score'] = data['cosine_score'].apply(lambda x: round(x, 5))
# data
# #%%
# data = data.append({'index_woodruff':4, 'index_scriptures':4,'text_woodruff':'is porter'}, ignore_index=True)
# data
# # %%
# data.sort_values(['index_woodruff', 'index_scriptures'], inplace=True)
# # Create a mask to identify rows where the indices are not 1 apart
# mask = (data['index_woodruff'].diff() != 1) | (data['index_scriptures'].diff() != 1)
# mask
# data['group'] = mask.cumsum()
# data
# data.groupby('group').agg({
# 'index_woodruff': 'last',
# 'index_scriptures': 'last',
# # 'match_count' : 'sum',
# # 'cosine_score': 'mean',
# # 'verse_title': 'first',
# # 'volume_title': 'first',
# # 'internal_id': 'first',
# # 'parent_id': 'first',
# # 'order': 'first',
# # 'website_url': 'first',
# 'text_woodruff': ' '.join,
# # 'text_scriptures': ' '.join,
# # 'dates': 'first',
# })
#%%
# import pandas as pd
# import numpy as np
# from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
# from IPython.display import display
# documentA = 'the man went out for a walk'
# documentB = 'the children sat around the fire'
# corpus = [documentA, documentB]
# bagOfWordsA = documentA.split(' ')
# bagOfWordsB = documentB.split(' ')
# bagOfWordsA
# bagOfWordsB
# #%%
# uniqueWords = set(bagOfWordsA).union(set(bagOfWordsB))
# uniqueWords
# #%%
# print('----------- compare word count -------------------')
# numOfWordsA = dict.fromkeys(uniqueWords, 0)
# for word in bagOfWordsA:
# numOfWordsA[word] += 1
# numOfWordsB = dict.fromkeys(uniqueWords, 0)
# for word in bagOfWordsB:
# numOfWordsB[word] += 1
# numOfWordsB
# #%%
# series_A = pd.Series(numOfWordsA)
# series_B = pd.Series(numOfWordsB)
# df = pd.concat([series_A, series_B], axis=1).T
# df = df.reindex(sorted(df.columns), axis=1)
# display(df)
# #%%
# tf_df = df.divide(df.sum(1),axis='index')
# tf_df
# #%%
# n_d = 1+ tf_df.shape[0]
# df_d_t = 1 + (tf_df.values>0).sum(0)
# idf = np.log(n_d/df_d_t) + 1
# idf
# #%%
# pd.DataFrame(df.values * idf,
# columns=df.columns )
# # %%