forked from CuongNN218/zalo_ltr_2021
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathhard_negative_mining.py
107 lines (85 loc) · 4.04 KB
/
hard_negative_mining.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
import pickle
import os
import numpy as np
import json
import torch
from tqdm import tqdm
from rank_bm25 import *
import argparse
import warnings
from sentence_transformers import SentenceTransformer, util
warnings.filterwarnings("ignore")
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument("--model_path", default="saved_model/bm25_Plus_04_06_model_full_manual_stopword", type=str)
parser.add_argument("--sentence_bert_path", default="", type=str, help="path to round 1 sentence bert model")
parser.add_argument("--data_path", default="zac2021-ltr-data", type=str, help="path to input data")
parser.add_argument("--save_path", default="pair_data", type=str)
parser.add_argument("--top_k", default=20, type=str, help="top k hard negative mining")
parser.add_argument("--path_doc_refer", default="generated_data/doc_refers_saved.pkl", type=str, help="path to doc refers")
parser.add_argument("--path_legal", default="generated_data/legal_dict.json", type=str, help="path to legal dict")
args = parser.parse_args()
# load training data from json
data = json.load(open(os.path.join(args.data_path, "train_question_answer.json")))
training_data = data["items"]
print(len(training_data))
# load bm25 model
with open(args.model_path, "rb") as bm_file:
bm25 = pickle.load(bm_file)
with open(args.path_doc_refer, "rb") as doc_refer_file:
doc_refers = pickle.load(doc_refer_file)
doc_path = os.path.join(args.path_legal)
df = open(doc_path)
doc_data = json.load(df)
# load hard negative model
model = SentenceTransformer(args.sentence_bert_path)
# add embedding for data
# if you already have data with encoded sentence uncoment line 47 - 54
import pickle
embed_list = []
for k, v in tqdm(doc_data.items()):
embed = model.encode(v['title'] + ' ' + v['text'])
doc_data[k]['embedding'] = embed
with open('legal_corpus_vibert_embedding.pkl', 'wb') as pkl:
pickle.dump(doc_data, pkl)
with open('legal_corpus_vibert_embedding.pkl', 'rb') as pkl:
data = pickle.load(pkl)
pred_list = []
top_k = args.top_k
save_pairs = []
for idx, item in tqdm(enumerate(training_data)):
question_id = item["question_id"]
question = item["question"]
relevant_articles = item["relevant_articles"]
actual_positive = len(relevant_articles)
for article in relevant_articles:
save_dict = {}
save_dict["question"] = question
concat_id = article["law_id"] + "_" + article["article_id"]
save_dict["document"] = doc_data[concat_id]["title"] + " " + doc_data[concat_id]["text"]
save_dict["relevant"] = 1
save_pairs.append(save_dict)
encoded_question = model.encode(question)
list_embs = []
for k, v in data.items():
emb_2 = torch.tensor(v['embedding']).unsqueeze(0)
list_embs.append(emb_2)
matrix_emb = torch.cat(list_embs, dim=0)
all_cosine = util.cos_sim(encoded_question, matrix_emb).numpy().squeeze(0)
predictions = np.argpartition(all_cosine, len(all_cosine) - top_k)[-top_k:]
for idx, idx_pred in enumerate(predictions):
pred = doc_refers[idx_pred]
check = 0
for article in relevant_articles:
check += 1 if pred[0] == article["law_id"] and pred[1] == article["article_id"] else 0
if check == 0:
save_dict = {}
save_dict["question"] = question
concat_id = pred[0] + "_" + pred[1]
save_dict["document"] = doc_data[concat_id]["title"] + " " + doc_data[concat_id]["text"]
save_dict["relevant"] = 0
save_pairs.append(save_dict)
save_path = args.save_path
os.makedirs(save_path, exist_ok=True)
with open(os.path.join(save_path, f"save_pairs_vibert_top{top_k}.pkl"), "wb") as pair_file:
pickle.dump(save_pairs, pair_file)