This repository has been archived by the owner on Dec 9, 2018. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 52
/
Copy pathCPT.py
205 lines (131 loc) · 5.88 KB
/
CPT.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
from PredictionTree import *
import pandas as pd
from tqdm import tqdm
class CPT():
alphabet = None # A set of all unique items in the entire data file
root = None # Root node of the Prediction Tree
II = None #Inverted Index dictionary, where key : unique item, value : set of sequences containing this item
LT = None # A Lookup table dictionary, where key : id of a sequence(row), value: leaf node of a Prediction Tree
def __init__(self):
self.alphabet = set()
self.root = PredictionTree()
self.II = {}
self.LT = {}
def load_files(self,train_file,test_file,merge = False):
"""
This function reads in the wide csv file of sequences separated by commas and returns a list of list of those
sequences. The sequences are defined as below.
seq1 = A,B,C,D
seq2 B,C,E
Returns: [[A,B,C,D],[B,C,E]]
"""
train = [] # List of list containing the entire sequence data using which the model will be trained.
test = [] # List of list containing the test sequences whose next n items are to be predicted
if train_file is None:
return train_file
training = pd.read_csv(train_file)
for index, row in training.iterrows():
train.append(row.values)
if test_file is None:
return train, test_file
testing = pd.read_csv(test_file)
for index, row in testing.iterrows():
if merge:
train.append(row.values)
test.append(row.values)
return train,test
# In[3]:
def train(self, train):
"""
This functions populates the Prediction Tree, Inverted Index and LookUp Table for the algorithm.
Input: The list of list training data
Output : Boolean True
"""
cursornode = self.root
for seqid,row in enumerate(train):
for element in row:
# adding to the Prediction Tree
if element==element: #different length sequence support
if cursornode.hasChild(element)== False:
cursornode.addChild(element)
cursornode = cursornode.getChild(element)
else:
cursornode = cursornode.getChild(element)
# Adding to the Inverted Index
if self.II.get(element) is None:
self.II[element] = set()
self.II[element].add(seqid)
self.alphabet.add(element)
self.LT[seqid] = cursornode
cursornode = self.root
return True
def score(self, counttable,key, length, target_size, number_of_similar_sequences, number_items_counttable):
"""
This function is the main workhorse and calculates the score to be populated against an item. Items are predicted
using this score.
Output: Returns a counttable dictionary which stores the score against items. This counttable is specific for a
particular row or a sequence and therefore re-calculated at each prediction.
"""
weight_level = 1/number_of_similar_sequences
weight_distance = 1/number_items_counttable
score = 1 + weight_level + weight_distance* 0.001
if counttable.get(key) is None:
counttable[key] = score
else:
counttable[key] = score * counttable.get(key)
return counttable
def predict(self,train,test,k, n=1):
"""
Here target is the test dataset in the form of list of list,
k is the number of last elements that will be used to find similar sequences and,
n is the number of predictions required.
Input: training list of list, target list of list, k,n
Output: max n predictions for each sequence
"""
predictions = []
for each_target in tqdm(test):
#different size sequence support
i=0
while i<len(each_target) and each_target[i]==each_target[i]:#find NaN start
i=i+1
l=i-k-1
if l<0:
l=0
each_target = each_target[l:i]
intersection = set(range(0,len(train)))
for element in each_target:
if self.II.get(element) is None:
continue
intersection = intersection & self.II.get(element)
similar_sequences = []
for element in intersection:
currentnode = self.LT.get(element)
tmp = []
while currentnode.Item is not None:
tmp.append(currentnode.Item)
currentnode = currentnode.Parent
similar_sequences.append(tmp)
for sequence in similar_sequences:
sequence.reverse()
counttable = {}
for sequence in similar_sequences:
try:
index = next(i for i,v in zip(range(len(sequence)-1, 0, -1), reversed(sequence)) if v == each_target[-1])
except:
index = None
if index is not None:
count = 1
for element in sequence[index+1:]:
if element in each_target:
continue
counttable = self.score(counttable,element,len(each_target),len(each_target),len(similar_sequences),count)
count+=1
pred = self.get_n_largest(counttable,n)
predictions.append(pred)
return predictions
def get_n_largest(self,dictionary,n):
"""
A small utility to obtain top n keys of a Dictionary based on their values.
"""
largest = sorted(dictionary.items(), key = lambda t: t[1], reverse=True)[:n]
return [key for key,_ in largest]