1
+ ############################################################################################
2
+ #
3
+ # The MIT License (MIT)
4
+ #
5
+ # Acute Myeloid Leukemia Detection System
6
+ # Copyright (C) 2018 Adam Milton-Barker (AdamMiltonBarker.com)
7
+ #
8
+ # Permission is hereby granted, free of charge, to any person obtaining a copy
9
+ # of this software and associated documentation files (the "Software"), to deal
10
+ # in the Software without restriction, including without limitation the rights
11
+ # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12
+ # copies of the Software, and to permit persons to whom the Software is
13
+ # furnished to do so, subject to the following conditions:
14
+ #
15
+ # The above copyright notice and this permission notice shall be included in
16
+ # all copies or substantial portions of the Software.
17
+ #
18
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19
+ # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20
+ # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21
+ # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22
+ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
23
+ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
24
+ # THE SOFTWARE.
25
+ #
26
+ # Title: Acute Myeloid Leukemia Detection System Data Tools
27
+ # Description: Data functions for the Acute Myeloid Leukemia Detection System.
28
+ # Configuration: required/confs.json
29
+ # Last Modified: 2018-12-22
30
+ #
31
+ ############################################################################################
32
+
33
+ import json , random , nltk , numpy as np
34
+
35
+ from nltk .stem .lancaster import LancasterStemmer
36
+ from Classes .Helpers import Helpers
37
+
38
+ class Data ():
39
+
40
+ def __init__ (self ):
41
+
42
+ ###############################################################
43
+ #
44
+ # Sets up all default requirements and placeholders
45
+ # needed for the NLU engine to run.
46
+ #
47
+ # - Helpers: Useful global functions
48
+ # - Logging: Logging class
49
+ # - LancasterStemmer: Word stemmer
50
+ #
51
+ ###############################################################
52
+
53
+ self .ignore = [',' ,'.' ,'!' ,'?' ]
54
+
55
+ self .Helpers = Helpers ()
56
+ self .confs = self .Helpers .loadConfigs ()
57
+ self .LogFile = self .Helpers .setLogFile (self .confs ["aiCore" ]["Logs" ]+ "JumpWay/" )
58
+
59
+ self .LancasterStemmer = LancasterStemmer ()
60
+
61
+ def loadTrainingData (self ):
62
+
63
+ ###############################################################
64
+ #
65
+ # Loads the NLU and NER training data from Model/Data/training.json
66
+ #
67
+ ###############################################################
68
+
69
+ with open ("Model/Data/training.json" ) as jsonData :
70
+ trainingData = json .load (jsonData )
71
+
72
+ self .Helpers .logMessage (
73
+ self .LogFile ,
74
+ "Data" ,
75
+ "INFO" ,
76
+ "Training Data Ready" )
77
+
78
+ return trainingData
79
+
80
+ def loadTrainedData (self ):
81
+
82
+ ###############################################################
83
+ #
84
+ # Loads the saved training configuratuon
85
+ #
86
+ ###############################################################
87
+
88
+ with open ("Model/model.json" ) as jsonData :
89
+ modelData = json .load (jsonData )
90
+
91
+ self .Helpers .logMessage (
92
+ self .LogFile ,
93
+ "Data" ,
94
+ "INFO" ,
95
+ "Model Data Ready" )
96
+
97
+ return modelData
98
+
99
+ def sortList (self , listToSort ):
100
+
101
+ ###############################################################
102
+ #
103
+ # Sorts a list by sorting the list, and removing duplicates
104
+ #
105
+ # https://www.programiz.com/python-programming/methods/built-in/sorted
106
+ # https://www.programiz.com/python-programming/list
107
+ # https://www.programiz.com/python-programming/set
108
+ #
109
+ ###############################################################
110
+
111
+ return sorted (list (set (listToSort )))
112
+
113
+ def extract (self , data = None , splitIt = False ):
114
+
115
+ ###############################################################
116
+ #
117
+ # Extracts words from sentences, stripping out characters in
118
+ # the ignore list above
119
+ #
120
+ # https://www.nltk.org/_modules/nltk/stem/lancaster.html
121
+ # http://insightsbot.com/blog/R8fu5/bag-of-words-algorithm-in-python-introduction
122
+ #
123
+ ###############################################################
124
+
125
+ return [self .LancasterStemmer .stem (word ) for word in (data .split () if splitIt == True else data ) if word not in self .ignore ]
126
+
127
+ def makeBagOfWords (self , sInput , words ):
128
+
129
+ ###############################################################
130
+ #
131
+ # Makes a bag of words used by the inference and training
132
+ # features. If makeBagOfWords is called during training, sInput
133
+ # will be a list.
134
+ #
135
+ # http://insightsbot.com/blog/R8fu5/bag-of-words-algorithm-in-python-introduction
136
+ #
137
+ ###############################################################
138
+
139
+ if type (sInput ) == list :
140
+ bagOfWords = []
141
+ for word in words :
142
+ if word in sInput :
143
+ bagOfWords .append (1 )
144
+ else :
145
+ bagOfWords .append (0 )
146
+ return bagOfWords
147
+
148
+ else :
149
+ bagOfWords = np .zeros (len (words ))
150
+ for cword in self .extract (sInput , True ):
151
+ for i , word in enumerate (words ):
152
+ if word == cword : bagOfWords [i ] += 1
153
+ return np .array (bagOfWords )
154
+
155
+ def prepareClasses (self , intent , classes ):
156
+
157
+ ###############################################################
158
+ #
159
+ # Adds an intent key to classes if it does not already exist
160
+ #
161
+ ###############################################################
162
+
163
+ if intent not in classes : classes .append (intent )
164
+ return classes
165
+
166
+ def prepareData (self , trainingData = [], wordsHldr = [], dataCorpusHldr = [], classesHldr = []):
167
+
168
+ ###############################################################
169
+ #
170
+ # Prepares the NLU and NER training data, loops through the
171
+ # intents from our dataset, converts any entities / synoynms
172
+ #
173
+ ###############################################################
174
+
175
+ counter = 0
176
+ intentMap = {}
177
+
178
+ for intent in trainingData ['intents' ]:
179
+
180
+ theIntent = intent ['intent' ]
181
+ for text in intent ['text' ]:
182
+
183
+ if 'entities' in intent and len (intent ['entities' ]):
184
+ i = 0
185
+ for entity in intent ['entities' ]:
186
+ tokens = text .replace (trainingData ['intents' ][counter ]["text" ][i ], "<" + entity ["entity" ]+ ">" ).lower ().split ()
187
+ wordsHldr .extend (tokens )
188
+ dataCorpusHldr .append ((tokens , theIntent ))
189
+ i = i + 1
190
+ else :
191
+ tokens = text .lower ().split ()
192
+ wordsHldr .extend (tokens )
193
+ dataCorpusHldr .append ((tokens , theIntent ))
194
+
195
+ intentMap [theIntent ] = counter
196
+ classesHldr = self .prepareClasses (theIntent , classesHldr )
197
+ counter = counter + 1
198
+
199
+ return self .sortList (self .extract (wordsHldr , False )), self .sortList (classesHldr ), dataCorpusHldr , intentMap
200
+
201
+ def finaliseData (self , classes , dataCorpus , words ):
202
+
203
+ ###############################################################
204
+ #
205
+ # Finalises the NLU training data
206
+ #
207
+ ###############################################################
208
+
209
+ trainData = []
210
+ out = np .zeros (len (classes ))
211
+
212
+ for document in dataCorpus :
213
+ output = list (out )
214
+ output [classes .index (document [1 ])] = 1
215
+ trainData .append ([self .makeBagOfWords (self .extract (document [0 ], False ), words ), output ])
216
+
217
+ random .shuffle (trainData )
218
+ trainData = np .array (trainData )
219
+
220
+ self .Helpers .logMessage (
221
+ self .LogFile ,
222
+ "Data" ,
223
+ "INFO" ,
224
+ "Finalised Training Data Ready" )
225
+
226
+ return list (trainData [:,0 ]), list (trainData [:,1 ])
0 commit comments