-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfullprocess.py
78 lines (59 loc) · 2.53 KB
/
fullprocess.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
import json
import os
import sys
import pandas as pd
from sklearn import metrics
import pickle
import training
import scoring
import deployment
import diagnostics
import reporting
with open('config.json','r') as f:
config = json.load(f)
input_data_path = os.path.join(config['input_folder_path'])
dataset_csv_path = os.path.join(config['output_folder_path'])
model_path = os.path.join(config['output_model_path'])
prod_folder_path = os.path.join(config['prod_deployment_path'])
##################Check and read new data
with open(os.getcwd()+'/'+prod_folder_path+'/'+'ingestedfiles.txt','r') as f:
read_data_files = f.read().rstrip()
continue_pr = False
for file in os.listdir(os.getcwd()+'/'+input_data_path+'/'):
if file in read_data_files:
continue
else:
os.system(f'python ingestion.py {input_data_path}')
continue_pr = True
break
#second, determine whether the source data folder has files that aren't listed in ingestedfiles.txt
##################Deciding whether to proceed, part 1
#if you found new data, you should proceed. otherwise, do end the process here
if continue_pr == False:
sys.exit()
##################Checking for model drift
#check whether the score from the deployed model is different from the score from the model that uses the newest ingested data
with open(os.getcwd()+'/'+prod_folder_path+'/'+'latestscore.txt') as f:
score = float(f.read().rsplit()[0])
data = pd.read_csv(os.getcwd()+'/'+dataset_csv_path+'/'+'finaldata.csv')
y = data.pop('exited').astype(int)
X = data[['lastmonth_activity','lastyear_activity','number_of_employees']]
for file in os.listdir(os.getcwd()+'/'+prod_folder_path):
if file.endswith('.pkl'):
model = pickle.load(open(os.getcwd()+'/'+prod_folder_path+'/'+file,'rb'))
predictions = model.predict(X)
# I will calculate the score here again, since scoring.py calculates the scores only with test data,
# not with latest ingested data. We didn't desgin scoring.py to take predictions and actual y as an input.
new_scores = metrics.f1_score(y, predictions)
##################Deciding whether to proceed, part 2
#if you found model drift, you should proceed. otherwise, do end the process here
if new_scores >= score:
sys.exit()
##################Re-deployment
#if you found evidence for model drift, re-run the deployment.py script
os.system('python training.py')
os.system('python deployment.py')
##################Diagnostics and reporting
#run diagnostics.py and reporting.py for the re-deployed model
os.system('python reporting.py')
os.system('python apicalls.py')