-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path15_Extract_SP.py
84 lines (63 loc) · 2.66 KB
/
15_Extract_SP.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
# Extracting cities from the state of Sao Paulo in order to do analysis
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import sys
import glob
import datetime
from epiweeks import Week, Year
from sklearn.utils.extmath import cartesian
files = glob.glob('./Data/processed/*.csv')
data_total = pd.DataFrame()
id_uf = 35
for file in files:
# reading data
data_test = pd.read_csv(file,
delimiter = ',',
index_col=False,
parse_dates = ['DT_SIN_PRI','DT_NOTIFIC'], encoding='cp1252')
# get the year we are working with in question
# first column is read differently
data_test = data_test.drop(columns = ['Unnamed: 0'])
data_total = data_total.append(data_test)
# Doing this for basic data analysis
data_total['CASO'] = 1
# Doing basic epidemiological curves
# Separating year and epidemiological week
data_total['SEM_EPI'] = data_total['SEM_PRI'].astype(str).str[4:]
data_total['ANO_EPI'] = data_total['SEM_PRI'].astype(str).str[:4]
# Uniting them with a -
data_total['DATA_EPI'] = data_total['ANO_EPI'] + '-' + data_total['SEM_EPI']
# Resident cases in the state of SP
data_res = data_total[data_total['SG_UF'] == id_uf]
# Grouping for DT_SIN_PRI and ID_MN_RESI
data_res = data_res.groupby(['ANO_EPI','SEM_EPI','ID_MN_RESI'])['CASO'].sum()
data_res = data_res.reset_index(name = 'CASO')
# Padding for the rest of weeks and dates
mun_list = np.unique(data_res['ID_MN_RESI'])
year_list = np.arange(2007,2023,1)
week_list = np.arange(1,54,1)
df = pd.DataFrame(cartesian((year_list, week_list, mun_list)))
df = df.rename(columns={0: "ANO_EPI", 1: "SEM_EPI", 2: "ID_MN_RESI"})
df['CASO'] = 0
df['ID_MN_RESI'] = df['ID_MN_RESI'].astype(int)
df['ANO_EPI'] = df['ANO_EPI'].astype(int)
df['SEM_EPI'] = df['SEM_EPI'].astype(int)
df['CASO'] = df['CASO'].astype(int)
data_res['ID_MN_RESI'] = data_res['ID_MN_RESI'].astype(int)
data_res['ANO_EPI'] = data_res['ANO_EPI'].astype(int)
data_res['SEM_EPI'] = data_res['SEM_EPI'].astype(int)
data_res['CASO'] = data_res['CASO'].astype(int)
df = df.merge(data_res, how = 'left', on = ['ANO_EPI','SEM_EPI','ID_MN_RESI'])
df['CASO'] = df['CASO_y']
df['CASO'] = df['CASO'].fillna(0)
df = df.drop(columns = ['CASO_x','CASO_y'])
df['DATA_EPI'] = df['ANO_EPI'].astype(str) + '-' + df['SEM_EPI'].astype(str)
for year in year_list:
n_weeks = Year(year).totalweeks()
print(year,n_weeks)
if(n_weeks == 52):
week_rm = str(year)+'-53'
df = df[df['DATA_EPI'] != week_rm]
df = df.to_csv('Data/analyzed/epicurves_SP.csv')