diff --git a/.gitignore b/.gitignore index 590add72..2b3b4bd9 100644 --- a/.gitignore +++ b/.gitignore @@ -198,7 +198,6 @@ cython_debug/ # folders **/wandb/ -**/notebooks/ # mac specific .DS_Store diff --git a/common_querysets/queryset_bittersweet_symphony.py b/common_querysets/queryset_bittersweet_symphony.py new file mode 100644 index 00000000..49b7d1b8 --- /dev/null +++ b/common_querysets/queryset_bittersweet_symphony.py @@ -0,0 +1,1302 @@ +from viewser import Queryset, Column + +def generate(): + """ + Contains the configuration for the input data in the form of a viewser queryset. That is the data from viewser that is used to train the model. + This configuration is "behavioral" so modifying it will affect the model's runtime behavior and integration into the deployment system. + There is no guarantee that the model will work if the input data configuration is changed here without changing the model settings and algorithm accordingly. + + Returns: + - queryset_base (Queryset): A queryset containing the base data for the model training. + """ + + # VIEWSER 6, Example configuration. Modify as needed. + + queryset_base = (Queryset('fatalities003_all_features','country_month') + .with_column(Column('gleditsch_ward', from_loa='country', from_column='gwcode') + ) + + .with_column(Column('ln_ged_sb_dep', from_loa='country_month', from_column='ged_sb_best_sum_nokgi') + .transform.ops.ln() + .transform.missing.fill() + ) + + .with_column(Column('ln_ged_sb', from_loa='country_month', from_column='ged_sb_best_sum_nokgi') + .transform.ops.ln() + .transform.missing.fill() + ) + + .with_column(Column('ln_ged_ns', from_loa='country_month', from_column='ged_ns_best_sum_nokgi') + .transform.ops.ln() + .transform.missing.fill() + ) + + .with_column(Column('ln_ged_os', from_loa='country_month', from_column='ged_os_best_sum_nokgi') + .transform.ops.ln() + .transform.missing.fill() + ) + + .with_column(Column('ln_acled_sb', from_loa='country_month', from_column='acled_sb_fat') + .transform.ops.ln() + .transform.missing.fill() + ) + + .with_column(Column('ln_acled_sb_count', from_loa='country_month', from_column='acled_sb_count') + .transform.ops.ln() + .transform.missing.fill() + ) + + .with_column(Column('ln_acled_os', from_loa='country_month', from_column='acled_os_fat') + .transform.ops.ln() + .transform.missing.fill() + ) + + .with_column(Column('wdi_ag_lnd_frst_k2', from_loa='country_year', from_column='wdi_ag_lnd_frst_k2') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + ) + + .with_column(Column('wdi_dt_oda_odat_pc_zs', from_loa='country_year', from_column='wdi_dt_oda_odat_pc_zs') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + ) + + .with_column(Column('wdi_ms_mil_xpnd_gd_zs', from_loa='country_year', from_column='wdi_ms_mil_xpnd_gd_zs') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + ) + + .with_column(Column('wdi_ms_mil_xpnd_zs', from_loa='country_year', from_column='wdi_ms_mil_xpnd_zs') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + ) + + .with_column(Column('wdi_nv_agr_totl_kd', from_loa='country_year', from_column='wdi_nv_agr_totl_kd') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + ) + + .with_column(Column('wdi_nv_agr_totl_kn', from_loa='country_year', from_column='wdi_nv_agr_totl_kn') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + ) + + .with_column(Column('wdi_ny_gdp_pcap_kd', from_loa='country_year', from_column='wdi_ny_gdp_pcap_kd') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + ) + + .with_column(Column('wdi_sp_dyn_le00_in', from_loa='country_year', from_column='wdi_sp_dyn_le00_in') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + ) + + .with_column(Column('wdi_se_enr_prim_fm_zs', from_loa='country_year', from_column='wdi_se_enr_prim_fm_zs') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + ) + + .with_column(Column('wdi_se_enr_prsc_fm_zs', from_loa='country_year', from_column='wdi_se_enr_prsc_fm_zs') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + ) + + .with_column(Column('wdi_se_prm_nenr', from_loa='country_year', from_column='wdi_se_prm_nenr') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + ) + + .with_column(Column('wdi_sh_sta_maln_zs', from_loa='country_year', from_column='wdi_sh_sta_maln_zs') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + ) + + .with_column(Column('wdi_sh_sta_stnt_zs', from_loa='country_year', from_column='wdi_sh_sta_stnt_zs') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + ) + + .with_column(Column('wdi_sl_tlf_totl_fe_zs', from_loa='country_year', from_column='wdi_sl_tlf_totl_fe_zs') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + ) + + .with_column(Column('wdi_sm_pop_refg_or', from_loa='country_year', from_column='wdi_sm_pop_refg_or') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + ) + + .with_column(Column('wdi_sm_pop_netm', from_loa='country_year', from_column='wdi_sm_pop_netm') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + ) + + .with_column(Column('wdi_sm_pop_totl_zs', from_loa='country_year', from_column='wdi_sm_pop_totl_zs') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + ) + + .with_column(Column('wdi_sp_dyn_imrt_in', from_loa='country_year', from_column='wdi_sp_dyn_imrt_in') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + ) + + .with_column(Column('wdi_sh_dyn_mort_fe', from_loa='country_year', from_column='wdi_sh_dyn_mort_fe') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + ) + + .with_column(Column('wdi_sp_pop_14_fe_zs', from_loa='country_year', from_column='wdi_sp_pop_0014_fe_zs') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + ) + + .with_column(Column('wdi_sp_pop_1564_fe_zs', from_loa='country_year', from_column='wdi_sp_pop_1564_fe_zs') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + ) + + .with_column(Column('wdi_sp_pop_65up_fe_zs', from_loa='country_year', from_column='wdi_sp_pop_65up_fe_zs') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + ) + + .with_column(Column('wdi_sp_pop_grow', from_loa='country_year', from_column='wdi_sp_pop_grow') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + ) + + .with_column(Column('wdi_sp_urb_totl_in_zs', from_loa='country_year', from_column='wdi_sp_urb_totl_in_zs') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + ) + + .with_column(Column('vdem_v2x_delibdem', from_loa='country_year', from_column='vdem_v2x_delibdem') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + ) + + .with_column(Column('vdem_v2x_egaldem', from_loa='country_year', from_column='vdem_v2x_egaldem') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + ) + + .with_column(Column('vdem_v2x_libdem', from_loa='country_year', from_column='vdem_v2x_libdem') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + ) + + .with_column(Column('vdem_v2x_libdem_48', from_loa='country_year', from_column='vdem_v2x_libdem') + .transform.missing.fill() + .transform.temporal.tlag(60) + .transform.missing.fill() + ) + + .with_column(Column('vdem_v2x_partip', from_loa='country_year', from_column='vdem_v2x_partip') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + ) + + .with_column(Column('vdem_v2x_partipdem', from_loa='country_year', from_column='vdem_v2x_partipdem') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + ) + + .with_column(Column('vdem_v2x_accountability', from_loa='country_year', from_column='vdem_v2x_accountability') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + ) + + .with_column(Column('vdem_v2x_civlib', from_loa='country_year', from_column='vdem_v2x_civlib') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + ) + + .with_column(Column('vdem_v2x_clphy', from_loa='country_year', from_column='vdem_v2x_clphy') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + ) + + .with_column(Column('vdem_v2x_cspart', from_loa='country_year', from_column='vdem_v2x_cspart') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + ) + + .with_column(Column('vdem_v2x_divparctrl', from_loa='country_year', from_column='vdem_v2x_divparctrl') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + ) + + .with_column(Column('vdem_v2x_edcomp_thick', from_loa='country_year', from_column='vdem_v2x_edcomp_thick') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + ) + + .with_column(Column('vdem_v2x_egal', from_loa='country_year', from_column='vdem_v2x_egal') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + ) + + .with_column(Column('vdem_v2x_execorr', from_loa='country_year', from_column='vdem_v2x_execorr') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + ) + + .with_column(Column('vdem_v2x_frassoc_thick', from_loa='country_year', from_column='vdem_v2x_frassoc_thick') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + ) + + .with_column(Column('vdem_v2x_gencs', from_loa='country_year', from_column='vdem_v2x_gencs') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + ) + + .with_column(Column('vdem_v2x_gender', from_loa='country_year', from_column='vdem_v2x_gender') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + ) + + .with_column(Column('vdem_v2x_genpp', from_loa='country_year', from_column='vdem_v2x_genpp') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + ) + + .with_column(Column('vdem_v2x_horacc', from_loa='country_year', from_column='vdem_v2x_horacc') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + ) + + .with_column(Column('vdem_v2x_neopat', from_loa='country_year', from_column='vdem_v2x_neopat') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + ) + + .with_column(Column('vdem_v2x_pubcorr', from_loa='country_year', from_column='vdem_v2x_pubcorr') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + ) + + .with_column(Column('vdem_v2x_rule', from_loa='country_year', from_column='vdem_v2x_rule') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + ) + + .with_column(Column('vdem_v2x_veracc', from_loa='country_year', from_column='vdem_v2x_veracc') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + ) + + .with_column(Column('vdem_v2x_ex_military', from_loa='country_year', from_column='vdem_v2x_ex_military') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + ) + + .with_column(Column('vdem_v2x_ex_party', from_loa='country_year', from_column='vdem_v2x_ex_party') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + ) + + .with_column(Column('vdem_v2x_freexp', from_loa='country_year', from_column='vdem_v2x_freexp') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + ) + + .with_column(Column('vdem_v2xcl_acjst', from_loa='country_year', from_column='vdem_v2xcl_acjst') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + ) + + .with_column(Column('vdem_v2xcl_dmove', from_loa='country_year', from_column='vdem_v2xcl_dmove') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + ) + + .with_column(Column('vdem_v2xcl_prpty', from_loa='country_year', from_column='vdem_v2xcl_prpty') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + ) + + .with_column(Column('vdem_v2xcl_rol', from_loa='country_year', from_column='vdem_v2xcl_rol') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + ) + + .with_column(Column('vdem_v2xcl_slave', from_loa='country_year', from_column='vdem_v2xcl_slave') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + ) + + .with_column(Column('vdem_v2xdd_dd', from_loa='country_year', from_column='vdem_v2xdd_dd') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + ) + + .with_column(Column('vdem_v2xdl_delib', from_loa='country_year', from_column='vdem_v2xdl_delib') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + ) + + .with_column(Column('vdem_v2xeg_eqdr', from_loa='country_year', from_column='vdem_v2xeg_eqdr') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + ) + + .with_column(Column('vdem_v2xeg_eqprotec', from_loa='country_year', from_column='vdem_v2xeg_eqprotec') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + ) + + .with_column(Column('vdem_v2xel_frefair', from_loa='country_year', from_column='vdem_v2xel_frefair') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + ) + + .with_column(Column('vdem_v2xel_regelec', from_loa='country_year', from_column='vdem_v2xel_regelec') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + ) + + .with_column(Column('vdem_v2xme_altinf', from_loa='country_year', from_column='vdem_v2xme_altinf') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + ) + + .with_column(Column('vdem_v2xnp_client', from_loa='country_year', from_column='vdem_v2xnp_client') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + ) + + .with_column(Column('vdem_v2xnp_regcorr', from_loa='country_year', from_column='vdem_v2xnp_regcorr') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + ) + + .with_column(Column('vdem_v2xpe_exlecon', from_loa='country_year', from_column='vdem_v2xpe_exlecon') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + ) + + .with_column(Column('vdem_v2xpe_exlpol', from_loa='country_year', from_column='vdem_v2xpe_exlpol') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + ) + + .with_column(Column('vdem_v2xpe_exlgeo', from_loa='country_year', from_column='vdem_v2xpe_exlgeo') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + ) + + .with_column(Column('vdem_v2xpe_exlgender', from_loa='country_year', from_column='vdem_v2xpe_exlgender') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + ) + + .with_column(Column('vdem_v2xpe_exlsocgr', from_loa='country_year', from_column='vdem_v2xpe_exlsocgr') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + ) + + .with_column(Column('vdem_v2xps_party', from_loa='country_year', from_column='vdem_v2xps_party') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + ) + + .with_column(Column('vdem_v2xcs_ccsi', from_loa='country_year', from_column='vdem_v2xcs_ccsi') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + ) + + .with_column(Column('vdem_v2xnp_pres', from_loa='country_year', from_column='vdem_v2xnp_pres') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + ) + + .with_column(Column('vdem_v2xeg_eqaccess', from_loa='country_year', from_column='vdem_v2xeg_eqaccess') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + ) + + .with_column(Column('vdem_v2x_diagacc', from_loa='country_year', from_column='vdem_v2x_diagacc') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + ) + + .with_column(Column('vdem_v2clrgunev', from_loa='country_year', from_column='vdem_v2clrgunev') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + ) + + .with_column(Column('wdi_sp_pop_totl', from_loa='country_year', from_column='wdi_sp_pop_totl') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + .transform.missing.replace_na() + ) + + .with_column(Column('ln_ged_sb_tlag_1', from_loa='country_month', from_column='ged_sb_best_sum_nokgi') + .transform.ops.ln() + .transform.missing.fill() + .transform.temporal.tlag(1) + .transform.missing.fill() + ) + + .with_column(Column('ln_ged_sb_tlag_2', from_loa='country_month', from_column='ged_sb_best_sum_nokgi') + .transform.ops.ln() + .transform.missing.fill() + .transform.temporal.tlag(2) + .transform.missing.fill() + ) + + .with_column(Column('ln_ged_sb_tlag_3', from_loa='country_month', from_column='ged_sb_best_sum_nokgi') + .transform.ops.ln() + .transform.missing.fill() + .transform.temporal.tlag(3) + .transform.missing.fill() + ) + + .with_column(Column('ln_ged_sb_tlag_4', from_loa='country_month', from_column='ged_sb_best_sum_nokgi') + .transform.ops.ln() + .transform.missing.fill() + .transform.temporal.tlag(4) + .transform.missing.fill() + ) + + .with_column(Column('ln_ged_sb_tlag_5', from_loa='country_month', from_column='ged_sb_best_sum_nokgi') + .transform.ops.ln() + .transform.missing.fill() + .transform.temporal.tlag(5) + .transform.missing.fill() + ) + + .with_column(Column('ln_ged_sb_tlag_6', from_loa='country_month', from_column='ged_sb_best_sum_nokgi') + .transform.ops.ln() + .transform.missing.fill() + .transform.temporal.tlag(6) + .transform.missing.fill() + ) + + .with_column(Column('ln_ged_sb_tsum_24', from_loa='country_month', from_column='ged_sb_best_sum_nokgi') + .transform.missing.replace_na() + .transform.temporal.moving_sum(24) + .transform.ops.ln() + .transform.missing.replace_na() + ) + + .with_column(Column('ln_ged_os_tlag_1', from_loa='country_month', from_column='ged_os_best_sum_nokgi') + .transform.ops.ln() + .transform.missing.fill() + .transform.temporal.tlag(1) + .transform.missing.fill() + ) + + .with_column(Column('topic_tokens_t1', from_loa='country_month', from_column='topic_tokens') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(1) + .transform.missing.fill() + ) + + .with_column(Column('topic_tokens_t2', from_loa='country_month', from_column='topic_tokens') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(2) + .transform.missing.fill() + ) + + .with_column(Column('topic_tokens_t13', from_loa='country_month', from_column='topic_tokens') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(13) + .transform.missing.fill() + ) + + .with_column(Column('topic_ste_theta0_stock_t1', from_loa='country_month', from_column='topic_ste_theta0_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(1) + .transform.missing.fill() + ) + + .with_column(Column('topic_ste_theta0_stock_t2', from_loa='country_month', from_column='topic_ste_theta0_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(2) + .transform.missing.fill() + ) + + .with_column(Column('topic_ste_theta0_stock_t13', from_loa='country_month', from_column='topic_ste_theta0_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(13) + .transform.missing.fill() + ) + + .with_column(Column('topic_ste_theta1_stock_t1', from_loa='country_month', from_column='topic_ste_theta1_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(1) + .transform.missing.fill() + ) + + .with_column(Column('topic_ste_theta1_stock_t2', from_loa='country_month', from_column='topic_ste_theta1_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(2) + .transform.missing.fill() + ) + + .with_column(Column('topic_ste_theta1_stock_t13', from_loa='country_month', from_column='topic_ste_theta1_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(13) + .transform.missing.fill() + ) + + .with_column(Column('topic_ste_theta2_stock_t1', from_loa='country_month', from_column='topic_ste_theta2_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(1) + .transform.missing.fill() + ) + + .with_column(Column('topic_ste_theta2_stock_t2', from_loa='country_month', from_column='topic_ste_theta2_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(2) + .transform.missing.fill() + ) + + .with_column(Column('topic_ste_theta2_stock_t13', from_loa='country_month', from_column='topic_ste_theta2_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(13) + .transform.missing.fill() + ) + + .with_column(Column('topic_ste_theta3_stock_t1', from_loa='country_month', from_column='topic_ste_theta3_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(1) + .transform.missing.fill() + ) + + .with_column(Column('topic_ste_theta3_stock_t2', from_loa='country_month', from_column='topic_ste_theta3_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(2) + .transform.missing.fill() + ) + + .with_column(Column('topic_ste_theta3_stock_t13', from_loa='country_month', from_column='topic_ste_theta3_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(13) + .transform.missing.fill() + ) + + .with_column(Column('topic_ste_theta4_stock_t1', from_loa='country_month', from_column='topic_ste_theta4_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(1) + .transform.missing.fill() + ) + + .with_column(Column('topic_ste_theta4_stock_t2', from_loa='country_month', from_column='topic_ste_theta4_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(2) + .transform.missing.fill() + ) + + .with_column(Column('topic_ste_theta4_stock_t13', from_loa='country_month', from_column='topic_ste_theta4_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(13) + .transform.missing.fill() + ) + + .with_column(Column('topic_ste_theta5_stock_t1', from_loa='country_month', from_column='topic_ste_theta5_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(1) + .transform.missing.fill() + ) + + .with_column(Column('topic_ste_theta5_stock_t2', from_loa='country_month', from_column='topic_ste_theta5_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(2) + .transform.missing.fill() + ) + + .with_column(Column('topic_ste_theta5_stock_t13', from_loa='country_month', from_column='topic_ste_theta5_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(13) + .transform.missing.fill() + ) + + .with_column(Column('topic_ste_theta6_stock_t1', from_loa='country_month', from_column='topic_ste_theta6_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(1) + .transform.missing.fill() + ) + + .with_column(Column('topic_ste_theta6_stock_t2', from_loa='country_month', from_column='topic_ste_theta6_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(2) + .transform.missing.fill() + ) + + .with_column(Column('topic_ste_theta6_stock_t13', from_loa='country_month', from_column='topic_ste_theta6_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(13) + .transform.missing.fill() + ) + + .with_column(Column('topic_ste_theta7_stock_t1', from_loa='country_month', from_column='topic_ste_theta7_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(1) + .transform.missing.fill() + ) + + .with_column(Column('topic_ste_theta7_stock_t2', from_loa='country_month', from_column='topic_ste_theta7_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(2) + .transform.missing.fill() + ) + + .with_column(Column('topic_ste_theta7_stock_t13', from_loa='country_month', from_column='topic_ste_theta7_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(13) + .transform.missing.fill() + ) + + .with_column(Column('topic_ste_theta8_stock_t1', from_loa='country_month', from_column='topic_ste_theta8_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(1) + .transform.missing.fill() + ) + + .with_column(Column('topic_ste_theta8_stock_t2', from_loa='country_month', from_column='topic_ste_theta8_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(2) + .transform.missing.fill() + ) + + .with_column(Column('topic_ste_theta8_stock_t13', from_loa='country_month', from_column='topic_ste_theta8_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(13) + .transform.missing.fill() + ) + + .with_column(Column('topic_ste_theta9_stock_t1', from_loa='country_month', from_column='topic_ste_theta9_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(1) + .transform.missing.fill() + ) + + .with_column(Column('topic_ste_theta9_stock_t2', from_loa='country_month', from_column='topic_ste_theta9_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(2) + .transform.missing.fill() + ) + + .with_column(Column('topic_ste_theta9_stock_t13', from_loa='country_month', from_column='topic_ste_theta9_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(13) + .transform.missing.fill() + ) + + .with_column(Column('topic_ste_theta10_stock_t1', from_loa='country_month', from_column='topic_ste_theta10_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(1) + .transform.missing.fill() + ) + + .with_column(Column('topic_ste_theta10_stock_t2', from_loa='country_month', from_column='topic_ste_theta10_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(2) + .transform.missing.fill() + ) + + .with_column(Column('topic_ste_theta10_stock_t13', from_loa='country_month', from_column='topic_ste_theta10_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(13) + .transform.missing.fill() + ) + + .with_column(Column('topic_ste_theta11_stock_t1', from_loa='country_month', from_column='topic_ste_theta11_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(1) + .transform.missing.fill() + ) + + .with_column(Column('topic_ste_theta11_stock_t2', from_loa='country_month', from_column='topic_ste_theta11_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(2) + .transform.missing.fill() + ) + + .with_column(Column('topic_ste_theta11_stock_t13', from_loa='country_month', from_column='topic_ste_theta11_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(13) + .transform.missing.fill() + ) + + .with_column(Column('topic_ste_theta12_stock_t1', from_loa='country_month', from_column='topic_ste_theta12_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(1) + .transform.missing.fill() + ) + + .with_column(Column('topic_ste_theta12_stock_t2', from_loa='country_month', from_column='topic_ste_theta12_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(2) + .transform.missing.fill() + ) + + .with_column(Column('topic_ste_theta12_stock_t13', from_loa='country_month', from_column='topic_ste_theta12_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(13) + .transform.missing.fill() + ) + + .with_column(Column('topic_ste_theta13_stock_t1', from_loa='country_month', from_column='topic_ste_theta13_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(1) + .transform.missing.fill() + ) + + .with_column(Column('topic_ste_theta13_stock_t2', from_loa='country_month', from_column='topic_ste_theta13_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(2) + .transform.missing.fill() + ) + + .with_column(Column('topic_ste_theta13_stock_t13', from_loa='country_month', from_column='topic_ste_theta13_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(13) + .transform.missing.fill() + ) + + .with_column(Column('topic_ste_theta14_stock_t1', from_loa='country_month', from_column='topic_ste_theta14_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(1) + .transform.missing.fill() + ) + + .with_column(Column('topic_ste_theta14_stock_t2', from_loa='country_month', from_column='topic_ste_theta14_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(2) + .transform.missing.fill() + ) + + .with_column(Column('topic_ste_theta14_stock_t13', from_loa='country_month', from_column='topic_ste_theta14_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(13) + .transform.missing.fill() + ) + + .with_column(Column('splag_wdi_sl_tlf_totl_fe_zs', from_loa='country_year', from_column='wdi_sl_tlf_totl_fe_zs') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.spatial.countrylag(1,1,0,0) + .transform.missing.replace_na() + ) + + .with_column(Column('splag_wdi_sm_pop_refg_or', from_loa='country_year', from_column='wdi_sm_pop_refg_or') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.spatial.countrylag(1,1,0,0) + .transform.missing.replace_na() + ) + + .with_column(Column('splag_wdi_sm_pop_netm', from_loa='country_year', from_column='wdi_sm_pop_netm') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.spatial.countrylag(1,1,0,0) + .transform.missing.replace_na() + ) + + .with_column(Column('splag_wdi_ag_lnd_frst_k2', from_loa='country_year', from_column='wdi_ag_lnd_frst_k2') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.spatial.countrylag(1,1,0,0) + .transform.missing.replace_na() + ) + + .with_column(Column('splag_vdem_v2x_libdem', from_loa='country_year', from_column='vdem_v2x_libdem') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.spatial.countrylag(1,1,0,0) + .transform.missing.replace_na() + ) + + .with_column(Column('splag_vdem_v2xcl_dmove', from_loa='country_year', from_column='vdem_v2xcl_dmove') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.spatial.countrylag(1,1,0,0) + .transform.missing.replace_na() + ) + + .with_column(Column('splag_vdem_v2x_accountability', from_loa='country_year', from_column='vdem_v2x_accountability') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.spatial.countrylag(1,1,0,0) + .transform.missing.replace_na() + ) + + .with_column(Column('splag_vdem_v2xpe_exlsocgr', from_loa='country_year', from_column='vdem_v2xpe_exlsocgr') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.spatial.countrylag(1,1,0,0) + .transform.missing.replace_na() + ) + + .with_column(Column('splag_vdem_v2xcl_rol', from_loa='country_year', from_column='vdem_v2xcl_rol') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.spatial.countrylag(1,1,0,0) + .transform.missing.replace_na() + ) + + .with_column(Column('agr_withdrawal_pct_t48', from_loa='country_year', from_column='agr_withdrawal_pct') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(48) + .transform.missing.fill() + ) + + .with_column(Column('dam_cap_pcap_t48', from_loa='country_year', from_column='dam_cap_pcap') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(48) + .transform.missing.fill() + ) + + .with_column(Column('groundwater_export_t48', from_loa='country_year', from_column='groundwater_export') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(48) + .transform.missing.fill() + ) + + .with_column(Column('fresh_withdrawal_pct_t48', from_loa='country_year', from_column='fresh_withdrawal_pct') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(48) + .transform.missing.fill() + ) + + .with_column(Column('ind_efficiency_t48', from_loa='country_year', from_column='ind_efficiency') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(48) + .transform.missing.fill() + ) + + .with_column(Column('irr_agr_efficiency_t48', from_loa='country_year', from_column='irr_agr_efficiency') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(48) + .transform.missing.fill() + ) + + .with_column(Column('services_efficiency_t48', from_loa='country_year', from_column='services_efficiency') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(48) + .transform.missing.fill() + ) + + .with_column(Column('general_efficiency_t48', from_loa='country_year', from_column='general_efficiency') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(48) + .transform.missing.fill() + ) + + .with_column(Column('water_stress_t48', from_loa='country_year', from_column='water_stress') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(48) + .transform.missing.fill() + ) + + .with_column(Column('renewable_internal_pcap_t48', from_loa='country_year', from_column='renewable_internal_pcap') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(48) + .transform.missing.fill() + ) + + .with_column(Column('renewable_pcap_t48', from_loa='country_year', from_column='renewable_pcap') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(48) + .transform.missing.fill() + ) + + .with_column(Column('decay_ged_sb_5', from_loa='country_month', from_column='ged_sb_best_sum_nokgi') + .transform.missing.replace_na() + .transform.bool.gte(5) + .transform.temporal.time_since() + .transform.temporal.decay(24) + .transform.missing.replace_na() + ) + + .with_column(Column('decay_ged_os_5', from_loa='country_month', from_column='ged_os_best_sum_nokgi') + .transform.missing.replace_na() + .transform.bool.gte(5) + .transform.temporal.time_since() + .transform.temporal.decay(24) + .transform.missing.replace_na() + ) + + .with_column(Column('decay_ged_sb_100', from_loa='country_month', from_column='ged_sb_best_sum_nokgi') + .transform.missing.replace_na() + .transform.bool.gte(100) + .transform.temporal.time_since() + .transform.temporal.decay(24) + .transform.missing.replace_na() + ) + + .with_column(Column('decay_ged_sb_500', from_loa='country_month', from_column='ged_sb_best_sum_nokgi') + .transform.missing.replace_na() + .transform.bool.gte(500) + .transform.temporal.time_since() + .transform.temporal.decay(24) + .transform.missing.replace_na() + ) + + .with_column(Column('decay_ged_os_100', from_loa='country_month', from_column='ged_os_best_sum_nokgi') + .transform.missing.replace_na() + .transform.bool.gte(100) + .transform.temporal.time_since() + .transform.temporal.decay(24) + .transform.missing.replace_na() + ) + + .with_column(Column('decay_ged_ns_5', from_loa='country_month', from_column='ged_ns_best_sum_nokgi') + .transform.missing.replace_na() + .transform.bool.gte(5) + .transform.temporal.time_since() + .transform.temporal.decay(24) + .transform.missing.replace_na() + ) + + .with_column(Column('decay_ged_ns_100', from_loa='country_month', from_column='ged_ns_best_sum_nokgi') + .transform.missing.replace_na() + .transform.bool.gte(100) + .transform.temporal.time_since() + .transform.temporal.decay(24) + .transform.missing.replace_na() + ) + + .with_column(Column('decay_acled_sb_5', from_loa='country_month', from_column='acled_sb_fat') + .transform.missing.replace_na() + .transform.bool.gte(5) + .transform.temporal.time_since() + .transform.temporal.decay(24) + .transform.missing.replace_na() + ) + + .with_column(Column('decay_acled_os_5', from_loa='country_month', from_column='acled_os_fat') + .transform.missing.replace_na() + .transform.bool.gte(5) + .transform.temporal.time_since() + .transform.temporal.decay(24) + .transform.missing.replace_na() + ) + + .with_column(Column('decay_acled_ns_5', from_loa='country_month', from_column='acled_ns_fat') + .transform.missing.replace_na() + .transform.bool.gte(5) + .transform.temporal.time_since() + .transform.temporal.decay(24) + .transform.missing.replace_na() + ) + + .with_column(Column('splag_1_decay_ged_sb_5', from_loa='country_month', from_column='ged_sb_best_sum_nokgi') + .transform.missing.replace_na() + .transform.bool.gte(5) + .transform.temporal.time_since() + .transform.temporal.decay(24) + .transform.spatial.countrylag(1,1,0,0) + .transform.missing.replace_na() + ) + + .with_column(Column('splag_1_decay_ged_os_5', from_loa='country_month', from_column='ged_os_best_sum_nokgi') + .transform.missing.replace_na() + .transform.bool.gte(5) + .transform.temporal.time_since() + .transform.temporal.decay(24) + .transform.spatial.countrylag(1,1,0,0) + .transform.missing.replace_na() + ) + + .with_column(Column('splag_1_decay_ged_ns_5', from_loa='country_month', from_column='ged_ns_best_sum_nokgi') + .transform.missing.replace_na() + .transform.bool.gte(5) + .transform.temporal.time_since() + .transform.temporal.decay(24) + .transform.spatial.countrylag(1,1,0,0) + .transform.missing.replace_na() + ) + + .with_column(Column('topic_tokens_t1_splag', from_loa='country_month', from_column='topic_tokens') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(13) + .transform.missing.fill() + .transform.spatial.countrylag(1,1,0,0) + .transform.missing.replace_na() + ) + + .with_column(Column('topic_ste_theta0_stock_t1_splag', from_loa='country_month', from_column='topic_ste_theta0_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(13) + .transform.missing.fill() + .transform.spatial.countrylag(1,1,0,0) + .transform.missing.replace_na() + ) + + .with_column(Column('topic_ste_theta1_stock_t1_splag', from_loa='country_month', from_column='topic_ste_theta1_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(13) + .transform.missing.fill() + .transform.spatial.countrylag(1,1,0,0) + .transform.missing.replace_na() + ) + + .with_column(Column('topic_ste_theta2_stock_t1_splag', from_loa='country_month', from_column='topic_ste_theta2_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(13) + .transform.missing.fill() + .transform.spatial.countrylag(1,1,0,0) + .transform.missing.replace_na() + ) + + .with_column(Column('topic_ste_theta3_stock_t1_splag', from_loa='country_month', from_column='topic_ste_theta3_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(13) + .transform.missing.fill() + .transform.spatial.countrylag(1,1,0,0) + .transform.missing.replace_na() + ) + + .with_column(Column('topic_ste_theta4_stock_t1_splag', from_loa='country_month', from_column='topic_ste_theta4_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(13) + .transform.missing.fill() + .transform.spatial.countrylag(1,1,0,0) + .transform.missing.replace_na() + ) + + .with_column(Column('topic_ste_theta5_stock_t1_splag', from_loa='country_month', from_column='topic_ste_theta5_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(13) + .transform.missing.fill() + .transform.spatial.countrylag(1,1,0,0) + .transform.missing.replace_na() + ) + + .with_column(Column('topic_ste_theta6_stock_t1_splag', from_loa='country_month', from_column='topic_ste_theta6_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(13) + .transform.missing.fill() + .transform.spatial.countrylag(1,1,0,0) + .transform.missing.replace_na() + ) + + .with_column(Column('topic_ste_theta7_stock_t1_splag', from_loa='country_month', from_column='topic_ste_theta7_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(13) + .transform.missing.fill() + .transform.spatial.countrylag(1,1,0,0) + .transform.missing.replace_na() + ) + + .with_column(Column('topic_ste_theta8_stock_t1_splag', from_loa='country_month', from_column='topic_ste_theta8_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(13) + .transform.missing.fill() + .transform.spatial.countrylag(1,1,0,0) + .transform.missing.replace_na() + ) + + .with_column(Column('topic_ste_theta9_stock_t1_splag', from_loa='country_month', from_column='topic_ste_theta9_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(13) + .transform.missing.fill() + .transform.spatial.countrylag(1,1,0,0) + .transform.missing.replace_na() + ) + + .with_column(Column('topic_ste_theta10_stock_t1_splag', from_loa='country_month', from_column='topic_ste_theta10_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(13) + .transform.missing.fill() + .transform.spatial.countrylag(1,1,0,0) + .transform.missing.replace_na() + ) + + .with_column(Column('topic_ste_theta11_stock_t1_splag', from_loa='country_month', from_column='topic_ste_theta11_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(13) + .transform.missing.fill() + .transform.spatial.countrylag(1,1,0,0) + .transform.missing.replace_na() + ) + + .with_column(Column('topic_ste_theta12_stock_t1_splag', from_loa='country_month', from_column='topic_ste_theta12_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(13) + .transform.missing.fill() + .transform.spatial.countrylag(1,1,0,0) + .transform.missing.replace_na() + ) + + .with_column(Column('topic_ste_theta13_stock_t1_splag', from_loa='country_month', from_column='topic_ste_theta13_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(13) + .transform.missing.fill() + .transform.spatial.countrylag(1,1,0,0) + .transform.missing.replace_na() + ) + + .with_column(Column('topic_ste_theta14_stock_t1_splag', from_loa='country_month', from_column='topic_ste_theta14_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(13) + .transform.missing.fill() + .transform.spatial.countrylag(1,1,0,0) + .transform.missing.replace_na() + ) + + .with_theme('fatalities002') + .describe("""Predicting ln(fatalities), cm level + + Queryset with all features + + """) + ) + + return queryset_base diff --git a/common_querysets/queryset_brown_cheese.py b/common_querysets/queryset_brown_cheese.py index 56e2ed05..853185d2 100644 --- a/common_querysets/queryset_brown_cheese.py +++ b/common_querysets/queryset_brown_cheese.py @@ -10,63 +10,56 @@ def generate(): - queryset_base (Queryset): A queryset containing the base data for the model training. """ - qs_baseline = (Queryset("fatalities003_baseline", "country_month") + queryset = (Queryset('fatalities003_baseline','country_month') + .with_column(Column('ln_ged_sb_dep', from_loa='country_month', from_column='ged_sb_best_sum_nokgi') + .transform.ops.ln() + .transform.missing.fill() + ) - # target variable - .with_column(Column("ln_ged_sb_dep", from_loa="country_month", from_column="ged_sb_best_sum_nokgi") - .transform.ops.ln() - .transform.missing.fill() - ) + .with_column(Column('ln_ged_sb', from_loa='country_month', from_column='ged_sb_best_sum_nokgi') + .transform.ops.ln() + .transform.missing.fill() + ) - # timelag 0 of target variable - .with_column(Column("ln_ged_sb", from_loa="country_month", from_column="ged_sb_best_sum_nokgi") - .transform.ops.ln() - .transform.missing.fill() - ) - # Decay functions - # sb - .with_column(Column("decay_ged_sb_5", from_loa="country_month", from_column="ged_sb_best_sum_nokgi") - .transform.missing.replace_na() - .transform.bool.gte(5) - .transform.temporal.time_since() - .transform.temporal.decay(24) - .transform.missing.replace_na() - ) - # os - .with_column(Column("decay_ged_os_5", from_loa="country_month", from_column="ged_os_best_sum_nokgi") - .transform.missing.replace_na() - .transform.bool.gte(5) - .transform.temporal.time_since() - .transform.temporal.decay(24) - .transform.missing.replace_na() - ) - # Spatial lag decay - .with_column(Column("splag_1_decay_ged_sb_5", from_loa="country_month", - from_column="ged_sb_best_sum_nokgi") - .transform.missing.replace_na() - .transform.bool.gte(5) - .transform.temporal.time_since() - .transform.temporal.decay(24) - .transform.spatial.countrylag(1, 1, 0, 0) - .transform.missing.replace_na() - ) + .with_column(Column('wdi_sp_pop_totl', from_loa='country_year', from_column='wdi_sp_pop_totl') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + .transform.missing.replace_na() + ) - # From + .with_column(Column('decay_ged_sb_5', from_loa='country_month', from_column='ged_sb_best_sum_nokgi') + .transform.missing.replace_na() + .transform.bool.gte(5) + .transform.temporal.time_since() + .transform.temporal.decay(24) + .transform.missing.replace_na() + ) - .with_column(Column("wdi_sp_pop_totl", from_loa="country_year", from_column="wdi_sp_pop_totl") - .transform.missing.fill() - .transform.temporal.tlag(12) - .transform.missing.fill() - .transform.missing.replace_na() - ) + .with_column(Column('decay_ged_os_5', from_loa='country_month', from_column='ged_os_best_sum_nokgi') + .transform.missing.replace_na() + .transform.bool.gte(5) + .transform.temporal.time_since() + .transform.temporal.decay(24) + .transform.missing.replace_na() + ) - .with_theme("fatalities") - .describe("""Fatalities conflict history, cm level + .with_column(Column('splag_1_decay_ged_sb_5', from_loa='country_month', from_column='ged_sb_best_sum_nokgi') + .transform.missing.replace_na() + .transform.bool.gte(5) + .transform.temporal.time_since() + .transform.temporal.decay(24) + .transform.spatial.countrylag(1,1,0,0) + .transform.missing.replace_na() + ) - Predicting ln(fatalities) using conflict predictors, ultrashort + .with_theme('fatalities') + .describe("""Fatalities conflict history, cm level - """) - ) + Predicting ln(fatalities) using conflict predictors, ultrashort + """) + ) - return qs_baseline + + return queryset diff --git a/common_querysets/queryset_car_radio.py b/common_querysets/queryset_car_radio.py new file mode 100644 index 00000000..ae9ed831 --- /dev/null +++ b/common_querysets/queryset_car_radio.py @@ -0,0 +1,546 @@ +from viewser import Queryset, Column + +def generate(): + """ + Contains the configuration for the input data in the form of a viewser queryset. That is the data from viewser that is used to train the model. + This configuration is "behavioral" so modifying it will affect the model's runtime behavior and integration into the deployment system. + There is no guarantee that the model will work if the input data configuration is changed here without changing the model settings and algorithm accordingly. + + Returns: + - queryset_base (Queryset): A queryset containing the base data for the model training. + """ + + # VIEWSER 6, Example configuration. Modify as needed. + + queryset = (Queryset('fatalities003_topics','country_month') + .with_column(Column('ln_ged_sb_dep', from_loa='country_month', from_column='ged_sb_best_sum_nokgi') + .transform.ops.ln() + .transform.missing.fill() + ) + + .with_column(Column('ln_ged_sb', from_loa='country_month', from_column='ged_sb_best_sum_nokgi') + .transform.ops.ln() + .transform.missing.fill() + ) + + .with_column(Column('wdi_sp_pop_totl', from_loa='country_year', from_column='wdi_sp_pop_totl') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + .transform.missing.replace_na() + ) + + .with_column(Column('topic_tokens_t1', from_loa='country_month', from_column='topic_tokens') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(1) + .transform.missing.fill() + ) + + .with_column(Column('topic_tokens_t2', from_loa='country_month', from_column='topic_tokens') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(2) + .transform.missing.fill() + ) + + .with_column(Column('topic_tokens_t13', from_loa='country_month', from_column='topic_tokens') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(13) + .transform.missing.fill() + ) + + .with_column(Column('topic_ste_theta0_stock_t1', from_loa='country_month', from_column='topic_ste_theta0_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(1) + .transform.missing.fill() + ) + + .with_column(Column('topic_ste_theta0_stock_t2', from_loa='country_month', from_column='topic_ste_theta0_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(2) + .transform.missing.fill() + ) + + .with_column(Column('topic_ste_theta0_stock_t13', from_loa='country_month', from_column='topic_ste_theta0_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(13) + .transform.missing.fill() + ) + + .with_column(Column('topic_ste_theta1_stock_t1', from_loa='country_month', from_column='topic_ste_theta1_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(1) + .transform.missing.fill() + ) + + .with_column(Column('topic_ste_theta1_stock_t2', from_loa='country_month', from_column='topic_ste_theta1_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(2) + .transform.missing.fill() + ) + + .with_column(Column('topic_ste_theta1_stock_t13', from_loa='country_month', from_column='topic_ste_theta1_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(13) + .transform.missing.fill() + ) + + .with_column(Column('topic_ste_theta2_stock_t1', from_loa='country_month', from_column='topic_ste_theta2_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(1) + .transform.missing.fill() + ) + + .with_column(Column('topic_ste_theta2_stock_t2', from_loa='country_month', from_column='topic_ste_theta2_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(2) + .transform.missing.fill() + ) + + .with_column(Column('topic_ste_theta2_stock_t13', from_loa='country_month', from_column='topic_ste_theta2_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(13) + .transform.missing.fill() + ) + + .with_column(Column('topic_ste_theta3_stock_t1', from_loa='country_month', from_column='topic_ste_theta3_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(1) + .transform.missing.fill() + ) + + .with_column(Column('topic_ste_theta3_stock_t2', from_loa='country_month', from_column='topic_ste_theta3_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(2) + .transform.missing.fill() + ) + + .with_column(Column('topic_ste_theta3_stock_t13', from_loa='country_month', from_column='topic_ste_theta3_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(13) + .transform.missing.fill() + ) + + .with_column(Column('topic_ste_theta4_stock_t1', from_loa='country_month', from_column='topic_ste_theta4_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(1) + .transform.missing.fill() + ) + + .with_column(Column('topic_ste_theta4_stock_t2', from_loa='country_month', from_column='topic_ste_theta4_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(2) + .transform.missing.fill() + ) + + .with_column(Column('topic_ste_theta4_stock_t13', from_loa='country_month', from_column='topic_ste_theta4_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(13) + .transform.missing.fill() + ) + + .with_column(Column('topic_ste_theta5_stock_t1', from_loa='country_month', from_column='topic_ste_theta5_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(1) + .transform.missing.fill() + ) + + .with_column(Column('topic_ste_theta5_stock_t2', from_loa='country_month', from_column='topic_ste_theta5_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(2) + .transform.missing.fill() + ) + + .with_column(Column('topic_ste_theta5_stock_t13', from_loa='country_month', from_column='topic_ste_theta5_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(13) + .transform.missing.fill() + ) + + .with_column(Column('topic_ste_theta6_stock_t1', from_loa='country_month', from_column='topic_ste_theta6_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(1) + .transform.missing.fill() + ) + + .with_column(Column('topic_ste_theta6_stock_t2', from_loa='country_month', from_column='topic_ste_theta6_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(2) + .transform.missing.fill() + ) + + .with_column(Column('topic_ste_theta6_stock_t13', from_loa='country_month', from_column='topic_ste_theta6_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(13) + .transform.missing.fill() + ) + + .with_column(Column('topic_ste_theta7_stock_t1', from_loa='country_month', from_column='topic_ste_theta7_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(1) + .transform.missing.fill() + ) + + .with_column(Column('topic_ste_theta7_stock_t2', from_loa='country_month', from_column='topic_ste_theta7_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(2) + .transform.missing.fill() + ) + + .with_column(Column('topic_ste_theta7_stock_t13', from_loa='country_month', from_column='topic_ste_theta7_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(13) + .transform.missing.fill() + ) + + .with_column(Column('topic_ste_theta8_stock_t1', from_loa='country_month', from_column='topic_ste_theta8_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(1) + .transform.missing.fill() + ) + + .with_column(Column('topic_ste_theta8_stock_t2', from_loa='country_month', from_column='topic_ste_theta8_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(2) + .transform.missing.fill() + ) + + .with_column(Column('topic_ste_theta8_stock_t13', from_loa='country_month', from_column='topic_ste_theta8_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(13) + .transform.missing.fill() + ) + + .with_column(Column('topic_ste_theta9_stock_t1', from_loa='country_month', from_column='topic_ste_theta9_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(1) + .transform.missing.fill() + ) + + .with_column(Column('topic_ste_theta9_stock_t2', from_loa='country_month', from_column='topic_ste_theta9_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(2) + .transform.missing.fill() + ) + + .with_column(Column('topic_ste_theta9_stock_t13', from_loa='country_month', from_column='topic_ste_theta9_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(13) + .transform.missing.fill() + ) + + .with_column(Column('topic_ste_theta10_stock_t1', from_loa='country_month', from_column='topic_ste_theta10_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(1) + .transform.missing.fill() + ) + + .with_column(Column('topic_ste_theta10_stock_t2', from_loa='country_month', from_column='topic_ste_theta10_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(2) + .transform.missing.fill() + ) + + .with_column(Column('topic_ste_theta10_stock_t13', from_loa='country_month', from_column='topic_ste_theta10_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(13) + .transform.missing.fill() + ) + + .with_column(Column('topic_ste_theta11_stock_t1', from_loa='country_month', from_column='topic_ste_theta11_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(1) + .transform.missing.fill() + ) + + .with_column(Column('topic_ste_theta11_stock_t2', from_loa='country_month', from_column='topic_ste_theta11_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(2) + .transform.missing.fill() + ) + + .with_column(Column('topic_ste_theta11_stock_t13', from_loa='country_month', from_column='topic_ste_theta11_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(13) + .transform.missing.fill() + ) + + .with_column(Column('topic_ste_theta12_stock_t1', from_loa='country_month', from_column='topic_ste_theta12_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(1) + .transform.missing.fill() + ) + + .with_column(Column('topic_ste_theta12_stock_t2', from_loa='country_month', from_column='topic_ste_theta12_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(2) + .transform.missing.fill() + ) + + .with_column(Column('topic_ste_theta12_stock_t13', from_loa='country_month', from_column='topic_ste_theta12_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(13) + .transform.missing.fill() + ) + + .with_column(Column('topic_ste_theta13_stock_t1', from_loa='country_month', from_column='topic_ste_theta13_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(1) + .transform.missing.fill() + ) + + .with_column(Column('topic_ste_theta13_stock_t2', from_loa='country_month', from_column='topic_ste_theta13_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(2) + .transform.missing.fill() + ) + + .with_column(Column('topic_ste_theta13_stock_t13', from_loa='country_month', from_column='topic_ste_theta13_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(13) + .transform.missing.fill() + ) + + .with_column(Column('topic_ste_theta14_stock_t1', from_loa='country_month', from_column='topic_ste_theta14_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(1) + .transform.missing.fill() + ) + + .with_column(Column('topic_ste_theta14_stock_t2', from_loa='country_month', from_column='topic_ste_theta14_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(2) + .transform.missing.fill() + ) + + .with_column(Column('topic_ste_theta14_stock_t13', from_loa='country_month', from_column='topic_ste_theta14_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(13) + .transform.missing.fill() + ) + + .with_column(Column('decay_ged_sb_5', from_loa='country_month', from_column='ged_sb_best_sum_nokgi') + .transform.missing.replace_na() + .transform.bool.gte(5) + .transform.temporal.time_since() + .transform.temporal.decay(24) + .transform.missing.replace_na() + ) + + .with_column(Column('decay_ged_os_5', from_loa='country_month', from_column='ged_os_best_sum_nokgi') + .transform.missing.replace_na() + .transform.bool.gte(5) + .transform.temporal.time_since() + .transform.temporal.decay(24) + .transform.missing.replace_na() + ) + + .with_column(Column('splag_1_decay_ged_sb_5', from_loa='country_month', from_column='ged_sb_best_sum_nokgi') + .transform.missing.replace_na() + .transform.bool.gte(5) + .transform.temporal.time_since() + .transform.temporal.decay(24) + .transform.spatial.countrylag(1,1,0,0) + .transform.missing.replace_na() + ) + + .with_column(Column('topic_tokens_t1_splag', from_loa='country_month', from_column='topic_tokens') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(13) + .transform.missing.fill() + .transform.spatial.countrylag(1,1,0,0) + .transform.missing.replace_na() + ) + + .with_column(Column('topic_ste_theta0_stock_t1_splag', from_loa='country_month', from_column='topic_ste_theta0_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(13) + .transform.missing.fill() + .transform.spatial.countrylag(1,1,0,0) + .transform.missing.replace_na() + ) + + .with_column(Column('topic_ste_theta1_stock_t1_splag', from_loa='country_month', from_column='topic_ste_theta1_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(13) + .transform.missing.fill() + .transform.spatial.countrylag(1,1,0,0) + .transform.missing.replace_na() + ) + + .with_column(Column('topic_ste_theta2_stock_t1_splag', from_loa='country_month', from_column='topic_ste_theta2_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(13) + .transform.missing.fill() + .transform.spatial.countrylag(1,1,0,0) + .transform.missing.replace_na() + ) + + .with_column(Column('topic_ste_theta3_stock_t1_splag', from_loa='country_month', from_column='topic_ste_theta3_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(13) + .transform.missing.fill() + .transform.spatial.countrylag(1,1,0,0) + .transform.missing.replace_na() + ) + + .with_column(Column('topic_ste_theta4_stock_t1_splag', from_loa='country_month', from_column='topic_ste_theta4_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(13) + .transform.missing.fill() + .transform.spatial.countrylag(1,1,0,0) + .transform.missing.replace_na() + ) + + .with_column(Column('topic_ste_theta5_stock_t1_splag', from_loa='country_month', from_column='topic_ste_theta5_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(13) + .transform.missing.fill() + .transform.spatial.countrylag(1,1,0,0) + .transform.missing.replace_na() + ) + + .with_column(Column('topic_ste_theta6_stock_t1_splag', from_loa='country_month', from_column='topic_ste_theta6_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(13) + .transform.missing.fill() + .transform.spatial.countrylag(1,1,0,0) + .transform.missing.replace_na() + ) + + .with_column(Column('topic_ste_theta7_stock_t1_splag', from_loa='country_month', from_column='topic_ste_theta7_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(13) + .transform.missing.fill() + .transform.spatial.countrylag(1,1,0,0) + .transform.missing.replace_na() + ) + + .with_column(Column('topic_ste_theta8_stock_t1_splag', from_loa='country_month', from_column='topic_ste_theta8_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(13) + .transform.missing.fill() + .transform.spatial.countrylag(1,1,0,0) + .transform.missing.replace_na() + ) + + .with_column(Column('topic_ste_theta9_stock_t1_splag', from_loa='country_month', from_column='topic_ste_theta9_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(13) + .transform.missing.fill() + .transform.spatial.countrylag(1,1,0,0) + .transform.missing.replace_na() + ) + + .with_column(Column('topic_ste_theta10_stock_t1_splag', from_loa='country_month', from_column='topic_ste_theta10_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(13) + .transform.missing.fill() + .transform.spatial.countrylag(1,1,0,0) + .transform.missing.replace_na() + ) + + .with_column(Column('topic_ste_theta11_stock_t1_splag', from_loa='country_month', from_column='topic_ste_theta11_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(13) + .transform.missing.fill() + .transform.spatial.countrylag(1,1,0,0) + .transform.missing.replace_na() + ) + + .with_column(Column('topic_ste_theta12_stock_t1_splag', from_loa='country_month', from_column='topic_ste_theta12_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(13) + .transform.missing.fill() + .transform.spatial.countrylag(1,1,0,0) + .transform.missing.replace_na() + ) + + .with_column(Column('topic_ste_theta13_stock_t1_splag', from_loa='country_month', from_column='topic_ste_theta13_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(13) + .transform.missing.fill() + .transform.spatial.countrylag(1,1,0,0) + .transform.missing.replace_na() + ) + + .with_column(Column('topic_ste_theta14_stock_t1_splag', from_loa='country_month', from_column='topic_ste_theta14_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(13) + .transform.missing.fill() + .transform.spatial.countrylag(1,1,0,0) + .transform.missing.replace_na() + ) + + .with_theme('fatalities002') + .describe("""Predicting ln(fatalities), cm level + + Queryset with baseline and Mueller & Rauh topic model features + + """) + ) + + return queryset diff --git a/common_querysets/queryset_counting_stars.py b/common_querysets/queryset_counting_stars.py new file mode 100644 index 00000000..1162af6c --- /dev/null +++ b/common_querysets/queryset_counting_stars.py @@ -0,0 +1,450 @@ +from viewser import Queryset, Column + +def generate(): + """ + Contains the configuration for the input data in the form of a viewser queryset. That is the data from viewser that is used to train the model. + This configuration is "behavioral" so modifying it will affect the model's runtime behavior and integration into the deployment system. + There is no guarantee that the model will work if the input data configuration is changed here without changing the model settings and algorithm accordingly. + + Returns: + - queryset_base (Queryset): A queryset containing the base data for the model training. + """ + + # VIEWSER 6, Example configuration. Modify as needed. + + queryset = (Queryset('fatalities003_conflict_history_long','country_month') + .with_column(Column('gleditsch_ward', from_loa='country', from_column='gwcode') + ) + + .with_column(Column('ln_ged_ns', from_loa='country_month', from_column='ged_ns_best_sum_nokgi') + .transform.ops.ln() + .transform.missing.fill() + ) + + .with_column(Column('ln_ged_os', from_loa='country_month', from_column='ged_os_best_sum_nokgi') + .transform.ops.ln() + .transform.missing.fill() + ) + + .with_column(Column('ln_acled_sb', from_loa='country_month', from_column='acled_sb_fat') + .transform.ops.ln() + .transform.missing.fill() + ) + + .with_column(Column('ln_acled_sb_count', from_loa='country_month', from_column='acled_sb_count') + .transform.ops.ln() + .transform.missing.fill() + ) + + .with_column(Column('ln_acled_os', from_loa='country_month', from_column='acled_os_fat') + .transform.ops.ln() + .transform.missing.fill() + ) + + .with_column(Column('splag_1_ged_sb', from_loa='country_month', from_column='ged_sb_best_sum_nokgi') + .transform.missing.replace_na() + .transform.spatial.countrylag(1,1,0,0) + ) + + .with_column(Column('splag_2_ged_sb', from_loa='country_month', from_column='ged_sb_best_sum_nokgi') + .transform.missing.replace_na() + .transform.spatial.countrylag(1,2,0,0) + ) + + .with_column(Column('splag_1_ged_os', from_loa='country_month', from_column='ged_os_best_sum_nokgi') + .transform.missing.replace_na() + .transform.spatial.countrylag(1,1,0,0) + ) + + .with_column(Column('splag_1_ged_ns', from_loa='country_month', from_column='ged_ns_best_sum_nokgi') + .transform.missing.replace_na() + .transform.spatial.countrylag(1,1,0,0) + ) + + .with_column(Column('ln_acled_prx_count', from_loa='country_month', from_column='acled_prx_count') + .transform.ops.ln() + .transform.missing.fill() + ) + + .with_column(Column('ln_acled_pr_count', from_loa='country_month', from_column='acled_pr_count') + .transform.ops.ln() + .transform.missing.fill() + ) + + .with_column(Column('ln_acled_prx_fat', from_loa='country_month', from_column='acled_prx_fat') + .transform.ops.ln() + .transform.missing.fill() + ) + + .with_column(Column('ln_acled_sb_gov', from_loa='country_month', from_column='acled_bat_gov_fat') + .transform.ops.ln() + .transform.missing.fill() + ) + + .with_column(Column('ln_acled_sb_reb', from_loa='country_month', from_column='acled_bat_reb_fat') + .transform.ops.ln() + .transform.missing.fill() + ) + + .with_column(Column('ln_acled_ns', from_loa='country_month', from_column='acled_ns_fat') + .transform.ops.ln() + .transform.missing.fill() + ) + + .with_column(Column('ln_ged_sb_dep', from_loa='country_month', from_column='ged_sb_best_sum_nokgi') + .transform.ops.ln() + .transform.missing.fill() + ) + + .with_column(Column('ln_ged_sb', from_loa='country_month', from_column='ged_sb_best_sum_nokgi') + .transform.ops.ln() + .transform.missing.fill() + ) + + .with_column(Column('wdi_sp_pop_totl', from_loa='country_year', from_column='wdi_sp_pop_totl') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + .transform.missing.replace_na() + ) + + .with_column(Column('ln_ged_sb_tlag_1', from_loa='country_month', from_column='ged_sb_best_sum_nokgi') + .transform.ops.ln() + .transform.missing.fill() + .transform.temporal.tlag(1) + .transform.missing.fill() + ) + + .with_column(Column('ln_ged_sb_tlag_2', from_loa='country_month', from_column='ged_sb_best_sum_nokgi') + .transform.ops.ln() + .transform.missing.fill() + .transform.temporal.tlag(2) + .transform.missing.fill() + ) + + .with_column(Column('ln_ged_sb_tlag_3', from_loa='country_month', from_column='ged_sb_best_sum_nokgi') + .transform.ops.ln() + .transform.missing.fill() + .transform.temporal.tlag(3) + .transform.missing.fill() + ) + + .with_column(Column('ln_ged_sb_tlag_4', from_loa='country_month', from_column='ged_sb_best_sum_nokgi') + .transform.ops.ln() + .transform.missing.fill() + .transform.temporal.tlag(4) + .transform.missing.fill() + ) + + .with_column(Column('ln_ged_sb_tlag_5', from_loa='country_month', from_column='ged_sb_best_sum_nokgi') + .transform.ops.ln() + .transform.missing.fill() + .transform.temporal.tlag(5) + .transform.missing.fill() + ) + + .with_column(Column('ln_ged_sb_tlag_6', from_loa='country_month', from_column='ged_sb_best_sum_nokgi') + .transform.ops.ln() + .transform.missing.fill() + .transform.temporal.tlag(6) + .transform.missing.fill() + ) + + .with_column(Column('ln_ged_sb_tsum_24', from_loa='country_month', from_column='ged_sb_best_sum_nokgi') + .transform.missing.replace_na() + .transform.temporal.moving_sum(24) + .transform.ops.ln() + .transform.missing.replace_na() + ) + + .with_column(Column('ln_ged_os_tlag_1', from_loa='country_month', from_column='ged_os_best_sum_nokgi') + .transform.ops.ln() + .transform.missing.fill() + .transform.temporal.tlag(1) + .transform.missing.fill() + ) + + .with_column(Column('ln_ged_sb_tsum_12', from_loa='country_month', from_column='ged_sb_best_sum_nokgi') + .transform.missing.replace_na() + .transform.temporal.moving_sum(12) + .transform.ops.ln() + .transform.missing.replace_na() + ) + + .with_column(Column('ln_ged_sb_tsum_48', from_loa='country_month', from_column='ged_sb_best_sum_nokgi') + .transform.missing.replace_na() + .transform.temporal.moving_sum(48) + .transform.ops.ln() + .transform.missing.replace_na() + ) + + .with_column(Column('ln_ged_ns_tlag_1', from_loa='country_month', from_column='ged_ns_best_sum_nokgi') + .transform.ops.ln() + .transform.missing.fill() + .transform.temporal.tlag(1) + .transform.missing.fill() + ) + + .with_column(Column('ln_ged_ns_tlag_2', from_loa='country_month', from_column='ged_ns_best_sum_nokgi') + .transform.ops.ln() + .transform.missing.fill() + .transform.temporal.tlag(2) + .transform.missing.fill() + ) + + .with_column(Column('ln_ged_os_tlag_2', from_loa='country_month', from_column='ged_os_best_sum_nokgi') + .transform.ops.ln() + .transform.missing.fill() + .transform.temporal.tlag(2) + .transform.missing.fill() + ) + + .with_column(Column('ln_acled_sb_tlag_1', from_loa='country_month', from_column='acled_sb_fat') + .transform.ops.ln() + .transform.missing.fill() + .transform.temporal.tlag(1) + .transform.missing.fill() + ) + + .with_column(Column('ln_acled_sb_tlag_2', from_loa='country_month', from_column='acled_sb_fat') + .transform.ops.ln() + .transform.missing.fill() + .transform.temporal.tlag(2) + .transform.missing.fill() + ) + + .with_column(Column('ln_acled_os_tlag_1', from_loa='country_month', from_column='acled_os_fat') + .transform.ops.ln() + .transform.missing.fill() + .transform.temporal.tlag(1) + .transform.missing.fill() + ) + + .with_column(Column('ln_acled_os_tlag_2', from_loa='country_month', from_column='acled_os_fat') + .transform.ops.ln() + .transform.missing.fill() + .transform.temporal.tlag(2) + .transform.missing.fill() + ) + + .with_column(Column('ln_acled_ns_tlag_1', from_loa='country_month', from_column='acled_ns_fat') + .transform.ops.ln() + .transform.missing.fill() + .transform.temporal.tlag(1) + .transform.missing.fill() + ) + + .with_column(Column('ln_acled_ns_tlag_2', from_loa='country_month', from_column='acled_os_fat') + .transform.ops.ln() + .transform.missing.fill() + .transform.temporal.tlag(2) + .transform.missing.fill() + ) + + .with_column(Column('decay_ged_sb_5', from_loa='country_month', from_column='ged_sb_best_sum_nokgi') + .transform.missing.replace_na() + .transform.bool.gte(5) + .transform.temporal.time_since() + .transform.temporal.decay(24) + .transform.missing.replace_na() + ) + + .with_column(Column('decay_ged_os_5', from_loa='country_month', from_column='ged_os_best_sum_nokgi') + .transform.missing.replace_na() + .transform.bool.gte(5) + .transform.temporal.time_since() + .transform.temporal.decay(24) + .transform.missing.replace_na() + ) + + .with_column(Column('decay_ged_sb_100', from_loa='country_month', from_column='ged_sb_best_sum_nokgi') + .transform.missing.replace_na() + .transform.bool.gte(100) + .transform.temporal.time_since() + .transform.temporal.decay(24) + .transform.missing.replace_na() + ) + + .with_column(Column('decay_ged_sb_500', from_loa='country_month', from_column='ged_sb_best_sum_nokgi') + .transform.missing.replace_na() + .transform.bool.gte(500) + .transform.temporal.time_since() + .transform.temporal.decay(24) + .transform.missing.replace_na() + ) + + .with_column(Column('decay_ged_os_100', from_loa='country_month', from_column='ged_os_best_sum_nokgi') + .transform.missing.replace_na() + .transform.bool.gte(100) + .transform.temporal.time_since() + .transform.temporal.decay(24) + .transform.missing.replace_na() + ) + + .with_column(Column('decay_ged_ns_5', from_loa='country_month', from_column='ged_ns_best_sum_nokgi') + .transform.missing.replace_na() + .transform.bool.gte(5) + .transform.temporal.time_since() + .transform.temporal.decay(24) + .transform.missing.replace_na() + ) + + .with_column(Column('decay_ged_ns_100', from_loa='country_month', from_column='ged_ns_best_sum_nokgi') + .transform.missing.replace_na() + .transform.bool.gte(100) + .transform.temporal.time_since() + .transform.temporal.decay(24) + .transform.missing.replace_na() + ) + + .with_column(Column('decay_acled_sb_5', from_loa='country_month', from_column='acled_sb_fat') + .transform.missing.replace_na() + .transform.bool.gte(5) + .transform.temporal.time_since() + .transform.temporal.decay(24) + .transform.missing.replace_na() + ) + + .with_column(Column('decay_acled_os_5', from_loa='country_month', from_column='acled_os_fat') + .transform.missing.replace_na() + .transform.bool.gte(5) + .transform.temporal.time_since() + .transform.temporal.decay(24) + .transform.missing.replace_na() + ) + + .with_column(Column('decay_acled_ns_5', from_loa='country_month', from_column='acled_ns_fat') + .transform.missing.replace_na() + .transform.bool.gte(5) + .transform.temporal.time_since() + .transform.temporal.decay(24) + .transform.missing.replace_na() + ) + + .with_column(Column('decay_ged_sb_1', from_loa='country_month', from_column='ged_sb_best_sum_nokgi') + .transform.missing.replace_na() + .transform.bool.gte(1) + .transform.temporal.time_since() + .transform.temporal.decay(24) + .transform.missing.replace_na() + ) + + .with_column(Column('decay_ged_sb_25', from_loa='country_month', from_column='ged_sb_best_sum_nokgi') + .transform.missing.replace_na() + .transform.bool.gte(25) + .transform.temporal.time_since() + .transform.temporal.decay(24) + .transform.missing.replace_na() + ) + + .with_column(Column('decay_ged_os_1', from_loa='country_month', from_column='ged_os_best_sum_nokgi') + .transform.missing.replace_na() + .transform.bool.gte(1) + .transform.temporal.time_since() + .transform.temporal.decay(24) + .transform.missing.replace_na() + ) + + .with_column(Column('decay_ged_os_25', from_loa='country_month', from_column='ged_os_best_sum_nokgi') + .transform.missing.replace_na() + .transform.bool.gte(25) + .transform.temporal.time_since() + .transform.temporal.decay(24) + .transform.missing.replace_na() + ) + + .with_column(Column('decay_ged_os_500', from_loa='country_month', from_column='ged_os_best_sum_nokgi') + .transform.missing.replace_na() + .transform.bool.gte(500) + .transform.temporal.time_since() + .transform.temporal.decay(24) + .transform.missing.replace_na() + ) + + .with_column(Column('decay_ged_ns_1', from_loa='country_month', from_column='ged_ns_best_sum_nokgi') + .transform.missing.replace_na() + .transform.bool.gte(1) + .transform.temporal.time_since() + .transform.temporal.decay(24) + .transform.missing.replace_na() + ) + + .with_column(Column('decay_ged_ns_25', from_loa='country_month', from_column='ged_ns_best_sum_nokgi') + .transform.missing.replace_na() + .transform.bool.gte(25) + .transform.temporal.time_since() + .transform.temporal.decay(24) + .transform.missing.replace_na() + ) + + .with_column(Column('decay_ged_ns_500', from_loa='country_month', from_column='ged_ns_best_sum_nokgi') + .transform.missing.replace_na() + .transform.bool.gte(500) + .transform.temporal.time_since() + .transform.temporal.decay(24) + .transform.missing.replace_na() + ) + + .with_column(Column('splag_1_decay_ged_sb_5', from_loa='country_month', from_column='ged_sb_best_sum_nokgi') + .transform.missing.replace_na() + .transform.bool.gte(5) + .transform.temporal.time_since() + .transform.temporal.decay(24) + .transform.spatial.countrylag(1,1,0,0) + .transform.missing.replace_na() + ) + + .with_column(Column('splag_1_decay_ged_os_5', from_loa='country_month', from_column='ged_os_best_sum_nokgi') + .transform.missing.replace_na() + .transform.bool.gte(5) + .transform.temporal.time_since() + .transform.temporal.decay(24) + .transform.spatial.countrylag(1,1,0,0) + .transform.missing.replace_na() + ) + + .with_column(Column('splag_1_decay_ged_ns_5', from_loa='country_month', from_column='ged_ns_best_sum_nokgi') + .transform.missing.replace_na() + .transform.bool.gte(5) + .transform.temporal.time_since() + .transform.temporal.decay(24) + .transform.spatial.countrylag(1,1,0,0) + .transform.missing.replace_na() + ) + + .with_column(Column('splag_1_decay_ged_sb_100', from_loa='country_month', from_column='ged_sb_best_sum_nokgi') + .transform.missing.replace_na() + .transform.bool.gte(100) + .transform.temporal.time_since() + .transform.temporal.decay(24) + .transform.spatial.countrylag(1,1,0,0) + .transform.missing.replace_na() + ) + + .with_column(Column('splag_1_decay_ged_os_100', from_loa='country_month', from_column='ged_os_best_sum_nokgi') + .transform.missing.replace_na() + .transform.bool.gte(100) + .transform.temporal.time_since() + .transform.temporal.decay(24) + .transform.spatial.countrylag(1,1,0,0) + .transform.missing.replace_na() + ) + + .with_column(Column('splag_1_decay_ged_ns_100', from_loa='country_month', from_column='ged_ns_best_sum_nokgi') + .transform.missing.replace_na() + .transform.bool.gte(100) + .transform.temporal.time_since() + .transform.temporal.decay(24) + .transform.spatial.countrylag(1,1,0,0) + .transform.missing.replace_na() + ) + + .with_theme('fatalities002') + .describe("""Predicting ln(fatalities), cm level + + Queryset with baseline, first set and extended set of conflict history features + + """) + ) + + return queryset diff --git a/common_querysets/queryset_demon_days.py b/common_querysets/queryset_demon_days.py new file mode 100644 index 00000000..27be9eec --- /dev/null +++ b/common_querysets/queryset_demon_days.py @@ -0,0 +1,274 @@ +from viewser import Queryset, Column + +def generate(): + """ + Contains the configuration for the input data in the form of a viewser queryset. That is the data from viewser that is used to train the model. + This configuration is "behavioral" so modifying it will affect the model's runtime behavior and integration into the deployment system. + There is no guarantee that the model will work if the input data configuration is changed here without changing the model settings and algorithm accordingly. + + Returns: + - queryset_base (Queryset): A queryset containing the base data for the model training. + """ + + # VIEWSER 6, Example configuration. Modify as needed. + + queryset = (Queryset('fatalities003_faostat','country_month') + .with_column(Column('ln_ged_sb_dep', from_loa='country_month', from_column='ged_sb_best_sum_nokgi') + .transform.ops.ln() + .transform.missing.fill() + ) + + .with_column(Column('ln_ged_sb', from_loa='country_month', from_column='ged_sb_best_sum_nokgi') + .transform.ops.ln() + .transform.missing.fill() + ) + + .with_column(Column('gleditsch_ward', from_loa='country', from_column='gwcode') + .transform.missing.fill() + .transform.missing.replace_na() + ) + + .with_column(Column('consumer_prices_food_indices', from_loa='country_month', from_column='consumer_prices_food_indices') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + ) + + .with_column(Column('consumer_prices_general_indices', from_loa='country_month', from_column='consumer_prices_general_indices') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + ) + + .with_column(Column('food_price_inflation', from_loa='country_month', from_column='food_price_inflation') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + ) + + .with_column(Column('avg_adequate_diet', from_loa='country_year', from_column='avg_adequate_diet') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + ) + + .with_column(Column('avg_animalprotein_pcap_day', from_loa='country_year', from_column='avg_animalprotein_pcap_day') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + ) + + .with_column(Column('avg_fprod_value', from_loa='country_year', from_column='avg_fprod_value') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + ) + + .with_column(Column('avg_protein_pcap_day', from_loa='country_year', from_column='avg_protein_pcap_day') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + ) + + .with_column(Column('gdp_pc_ppp', from_loa='country_year', from_column='gdp_pc_ppp') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + ) + + .with_column(Column('kcal_pcap_day', from_loa='country_year', from_column='kcal_pcap_day') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + ) + + .with_column(Column('kcal_pcap_day_cerotu', from_loa='country_year', from_column='kcal_pcap_day_cerotu') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + ) + + .with_column(Column('pcap_fprod_var', from_loa='country_year', from_column='pcap_fprod_var') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + ) + + .with_column(Column('pcap_fsupply_var', from_loa='country_year', from_column='pcap_fsupply_var') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + ) + + .with_column(Column('pct_arable_land', from_loa='country_year', from_column='pct_arable_land') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + ) + + .with_column(Column('pct_cereal_import', from_loa='country_year', from_column='pct_cereal_import') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + ) + + .with_column(Column('pct_fimport_merch', from_loa='country_year', from_column='pct_fimport_merch') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + ) + + .with_column(Column('pct_modsevere_finsecurity', from_loa='country_year', from_column='pct_modsevere_finsecurity') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + ) + + .with_column(Column('pct_pop_basicdrink', from_loa='country_year', from_column='pct_pop_basicdrink') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + ) + + .with_column(Column('pct_pop_basicsani', from_loa='country_year', from_column='pct_pop_basicsani') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + ) + + .with_column(Column('pct_pop_safedrink', from_loa='country_year', from_column='pct_pop_safedrink') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + ) + + .with_column(Column('pct_pop_safesani', from_loa='country_year', from_column='pct_pop_safesani') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + ) + + .with_column(Column('pct_severe_finsecurity', from_loa='country_year', from_column='pct_severe_finsecurity') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + ) + + .with_column(Column('pct_und5_overweight', from_loa='country_year', from_column='pct_und5_overweight') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + ) + + .with_column(Column('pct_und5_stunted', from_loa='country_year', from_column='pct_und5_stunted') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + ) + + .with_column(Column('pct_und5_wasting', from_loa='country_year', from_column='pct_und5_wasting') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + ) + + .with_column(Column('pct_undernourished', from_loa='country_year', from_column='pct_undernourished') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + ) + + .with_column(Column('pol_stability', from_loa='country_year', from_column='pol_stability') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + ) + + .with_column(Column('pop_modsevere_finsecurity', from_loa='country_year', from_column='pop_modsevere_finsecurity') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + ) + + .with_column(Column('pop_severe_finsecurity', from_loa='country_year', from_column='pop_severe_finsecurity') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + ) + + .with_column(Column('pop_undernourished', from_loa='country_year', from_column='pop_undernourished') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + ) + + .with_column(Column('prev_adult_obesity', from_loa='country_year', from_column='prev_adult_obesity') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + ) + + .with_column(Column('prev_infant_bfeed', from_loa='country_year', from_column='prev_infant_bfeed') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + ) + + .with_column(Column('prev_lowbweight', from_loa='country_year', from_column='prev_lowbweight') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + ) + + .with_column(Column('prev_repr_anemia', from_loa='country_year', from_column='prev_repr_anemia') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + ) + + .with_column(Column('rail_density', from_loa='country_year', from_column='rail_density') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + ) + + .with_column(Column('wdi_sp_pop_totl', from_loa='country_year', from_column='wdi_sp_pop_totl') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + .transform.missing.replace_na() + ) + + .with_column(Column('decay_ged_sb_5', from_loa='country_month', from_column='ged_sb_best_sum_nokgi') + .transform.missing.replace_na() + .transform.bool.gte(5) + .transform.temporal.time_since() + .transform.temporal.decay(24) + .transform.missing.replace_na() + ) + + .with_column(Column('decay_ged_os_5', from_loa='country_month', from_column='ged_os_best_sum_nokgi') + .transform.missing.replace_na() + .transform.bool.gte(5) + .transform.temporal.time_since() + .transform.temporal.decay(24) + .transform.missing.replace_na() + ) + + .with_column(Column('splag_1_decay_ged_sb_5', from_loa='country_month', from_column='ged_sb_best_sum_nokgi') + .transform.missing.replace_na() + .transform.bool.gte(5) + .transform.temporal.time_since() + .transform.temporal.decay(24) + .transform.spatial.countrylag(1,1,0,0) + .transform.missing.replace_na() + ) + + .with_theme('fatalities002') + .describe("""Predicting ln(fatalities), cm level + + Queryset with baseline and faostat features + + """) + ) + return queryset diff --git a/common_querysets/queryset_fast_car.py b/common_querysets/queryset_fast_car.py new file mode 100644 index 00000000..fd651bb2 --- /dev/null +++ b/common_querysets/queryset_fast_car.py @@ -0,0 +1,419 @@ +from viewser import Queryset, Column + +def generate(): + """ + Contains the configuration for the input data in the form of a viewser queryset. That is the data from viewser that is used to train the model. + This configuration is "behavioral" so modifying it will affect the model's runtime behavior and integration into the deployment system. + There is no guarantee that the model will work if the input data configuration is changed here without changing the model settings and algorithm accordingly. + + Returns: + - queryset_base (Queryset): A queryset containing the base data for the model training. + """ + + # VIEWSER 6, Example configuration. Modify as needed. + + queryset = (Queryset('fatalities003_vdem_short','country_month') + .with_column(Column('ln_ged_sb_dep', from_loa='country_month', from_column='ged_sb_best_sum_nokgi') + .transform.ops.ln() + .transform.missing.fill() + ) + + .with_column(Column('ln_ged_sb', from_loa='country_month', from_column='ged_sb_best_sum_nokgi') + .transform.ops.ln() + .transform.missing.fill() + ) + + .with_column(Column('vdem_v2x_delibdem', from_loa='country_year', from_column='vdem_v2x_delibdem') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + ) + + .with_column(Column('vdem_v2x_egaldem', from_loa='country_year', from_column='vdem_v2x_egaldem') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + ) + + .with_column(Column('vdem_v2x_libdem', from_loa='country_year', from_column='vdem_v2x_libdem') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + ) + + .with_column(Column('vdem_v2x_libdem_48', from_loa='country_year', from_column='vdem_v2x_libdem') + .transform.missing.fill() + .transform.temporal.tlag(60) + .transform.missing.fill() + ) + + .with_column(Column('vdem_v2x_partip', from_loa='country_year', from_column='vdem_v2x_partip') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + ) + + .with_column(Column('vdem_v2x_partipdem', from_loa='country_year', from_column='vdem_v2x_partipdem') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + ) + + .with_column(Column('vdem_v2x_accountability', from_loa='country_year', from_column='vdem_v2x_accountability') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + ) + + .with_column(Column('vdem_v2x_civlib', from_loa='country_year', from_column='vdem_v2x_civlib') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + ) + + .with_column(Column('vdem_v2x_clphy', from_loa='country_year', from_column='vdem_v2x_clphy') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + ) + + .with_column(Column('vdem_v2x_cspart', from_loa='country_year', from_column='vdem_v2x_cspart') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + ) + + .with_column(Column('vdem_v2x_divparctrl', from_loa='country_year', from_column='vdem_v2x_divparctrl') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + ) + + .with_column(Column('vdem_v2x_edcomp_thick', from_loa='country_year', from_column='vdem_v2x_edcomp_thick') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + ) + + .with_column(Column('vdem_v2x_egal', from_loa='country_year', from_column='vdem_v2x_egal') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + ) + + .with_column(Column('vdem_v2x_execorr', from_loa='country_year', from_column='vdem_v2x_execorr') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + ) + + .with_column(Column('vdem_v2x_frassoc_thick', from_loa='country_year', from_column='vdem_v2x_frassoc_thick') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + ) + + .with_column(Column('vdem_v2x_gencs', from_loa='country_year', from_column='vdem_v2x_gencs') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + ) + + .with_column(Column('vdem_v2x_gender', from_loa='country_year', from_column='vdem_v2x_gender') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + ) + + .with_column(Column('vdem_v2x_genpp', from_loa='country_year', from_column='vdem_v2x_genpp') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + ) + + .with_column(Column('vdem_v2x_horacc', from_loa='country_year', from_column='vdem_v2x_horacc') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + ) + + .with_column(Column('vdem_v2x_neopat', from_loa='country_year', from_column='vdem_v2x_neopat') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + ) + + .with_column(Column('vdem_v2x_pubcorr', from_loa='country_year', from_column='vdem_v2x_pubcorr') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + ) + + .with_column(Column('vdem_v2x_rule', from_loa='country_year', from_column='vdem_v2x_rule') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + ) + + .with_column(Column('vdem_v2x_veracc', from_loa='country_year', from_column='vdem_v2x_veracc') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + ) + + .with_column(Column('vdem_v2x_ex_military', from_loa='country_year', from_column='vdem_v2x_ex_military') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + ) + + .with_column(Column('vdem_v2x_ex_party', from_loa='country_year', from_column='vdem_v2x_ex_party') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + ) + + .with_column(Column('vdem_v2x_freexp', from_loa='country_year', from_column='vdem_v2x_freexp') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + ) + + .with_column(Column('vdem_v2xcl_acjst', from_loa='country_year', from_column='vdem_v2xcl_acjst') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + ) + + .with_column(Column('vdem_v2xcl_dmove', from_loa='country_year', from_column='vdem_v2xcl_dmove') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + ) + + .with_column(Column('vdem_v2xcl_prpty', from_loa='country_year', from_column='vdem_v2xcl_prpty') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + ) + + .with_column(Column('vdem_v2xcl_rol', from_loa='country_year', from_column='vdem_v2xcl_rol') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + ) + + .with_column(Column('vdem_v2xcl_slave', from_loa='country_year', from_column='vdem_v2xcl_slave') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + ) + + .with_column(Column('vdem_v2xdd_dd', from_loa='country_year', from_column='vdem_v2xdd_dd') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + ) + + .with_column(Column('vdem_v2xdl_delib', from_loa='country_year', from_column='vdem_v2xdl_delib') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + ) + + .with_column(Column('vdem_v2xeg_eqdr', from_loa='country_year', from_column='vdem_v2xeg_eqdr') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + ) + + .with_column(Column('vdem_v2xeg_eqprotec', from_loa='country_year', from_column='vdem_v2xeg_eqprotec') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + ) + + .with_column(Column('vdem_v2xel_frefair', from_loa='country_year', from_column='vdem_v2xel_frefair') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + ) + + .with_column(Column('vdem_v2xel_regelec', from_loa='country_year', from_column='vdem_v2xel_regelec') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + ) + + .with_column(Column('vdem_v2xme_altinf', from_loa='country_year', from_column='vdem_v2xme_altinf') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + ) + + .with_column(Column('vdem_v2xnp_client', from_loa='country_year', from_column='vdem_v2xnp_client') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + ) + + .with_column(Column('vdem_v2xnp_regcorr', from_loa='country_year', from_column='vdem_v2xnp_regcorr') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + ) + + .with_column(Column('vdem_v2xpe_exlecon', from_loa='country_year', from_column='vdem_v2xpe_exlecon') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + ) + + .with_column(Column('vdem_v2xpe_exlpol', from_loa='country_year', from_column='vdem_v2xpe_exlpol') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + ) + + .with_column(Column('vdem_v2xpe_exlgeo', from_loa='country_year', from_column='vdem_v2xpe_exlgeo') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + ) + + .with_column(Column('vdem_v2xpe_exlgender', from_loa='country_year', from_column='vdem_v2xpe_exlgender') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + ) + + .with_column(Column('vdem_v2xpe_exlsocgr', from_loa='country_year', from_column='vdem_v2xpe_exlsocgr') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + ) + + .with_column(Column('vdem_v2xps_party', from_loa='country_year', from_column='vdem_v2xps_party') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + ) + + .with_column(Column('vdem_v2xcs_ccsi', from_loa='country_year', from_column='vdem_v2xcs_ccsi') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + ) + + .with_column(Column('vdem_v2xnp_pres', from_loa='country_year', from_column='vdem_v2xnp_pres') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + ) + + .with_column(Column('vdem_v2xeg_eqaccess', from_loa='country_year', from_column='vdem_v2xeg_eqaccess') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + ) + + .with_column(Column('vdem_v2x_diagacc', from_loa='country_year', from_column='vdem_v2x_diagacc') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + ) + + .with_column(Column('vdem_v2clrgunev', from_loa='country_year', from_column='vdem_v2clrgunev') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + ) + + .with_column(Column('wdi_sm_pop_netm', from_loa='country_year', from_column='wdi_sm_pop_netm') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + ) + + .with_column(Column('wdi_sp_dyn_imrt_in', from_loa='country_year', from_column='wdi_sp_dyn_imrt_in') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + ) + + .with_column(Column('wdi_sp_pop_totl', from_loa='country_year', from_column='wdi_sp_pop_totl') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + .transform.missing.replace_na() + ) + + .with_column(Column('splag_vdem_v2x_libdem', from_loa='country_year', from_column='vdem_v2x_libdem') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.spatial.countrylag(1,1,0,0) + .transform.missing.replace_na() + ) + + .with_column(Column('splag_vdem_v2xcl_dmove', from_loa='country_year', from_column='vdem_v2xcl_dmove') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.spatial.countrylag(1,1,0,0) + .transform.missing.replace_na() + ) + + .with_column(Column('splag_vdem_v2x_accountability', from_loa='country_year', from_column='vdem_v2x_accountability') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.spatial.countrylag(1,1,0,0) + .transform.missing.replace_na() + ) + + .with_column(Column('splag_vdem_v2xpe_exlsocgr', from_loa='country_year', from_column='vdem_v2xpe_exlsocgr') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.spatial.countrylag(1,1,0,0) + .transform.missing.replace_na() + ) + + .with_column(Column('splag_vdem_v2xcl_rol', from_loa='country_year', from_column='vdem_v2xcl_rol') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.spatial.countrylag(1,1,0,0) + .transform.missing.replace_na() + ) + + .with_column(Column('decay_ged_sb_5', from_loa='country_month', from_column='ged_sb_best_sum_nokgi') + .transform.missing.replace_na() + .transform.bool.gte(5) + .transform.temporal.time_since() + .transform.temporal.decay(24) + .transform.missing.replace_na() + ) + + .with_column(Column('decay_ged_os_5', from_loa='country_month', from_column='ged_os_best_sum_nokgi') + .transform.missing.replace_na() + .transform.bool.gte(5) + .transform.temporal.time_since() + .transform.temporal.decay(24) + .transform.missing.replace_na() + ) + + .with_column(Column('splag_1_decay_ged_sb_5', from_loa='country_month', from_column='ged_sb_best_sum_nokgi') + .transform.missing.replace_na() + .transform.bool.gte(5) + .transform.temporal.time_since() + .transform.temporal.decay(24) + .transform.spatial.countrylag(1,1,0,0) + .transform.missing.replace_na() + ) + + .with_theme('fatalities002') + .describe("""Predicting ln(fatalities), cm level + + Queryset with baseline and short list of vdem features + + """) + ) + + return queryset diff --git a/common_querysets/queryset_fluorescent_adolescent.py b/common_querysets/queryset_fluorescent_adolescent.py new file mode 100644 index 00000000..46edf09c --- /dev/null +++ b/common_querysets/queryset_fluorescent_adolescent.py @@ -0,0 +1,226 @@ +from viewser import Queryset, Column + +def generate(): + """ + Contains the configuration for the input data in the form of a viewser queryset. That is the data from viewser that is used to train the model. + This configuration is "behavioral" so modifying it will affect the model's runtime behavior and integration into the deployment system. + There is no guarantee that the model will work if the input data configuration is changed here without changing the model settings and algorithm accordingly. + + Returns: + - queryset_base (Queryset): A queryset containing the base data for the model training. + """ + + # VIEWSER 6, Example configuration. Modify as needed. + + queryset = (Queryset('fatalities003_joint_narrow','country_month') + .with_column(Column('ln_ged_sb_dep', from_loa='country_month', from_column='ged_sb_best_sum_nokgi') + .transform.ops.ln() + .transform.missing.fill() + ) + + .with_column(Column('gleditsch_ward', from_loa='country', from_column='gwcode') + .transform.missing.fill() + .transform.missing.replace_na() + ) + + .with_column(Column('ln_ged_sb', from_loa='country_month', from_column='ged_sb_best_sum_nokgi') + .transform.ops.ln() + .transform.missing.fill() + ) + + .with_column(Column('reign_tenure_months', from_loa='country_month', from_column='tenure_months') + .transform.missing.fill() + .transform.missing.replace_na() + ) + + .with_column(Column('wdi_sp_pop_totl', from_loa='country_year', from_column='wdi_sp_pop_totl') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + ) + + .with_column(Column('wdi_ag_lnd_frst_k2', from_loa='country_year', from_column='wdi_ag_lnd_frst_k2') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + ) + + .with_column(Column('wdi_nv_agr_totl_kn', from_loa='country_year', from_column='wdi_nv_agr_totl_kn') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + ) + + .with_column(Column('wdi_sh_sta_maln_zs', from_loa='country_year', from_column='wdi_sh_sta_maln_zs') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + ) + + .with_column(Column('wdi_sl_tlf_totl_fe_zs', from_loa='country_year', from_column='wdi_sl_tlf_totl_fe_zs') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + ) + + .with_column(Column('wdi_sm_pop_refg_or', from_loa='country_year', from_column='wdi_sm_pop_refg_or') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + ) + + .with_column(Column('wdi_sp_dyn_imrt_in', from_loa='country_year', from_column='wdi_sp_dyn_imrt_in') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + ) + + .with_column(Column('wdi_sp_pop_14_fe_zs', from_loa='country_year', from_column='wdi_sp_pop_0014_fe_zs') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + ) + + .with_column(Column('wdi_sp_pop_grow', from_loa='country_year', from_column='wdi_sp_pop_grow') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + ) + + .with_column(Column('vdem_v2xcl_dmove', from_loa='country_year', from_column='vdem_v2xcl_dmove') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + ) + + .with_column(Column('vdem_v2xcl_rol', from_loa='country_year', from_column='vdem_v2xcl_rol') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + ) + + .with_column(Column('vdem_v2xeg_eqdr', from_loa='country_year', from_column='vdem_v2xeg_eqdr') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + ) + + .with_column(Column('vdem_v2xpe_exlpol', from_loa='country_year', from_column='vdem_v2xpe_exlpol') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + ) + + .with_column(Column('vdem_v2xpe_exlsocgr', from_loa='country_year', from_column='vdem_v2xpe_exlsocgr') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + ) + + .with_column(Column('ln_ged_sb_tlag_1', from_loa='country_month', from_column='ged_sb_best_sum_nokgi') + .transform.ops.ln() + .transform.missing.fill() + .transform.temporal.tlag(1) + .transform.missing.fill() + ) + + .with_column(Column('ln_ged_sb_tlag_2', from_loa='country_month', from_column='ged_sb_best_sum_nokgi') + .transform.ops.ln() + .transform.missing.fill() + .transform.temporal.tlag(2) + .transform.missing.fill() + ) + + .with_column(Column('splag_wdi_ag_lnd_frst_k2', from_loa='country_year', from_column='wdi_ag_lnd_frst_k2') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.spatial.countrylag(1,1,0,0) + .transform.missing.replace_na() + ) + + .with_column(Column('splag_wdi_sl_tlf_totl_fe_zs', from_loa='country_year', from_column='wdi_sl_tlf_totl_fe_zs') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.spatial.countrylag(1,1,0,0) + .transform.missing.replace_na() + ) + + .with_column(Column('splag_wdi_sm_pop_netm', from_loa='country_year', from_column='wdi_sm_pop_netm') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.spatial.countrylag(1,1,0,0) + .transform.missing.replace_na() + ) + + .with_column(Column('splag_vdem_v2xpe_exlsocgr', from_loa='country_year', from_column='vdem_v2xpe_exlsocgr') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.spatial.countrylag(1,1,0,0) + .transform.missing.replace_na() + ) + + .with_column(Column('splag_vdem_v2xcl_rol', from_loa='country_year', from_column='vdem_v2xcl_rol') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.spatial.countrylag(1,1,0,0) + .transform.missing.replace_na() + ) + + .with_column(Column('decay_ged_sb_5', from_loa='country_month', from_column='ged_sb_best_sum_nokgi') + .transform.missing.replace_na() + .transform.bool.gte(5) + .transform.temporal.time_since() + .transform.temporal.decay(24) + .transform.missing.replace_na() + ) + + .with_column(Column('decay_ged_os_5', from_loa='country_month', from_column='ged_os_best_sum_nokgi') + .transform.missing.replace_na() + .transform.bool.gte(5) + .transform.temporal.time_since() + .transform.temporal.decay(24) + .transform.missing.replace_na() + ) + + .with_column(Column('decay_acled_os_5', from_loa='country_month', from_column='acled_os_fat') + .transform.missing.replace_na() + .transform.bool.gte(5) + .transform.temporal.time_since() + .transform.temporal.decay(24) + .transform.missing.replace_na() + ) + + .with_column(Column('decay_ged_sb_100', from_loa='country_month', from_column='ged_sb_best_sum_nokgi') + .transform.missing.replace_na() + .transform.bool.gte(100) + .transform.temporal.time_since() + .transform.temporal.decay(24) + .transform.missing.replace_na() + ) + + .with_column(Column('decay_ged_sb_500', from_loa='country_month', from_column='ged_sb_best_sum_nokgi') + .transform.missing.replace_na() + .transform.bool.gte(500) + .transform.temporal.time_since() + .transform.temporal.decay(24) + .transform.missing.replace_na() + ) + + .with_column(Column('splag_1_decay_ged_sb_5', from_loa='country_month', from_column='ged_sb_best_sum_nokgi') + .transform.missing.replace_na() + .transform.bool.gte(5) + .transform.temporal.time_since() + .transform.temporal.decay(24) + .transform.spatial.countrylag(1,1,0,0) + .transform.missing.replace_na() + ) + + .with_theme('fatalities') + .describe("""Predicting ged_dummy_sb, cm level + + Queryset with features from various sources, 'joint narrow' + + """) + ) + + return queryset diff --git a/common_querysets/queryset_good_riddance.py b/common_querysets/queryset_good_riddance.py new file mode 100644 index 00000000..46edf09c --- /dev/null +++ b/common_querysets/queryset_good_riddance.py @@ -0,0 +1,226 @@ +from viewser import Queryset, Column + +def generate(): + """ + Contains the configuration for the input data in the form of a viewser queryset. That is the data from viewser that is used to train the model. + This configuration is "behavioral" so modifying it will affect the model's runtime behavior and integration into the deployment system. + There is no guarantee that the model will work if the input data configuration is changed here without changing the model settings and algorithm accordingly. + + Returns: + - queryset_base (Queryset): A queryset containing the base data for the model training. + """ + + # VIEWSER 6, Example configuration. Modify as needed. + + queryset = (Queryset('fatalities003_joint_narrow','country_month') + .with_column(Column('ln_ged_sb_dep', from_loa='country_month', from_column='ged_sb_best_sum_nokgi') + .transform.ops.ln() + .transform.missing.fill() + ) + + .with_column(Column('gleditsch_ward', from_loa='country', from_column='gwcode') + .transform.missing.fill() + .transform.missing.replace_na() + ) + + .with_column(Column('ln_ged_sb', from_loa='country_month', from_column='ged_sb_best_sum_nokgi') + .transform.ops.ln() + .transform.missing.fill() + ) + + .with_column(Column('reign_tenure_months', from_loa='country_month', from_column='tenure_months') + .transform.missing.fill() + .transform.missing.replace_na() + ) + + .with_column(Column('wdi_sp_pop_totl', from_loa='country_year', from_column='wdi_sp_pop_totl') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + ) + + .with_column(Column('wdi_ag_lnd_frst_k2', from_loa='country_year', from_column='wdi_ag_lnd_frst_k2') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + ) + + .with_column(Column('wdi_nv_agr_totl_kn', from_loa='country_year', from_column='wdi_nv_agr_totl_kn') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + ) + + .with_column(Column('wdi_sh_sta_maln_zs', from_loa='country_year', from_column='wdi_sh_sta_maln_zs') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + ) + + .with_column(Column('wdi_sl_tlf_totl_fe_zs', from_loa='country_year', from_column='wdi_sl_tlf_totl_fe_zs') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + ) + + .with_column(Column('wdi_sm_pop_refg_or', from_loa='country_year', from_column='wdi_sm_pop_refg_or') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + ) + + .with_column(Column('wdi_sp_dyn_imrt_in', from_loa='country_year', from_column='wdi_sp_dyn_imrt_in') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + ) + + .with_column(Column('wdi_sp_pop_14_fe_zs', from_loa='country_year', from_column='wdi_sp_pop_0014_fe_zs') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + ) + + .with_column(Column('wdi_sp_pop_grow', from_loa='country_year', from_column='wdi_sp_pop_grow') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + ) + + .with_column(Column('vdem_v2xcl_dmove', from_loa='country_year', from_column='vdem_v2xcl_dmove') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + ) + + .with_column(Column('vdem_v2xcl_rol', from_loa='country_year', from_column='vdem_v2xcl_rol') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + ) + + .with_column(Column('vdem_v2xeg_eqdr', from_loa='country_year', from_column='vdem_v2xeg_eqdr') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + ) + + .with_column(Column('vdem_v2xpe_exlpol', from_loa='country_year', from_column='vdem_v2xpe_exlpol') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + ) + + .with_column(Column('vdem_v2xpe_exlsocgr', from_loa='country_year', from_column='vdem_v2xpe_exlsocgr') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + ) + + .with_column(Column('ln_ged_sb_tlag_1', from_loa='country_month', from_column='ged_sb_best_sum_nokgi') + .transform.ops.ln() + .transform.missing.fill() + .transform.temporal.tlag(1) + .transform.missing.fill() + ) + + .with_column(Column('ln_ged_sb_tlag_2', from_loa='country_month', from_column='ged_sb_best_sum_nokgi') + .transform.ops.ln() + .transform.missing.fill() + .transform.temporal.tlag(2) + .transform.missing.fill() + ) + + .with_column(Column('splag_wdi_ag_lnd_frst_k2', from_loa='country_year', from_column='wdi_ag_lnd_frst_k2') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.spatial.countrylag(1,1,0,0) + .transform.missing.replace_na() + ) + + .with_column(Column('splag_wdi_sl_tlf_totl_fe_zs', from_loa='country_year', from_column='wdi_sl_tlf_totl_fe_zs') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.spatial.countrylag(1,1,0,0) + .transform.missing.replace_na() + ) + + .with_column(Column('splag_wdi_sm_pop_netm', from_loa='country_year', from_column='wdi_sm_pop_netm') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.spatial.countrylag(1,1,0,0) + .transform.missing.replace_na() + ) + + .with_column(Column('splag_vdem_v2xpe_exlsocgr', from_loa='country_year', from_column='vdem_v2xpe_exlsocgr') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.spatial.countrylag(1,1,0,0) + .transform.missing.replace_na() + ) + + .with_column(Column('splag_vdem_v2xcl_rol', from_loa='country_year', from_column='vdem_v2xcl_rol') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.spatial.countrylag(1,1,0,0) + .transform.missing.replace_na() + ) + + .with_column(Column('decay_ged_sb_5', from_loa='country_month', from_column='ged_sb_best_sum_nokgi') + .transform.missing.replace_na() + .transform.bool.gte(5) + .transform.temporal.time_since() + .transform.temporal.decay(24) + .transform.missing.replace_na() + ) + + .with_column(Column('decay_ged_os_5', from_loa='country_month', from_column='ged_os_best_sum_nokgi') + .transform.missing.replace_na() + .transform.bool.gte(5) + .transform.temporal.time_since() + .transform.temporal.decay(24) + .transform.missing.replace_na() + ) + + .with_column(Column('decay_acled_os_5', from_loa='country_month', from_column='acled_os_fat') + .transform.missing.replace_na() + .transform.bool.gte(5) + .transform.temporal.time_since() + .transform.temporal.decay(24) + .transform.missing.replace_na() + ) + + .with_column(Column('decay_ged_sb_100', from_loa='country_month', from_column='ged_sb_best_sum_nokgi') + .transform.missing.replace_na() + .transform.bool.gte(100) + .transform.temporal.time_since() + .transform.temporal.decay(24) + .transform.missing.replace_na() + ) + + .with_column(Column('decay_ged_sb_500', from_loa='country_month', from_column='ged_sb_best_sum_nokgi') + .transform.missing.replace_na() + .transform.bool.gte(500) + .transform.temporal.time_since() + .transform.temporal.decay(24) + .transform.missing.replace_na() + ) + + .with_column(Column('splag_1_decay_ged_sb_5', from_loa='country_month', from_column='ged_sb_best_sum_nokgi') + .transform.missing.replace_na() + .transform.bool.gte(5) + .transform.temporal.time_since() + .transform.temporal.decay(24) + .transform.spatial.countrylag(1,1,0,0) + .transform.missing.replace_na() + ) + + .with_theme('fatalities') + .describe("""Predicting ged_dummy_sb, cm level + + Queryset with features from various sources, 'joint narrow' + + """) + ) + + return queryset diff --git a/common_querysets/queryset_green_squirrel.py b/common_querysets/queryset_green_squirrel.py new file mode 100644 index 00000000..bb2497a3 --- /dev/null +++ b/common_querysets/queryset_green_squirrel.py @@ -0,0 +1,608 @@ +from viewser import Queryset, Column + +def generate(): + """ + Contains the configuration for the input data in the form of a viewser queryset. That is the data from viewser that is used to train the model. + This configuration is "behavioral" so modifying it will affect the model's runtime behavior and integration into the deployment system. + There is no guarantee that the model will work if the input data configuration is changed here without changing the model settings and algorithm accordingly. + + Returns: + - queryset_base (Queryset): A queryset containing the base data for the model training. + """ + + # VIEWSER 6, Example configuration. Modify as needed. + + queryset = (Queryset('fatalities003_joint_broad','country_month') + .with_column(Column('gleditsch_ward', from_loa='country', from_column='gwcode') + ) + + .with_column(Column('ln_ged_sb_dep', from_loa='country_month', from_column='ged_sb_best_sum_nokgi') + .transform.ops.ln() + .transform.missing.fill() + .transform.missing.replace_na() + ) + + .with_column(Column('ln_ged_sb', from_loa='country_month', from_column='ged_sb_best_sum_nokgi') + .transform.ops.ln() + .transform.missing.fill() + .transform.missing.replace_na() + ) + + .with_column(Column('ln_ged_ns', from_loa='country_month', from_column='ged_ns_best_sum_nokgi') + .transform.ops.ln() + .transform.missing.fill() + .transform.missing.replace_na() + ) + + .with_column(Column('ln_ged_os', from_loa='country_month', from_column='ged_os_best_sum_nokgi') + .transform.ops.ln() + .transform.missing.fill() + .transform.missing.replace_na() + ) + + .with_column(Column('ln_acled_sb', from_loa='country_month', from_column='acled_sb_fat') + .transform.ops.ln() + .transform.missing.fill() + .transform.missing.replace_na() + ) + + .with_column(Column('ln_acled_sb_count', from_loa='country_month', from_column='acled_sb_count') + .transform.ops.ln() + .transform.missing.fill() + .transform.missing.replace_na() + ) + + .with_column(Column('ln_acled_os', from_loa='country_month', from_column='acled_os_fat') + .transform.ops.ln() + .transform.missing.fill() + .transform.missing.replace_na() + ) + + .with_column(Column('wdi_sm_pop_netm', from_loa='country_year', from_column='wdi_sm_pop_netm') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + .transform.missing.replace_na() + ) + + .with_column(Column('wdi_sm_pop_refg_or', from_loa='country_year', from_column='wdi_sm_pop_refg_or') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + .transform.missing.replace_na() + ) + + .with_column(Column('wdi_dt_oda_odat_pc_zs', from_loa='country_year', from_column='wdi_dt_oda_odat_pc_zs') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + .transform.missing.replace_na() + ) + + .with_column(Column('wdi_ms_mil_xpnd_gd_zs', from_loa='country_year', from_column='wdi_ms_mil_xpnd_gd_zs') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + .transform.missing.replace_na() + ) + + .with_column(Column('wdi_sl_tlf_totl_fe_zs', from_loa='country_year', from_column='wdi_sl_tlf_totl_fe_zs') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + .transform.missing.replace_na() + ) + + .with_column(Column('wdi_nv_agr_totl_kn', from_loa='country_year', from_column='wdi_nv_agr_totl_kn') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + .transform.missing.replace_na() + ) + + .with_column(Column('wdi_sp_pop_grow', from_loa='country_year', from_column='wdi_sp_pop_grow') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + .transform.missing.replace_na() + ) + + .with_column(Column('wdi_se_enr_prim_fm_zs', from_loa='country_year', from_column='wdi_se_enr_prim_fm_zs') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + .transform.missing.replace_na() + ) + + .with_column(Column('wdi_sp_urb_totl_in_zs', from_loa='country_year', from_column='wdi_sp_urb_totl_in_zs') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + .transform.missing.replace_na() + ) + + .with_column(Column('wdi_sh_sta_maln_zs', from_loa='country_year', from_column='wdi_sh_sta_maln_zs') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + .transform.missing.replace_na() + ) + + .with_column(Column('wdi_sp_dyn_imrt_fe_in', from_loa='country_year', from_column='wdi_sp_dyn_imrt_fe_in') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + .transform.missing.replace_na() + ) + + .with_column(Column('wdi_ny_gdp_mktp_kd', from_loa='country_year', from_column='wdi_ny_gdp_mktp_kd') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + .transform.missing.replace_na() + ) + + .with_column(Column('wdi_sh_sta_stnt_zs', from_loa='country_year', from_column='wdi_sh_sta_stnt_zs') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + .transform.missing.replace_na() + ) + + .with_column(Column('vdem_v2x_horacc', from_loa='country_year', from_column='vdem_v2x_horacc') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + .transform.missing.replace_na() + ) + + .with_column(Column('vdem_v2xnp_client', from_loa='country_year', from_column='vdem_v2xnp_client') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + .transform.missing.replace_na() + ) + + .with_column(Column('vdem_v2x_veracc', from_loa='country_year', from_column='vdem_v2x_veracc') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + .transform.missing.replace_na() + ) + + .with_column(Column('vdem_v2x_divparctrl', from_loa='country_year', from_column='vdem_v2x_divparctrl') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + .transform.missing.replace_na() + ) + + .with_column(Column('vdem_v2xpe_exlpol', from_loa='country_year', from_column='vdem_v2xpe_exlpol') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + .transform.missing.replace_na() + ) + + .with_column(Column('vdem_v2x_diagacc', from_loa='country_year', from_column='vdem_v2x_diagacc') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + .transform.missing.replace_na() + ) + + .with_column(Column('vdem_v2xpe_exlgeo', from_loa='country_year', from_column='vdem_v2xpe_exlgeo') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + .transform.missing.replace_na() + ) + + .with_column(Column('vdem_v2xpe_exlgender', from_loa='country_year', from_column='vdem_v2xpe_exlgender') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + .transform.missing.replace_na() + ) + + .with_column(Column('vdem_v2xpe_exlsocgr', from_loa='country_year', from_column='vdem_v2xpe_exlsocgr') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + .transform.missing.replace_na() + ) + + .with_column(Column('vdem_v2x_ex_party', from_loa='country_year', from_column='vdem_v2x_ex_party') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + .transform.missing.replace_na() + ) + + .with_column(Column('vdem_v2x_genpp', from_loa='country_year', from_column='vdem_v2x_genpp') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + .transform.missing.replace_na() + ) + + .with_column(Column('vdem_v2xeg_eqdr', from_loa='country_year', from_column='vdem_v2xeg_eqdr') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + .transform.missing.replace_na() + ) + + .with_column(Column('vdem_v2xcl_prpty', from_loa='country_year', from_column='vdem_v2xcl_prpty') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + .transform.missing.replace_na() + ) + + .with_column(Column('vdem_v2xeg_eqprotec', from_loa='country_year', from_column='vdem_v2xeg_eqprotec') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + .transform.missing.replace_na() + ) + + .with_column(Column('vdem_v2x_ex_military', from_loa='country_year', from_column='vdem_v2x_ex_military') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + .transform.missing.replace_na() + ) + + .with_column(Column('vdem_v2xcl_dmove', from_loa='country_year', from_column='vdem_v2xcl_dmove') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + .transform.missing.replace_na() + ) + + .with_column(Column('vdem_v2x_clphy', from_loa='country_year', from_column='vdem_v2x_clphy') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + .transform.missing.replace_na() + ) + + .with_column(Column('vdem_v2x_hosabort', from_loa='country_year', from_column='vdem_v2x_hosabort') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + .transform.missing.replace_na() + ) + + .with_column(Column('vdem_v2xnp_regcorr', from_loa='country_year', from_column='vdem_v2xnp_regcorr') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + .transform.missing.replace_na() + ) + + .with_column(Column('wdi_sp_pop_totl', from_loa='country_year', from_column='wdi_sp_pop_totl') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + .transform.missing.replace_na() + ) + + .with_column(Column('ln_ged_sb_tlag_1', from_loa='country_month', from_column='ged_sb_best_sum_nokgi') + .transform.ops.ln() + .transform.missing.fill() + .transform.temporal.tlag(1) + .transform.missing.fill() + .transform.missing.replace_na() + ) + + .with_column(Column('ln_ged_sb_tlag_2', from_loa='country_month', from_column='ged_sb_best_sum_nokgi') + .transform.ops.ln() + .transform.missing.fill() + .transform.temporal.tlag(2) + .transform.missing.fill() + .transform.missing.replace_na() + ) + + .with_column(Column('ln_ged_sb_tlag_3', from_loa='country_month', from_column='ged_sb_best_sum_nokgi') + .transform.ops.ln() + .transform.missing.fill() + .transform.temporal.tlag(3) + .transform.missing.fill() + .transform.missing.replace_na() + ) + + .with_column(Column('ln_ged_sb_tlag_4', from_loa='country_month', from_column='ged_sb_best_sum_nokgi') + .transform.ops.ln() + .transform.missing.fill() + .transform.temporal.tlag(4) + .transform.missing.fill() + .transform.missing.replace_na() + ) + + .with_column(Column('ln_ged_sb_tlag_5', from_loa='country_month', from_column='ged_sb_best_sum_nokgi') + .transform.ops.ln() + .transform.missing.fill() + .transform.temporal.tlag(5) + .transform.missing.fill() + .transform.missing.replace_na() + ) + + .with_column(Column('ln_ged_sb_tlag_6', from_loa='country_month', from_column='ged_sb_best_sum_nokgi') + .transform.ops.ln() + .transform.missing.fill() + .transform.temporal.tlag(6) + .transform.missing.fill() + .transform.missing.replace_na() + ) + + .with_column(Column('ln_ged_os_tlag_1', from_loa='country_month', from_column='ged_os_best_sum_nokgi') + .transform.ops.ln() + .transform.missing.fill() + .transform.temporal.tlag(1) + .transform.missing.fill() + .transform.missing.replace_na() + ) + + .with_column(Column('topic_tokens_t1', from_loa='country_month', from_column='topic_tokens') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(1) + .transform.missing.fill() + .transform.missing.replace_na() + ) + + .with_column(Column('topic_tokens_t2', from_loa='country_month', from_column='topic_tokens') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(2) + .transform.missing.fill() + .transform.missing.replace_na() + ) + + .with_column(Column('topic_ste_theta4_stock_t1', from_loa='country_month', from_column='topic_ste_theta4_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(1) + .transform.missing.fill() + .transform.missing.replace_na() + ) + + .with_column(Column('topic_ste_theta4_stock_t2', from_loa='country_month', from_column='topic_ste_theta4_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(2) + .transform.missing.fill() + .transform.missing.replace_na() + ) + + .with_column(Column('topic_ste_theta4_stock_t13', from_loa='country_month', from_column='topic_ste_theta4_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(13) + .transform.missing.fill() + .transform.missing.replace_na() + ) + + .with_column(Column('topic_ste_theta2_stock_t1', from_loa='country_month', from_column='topic_ste_theta5_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(1) + .transform.missing.fill() + .transform.missing.replace_na() + ) + + .with_column(Column('topic_ste_theta2_stock_t2', from_loa='country_month', from_column='topic_ste_theta5_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(2) + .transform.missing.fill() + .transform.missing.replace_na() + ) + + .with_column(Column('topic_ste_theta2_stock_t13', from_loa='country_month', from_column='topic_ste_theta5_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(13) + .transform.missing.fill() + .transform.missing.replace_na() + ) + + .with_column(Column('splag_wdi_sl_tlf_totl_fe_zs', from_loa='country_year', from_column='wdi_sl_tlf_totl_fe_zs') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.spatial.countrylag(1,1,0,0) + .transform.missing.replace_na() + ) + + .with_column(Column('splag_wdi_sm_pop_refg_or', from_loa='country_year', from_column='wdi_sm_pop_refg_or') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.spatial.countrylag(1,1,0,0) + .transform.missing.replace_na() + ) + + .with_column(Column('splag_wdi_sm_pop_netm', from_loa='country_year', from_column='wdi_sm_pop_netm') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.spatial.countrylag(1,1,0,0) + .transform.missing.replace_na() + ) + + .with_column(Column('splag_wdi_ag_lnd_frst_k2', from_loa='country_year', from_column='wdi_ag_lnd_frst_k2') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.spatial.countrylag(1,1,0,0) + .transform.missing.replace_na() + ) + + .with_column(Column('splag_vdem_v2x_libdem', from_loa='country_year', from_column='vdem_v2x_libdem') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.spatial.countrylag(1,1,0,0) + .transform.missing.replace_na() + ) + + .with_column(Column('splag_vdem_v2xcl_dmove', from_loa='country_year', from_column='vdem_v2xcl_dmove') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.spatial.countrylag(1,1,0,0) + .transform.missing.replace_na() + ) + + .with_column(Column('splag_vdem_v2x_accountability', from_loa='country_year', from_column='vdem_v2x_accountability') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.spatial.countrylag(1,1,0,0) + .transform.missing.replace_na() + ) + + .with_column(Column('splag_vdem_v2xpe_exlsocgr', from_loa='country_year', from_column='vdem_v2xpe_exlsocgr') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.spatial.countrylag(1,1,0,0) + .transform.missing.replace_na() + ) + + .with_column(Column('splag_vdem_v2xcl_rol', from_loa='country_year', from_column='vdem_v2xcl_rol') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.spatial.countrylag(1,1,0,0) + .transform.missing.replace_na() + ) + + .with_column(Column('decay_ged_sb_5', from_loa='country_month', from_column='ged_sb_best_sum_nokgi') + .transform.missing.replace_na() + .transform.bool.gte(5) + .transform.temporal.time_since() + .transform.temporal.decay(24) + .transform.missing.replace_na() + ) + + .with_column(Column('decay_ged_os_5', from_loa='country_month', from_column='ged_os_best_sum_nokgi') + .transform.missing.replace_na() + .transform.bool.gte(5) + .transform.temporal.time_since() + .transform.temporal.decay(24) + .transform.missing.replace_na() + ) + + .with_column(Column('decay_ged_sb_100', from_loa='country_month', from_column='ged_sb_best_sum_nokgi') + .transform.missing.replace_na() + .transform.bool.gte(100) + .transform.temporal.time_since() + .transform.temporal.decay(24) + .transform.missing.replace_na() + ) + + .with_column(Column('decay_ged_sb_500', from_loa='country_month', from_column='ged_sb_best_sum_nokgi') + .transform.missing.replace_na() + .transform.bool.gte(500) + .transform.temporal.time_since() + .transform.temporal.decay(24) + .transform.missing.replace_na() + ) + + .with_column(Column('decay_ged_os_100', from_loa='country_month', from_column='ged_os_best_sum_nokgi') + .transform.missing.replace_na() + .transform.bool.gte(100) + .transform.temporal.time_since() + .transform.temporal.decay(24) + .transform.missing.replace_na() + ) + + .with_column(Column('decay_ged_ns_5', from_loa='country_month', from_column='ged_ns_best_sum_nokgi') + .transform.missing.replace_na() + .transform.bool.gte(5) + .transform.temporal.time_since() + .transform.temporal.decay(24) + .transform.missing.replace_na() + ) + + .with_column(Column('decay_ged_ns_100', from_loa='country_month', from_column='ged_ns_best_sum_nokgi') + .transform.missing.replace_na() + .transform.bool.gte(100) + .transform.temporal.time_since() + .transform.temporal.decay(24) + .transform.missing.replace_na() + ) + + .with_column(Column('decay_acled_sb_5', from_loa='country_month', from_column='acled_sb_fat') + .transform.missing.replace_na() + .transform.bool.gte(5) + .transform.temporal.time_since() + .transform.temporal.decay(24) + .transform.missing.replace_na() + ) + + .with_column(Column('decay_acled_os_5', from_loa='country_month', from_column='acled_os_fat') + .transform.missing.replace_na() + .transform.bool.gte(5) + .transform.temporal.time_since() + .transform.temporal.decay(24) + .transform.missing.replace_na() + ) + + .with_column(Column('decay_acled_ns_5', from_loa='country_month', from_column='acled_ns_fat') + .transform.missing.replace_na() + .transform.bool.gte(5) + .transform.temporal.time_since() + .transform.temporal.decay(24) + .transform.missing.replace_na() + ) + + .with_column(Column('splag_1_decay_ged_sb_5', from_loa='country_month', from_column='ged_sb_best_sum_nokgi') + .transform.missing.replace_na() + .transform.bool.gte(5) + .transform.temporal.time_since() + .transform.temporal.decay(24) + .transform.spatial.countrylag(1,1,0,0) + .transform.missing.replace_na() + ) + + .with_column(Column('splag_1_decay_ged_os_5', from_loa='country_month', from_column='ged_os_best_sum_nokgi') + .transform.missing.replace_na() + .transform.bool.gte(5) + .transform.temporal.time_since() + .transform.temporal.decay(24) + .transform.spatial.countrylag(1,1,0,0) + .transform.missing.replace_na() + ) + + .with_column(Column('splag_1_decay_ged_ns_5', from_loa='country_month', from_column='ged_ns_best_sum_nokgi') + .transform.missing.replace_na() + .transform.bool.gte(5) + .transform.temporal.time_since() + .transform.temporal.decay(24) + .transform.spatial.countrylag(1,1,0,0) + .transform.missing.replace_na() + ) + + .with_column(Column('topic_ste_theta4_stock_t1_splag', from_loa='country_month', from_column='topic_ste_theta4_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(13) + .transform.missing.fill() + .transform.spatial.countrylag(1,1,0,0) + .transform.missing.replace_na() + ) + + .with_column(Column('topic_ste_theta2_stock_t1_splag', from_loa='country_month', from_column='topic_ste_theta5_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(13) + .transform.missing.fill() + .transform.spatial.countrylag(1,1,0,0) + .transform.missing.replace_na() + ) + + .with_theme('fatalities002') + .describe("""Predicting ln(fatalities), cm level + + Queryset with baseline and broad list of features from all sources + + """) + ) + + return queryset diff --git a/common_querysets/queryset_heavy_rotation.py b/common_querysets/queryset_heavy_rotation.py new file mode 100644 index 00000000..bb2497a3 --- /dev/null +++ b/common_querysets/queryset_heavy_rotation.py @@ -0,0 +1,608 @@ +from viewser import Queryset, Column + +def generate(): + """ + Contains the configuration for the input data in the form of a viewser queryset. That is the data from viewser that is used to train the model. + This configuration is "behavioral" so modifying it will affect the model's runtime behavior and integration into the deployment system. + There is no guarantee that the model will work if the input data configuration is changed here without changing the model settings and algorithm accordingly. + + Returns: + - queryset_base (Queryset): A queryset containing the base data for the model training. + """ + + # VIEWSER 6, Example configuration. Modify as needed. + + queryset = (Queryset('fatalities003_joint_broad','country_month') + .with_column(Column('gleditsch_ward', from_loa='country', from_column='gwcode') + ) + + .with_column(Column('ln_ged_sb_dep', from_loa='country_month', from_column='ged_sb_best_sum_nokgi') + .transform.ops.ln() + .transform.missing.fill() + .transform.missing.replace_na() + ) + + .with_column(Column('ln_ged_sb', from_loa='country_month', from_column='ged_sb_best_sum_nokgi') + .transform.ops.ln() + .transform.missing.fill() + .transform.missing.replace_na() + ) + + .with_column(Column('ln_ged_ns', from_loa='country_month', from_column='ged_ns_best_sum_nokgi') + .transform.ops.ln() + .transform.missing.fill() + .transform.missing.replace_na() + ) + + .with_column(Column('ln_ged_os', from_loa='country_month', from_column='ged_os_best_sum_nokgi') + .transform.ops.ln() + .transform.missing.fill() + .transform.missing.replace_na() + ) + + .with_column(Column('ln_acled_sb', from_loa='country_month', from_column='acled_sb_fat') + .transform.ops.ln() + .transform.missing.fill() + .transform.missing.replace_na() + ) + + .with_column(Column('ln_acled_sb_count', from_loa='country_month', from_column='acled_sb_count') + .transform.ops.ln() + .transform.missing.fill() + .transform.missing.replace_na() + ) + + .with_column(Column('ln_acled_os', from_loa='country_month', from_column='acled_os_fat') + .transform.ops.ln() + .transform.missing.fill() + .transform.missing.replace_na() + ) + + .with_column(Column('wdi_sm_pop_netm', from_loa='country_year', from_column='wdi_sm_pop_netm') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + .transform.missing.replace_na() + ) + + .with_column(Column('wdi_sm_pop_refg_or', from_loa='country_year', from_column='wdi_sm_pop_refg_or') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + .transform.missing.replace_na() + ) + + .with_column(Column('wdi_dt_oda_odat_pc_zs', from_loa='country_year', from_column='wdi_dt_oda_odat_pc_zs') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + .transform.missing.replace_na() + ) + + .with_column(Column('wdi_ms_mil_xpnd_gd_zs', from_loa='country_year', from_column='wdi_ms_mil_xpnd_gd_zs') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + .transform.missing.replace_na() + ) + + .with_column(Column('wdi_sl_tlf_totl_fe_zs', from_loa='country_year', from_column='wdi_sl_tlf_totl_fe_zs') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + .transform.missing.replace_na() + ) + + .with_column(Column('wdi_nv_agr_totl_kn', from_loa='country_year', from_column='wdi_nv_agr_totl_kn') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + .transform.missing.replace_na() + ) + + .with_column(Column('wdi_sp_pop_grow', from_loa='country_year', from_column='wdi_sp_pop_grow') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + .transform.missing.replace_na() + ) + + .with_column(Column('wdi_se_enr_prim_fm_zs', from_loa='country_year', from_column='wdi_se_enr_prim_fm_zs') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + .transform.missing.replace_na() + ) + + .with_column(Column('wdi_sp_urb_totl_in_zs', from_loa='country_year', from_column='wdi_sp_urb_totl_in_zs') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + .transform.missing.replace_na() + ) + + .with_column(Column('wdi_sh_sta_maln_zs', from_loa='country_year', from_column='wdi_sh_sta_maln_zs') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + .transform.missing.replace_na() + ) + + .with_column(Column('wdi_sp_dyn_imrt_fe_in', from_loa='country_year', from_column='wdi_sp_dyn_imrt_fe_in') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + .transform.missing.replace_na() + ) + + .with_column(Column('wdi_ny_gdp_mktp_kd', from_loa='country_year', from_column='wdi_ny_gdp_mktp_kd') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + .transform.missing.replace_na() + ) + + .with_column(Column('wdi_sh_sta_stnt_zs', from_loa='country_year', from_column='wdi_sh_sta_stnt_zs') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + .transform.missing.replace_na() + ) + + .with_column(Column('vdem_v2x_horacc', from_loa='country_year', from_column='vdem_v2x_horacc') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + .transform.missing.replace_na() + ) + + .with_column(Column('vdem_v2xnp_client', from_loa='country_year', from_column='vdem_v2xnp_client') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + .transform.missing.replace_na() + ) + + .with_column(Column('vdem_v2x_veracc', from_loa='country_year', from_column='vdem_v2x_veracc') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + .transform.missing.replace_na() + ) + + .with_column(Column('vdem_v2x_divparctrl', from_loa='country_year', from_column='vdem_v2x_divparctrl') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + .transform.missing.replace_na() + ) + + .with_column(Column('vdem_v2xpe_exlpol', from_loa='country_year', from_column='vdem_v2xpe_exlpol') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + .transform.missing.replace_na() + ) + + .with_column(Column('vdem_v2x_diagacc', from_loa='country_year', from_column='vdem_v2x_diagacc') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + .transform.missing.replace_na() + ) + + .with_column(Column('vdem_v2xpe_exlgeo', from_loa='country_year', from_column='vdem_v2xpe_exlgeo') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + .transform.missing.replace_na() + ) + + .with_column(Column('vdem_v2xpe_exlgender', from_loa='country_year', from_column='vdem_v2xpe_exlgender') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + .transform.missing.replace_na() + ) + + .with_column(Column('vdem_v2xpe_exlsocgr', from_loa='country_year', from_column='vdem_v2xpe_exlsocgr') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + .transform.missing.replace_na() + ) + + .with_column(Column('vdem_v2x_ex_party', from_loa='country_year', from_column='vdem_v2x_ex_party') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + .transform.missing.replace_na() + ) + + .with_column(Column('vdem_v2x_genpp', from_loa='country_year', from_column='vdem_v2x_genpp') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + .transform.missing.replace_na() + ) + + .with_column(Column('vdem_v2xeg_eqdr', from_loa='country_year', from_column='vdem_v2xeg_eqdr') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + .transform.missing.replace_na() + ) + + .with_column(Column('vdem_v2xcl_prpty', from_loa='country_year', from_column='vdem_v2xcl_prpty') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + .transform.missing.replace_na() + ) + + .with_column(Column('vdem_v2xeg_eqprotec', from_loa='country_year', from_column='vdem_v2xeg_eqprotec') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + .transform.missing.replace_na() + ) + + .with_column(Column('vdem_v2x_ex_military', from_loa='country_year', from_column='vdem_v2x_ex_military') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + .transform.missing.replace_na() + ) + + .with_column(Column('vdem_v2xcl_dmove', from_loa='country_year', from_column='vdem_v2xcl_dmove') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + .transform.missing.replace_na() + ) + + .with_column(Column('vdem_v2x_clphy', from_loa='country_year', from_column='vdem_v2x_clphy') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + .transform.missing.replace_na() + ) + + .with_column(Column('vdem_v2x_hosabort', from_loa='country_year', from_column='vdem_v2x_hosabort') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + .transform.missing.replace_na() + ) + + .with_column(Column('vdem_v2xnp_regcorr', from_loa='country_year', from_column='vdem_v2xnp_regcorr') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + .transform.missing.replace_na() + ) + + .with_column(Column('wdi_sp_pop_totl', from_loa='country_year', from_column='wdi_sp_pop_totl') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + .transform.missing.replace_na() + ) + + .with_column(Column('ln_ged_sb_tlag_1', from_loa='country_month', from_column='ged_sb_best_sum_nokgi') + .transform.ops.ln() + .transform.missing.fill() + .transform.temporal.tlag(1) + .transform.missing.fill() + .transform.missing.replace_na() + ) + + .with_column(Column('ln_ged_sb_tlag_2', from_loa='country_month', from_column='ged_sb_best_sum_nokgi') + .transform.ops.ln() + .transform.missing.fill() + .transform.temporal.tlag(2) + .transform.missing.fill() + .transform.missing.replace_na() + ) + + .with_column(Column('ln_ged_sb_tlag_3', from_loa='country_month', from_column='ged_sb_best_sum_nokgi') + .transform.ops.ln() + .transform.missing.fill() + .transform.temporal.tlag(3) + .transform.missing.fill() + .transform.missing.replace_na() + ) + + .with_column(Column('ln_ged_sb_tlag_4', from_loa='country_month', from_column='ged_sb_best_sum_nokgi') + .transform.ops.ln() + .transform.missing.fill() + .transform.temporal.tlag(4) + .transform.missing.fill() + .transform.missing.replace_na() + ) + + .with_column(Column('ln_ged_sb_tlag_5', from_loa='country_month', from_column='ged_sb_best_sum_nokgi') + .transform.ops.ln() + .transform.missing.fill() + .transform.temporal.tlag(5) + .transform.missing.fill() + .transform.missing.replace_na() + ) + + .with_column(Column('ln_ged_sb_tlag_6', from_loa='country_month', from_column='ged_sb_best_sum_nokgi') + .transform.ops.ln() + .transform.missing.fill() + .transform.temporal.tlag(6) + .transform.missing.fill() + .transform.missing.replace_na() + ) + + .with_column(Column('ln_ged_os_tlag_1', from_loa='country_month', from_column='ged_os_best_sum_nokgi') + .transform.ops.ln() + .transform.missing.fill() + .transform.temporal.tlag(1) + .transform.missing.fill() + .transform.missing.replace_na() + ) + + .with_column(Column('topic_tokens_t1', from_loa='country_month', from_column='topic_tokens') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(1) + .transform.missing.fill() + .transform.missing.replace_na() + ) + + .with_column(Column('topic_tokens_t2', from_loa='country_month', from_column='topic_tokens') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(2) + .transform.missing.fill() + .transform.missing.replace_na() + ) + + .with_column(Column('topic_ste_theta4_stock_t1', from_loa='country_month', from_column='topic_ste_theta4_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(1) + .transform.missing.fill() + .transform.missing.replace_na() + ) + + .with_column(Column('topic_ste_theta4_stock_t2', from_loa='country_month', from_column='topic_ste_theta4_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(2) + .transform.missing.fill() + .transform.missing.replace_na() + ) + + .with_column(Column('topic_ste_theta4_stock_t13', from_loa='country_month', from_column='topic_ste_theta4_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(13) + .transform.missing.fill() + .transform.missing.replace_na() + ) + + .with_column(Column('topic_ste_theta2_stock_t1', from_loa='country_month', from_column='topic_ste_theta5_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(1) + .transform.missing.fill() + .transform.missing.replace_na() + ) + + .with_column(Column('topic_ste_theta2_stock_t2', from_loa='country_month', from_column='topic_ste_theta5_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(2) + .transform.missing.fill() + .transform.missing.replace_na() + ) + + .with_column(Column('topic_ste_theta2_stock_t13', from_loa='country_month', from_column='topic_ste_theta5_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(13) + .transform.missing.fill() + .transform.missing.replace_na() + ) + + .with_column(Column('splag_wdi_sl_tlf_totl_fe_zs', from_loa='country_year', from_column='wdi_sl_tlf_totl_fe_zs') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.spatial.countrylag(1,1,0,0) + .transform.missing.replace_na() + ) + + .with_column(Column('splag_wdi_sm_pop_refg_or', from_loa='country_year', from_column='wdi_sm_pop_refg_or') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.spatial.countrylag(1,1,0,0) + .transform.missing.replace_na() + ) + + .with_column(Column('splag_wdi_sm_pop_netm', from_loa='country_year', from_column='wdi_sm_pop_netm') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.spatial.countrylag(1,1,0,0) + .transform.missing.replace_na() + ) + + .with_column(Column('splag_wdi_ag_lnd_frst_k2', from_loa='country_year', from_column='wdi_ag_lnd_frst_k2') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.spatial.countrylag(1,1,0,0) + .transform.missing.replace_na() + ) + + .with_column(Column('splag_vdem_v2x_libdem', from_loa='country_year', from_column='vdem_v2x_libdem') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.spatial.countrylag(1,1,0,0) + .transform.missing.replace_na() + ) + + .with_column(Column('splag_vdem_v2xcl_dmove', from_loa='country_year', from_column='vdem_v2xcl_dmove') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.spatial.countrylag(1,1,0,0) + .transform.missing.replace_na() + ) + + .with_column(Column('splag_vdem_v2x_accountability', from_loa='country_year', from_column='vdem_v2x_accountability') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.spatial.countrylag(1,1,0,0) + .transform.missing.replace_na() + ) + + .with_column(Column('splag_vdem_v2xpe_exlsocgr', from_loa='country_year', from_column='vdem_v2xpe_exlsocgr') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.spatial.countrylag(1,1,0,0) + .transform.missing.replace_na() + ) + + .with_column(Column('splag_vdem_v2xcl_rol', from_loa='country_year', from_column='vdem_v2xcl_rol') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.spatial.countrylag(1,1,0,0) + .transform.missing.replace_na() + ) + + .with_column(Column('decay_ged_sb_5', from_loa='country_month', from_column='ged_sb_best_sum_nokgi') + .transform.missing.replace_na() + .transform.bool.gte(5) + .transform.temporal.time_since() + .transform.temporal.decay(24) + .transform.missing.replace_na() + ) + + .with_column(Column('decay_ged_os_5', from_loa='country_month', from_column='ged_os_best_sum_nokgi') + .transform.missing.replace_na() + .transform.bool.gte(5) + .transform.temporal.time_since() + .transform.temporal.decay(24) + .transform.missing.replace_na() + ) + + .with_column(Column('decay_ged_sb_100', from_loa='country_month', from_column='ged_sb_best_sum_nokgi') + .transform.missing.replace_na() + .transform.bool.gte(100) + .transform.temporal.time_since() + .transform.temporal.decay(24) + .transform.missing.replace_na() + ) + + .with_column(Column('decay_ged_sb_500', from_loa='country_month', from_column='ged_sb_best_sum_nokgi') + .transform.missing.replace_na() + .transform.bool.gte(500) + .transform.temporal.time_since() + .transform.temporal.decay(24) + .transform.missing.replace_na() + ) + + .with_column(Column('decay_ged_os_100', from_loa='country_month', from_column='ged_os_best_sum_nokgi') + .transform.missing.replace_na() + .transform.bool.gte(100) + .transform.temporal.time_since() + .transform.temporal.decay(24) + .transform.missing.replace_na() + ) + + .with_column(Column('decay_ged_ns_5', from_loa='country_month', from_column='ged_ns_best_sum_nokgi') + .transform.missing.replace_na() + .transform.bool.gte(5) + .transform.temporal.time_since() + .transform.temporal.decay(24) + .transform.missing.replace_na() + ) + + .with_column(Column('decay_ged_ns_100', from_loa='country_month', from_column='ged_ns_best_sum_nokgi') + .transform.missing.replace_na() + .transform.bool.gte(100) + .transform.temporal.time_since() + .transform.temporal.decay(24) + .transform.missing.replace_na() + ) + + .with_column(Column('decay_acled_sb_5', from_loa='country_month', from_column='acled_sb_fat') + .transform.missing.replace_na() + .transform.bool.gte(5) + .transform.temporal.time_since() + .transform.temporal.decay(24) + .transform.missing.replace_na() + ) + + .with_column(Column('decay_acled_os_5', from_loa='country_month', from_column='acled_os_fat') + .transform.missing.replace_na() + .transform.bool.gte(5) + .transform.temporal.time_since() + .transform.temporal.decay(24) + .transform.missing.replace_na() + ) + + .with_column(Column('decay_acled_ns_5', from_loa='country_month', from_column='acled_ns_fat') + .transform.missing.replace_na() + .transform.bool.gte(5) + .transform.temporal.time_since() + .transform.temporal.decay(24) + .transform.missing.replace_na() + ) + + .with_column(Column('splag_1_decay_ged_sb_5', from_loa='country_month', from_column='ged_sb_best_sum_nokgi') + .transform.missing.replace_na() + .transform.bool.gte(5) + .transform.temporal.time_since() + .transform.temporal.decay(24) + .transform.spatial.countrylag(1,1,0,0) + .transform.missing.replace_na() + ) + + .with_column(Column('splag_1_decay_ged_os_5', from_loa='country_month', from_column='ged_os_best_sum_nokgi') + .transform.missing.replace_na() + .transform.bool.gte(5) + .transform.temporal.time_since() + .transform.temporal.decay(24) + .transform.spatial.countrylag(1,1,0,0) + .transform.missing.replace_na() + ) + + .with_column(Column('splag_1_decay_ged_ns_5', from_loa='country_month', from_column='ged_ns_best_sum_nokgi') + .transform.missing.replace_na() + .transform.bool.gte(5) + .transform.temporal.time_since() + .transform.temporal.decay(24) + .transform.spatial.countrylag(1,1,0,0) + .transform.missing.replace_na() + ) + + .with_column(Column('topic_ste_theta4_stock_t1_splag', from_loa='country_month', from_column='topic_ste_theta4_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(13) + .transform.missing.fill() + .transform.spatial.countrylag(1,1,0,0) + .transform.missing.replace_na() + ) + + .with_column(Column('topic_ste_theta2_stock_t1_splag', from_loa='country_month', from_column='topic_ste_theta5_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(13) + .transform.missing.fill() + .transform.spatial.countrylag(1,1,0,0) + .transform.missing.replace_na() + ) + + .with_theme('fatalities002') + .describe("""Predicting ln(fatalities), cm level + + Queryset with baseline and broad list of features from all sources + + """) + ) + + return queryset diff --git a/common_querysets/queryset_high_hopes.py b/common_querysets/queryset_high_hopes.py new file mode 100644 index 00000000..295f8fda --- /dev/null +++ b/common_querysets/queryset_high_hopes.py @@ -0,0 +1,232 @@ +from viewser import Queryset, Column + +def generate(): + """ + Contains the configuration for the input data in the form of a viewser queryset. That is the data from viewser that is used to train the model. + This configuration is "behavioral" so modifying it will affect the model's runtime behavior and integration into the deployment system. + There is no guarantee that the model will work if the input data configuration is changed here without changing the model settings and algorithm accordingly. + + Returns: + - queryset_base (Queryset): A queryset containing the base data for the model training. + """ + + # VIEWSER 6, Example configuration. Modify as needed. + + queryset = (Queryset('fatalities003_conflict_history','country_month') + .with_column(Column('gleditsch_ward', from_loa='country', from_column='gwcode') + ) + + .with_column(Column('ln_ged_sb_dep', from_loa='country_month', from_column='ged_sb_best_sum_nokgi') + .transform.ops.ln() + .transform.missing.fill() + ) + + .with_column(Column('ln_ged_sb', from_loa='country_month', from_column='ged_sb_best_sum_nokgi') + .transform.ops.ln() + .transform.missing.fill() + ) + + .with_column(Column('ln_ged_ns', from_loa='country_month', from_column='ged_ns_best_sum_nokgi') + .transform.ops.ln() + .transform.missing.fill() + ) + + .with_column(Column('ln_ged_os', from_loa='country_month', from_column='ged_os_best_sum_nokgi') + .transform.ops.ln() + .transform.missing.fill() + ) + + .with_column(Column('ln_acled_sb', from_loa='country_month', from_column='acled_sb_fat') + .transform.ops.ln() + .transform.missing.fill() + ) + + .with_column(Column('ln_acled_sb_count', from_loa='country_month', from_column='acled_sb_count') + .transform.ops.ln() + .transform.missing.fill() + ) + + .with_column(Column('ln_acled_os', from_loa='country_month', from_column='acled_os_fat') + .transform.ops.ln() + .transform.missing.fill() + ) + + .with_column(Column('wdi_sp_pop_totl', from_loa='country_year', from_column='wdi_sp_pop_totl') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + .transform.missing.replace_na() + ) + + .with_column(Column('ln_ged_sb_tlag_1', from_loa='country_month', from_column='ged_sb_best_sum_nokgi') + .transform.ops.ln() + .transform.missing.fill() + .transform.temporal.tlag(1) + .transform.missing.fill() + ) + + .with_column(Column('ln_ged_sb_tlag_2', from_loa='country_month', from_column='ged_sb_best_sum_nokgi') + .transform.ops.ln() + .transform.missing.fill() + .transform.temporal.tlag(2) + .transform.missing.fill() + ) + + .with_column(Column('ln_ged_sb_tlag_3', from_loa='country_month', from_column='ged_sb_best_sum_nokgi') + .transform.ops.ln() + .transform.missing.fill() + .transform.temporal.tlag(3) + .transform.missing.fill() + ) + + .with_column(Column('ln_ged_sb_tlag_4', from_loa='country_month', from_column='ged_sb_best_sum_nokgi') + .transform.ops.ln() + .transform.missing.fill() + .transform.temporal.tlag(4) + .transform.missing.fill() + ) + + .with_column(Column('ln_ged_sb_tlag_5', from_loa='country_month', from_column='ged_sb_best_sum_nokgi') + .transform.ops.ln() + .transform.missing.fill() + .transform.temporal.tlag(5) + .transform.missing.fill() + ) + + .with_column(Column('ln_ged_sb_tlag_6', from_loa='country_month', from_column='ged_sb_best_sum_nokgi') + .transform.ops.ln() + .transform.missing.fill() + .transform.temporal.tlag(6) + .transform.missing.fill() + ) + + .with_column(Column('ln_ged_sb_tsum_24', from_loa='country_month', from_column='ged_sb_best_sum_nokgi') + .transform.missing.replace_na() + .transform.temporal.moving_sum(24) + .transform.ops.ln() + .transform.missing.replace_na() + ) + + .with_column(Column('ln_ged_os_tlag_1', from_loa='country_month', from_column='ged_os_best_sum_nokgi') + .transform.ops.ln() + .transform.missing.fill() + .transform.temporal.tlag(1) + .transform.missing.fill() + ) + + .with_column(Column('decay_ged_sb_5', from_loa='country_month', from_column='ged_sb_best_sum_nokgi') + .transform.missing.replace_na() + .transform.bool.gte(5) + .transform.temporal.time_since() + .transform.temporal.decay(24) + .transform.missing.replace_na() + ) + + .with_column(Column('decay_ged_os_5', from_loa='country_month', from_column='ged_os_best_sum_nokgi') + .transform.missing.replace_na() + .transform.bool.gte(5) + .transform.temporal.time_since() + .transform.temporal.decay(24) + .transform.missing.replace_na() + ) + + .with_column(Column('decay_ged_sb_100', from_loa='country_month', from_column='ged_sb_best_sum_nokgi') + .transform.missing.replace_na() + .transform.bool.gte(100) + .transform.temporal.time_since() + .transform.temporal.decay(24) + .transform.missing.replace_na() + ) + + .with_column(Column('decay_ged_sb_500', from_loa='country_month', from_column='ged_sb_best_sum_nokgi') + .transform.missing.replace_na() + .transform.bool.gte(500) + .transform.temporal.time_since() + .transform.temporal.decay(24) + .transform.missing.replace_na() + ) + + .with_column(Column('decay_ged_os_100', from_loa='country_month', from_column='ged_os_best_sum_nokgi') + .transform.missing.replace_na() + .transform.bool.gte(100) + .transform.temporal.time_since() + .transform.temporal.decay(24) + .transform.missing.replace_na() + ) + + .with_column(Column('decay_ged_ns_5', from_loa='country_month', from_column='ged_ns_best_sum_nokgi') + .transform.missing.replace_na() + .transform.bool.gte(5) + .transform.temporal.time_since() + .transform.temporal.decay(24) + .transform.missing.replace_na() + ) + + .with_column(Column('decay_ged_ns_100', from_loa='country_month', from_column='ged_ns_best_sum_nokgi') + .transform.missing.replace_na() + .transform.bool.gte(100) + .transform.temporal.time_since() + .transform.temporal.decay(24) + .transform.missing.replace_na() + ) + + .with_column(Column('decay_acled_sb_5', from_loa='country_month', from_column='acled_sb_fat') + .transform.missing.replace_na() + .transform.bool.gte(5) + .transform.temporal.time_since() + .transform.temporal.decay(24) + .transform.missing.replace_na() + ) + + .with_column(Column('decay_acled_os_5', from_loa='country_month', from_column='acled_os_fat') + .transform.missing.replace_na() + .transform.bool.gte(5) + .transform.temporal.time_since() + .transform.temporal.decay(24) + .transform.missing.replace_na() + ) + + .with_column(Column('decay_acled_ns_5', from_loa='country_month', from_column='acled_ns_fat') + .transform.missing.replace_na() + .transform.bool.gte(5) + .transform.temporal.time_since() + .transform.temporal.decay(24) + .transform.missing.replace_na() + ) + + .with_column(Column('splag_1_decay_ged_sb_5', from_loa='country_month', from_column='ged_sb_best_sum_nokgi') + .transform.missing.replace_na() + .transform.bool.gte(5) + .transform.temporal.time_since() + .transform.temporal.decay(24) + .transform.spatial.countrylag(1,1,0,0) + .transform.missing.replace_na() + ) + + .with_column(Column('splag_1_decay_ged_os_5', from_loa='country_month', from_column='ged_os_best_sum_nokgi') + .transform.missing.replace_na() + .transform.bool.gte(5) + .transform.temporal.time_since() + .transform.temporal.decay(24) + .transform.spatial.countrylag(1,1,0,0) + .transform.missing.replace_na() + ) + + .with_column(Column('splag_1_decay_ged_ns_5', from_loa='country_month', from_column='ged_ns_best_sum_nokgi') + .transform.missing.replace_na() + .transform.bool.gte(5) + .transform.temporal.time_since() + .transform.temporal.decay(24) + .transform.spatial.countrylag(1,1,0,0) + .transform.missing.replace_na() + ) + + .with_theme('fatalities002') + .describe("""Predicting ln(fatalities), cm level + + Queryset with baseline and first set of conflict history features + + """) + ) + + return queryset diff --git a/common_querysets/queryset_little_lies.py b/common_querysets/queryset_little_lies.py new file mode 100644 index 00000000..46edf09c --- /dev/null +++ b/common_querysets/queryset_little_lies.py @@ -0,0 +1,226 @@ +from viewser import Queryset, Column + +def generate(): + """ + Contains the configuration for the input data in the form of a viewser queryset. That is the data from viewser that is used to train the model. + This configuration is "behavioral" so modifying it will affect the model's runtime behavior and integration into the deployment system. + There is no guarantee that the model will work if the input data configuration is changed here without changing the model settings and algorithm accordingly. + + Returns: + - queryset_base (Queryset): A queryset containing the base data for the model training. + """ + + # VIEWSER 6, Example configuration. Modify as needed. + + queryset = (Queryset('fatalities003_joint_narrow','country_month') + .with_column(Column('ln_ged_sb_dep', from_loa='country_month', from_column='ged_sb_best_sum_nokgi') + .transform.ops.ln() + .transform.missing.fill() + ) + + .with_column(Column('gleditsch_ward', from_loa='country', from_column='gwcode') + .transform.missing.fill() + .transform.missing.replace_na() + ) + + .with_column(Column('ln_ged_sb', from_loa='country_month', from_column='ged_sb_best_sum_nokgi') + .transform.ops.ln() + .transform.missing.fill() + ) + + .with_column(Column('reign_tenure_months', from_loa='country_month', from_column='tenure_months') + .transform.missing.fill() + .transform.missing.replace_na() + ) + + .with_column(Column('wdi_sp_pop_totl', from_loa='country_year', from_column='wdi_sp_pop_totl') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + ) + + .with_column(Column('wdi_ag_lnd_frst_k2', from_loa='country_year', from_column='wdi_ag_lnd_frst_k2') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + ) + + .with_column(Column('wdi_nv_agr_totl_kn', from_loa='country_year', from_column='wdi_nv_agr_totl_kn') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + ) + + .with_column(Column('wdi_sh_sta_maln_zs', from_loa='country_year', from_column='wdi_sh_sta_maln_zs') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + ) + + .with_column(Column('wdi_sl_tlf_totl_fe_zs', from_loa='country_year', from_column='wdi_sl_tlf_totl_fe_zs') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + ) + + .with_column(Column('wdi_sm_pop_refg_or', from_loa='country_year', from_column='wdi_sm_pop_refg_or') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + ) + + .with_column(Column('wdi_sp_dyn_imrt_in', from_loa='country_year', from_column='wdi_sp_dyn_imrt_in') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + ) + + .with_column(Column('wdi_sp_pop_14_fe_zs', from_loa='country_year', from_column='wdi_sp_pop_0014_fe_zs') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + ) + + .with_column(Column('wdi_sp_pop_grow', from_loa='country_year', from_column='wdi_sp_pop_grow') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + ) + + .with_column(Column('vdem_v2xcl_dmove', from_loa='country_year', from_column='vdem_v2xcl_dmove') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + ) + + .with_column(Column('vdem_v2xcl_rol', from_loa='country_year', from_column='vdem_v2xcl_rol') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + ) + + .with_column(Column('vdem_v2xeg_eqdr', from_loa='country_year', from_column='vdem_v2xeg_eqdr') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + ) + + .with_column(Column('vdem_v2xpe_exlpol', from_loa='country_year', from_column='vdem_v2xpe_exlpol') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + ) + + .with_column(Column('vdem_v2xpe_exlsocgr', from_loa='country_year', from_column='vdem_v2xpe_exlsocgr') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + ) + + .with_column(Column('ln_ged_sb_tlag_1', from_loa='country_month', from_column='ged_sb_best_sum_nokgi') + .transform.ops.ln() + .transform.missing.fill() + .transform.temporal.tlag(1) + .transform.missing.fill() + ) + + .with_column(Column('ln_ged_sb_tlag_2', from_loa='country_month', from_column='ged_sb_best_sum_nokgi') + .transform.ops.ln() + .transform.missing.fill() + .transform.temporal.tlag(2) + .transform.missing.fill() + ) + + .with_column(Column('splag_wdi_ag_lnd_frst_k2', from_loa='country_year', from_column='wdi_ag_lnd_frst_k2') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.spatial.countrylag(1,1,0,0) + .transform.missing.replace_na() + ) + + .with_column(Column('splag_wdi_sl_tlf_totl_fe_zs', from_loa='country_year', from_column='wdi_sl_tlf_totl_fe_zs') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.spatial.countrylag(1,1,0,0) + .transform.missing.replace_na() + ) + + .with_column(Column('splag_wdi_sm_pop_netm', from_loa='country_year', from_column='wdi_sm_pop_netm') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.spatial.countrylag(1,1,0,0) + .transform.missing.replace_na() + ) + + .with_column(Column('splag_vdem_v2xpe_exlsocgr', from_loa='country_year', from_column='vdem_v2xpe_exlsocgr') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.spatial.countrylag(1,1,0,0) + .transform.missing.replace_na() + ) + + .with_column(Column('splag_vdem_v2xcl_rol', from_loa='country_year', from_column='vdem_v2xcl_rol') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.spatial.countrylag(1,1,0,0) + .transform.missing.replace_na() + ) + + .with_column(Column('decay_ged_sb_5', from_loa='country_month', from_column='ged_sb_best_sum_nokgi') + .transform.missing.replace_na() + .transform.bool.gte(5) + .transform.temporal.time_since() + .transform.temporal.decay(24) + .transform.missing.replace_na() + ) + + .with_column(Column('decay_ged_os_5', from_loa='country_month', from_column='ged_os_best_sum_nokgi') + .transform.missing.replace_na() + .transform.bool.gte(5) + .transform.temporal.time_since() + .transform.temporal.decay(24) + .transform.missing.replace_na() + ) + + .with_column(Column('decay_acled_os_5', from_loa='country_month', from_column='acled_os_fat') + .transform.missing.replace_na() + .transform.bool.gte(5) + .transform.temporal.time_since() + .transform.temporal.decay(24) + .transform.missing.replace_na() + ) + + .with_column(Column('decay_ged_sb_100', from_loa='country_month', from_column='ged_sb_best_sum_nokgi') + .transform.missing.replace_na() + .transform.bool.gte(100) + .transform.temporal.time_since() + .transform.temporal.decay(24) + .transform.missing.replace_na() + ) + + .with_column(Column('decay_ged_sb_500', from_loa='country_month', from_column='ged_sb_best_sum_nokgi') + .transform.missing.replace_na() + .transform.bool.gte(500) + .transform.temporal.time_since() + .transform.temporal.decay(24) + .transform.missing.replace_na() + ) + + .with_column(Column('splag_1_decay_ged_sb_5', from_loa='country_month', from_column='ged_sb_best_sum_nokgi') + .transform.missing.replace_na() + .transform.bool.gte(5) + .transform.temporal.time_since() + .transform.temporal.decay(24) + .transform.spatial.countrylag(1,1,0,0) + .transform.missing.replace_na() + ) + + .with_theme('fatalities') + .describe("""Predicting ged_dummy_sb, cm level + + Queryset with features from various sources, 'joint narrow' + + """) + ) + + return queryset diff --git a/common_querysets/queryset_national_anthem.py b/common_querysets/queryset_national_anthem.py new file mode 100644 index 00000000..012b6cc6 --- /dev/null +++ b/common_querysets/queryset_national_anthem.py @@ -0,0 +1,264 @@ +from viewser import Queryset, Column + +def generate(): + """ + Contains the configuration for the input data in the form of a viewser queryset. That is the data from viewser that is used to train the model. + This configuration is "behavioral" so modifying it will affect the model's runtime behavior and integration into the deployment system. + There is no guarantee that the model will work if the input data configuration is changed here without changing the model settings and algorithm accordingly. + + Returns: + - queryset_base (Queryset): A queryset containing the base data for the model training. + """ + + # VIEWSER 6, Example configuration. Modify as needed. + + queryset = (Queryset('fatalities003_wdi_short','country_month') + .with_column(Column('ln_ged_sb_dep', from_loa='country_month', from_column='ged_sb_best_sum_nokgi') + .transform.ops.ln() + .transform.missing.fill() + .transform.missing.replace_na() + ) + + .with_column(Column('ln_ged_sb', from_loa='country_month', from_column='ged_sb_best_sum_nokgi') + .transform.ops.ln() + .transform.missing.fill() + .transform.missing.replace_na() + ) + + .with_column(Column('wdi_ag_lnd_frst_k2', from_loa='country_year', from_column='wdi_ag_lnd_frst_k2') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + .transform.missing.replace_na() + ) + + .with_column(Column('wdi_dt_oda_odat_pc_zs', from_loa='country_year', from_column='wdi_dt_oda_odat_pc_zs') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + .transform.missing.replace_na() + ) + + .with_column(Column('wdi_ms_mil_xpnd_gd_zs', from_loa='country_year', from_column='wdi_ms_mil_xpnd_gd_zs') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + .transform.missing.replace_na() + ) + + .with_column(Column('wdi_ms_mil_xpnd_zs', from_loa='country_year', from_column='wdi_ms_mil_xpnd_zs') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + .transform.missing.replace_na() + ) + + .with_column(Column('wdi_nv_agr_totl_kd', from_loa='country_year', from_column='wdi_nv_agr_totl_kd') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + .transform.missing.replace_na() + ) + + .with_column(Column('wdi_nv_agr_totl_kn', from_loa='country_year', from_column='wdi_nv_agr_totl_kn') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + .transform.missing.replace_na() + ) + + .with_column(Column('wdi_ny_gdp_pcap_kd', from_loa='country_year', from_column='wdi_ny_gdp_pcap_kd') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + .transform.missing.replace_na() + ) + + .with_column(Column('wdi_sp_dyn_le00_in', from_loa='country_year', from_column='wdi_sp_dyn_le00_in') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + .transform.missing.replace_na() + ) + + .with_column(Column('wdi_se_enr_prim_fm_zs', from_loa='country_year', from_column='wdi_se_enr_prim_fm_zs') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + .transform.missing.replace_na() + ) + + .with_column(Column('wdi_se_enr_prsc_fm_zs', from_loa='country_year', from_column='wdi_se_enr_prsc_fm_zs') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + .transform.missing.replace_na() + ) + + .with_column(Column('wdi_se_prm_nenr', from_loa='country_year', from_column='wdi_se_prm_nenr') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + .transform.missing.replace_na() + ) + + .with_column(Column('wdi_sh_sta_maln_zs', from_loa='country_year', from_column='wdi_sh_sta_maln_zs') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + .transform.missing.replace_na() + ) + + .with_column(Column('wdi_sh_sta_stnt_zs', from_loa='country_year', from_column='wdi_sh_sta_stnt_zs') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + .transform.missing.replace_na() + ) + + .with_column(Column('wdi_sl_tlf_totl_fe_zs', from_loa='country_year', from_column='wdi_sl_tlf_totl_fe_zs') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + .transform.missing.replace_na() + ) + + .with_column(Column('wdi_sm_pop_refg_or', from_loa='country_year', from_column='wdi_sm_pop_refg_or') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + .transform.missing.replace_na() + ) + + .with_column(Column('wdi_sm_pop_netm', from_loa='country_year', from_column='wdi_sm_pop_netm') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + .transform.missing.replace_na() + ) + + .with_column(Column('wdi_sm_pop_totl_zs', from_loa='country_year', from_column='wdi_sm_pop_totl_zs') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + .transform.missing.replace_na() + ) + + .with_column(Column('wdi_sp_dyn_imrt_in', from_loa='country_year', from_column='wdi_sp_dyn_imrt_in') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + .transform.missing.replace_na() + ) + + .with_column(Column('wdi_sh_dyn_mort_fe', from_loa='country_year', from_column='wdi_sh_dyn_mort_fe') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + .transform.missing.replace_na() + ) + + .with_column(Column('wdi_sp_pop_14_fe_zs', from_loa='country_year', from_column='wdi_sp_pop_0014_fe_zs') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + .transform.missing.replace_na() + ) + + .with_column(Column('wdi_sp_pop_1564_fe_zs', from_loa='country_year', from_column='wdi_sp_pop_1564_fe_zs') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + .transform.missing.replace_na() + ) + + .with_column(Column('wdi_sp_pop_65up_fe_zs', from_loa='country_year', from_column='wdi_sp_pop_65up_fe_zs') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + .transform.missing.replace_na() + ) + + .with_column(Column('wdi_sp_pop_grow', from_loa='country_year', from_column='wdi_sp_pop_grow') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + .transform.missing.replace_na() + ) + + .with_column(Column('wdi_sp_urb_totl_in_zs', from_loa='country_year', from_column='wdi_sp_urb_totl_in_zs') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + .transform.missing.replace_na() + ) + + .with_column(Column('wdi_sp_pop_totl', from_loa='country_year', from_column='wdi_sp_pop_totl') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + .transform.missing.replace_na() + ) + + .with_column(Column('splag_wdi_sl_tlf_totl_fe_zs', from_loa='country_year', from_column='wdi_sl_tlf_totl_fe_zs') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.spatial.countrylag(1,1,0,0) + .transform.missing.replace_na() + ) + + .with_column(Column('splag_wdi_sm_pop_refg_or', from_loa='country_year', from_column='wdi_sm_pop_refg_or') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.spatial.countrylag(1,1,0,0) + .transform.missing.replace_na() + ) + + .with_column(Column('splag_wdi_sm_pop_netm', from_loa='country_year', from_column='wdi_sm_pop_netm') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.spatial.countrylag(1,1,0,0) + .transform.missing.replace_na() + ) + + .with_column(Column('splag_wdi_ag_lnd_frst_k2', from_loa='country_year', from_column='wdi_ag_lnd_frst_k2') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.spatial.countrylag(1,1,0,0) + .transform.missing.replace_na() + ) + + .with_column(Column('decay_ged_sb_5', from_loa='country_month', from_column='ged_sb_best_sum_nokgi') + .transform.missing.replace_na() + .transform.bool.gte(5) + .transform.temporal.time_since() + .transform.temporal.decay(24) + .transform.missing.replace_na() + ) + + .with_column(Column('decay_ged_os_5', from_loa='country_month', from_column='ged_os_best_sum_nokgi') + .transform.missing.replace_na() + .transform.bool.gte(5) + .transform.temporal.time_since() + .transform.temporal.decay(24) + .transform.missing.replace_na() + ) + + .with_column(Column('splag_1_decay_ged_sb_5', from_loa='country_month', from_column='ged_sb_best_sum_nokgi') + .transform.missing.replace_na() + .transform.bool.gte(5) + .transform.temporal.time_since() + .transform.temporal.decay(24) + .transform.spatial.countrylag(1,1,0,0) + .transform.missing.replace_na() + ) + + .with_theme('fatalities002') + .describe("""Predicting ln(fatalities), cm level + + Queryset with baseline and short list of wdi features + + """) + ) + + return queryset diff --git a/common_querysets/queryset_ominous_ox.py b/common_querysets/queryset_ominous_ox.py new file mode 100644 index 00000000..6fd97181 --- /dev/null +++ b/common_querysets/queryset_ominous_ox.py @@ -0,0 +1,232 @@ +from viewser import Queryset, Column + +def generate(): + """ + Contains the configuration for the input data in the form of a viewser queryset. That is the data from viewser that is used to train the model. + This configuration is "behavioral" so modifying it will affect the model's runtime behavior and integration into the deployment system. + There is no guarantee that the model will work if the input data configuration is changed here without changing the model settings and algorithm accordingly. + + Returns: + - queryset_base (Queryset): A queryset containing the base data for the model training. + """ + + # VIEWSER 6, Example configuration. Modify as needed. + + queryset = (Queryset('fatalities003_conflict_history','country_month') + .with_column(Column('gleditsch_ward', from_loa='country', from_column='gwcode') + ) + + .with_column(Column('ln_ged_sb_dep', from_loa='country_month', from_column='ged_sb_best_sum_nokgi') + .transform.ops.ln() + .transform.missing.fill() + ) + + .with_column(Column('ln_ged_sb', from_loa='country_month', from_column='ged_sb_best_sum_nokgi') + .transform.ops.ln() + .transform.missing.fill() + ) + + .with_column(Column('ln_ged_ns', from_loa='country_month', from_column='ged_ns_best_sum_nokgi') + .transform.ops.ln() + .transform.missing.fill() + ) + + .with_column(Column('ln_ged_os', from_loa='country_month', from_column='ged_os_best_sum_nokgi') + .transform.ops.ln() + .transform.missing.fill() + ) + + .with_column(Column('ln_acled_sb', from_loa='country_month', from_column='acled_sb_fat') + .transform.ops.ln() + .transform.missing.fill() + ) + + .with_column(Column('ln_acled_sb_count', from_loa='country_month', from_column='acled_sb_count') + .transform.ops.ln() + .transform.missing.fill() + ) + + .with_column(Column('ln_acled_os', from_loa='country_month', from_column='acled_os_fat') + .transform.ops.ln() + .transform.missing.fill() + ) + + .with_column(Column('wdi_sp_pop_totl', from_loa='country_year', from_column='wdi_sp_pop_totl') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + .transform.missing.replace_na() + ) + + .with_column(Column('ln_ged_sb_tlag_1', from_loa='country_month', from_column='ged_sb_best_sum_nokgi') + .transform.ops.ln() + .transform.missing.fill() + .transform.temporal.tlag(1) + .transform.missing.fill() + ) + + .with_column(Column('ln_ged_sb_tlag_2', from_loa='country_month', from_column='ged_sb_best_sum_nokgi') + .transform.ops.ln() + .transform.missing.fill() + .transform.temporal.tlag(2) + .transform.missing.fill() + ) + + .with_column(Column('ln_ged_sb_tlag_3', from_loa='country_month', from_column='ged_sb_best_sum_nokgi') + .transform.ops.ln() + .transform.missing.fill() + .transform.temporal.tlag(3) + .transform.missing.fill() + ) + + .with_column(Column('ln_ged_sb_tlag_4', from_loa='country_month', from_column='ged_sb_best_sum_nokgi') + .transform.ops.ln() + .transform.missing.fill() + .transform.temporal.tlag(4) + .transform.missing.fill() + ) + + .with_column(Column('ln_ged_sb_tlag_5', from_loa='country_month', from_column='ged_sb_best_sum_nokgi') + .transform.ops.ln() + .transform.missing.fill() + .transform.temporal.tlag(5) + .transform.missing.fill() + ) + + .with_column(Column('ln_ged_sb_tlag_6', from_loa='country_month', from_column='ged_sb_best_sum_nokgi') + .transform.ops.ln() + .transform.missing.fill() + .transform.temporal.tlag(6) + .transform.missing.fill() + ) + + .with_column(Column('ln_ged_sb_tsum_24', from_loa='country_month', from_column='ged_sb_best_sum_nokgi') + .transform.missing.replace_na() + .transform.temporal.moving_sum(24) + .transform.ops.ln() + .transform.missing.replace_na() + ) + + .with_column(Column('ln_ged_os_tlag_1', from_loa='country_month', from_column='ged_os_best_sum_nokgi') + .transform.ops.ln() + .transform.missing.fill() + .transform.temporal.tlag(1) + .transform.missing.fill() + ) + + .with_column(Column('decay_ged_sb_5', from_loa='country_month', from_column='ged_sb_best_sum_nokgi') + .transform.missing.replace_na() + .transform.bool.gte(5) + .transform.temporal.time_since() + .transform.temporal.decay(24) + .transform.missing.replace_na() + ) + + .with_column(Column('decay_ged_os_5', from_loa='country_month', from_column='ged_os_best_sum_nokgi') + .transform.missing.replace_na() + .transform.bool.gte(5) + .transform.temporal.time_since() + .transform.temporal.decay(24) + .transform.missing.replace_na() + ) + + .with_column(Column('decay_ged_sb_100', from_loa='country_month', from_column='ged_sb_best_sum_nokgi') + .transform.missing.replace_na() + .transform.bool.gte(100) + .transform.temporal.time_since() + .transform.temporal.decay(24) + .transform.missing.replace_na() + ) + + .with_column(Column('decay_ged_sb_500', from_loa='country_month', from_column='ged_sb_best_sum_nokgi') + .transform.missing.replace_na() + .transform.bool.gte(500) + .transform.temporal.time_since() + .transform.temporal.decay(24) + .transform.missing.replace_na() + ) + + .with_column(Column('decay_ged_os_100', from_loa='country_month', from_column='ged_os_best_sum_nokgi') + .transform.missing.replace_na() + .transform.bool.gte(100) + .transform.temporal.time_since() + .transform.temporal.decay(24) + .transform.missing.replace_na() + ) + + .with_column(Column('decay_ged_ns_5', from_loa='country_month', from_column='ged_ns_best_sum_nokgi') + .transform.missing.replace_na() + .transform.bool.gte(5) + .transform.temporal.time_since() + .transform.temporal.decay(24) + .transform.missing.replace_na() + ) + + .with_column(Column('decay_ged_ns_100', from_loa='country_month', from_column='ged_ns_best_sum_nokgi') + .transform.missing.replace_na() + .transform.bool.gte(100) + .transform.temporal.time_since() + .transform.temporal.decay(24) + .transform.missing.replace_na() + ) + + .with_column(Column('decay_acled_sb_5', from_loa='country_month', from_column='acled_sb_fat') + .transform.missing.replace_na() + .transform.bool.gte(5) + .transform.temporal.time_since() + .transform.temporal.decay(24) + .transform.missing.replace_na() + ) + + .with_column(Column('decay_acled_os_5', from_loa='country_month', from_column='acled_os_fat') + .transform.missing.replace_na() + .transform.bool.gte(5) + .transform.temporal.time_since() + .transform.temporal.decay(24) + .transform.missing.replace_na() + ) + + .with_column(Column('decay_acled_ns_5', from_loa='country_month', from_column='acled_ns_fat') + .transform.missing.replace_na() + .transform.bool.gte(5) + .transform.temporal.time_since() + .transform.temporal.decay(24) + .transform.missing.replace_na() + ) + + .with_column(Column('splag_1_decay_ged_sb_5', from_loa='country_month', from_column='ged_sb_best_sum_nokgi') + .transform.missing.replace_na() + .transform.bool.gte(5) + .transform.temporal.time_since() + .transform.temporal.decay(24) + .transform.spatial.countrylag(1,1,0,0) + .transform.missing.replace_na() + ) + + .with_column(Column('splag_1_decay_ged_os_5', from_loa='country_month', from_column='ged_os_best_sum_nokgi') + .transform.missing.replace_na() + .transform.bool.gte(5) + .transform.temporal.time_since() + .transform.temporal.decay(24) + .transform.spatial.countrylag(1,1,0,0) + .transform.missing.replace_na() + ) + + .with_column(Column('splag_1_decay_ged_ns_5', from_loa='country_month', from_column='ged_ns_best_sum_nokgi') + .transform.missing.replace_na() + .transform.bool.gte(5) + .transform.temporal.time_since() + .transform.temporal.decay(24) + .transform.spatial.countrylag(1,1,0,0) + .transform.missing.replace_na() + ) + + .with_theme('fatalities002') + .describe("""Predicting ln(fatalities), cm level + + Queryset with baseline and first set of conflict history features + + """) + ) + + return queryset diff --git a/common_querysets/queryset_plastic_beach.py b/common_querysets/queryset_plastic_beach.py new file mode 100644 index 00000000..cb85613a --- /dev/null +++ b/common_querysets/queryset_plastic_beach.py @@ -0,0 +1,143 @@ +from viewser import Queryset, Column + +def generate(): + """ + Contains the configuration for the input data in the form of a viewser queryset. That is the data from viewser that is used to train the model. + This configuration is "behavioral" so modifying it will affect the model's runtime behavior and integration into the deployment system. + There is no guarantee that the model will work if the input data configuration is changed here without changing the model settings and algorithm accordingly. + + Returns: + - queryset_base (Queryset): A queryset containing the base data for the model training. + """ + + # VIEWSER 6, Example configuration. Modify as needed. + + queryset = (Queryset('fatalities003_aquastat','country_month') + .with_column(Column('ln_ged_sb_dep', from_loa='country_month', from_column='ged_sb_best_sum_nokgi') + .transform.ops.ln() + .transform.missing.fill() + ) + + .with_column(Column('ln_ged_sb', from_loa='country_month', from_column='ged_sb_best_sum_nokgi') + .transform.ops.ln() + .transform.missing.fill() + ) + + .with_column(Column('wdi_sp_pop_totl', from_loa='country_year', from_column='wdi_sp_pop_totl') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + .transform.missing.replace_na() + ) + + .with_column(Column('agr_withdrawal_pct_t48', from_loa='country_year', from_column='agr_withdrawal_pct') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(48) + .transform.missing.fill() + ) + + .with_column(Column('dam_cap_pcap_t48', from_loa='country_year', from_column='dam_cap_pcap') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(48) + .transform.missing.fill() + ) + + .with_column(Column('groundwater_export_t48', from_loa='country_year', from_column='groundwater_export') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(48) + .transform.missing.fill() + ) + + .with_column(Column('fresh_withdrawal_pct_t48', from_loa='country_year', from_column='fresh_withdrawal_pct') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(48) + .transform.missing.fill() + ) + + .with_column(Column('ind_efficiency_t48', from_loa='country_year', from_column='ind_efficiency') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(48) + .transform.missing.fill() + ) + + .with_column(Column('irr_agr_efficiency_t48', from_loa='country_year', from_column='irr_agr_efficiency') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(48) + .transform.missing.fill() + ) + + .with_column(Column('services_efficiency_t48', from_loa='country_year', from_column='services_efficiency') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(48) + .transform.missing.fill() + ) + + .with_column(Column('general_efficiency_t48', from_loa='country_year', from_column='general_efficiency') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(48) + .transform.missing.fill() + ) + + .with_column(Column('water_stress_t48', from_loa='country_year', from_column='water_stress') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(48) + .transform.missing.fill() + ) + + .with_column(Column('renewable_internal_pcap_t48', from_loa='country_year', from_column='renewable_internal_pcap') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(48) + .transform.missing.fill() + ) + + .with_column(Column('renewable_pcap_t48', from_loa='country_year', from_column='renewable_pcap') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(48) + .transform.missing.fill() + ) + + .with_column(Column('decay_ged_sb_5', from_loa='country_month', from_column='ged_sb_best_sum_nokgi') + .transform.missing.replace_na() + .transform.bool.gte(5) + .transform.temporal.time_since() + .transform.temporal.decay(24) + .transform.missing.replace_na() + ) + + .with_column(Column('decay_ged_os_5', from_loa='country_month', from_column='ged_os_best_sum_nokgi') + .transform.missing.replace_na() + .transform.bool.gte(5) + .transform.temporal.time_since() + .transform.temporal.decay(24) + .transform.missing.replace_na() + ) + + .with_column(Column('splag_1_decay_ged_sb_5', from_loa='country_month', from_column='ged_sb_best_sum_nokgi') + .transform.missing.replace_na() + .transform.bool.gte(5) + .transform.temporal.time_since() + .transform.temporal.decay(24) + .transform.spatial.countrylag(1,1,0,0) + .transform.missing.replace_na() + ) + + .with_theme('fatalities002') + .describe("""Predicting ln(fatalities), cm level + + Queryset with baseline and aquastat features + + """) + ) + + return queryset diff --git a/common_querysets/queryset_popular_monster.py b/common_querysets/queryset_popular_monster.py new file mode 100644 index 00000000..ae9ed831 --- /dev/null +++ b/common_querysets/queryset_popular_monster.py @@ -0,0 +1,546 @@ +from viewser import Queryset, Column + +def generate(): + """ + Contains the configuration for the input data in the form of a viewser queryset. That is the data from viewser that is used to train the model. + This configuration is "behavioral" so modifying it will affect the model's runtime behavior and integration into the deployment system. + There is no guarantee that the model will work if the input data configuration is changed here without changing the model settings and algorithm accordingly. + + Returns: + - queryset_base (Queryset): A queryset containing the base data for the model training. + """ + + # VIEWSER 6, Example configuration. Modify as needed. + + queryset = (Queryset('fatalities003_topics','country_month') + .with_column(Column('ln_ged_sb_dep', from_loa='country_month', from_column='ged_sb_best_sum_nokgi') + .transform.ops.ln() + .transform.missing.fill() + ) + + .with_column(Column('ln_ged_sb', from_loa='country_month', from_column='ged_sb_best_sum_nokgi') + .transform.ops.ln() + .transform.missing.fill() + ) + + .with_column(Column('wdi_sp_pop_totl', from_loa='country_year', from_column='wdi_sp_pop_totl') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + .transform.missing.replace_na() + ) + + .with_column(Column('topic_tokens_t1', from_loa='country_month', from_column='topic_tokens') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(1) + .transform.missing.fill() + ) + + .with_column(Column('topic_tokens_t2', from_loa='country_month', from_column='topic_tokens') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(2) + .transform.missing.fill() + ) + + .with_column(Column('topic_tokens_t13', from_loa='country_month', from_column='topic_tokens') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(13) + .transform.missing.fill() + ) + + .with_column(Column('topic_ste_theta0_stock_t1', from_loa='country_month', from_column='topic_ste_theta0_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(1) + .transform.missing.fill() + ) + + .with_column(Column('topic_ste_theta0_stock_t2', from_loa='country_month', from_column='topic_ste_theta0_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(2) + .transform.missing.fill() + ) + + .with_column(Column('topic_ste_theta0_stock_t13', from_loa='country_month', from_column='topic_ste_theta0_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(13) + .transform.missing.fill() + ) + + .with_column(Column('topic_ste_theta1_stock_t1', from_loa='country_month', from_column='topic_ste_theta1_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(1) + .transform.missing.fill() + ) + + .with_column(Column('topic_ste_theta1_stock_t2', from_loa='country_month', from_column='topic_ste_theta1_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(2) + .transform.missing.fill() + ) + + .with_column(Column('topic_ste_theta1_stock_t13', from_loa='country_month', from_column='topic_ste_theta1_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(13) + .transform.missing.fill() + ) + + .with_column(Column('topic_ste_theta2_stock_t1', from_loa='country_month', from_column='topic_ste_theta2_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(1) + .transform.missing.fill() + ) + + .with_column(Column('topic_ste_theta2_stock_t2', from_loa='country_month', from_column='topic_ste_theta2_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(2) + .transform.missing.fill() + ) + + .with_column(Column('topic_ste_theta2_stock_t13', from_loa='country_month', from_column='topic_ste_theta2_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(13) + .transform.missing.fill() + ) + + .with_column(Column('topic_ste_theta3_stock_t1', from_loa='country_month', from_column='topic_ste_theta3_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(1) + .transform.missing.fill() + ) + + .with_column(Column('topic_ste_theta3_stock_t2', from_loa='country_month', from_column='topic_ste_theta3_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(2) + .transform.missing.fill() + ) + + .with_column(Column('topic_ste_theta3_stock_t13', from_loa='country_month', from_column='topic_ste_theta3_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(13) + .transform.missing.fill() + ) + + .with_column(Column('topic_ste_theta4_stock_t1', from_loa='country_month', from_column='topic_ste_theta4_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(1) + .transform.missing.fill() + ) + + .with_column(Column('topic_ste_theta4_stock_t2', from_loa='country_month', from_column='topic_ste_theta4_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(2) + .transform.missing.fill() + ) + + .with_column(Column('topic_ste_theta4_stock_t13', from_loa='country_month', from_column='topic_ste_theta4_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(13) + .transform.missing.fill() + ) + + .with_column(Column('topic_ste_theta5_stock_t1', from_loa='country_month', from_column='topic_ste_theta5_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(1) + .transform.missing.fill() + ) + + .with_column(Column('topic_ste_theta5_stock_t2', from_loa='country_month', from_column='topic_ste_theta5_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(2) + .transform.missing.fill() + ) + + .with_column(Column('topic_ste_theta5_stock_t13', from_loa='country_month', from_column='topic_ste_theta5_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(13) + .transform.missing.fill() + ) + + .with_column(Column('topic_ste_theta6_stock_t1', from_loa='country_month', from_column='topic_ste_theta6_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(1) + .transform.missing.fill() + ) + + .with_column(Column('topic_ste_theta6_stock_t2', from_loa='country_month', from_column='topic_ste_theta6_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(2) + .transform.missing.fill() + ) + + .with_column(Column('topic_ste_theta6_stock_t13', from_loa='country_month', from_column='topic_ste_theta6_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(13) + .transform.missing.fill() + ) + + .with_column(Column('topic_ste_theta7_stock_t1', from_loa='country_month', from_column='topic_ste_theta7_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(1) + .transform.missing.fill() + ) + + .with_column(Column('topic_ste_theta7_stock_t2', from_loa='country_month', from_column='topic_ste_theta7_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(2) + .transform.missing.fill() + ) + + .with_column(Column('topic_ste_theta7_stock_t13', from_loa='country_month', from_column='topic_ste_theta7_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(13) + .transform.missing.fill() + ) + + .with_column(Column('topic_ste_theta8_stock_t1', from_loa='country_month', from_column='topic_ste_theta8_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(1) + .transform.missing.fill() + ) + + .with_column(Column('topic_ste_theta8_stock_t2', from_loa='country_month', from_column='topic_ste_theta8_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(2) + .transform.missing.fill() + ) + + .with_column(Column('topic_ste_theta8_stock_t13', from_loa='country_month', from_column='topic_ste_theta8_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(13) + .transform.missing.fill() + ) + + .with_column(Column('topic_ste_theta9_stock_t1', from_loa='country_month', from_column='topic_ste_theta9_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(1) + .transform.missing.fill() + ) + + .with_column(Column('topic_ste_theta9_stock_t2', from_loa='country_month', from_column='topic_ste_theta9_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(2) + .transform.missing.fill() + ) + + .with_column(Column('topic_ste_theta9_stock_t13', from_loa='country_month', from_column='topic_ste_theta9_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(13) + .transform.missing.fill() + ) + + .with_column(Column('topic_ste_theta10_stock_t1', from_loa='country_month', from_column='topic_ste_theta10_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(1) + .transform.missing.fill() + ) + + .with_column(Column('topic_ste_theta10_stock_t2', from_loa='country_month', from_column='topic_ste_theta10_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(2) + .transform.missing.fill() + ) + + .with_column(Column('topic_ste_theta10_stock_t13', from_loa='country_month', from_column='topic_ste_theta10_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(13) + .transform.missing.fill() + ) + + .with_column(Column('topic_ste_theta11_stock_t1', from_loa='country_month', from_column='topic_ste_theta11_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(1) + .transform.missing.fill() + ) + + .with_column(Column('topic_ste_theta11_stock_t2', from_loa='country_month', from_column='topic_ste_theta11_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(2) + .transform.missing.fill() + ) + + .with_column(Column('topic_ste_theta11_stock_t13', from_loa='country_month', from_column='topic_ste_theta11_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(13) + .transform.missing.fill() + ) + + .with_column(Column('topic_ste_theta12_stock_t1', from_loa='country_month', from_column='topic_ste_theta12_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(1) + .transform.missing.fill() + ) + + .with_column(Column('topic_ste_theta12_stock_t2', from_loa='country_month', from_column='topic_ste_theta12_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(2) + .transform.missing.fill() + ) + + .with_column(Column('topic_ste_theta12_stock_t13', from_loa='country_month', from_column='topic_ste_theta12_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(13) + .transform.missing.fill() + ) + + .with_column(Column('topic_ste_theta13_stock_t1', from_loa='country_month', from_column='topic_ste_theta13_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(1) + .transform.missing.fill() + ) + + .with_column(Column('topic_ste_theta13_stock_t2', from_loa='country_month', from_column='topic_ste_theta13_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(2) + .transform.missing.fill() + ) + + .with_column(Column('topic_ste_theta13_stock_t13', from_loa='country_month', from_column='topic_ste_theta13_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(13) + .transform.missing.fill() + ) + + .with_column(Column('topic_ste_theta14_stock_t1', from_loa='country_month', from_column='topic_ste_theta14_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(1) + .transform.missing.fill() + ) + + .with_column(Column('topic_ste_theta14_stock_t2', from_loa='country_month', from_column='topic_ste_theta14_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(2) + .transform.missing.fill() + ) + + .with_column(Column('topic_ste_theta14_stock_t13', from_loa='country_month', from_column='topic_ste_theta14_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(13) + .transform.missing.fill() + ) + + .with_column(Column('decay_ged_sb_5', from_loa='country_month', from_column='ged_sb_best_sum_nokgi') + .transform.missing.replace_na() + .transform.bool.gte(5) + .transform.temporal.time_since() + .transform.temporal.decay(24) + .transform.missing.replace_na() + ) + + .with_column(Column('decay_ged_os_5', from_loa='country_month', from_column='ged_os_best_sum_nokgi') + .transform.missing.replace_na() + .transform.bool.gte(5) + .transform.temporal.time_since() + .transform.temporal.decay(24) + .transform.missing.replace_na() + ) + + .with_column(Column('splag_1_decay_ged_sb_5', from_loa='country_month', from_column='ged_sb_best_sum_nokgi') + .transform.missing.replace_na() + .transform.bool.gte(5) + .transform.temporal.time_since() + .transform.temporal.decay(24) + .transform.spatial.countrylag(1,1,0,0) + .transform.missing.replace_na() + ) + + .with_column(Column('topic_tokens_t1_splag', from_loa='country_month', from_column='topic_tokens') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(13) + .transform.missing.fill() + .transform.spatial.countrylag(1,1,0,0) + .transform.missing.replace_na() + ) + + .with_column(Column('topic_ste_theta0_stock_t1_splag', from_loa='country_month', from_column='topic_ste_theta0_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(13) + .transform.missing.fill() + .transform.spatial.countrylag(1,1,0,0) + .transform.missing.replace_na() + ) + + .with_column(Column('topic_ste_theta1_stock_t1_splag', from_loa='country_month', from_column='topic_ste_theta1_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(13) + .transform.missing.fill() + .transform.spatial.countrylag(1,1,0,0) + .transform.missing.replace_na() + ) + + .with_column(Column('topic_ste_theta2_stock_t1_splag', from_loa='country_month', from_column='topic_ste_theta2_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(13) + .transform.missing.fill() + .transform.spatial.countrylag(1,1,0,0) + .transform.missing.replace_na() + ) + + .with_column(Column('topic_ste_theta3_stock_t1_splag', from_loa='country_month', from_column='topic_ste_theta3_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(13) + .transform.missing.fill() + .transform.spatial.countrylag(1,1,0,0) + .transform.missing.replace_na() + ) + + .with_column(Column('topic_ste_theta4_stock_t1_splag', from_loa='country_month', from_column='topic_ste_theta4_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(13) + .transform.missing.fill() + .transform.spatial.countrylag(1,1,0,0) + .transform.missing.replace_na() + ) + + .with_column(Column('topic_ste_theta5_stock_t1_splag', from_loa='country_month', from_column='topic_ste_theta5_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(13) + .transform.missing.fill() + .transform.spatial.countrylag(1,1,0,0) + .transform.missing.replace_na() + ) + + .with_column(Column('topic_ste_theta6_stock_t1_splag', from_loa='country_month', from_column='topic_ste_theta6_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(13) + .transform.missing.fill() + .transform.spatial.countrylag(1,1,0,0) + .transform.missing.replace_na() + ) + + .with_column(Column('topic_ste_theta7_stock_t1_splag', from_loa='country_month', from_column='topic_ste_theta7_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(13) + .transform.missing.fill() + .transform.spatial.countrylag(1,1,0,0) + .transform.missing.replace_na() + ) + + .with_column(Column('topic_ste_theta8_stock_t1_splag', from_loa='country_month', from_column='topic_ste_theta8_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(13) + .transform.missing.fill() + .transform.spatial.countrylag(1,1,0,0) + .transform.missing.replace_na() + ) + + .with_column(Column('topic_ste_theta9_stock_t1_splag', from_loa='country_month', from_column='topic_ste_theta9_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(13) + .transform.missing.fill() + .transform.spatial.countrylag(1,1,0,0) + .transform.missing.replace_na() + ) + + .with_column(Column('topic_ste_theta10_stock_t1_splag', from_loa='country_month', from_column='topic_ste_theta10_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(13) + .transform.missing.fill() + .transform.spatial.countrylag(1,1,0,0) + .transform.missing.replace_na() + ) + + .with_column(Column('topic_ste_theta11_stock_t1_splag', from_loa='country_month', from_column='topic_ste_theta11_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(13) + .transform.missing.fill() + .transform.spatial.countrylag(1,1,0,0) + .transform.missing.replace_na() + ) + + .with_column(Column('topic_ste_theta12_stock_t1_splag', from_loa='country_month', from_column='topic_ste_theta12_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(13) + .transform.missing.fill() + .transform.spatial.countrylag(1,1,0,0) + .transform.missing.replace_na() + ) + + .with_column(Column('topic_ste_theta13_stock_t1_splag', from_loa='country_month', from_column='topic_ste_theta13_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(13) + .transform.missing.fill() + .transform.spatial.countrylag(1,1,0,0) + .transform.missing.replace_na() + ) + + .with_column(Column('topic_ste_theta14_stock_t1_splag', from_loa='country_month', from_column='topic_ste_theta14_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(13) + .transform.missing.fill() + .transform.spatial.countrylag(1,1,0,0) + .transform.missing.replace_na() + ) + + .with_theme('fatalities002') + .describe("""Predicting ln(fatalities), cm level + + Queryset with baseline and Mueller & Rauh topic model features + + """) + ) + + return queryset diff --git a/common_querysets/queryset_teen_spirit.py b/common_querysets/queryset_teen_spirit.py new file mode 100644 index 00000000..3910ff7f --- /dev/null +++ b/common_querysets/queryset_teen_spirit.py @@ -0,0 +1,116 @@ +from viewser import Queryset, Column + +def generate(): + """ + Contains the configuration for the input data in the form of a viewser queryset. That is the data from viewser that is used to train the model. + This configuration is "behavioral" so modifying it will affect the model's runtime behavior and integration into the deployment system. + There is no guarantee that the model will work if the input data configuration is changed here without changing the model settings and algorithm accordingly. + + Returns: + - queryset_base (Queryset): A queryset containing the base data for the model training. + """ + + # VIEWSER 6, Example configuration. Modify as needed. + + queryset = (Queryset('fatalities003_faoprices','country_month') + .with_column(Column('fao_wheat_price', from_loa='country_month', from_column='wheat_price') + .transform.missing.replace_na(0) + ) + + .with_column(Column('fao_mp_price', from_loa='country_month', from_column='mp_price') + .transform.missing.replace_na(0) + ) + + .with_column(Column('fao_sugar_price', from_loa='country_month', from_column='sugar_price') + .transform.missing.replace_na(0) + ) + + .with_column(Column('fao_meat_price', from_loa='country_month', from_column='meat_price') + .transform.missing.replace_na(0) + ) + + .with_column(Column('fao_milk_price', from_loa='country_month', from_column='milk_price') + .transform.missing.replace_na(0) + ) + + .with_column(Column('ln_ged_sb_dep', from_loa='country_month', from_column='ged_sb_best_sum_nokgi') + .transform.ops.ln() + .transform.missing.fill() + ) + + .with_column(Column('ln_ged_sb', from_loa='country_month', from_column='ged_sb_best_sum_nokgi') + .transform.ops.ln() + .transform.missing.fill() + ) + + .with_column(Column('gleditsch_ward', from_loa='country', from_column='gwcode') + .transform.missing.fill() + .transform.missing.replace_na() + ) + + .with_column(Column('delta_fao_wheat_price', from_loa='country_month', from_column='wheat_price') + .transform.temporal.delta(12) + .transform.missing.replace_na(0) + ) + + .with_column(Column('delta_fao_mp_price', from_loa='country_month', from_column='mp_price') + .transform.temporal.delta(12) + .transform.missing.replace_na(0) + ) + + .with_column(Column('delta_fao_sugar_price', from_loa='country_month', from_column='sugar_price') + .transform.temporal.delta(12) + .transform.missing.replace_na(0) + ) + + .with_column(Column('delta_fao_meat_price', from_loa='country_month', from_column='meat_price') + .transform.temporal.delta(12) + .transform.missing.replace_na(0) + ) + + .with_column(Column('delta_fao_milk_price', from_loa='country_month', from_column='milk_price') + .transform.temporal.delta(12) + .transform.missing.replace_na(0) + ) + + .with_column(Column('wdi_sp_pop_totl', from_loa='country_year', from_column='wdi_sp_pop_totl') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + .transform.missing.replace_na() + ) + + .with_column(Column('decay_ged_sb_5', from_loa='country_month', from_column='ged_sb_best_sum_nokgi') + .transform.missing.replace_na() + .transform.bool.gte(5) + .transform.temporal.time_since() + .transform.temporal.decay(24) + .transform.missing.replace_na() + ) + + .with_column(Column('decay_ged_os_5', from_loa='country_month', from_column='ged_os_best_sum_nokgi') + .transform.missing.replace_na() + .transform.bool.gte(5) + .transform.temporal.time_since() + .transform.temporal.decay(24) + .transform.missing.replace_na() + ) + + .with_column(Column('splag_1_decay_ged_sb_5', from_loa='country_month', from_column='ged_sb_best_sum_nokgi') + .transform.missing.replace_na() + .transform.bool.gte(5) + .transform.temporal.time_since() + .transform.temporal.decay(24) + .transform.spatial.countrylag(1,1,0,0) + .transform.missing.replace_na() + ) + + .with_theme('fatalities002') + .describe("""Predicting ln(fatalities), cm level + + Queryset with baseline and faoprices features + + """) + ) + + return queryset diff --git a/common_querysets/queryset_twin_flame.py b/common_querysets/queryset_twin_flame.py new file mode 100644 index 00000000..ae9ed831 --- /dev/null +++ b/common_querysets/queryset_twin_flame.py @@ -0,0 +1,546 @@ +from viewser import Queryset, Column + +def generate(): + """ + Contains the configuration for the input data in the form of a viewser queryset. That is the data from viewser that is used to train the model. + This configuration is "behavioral" so modifying it will affect the model's runtime behavior and integration into the deployment system. + There is no guarantee that the model will work if the input data configuration is changed here without changing the model settings and algorithm accordingly. + + Returns: + - queryset_base (Queryset): A queryset containing the base data for the model training. + """ + + # VIEWSER 6, Example configuration. Modify as needed. + + queryset = (Queryset('fatalities003_topics','country_month') + .with_column(Column('ln_ged_sb_dep', from_loa='country_month', from_column='ged_sb_best_sum_nokgi') + .transform.ops.ln() + .transform.missing.fill() + ) + + .with_column(Column('ln_ged_sb', from_loa='country_month', from_column='ged_sb_best_sum_nokgi') + .transform.ops.ln() + .transform.missing.fill() + ) + + .with_column(Column('wdi_sp_pop_totl', from_loa='country_year', from_column='wdi_sp_pop_totl') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + .transform.missing.replace_na() + ) + + .with_column(Column('topic_tokens_t1', from_loa='country_month', from_column='topic_tokens') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(1) + .transform.missing.fill() + ) + + .with_column(Column('topic_tokens_t2', from_loa='country_month', from_column='topic_tokens') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(2) + .transform.missing.fill() + ) + + .with_column(Column('topic_tokens_t13', from_loa='country_month', from_column='topic_tokens') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(13) + .transform.missing.fill() + ) + + .with_column(Column('topic_ste_theta0_stock_t1', from_loa='country_month', from_column='topic_ste_theta0_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(1) + .transform.missing.fill() + ) + + .with_column(Column('topic_ste_theta0_stock_t2', from_loa='country_month', from_column='topic_ste_theta0_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(2) + .transform.missing.fill() + ) + + .with_column(Column('topic_ste_theta0_stock_t13', from_loa='country_month', from_column='topic_ste_theta0_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(13) + .transform.missing.fill() + ) + + .with_column(Column('topic_ste_theta1_stock_t1', from_loa='country_month', from_column='topic_ste_theta1_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(1) + .transform.missing.fill() + ) + + .with_column(Column('topic_ste_theta1_stock_t2', from_loa='country_month', from_column='topic_ste_theta1_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(2) + .transform.missing.fill() + ) + + .with_column(Column('topic_ste_theta1_stock_t13', from_loa='country_month', from_column='topic_ste_theta1_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(13) + .transform.missing.fill() + ) + + .with_column(Column('topic_ste_theta2_stock_t1', from_loa='country_month', from_column='topic_ste_theta2_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(1) + .transform.missing.fill() + ) + + .with_column(Column('topic_ste_theta2_stock_t2', from_loa='country_month', from_column='topic_ste_theta2_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(2) + .transform.missing.fill() + ) + + .with_column(Column('topic_ste_theta2_stock_t13', from_loa='country_month', from_column='topic_ste_theta2_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(13) + .transform.missing.fill() + ) + + .with_column(Column('topic_ste_theta3_stock_t1', from_loa='country_month', from_column='topic_ste_theta3_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(1) + .transform.missing.fill() + ) + + .with_column(Column('topic_ste_theta3_stock_t2', from_loa='country_month', from_column='topic_ste_theta3_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(2) + .transform.missing.fill() + ) + + .with_column(Column('topic_ste_theta3_stock_t13', from_loa='country_month', from_column='topic_ste_theta3_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(13) + .transform.missing.fill() + ) + + .with_column(Column('topic_ste_theta4_stock_t1', from_loa='country_month', from_column='topic_ste_theta4_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(1) + .transform.missing.fill() + ) + + .with_column(Column('topic_ste_theta4_stock_t2', from_loa='country_month', from_column='topic_ste_theta4_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(2) + .transform.missing.fill() + ) + + .with_column(Column('topic_ste_theta4_stock_t13', from_loa='country_month', from_column='topic_ste_theta4_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(13) + .transform.missing.fill() + ) + + .with_column(Column('topic_ste_theta5_stock_t1', from_loa='country_month', from_column='topic_ste_theta5_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(1) + .transform.missing.fill() + ) + + .with_column(Column('topic_ste_theta5_stock_t2', from_loa='country_month', from_column='topic_ste_theta5_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(2) + .transform.missing.fill() + ) + + .with_column(Column('topic_ste_theta5_stock_t13', from_loa='country_month', from_column='topic_ste_theta5_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(13) + .transform.missing.fill() + ) + + .with_column(Column('topic_ste_theta6_stock_t1', from_loa='country_month', from_column='topic_ste_theta6_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(1) + .transform.missing.fill() + ) + + .with_column(Column('topic_ste_theta6_stock_t2', from_loa='country_month', from_column='topic_ste_theta6_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(2) + .transform.missing.fill() + ) + + .with_column(Column('topic_ste_theta6_stock_t13', from_loa='country_month', from_column='topic_ste_theta6_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(13) + .transform.missing.fill() + ) + + .with_column(Column('topic_ste_theta7_stock_t1', from_loa='country_month', from_column='topic_ste_theta7_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(1) + .transform.missing.fill() + ) + + .with_column(Column('topic_ste_theta7_stock_t2', from_loa='country_month', from_column='topic_ste_theta7_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(2) + .transform.missing.fill() + ) + + .with_column(Column('topic_ste_theta7_stock_t13', from_loa='country_month', from_column='topic_ste_theta7_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(13) + .transform.missing.fill() + ) + + .with_column(Column('topic_ste_theta8_stock_t1', from_loa='country_month', from_column='topic_ste_theta8_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(1) + .transform.missing.fill() + ) + + .with_column(Column('topic_ste_theta8_stock_t2', from_loa='country_month', from_column='topic_ste_theta8_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(2) + .transform.missing.fill() + ) + + .with_column(Column('topic_ste_theta8_stock_t13', from_loa='country_month', from_column='topic_ste_theta8_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(13) + .transform.missing.fill() + ) + + .with_column(Column('topic_ste_theta9_stock_t1', from_loa='country_month', from_column='topic_ste_theta9_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(1) + .transform.missing.fill() + ) + + .with_column(Column('topic_ste_theta9_stock_t2', from_loa='country_month', from_column='topic_ste_theta9_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(2) + .transform.missing.fill() + ) + + .with_column(Column('topic_ste_theta9_stock_t13', from_loa='country_month', from_column='topic_ste_theta9_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(13) + .transform.missing.fill() + ) + + .with_column(Column('topic_ste_theta10_stock_t1', from_loa='country_month', from_column='topic_ste_theta10_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(1) + .transform.missing.fill() + ) + + .with_column(Column('topic_ste_theta10_stock_t2', from_loa='country_month', from_column='topic_ste_theta10_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(2) + .transform.missing.fill() + ) + + .with_column(Column('topic_ste_theta10_stock_t13', from_loa='country_month', from_column='topic_ste_theta10_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(13) + .transform.missing.fill() + ) + + .with_column(Column('topic_ste_theta11_stock_t1', from_loa='country_month', from_column='topic_ste_theta11_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(1) + .transform.missing.fill() + ) + + .with_column(Column('topic_ste_theta11_stock_t2', from_loa='country_month', from_column='topic_ste_theta11_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(2) + .transform.missing.fill() + ) + + .with_column(Column('topic_ste_theta11_stock_t13', from_loa='country_month', from_column='topic_ste_theta11_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(13) + .transform.missing.fill() + ) + + .with_column(Column('topic_ste_theta12_stock_t1', from_loa='country_month', from_column='topic_ste_theta12_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(1) + .transform.missing.fill() + ) + + .with_column(Column('topic_ste_theta12_stock_t2', from_loa='country_month', from_column='topic_ste_theta12_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(2) + .transform.missing.fill() + ) + + .with_column(Column('topic_ste_theta12_stock_t13', from_loa='country_month', from_column='topic_ste_theta12_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(13) + .transform.missing.fill() + ) + + .with_column(Column('topic_ste_theta13_stock_t1', from_loa='country_month', from_column='topic_ste_theta13_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(1) + .transform.missing.fill() + ) + + .with_column(Column('topic_ste_theta13_stock_t2', from_loa='country_month', from_column='topic_ste_theta13_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(2) + .transform.missing.fill() + ) + + .with_column(Column('topic_ste_theta13_stock_t13', from_loa='country_month', from_column='topic_ste_theta13_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(13) + .transform.missing.fill() + ) + + .with_column(Column('topic_ste_theta14_stock_t1', from_loa='country_month', from_column='topic_ste_theta14_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(1) + .transform.missing.fill() + ) + + .with_column(Column('topic_ste_theta14_stock_t2', from_loa='country_month', from_column='topic_ste_theta14_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(2) + .transform.missing.fill() + ) + + .with_column(Column('topic_ste_theta14_stock_t13', from_loa='country_month', from_column='topic_ste_theta14_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(13) + .transform.missing.fill() + ) + + .with_column(Column('decay_ged_sb_5', from_loa='country_month', from_column='ged_sb_best_sum_nokgi') + .transform.missing.replace_na() + .transform.bool.gte(5) + .transform.temporal.time_since() + .transform.temporal.decay(24) + .transform.missing.replace_na() + ) + + .with_column(Column('decay_ged_os_5', from_loa='country_month', from_column='ged_os_best_sum_nokgi') + .transform.missing.replace_na() + .transform.bool.gte(5) + .transform.temporal.time_since() + .transform.temporal.decay(24) + .transform.missing.replace_na() + ) + + .with_column(Column('splag_1_decay_ged_sb_5', from_loa='country_month', from_column='ged_sb_best_sum_nokgi') + .transform.missing.replace_na() + .transform.bool.gte(5) + .transform.temporal.time_since() + .transform.temporal.decay(24) + .transform.spatial.countrylag(1,1,0,0) + .transform.missing.replace_na() + ) + + .with_column(Column('topic_tokens_t1_splag', from_loa='country_month', from_column='topic_tokens') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(13) + .transform.missing.fill() + .transform.spatial.countrylag(1,1,0,0) + .transform.missing.replace_na() + ) + + .with_column(Column('topic_ste_theta0_stock_t1_splag', from_loa='country_month', from_column='topic_ste_theta0_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(13) + .transform.missing.fill() + .transform.spatial.countrylag(1,1,0,0) + .transform.missing.replace_na() + ) + + .with_column(Column('topic_ste_theta1_stock_t1_splag', from_loa='country_month', from_column='topic_ste_theta1_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(13) + .transform.missing.fill() + .transform.spatial.countrylag(1,1,0,0) + .transform.missing.replace_na() + ) + + .with_column(Column('topic_ste_theta2_stock_t1_splag', from_loa='country_month', from_column='topic_ste_theta2_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(13) + .transform.missing.fill() + .transform.spatial.countrylag(1,1,0,0) + .transform.missing.replace_na() + ) + + .with_column(Column('topic_ste_theta3_stock_t1_splag', from_loa='country_month', from_column='topic_ste_theta3_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(13) + .transform.missing.fill() + .transform.spatial.countrylag(1,1,0,0) + .transform.missing.replace_na() + ) + + .with_column(Column('topic_ste_theta4_stock_t1_splag', from_loa='country_month', from_column='topic_ste_theta4_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(13) + .transform.missing.fill() + .transform.spatial.countrylag(1,1,0,0) + .transform.missing.replace_na() + ) + + .with_column(Column('topic_ste_theta5_stock_t1_splag', from_loa='country_month', from_column='topic_ste_theta5_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(13) + .transform.missing.fill() + .transform.spatial.countrylag(1,1,0,0) + .transform.missing.replace_na() + ) + + .with_column(Column('topic_ste_theta6_stock_t1_splag', from_loa='country_month', from_column='topic_ste_theta6_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(13) + .transform.missing.fill() + .transform.spatial.countrylag(1,1,0,0) + .transform.missing.replace_na() + ) + + .with_column(Column('topic_ste_theta7_stock_t1_splag', from_loa='country_month', from_column='topic_ste_theta7_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(13) + .transform.missing.fill() + .transform.spatial.countrylag(1,1,0,0) + .transform.missing.replace_na() + ) + + .with_column(Column('topic_ste_theta8_stock_t1_splag', from_loa='country_month', from_column='topic_ste_theta8_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(13) + .transform.missing.fill() + .transform.spatial.countrylag(1,1,0,0) + .transform.missing.replace_na() + ) + + .with_column(Column('topic_ste_theta9_stock_t1_splag', from_loa='country_month', from_column='topic_ste_theta9_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(13) + .transform.missing.fill() + .transform.spatial.countrylag(1,1,0,0) + .transform.missing.replace_na() + ) + + .with_column(Column('topic_ste_theta10_stock_t1_splag', from_loa='country_month', from_column='topic_ste_theta10_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(13) + .transform.missing.fill() + .transform.spatial.countrylag(1,1,0,0) + .transform.missing.replace_na() + ) + + .with_column(Column('topic_ste_theta11_stock_t1_splag', from_loa='country_month', from_column='topic_ste_theta11_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(13) + .transform.missing.fill() + .transform.spatial.countrylag(1,1,0,0) + .transform.missing.replace_na() + ) + + .with_column(Column('topic_ste_theta12_stock_t1_splag', from_loa='country_month', from_column='topic_ste_theta12_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(13) + .transform.missing.fill() + .transform.spatial.countrylag(1,1,0,0) + .transform.missing.replace_na() + ) + + .with_column(Column('topic_ste_theta13_stock_t1_splag', from_loa='country_month', from_column='topic_ste_theta13_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(13) + .transform.missing.fill() + .transform.spatial.countrylag(1,1,0,0) + .transform.missing.replace_na() + ) + + .with_column(Column('topic_ste_theta14_stock_t1_splag', from_loa='country_month', from_column='topic_ste_theta14_stock') + .transform.missing.fill() + .transform.missing.replace_na() + .transform.temporal.tlag(13) + .transform.missing.fill() + .transform.spatial.countrylag(1,1,0,0) + .transform.missing.replace_na() + ) + + .with_theme('fatalities002') + .describe("""Predicting ln(fatalities), cm level + + Queryset with baseline and Mueller & Rauh topic model features + + """) + ) + + return queryset diff --git a/common_querysets/queryset_yellow_submarine.py b/common_querysets/queryset_yellow_submarine.py new file mode 100644 index 00000000..9d2a4a83 --- /dev/null +++ b/common_querysets/queryset_yellow_submarine.py @@ -0,0 +1,87 @@ +from viewser import Queryset, Column + +def generate(): + """ + Contains the configuration for the input data in the form of a viewser queryset. That is the data from viewser that is used to train the model. + This configuration is "behavioral" so modifying it will affect the model's runtime behavior and integration into the deployment system. + There is no guarantee that the model will work if the input data configuration is changed here without changing the model settings and algorithm accordingly. + + Returns: + - queryset_base (Queryset): A queryset containing the base data for the model training. + """ + + # VIEWSER 6, Example configuration. Modify as needed. + + queryset = (Queryset('fatalities003_imfweo','country_month') + .with_column(Column('imfweo_ngdp_rpch_tcurrent', from_loa='country_month', from_column='ngdp_rpch_tcurrent') + .transform.missing.replace_na(0) + ) + + .with_column(Column('imfweo_ngdp_rpch_tmin1', from_loa='country_month', from_column='ngdp_rpch_tmin1') + .transform.missing.replace_na(0) + ) + + .with_column(Column('imfweo_ngdp_rpch_tplus1', from_loa='country_month', from_column='ngdp_rpch_tplus1') + .transform.missing.replace_na(0) + ) + + .with_column(Column('imfweo_ngdp_rpch_tplus2', from_loa='country_month', from_column='ngdp_rpch_tplus2') + .transform.missing.replace_na(0) + ) + + .with_column(Column('ln_ged_sb_dep', from_loa='country_month', from_column='ged_sb_best_sum_nokgi') + .transform.ops.ln() + .transform.missing.fill() + ) + + .with_column(Column('ln_ged_sb', from_loa='country_month', from_column='ged_sb_best_sum_nokgi') + .transform.ops.ln() + .transform.missing.fill() + ) + + .with_column(Column('gleditsch_ward', from_loa='country', from_column='gwcode') + .transform.missing.fill() + .transform.missing.replace_na() + ) + + .with_column(Column('wdi_sp_pop_totl', from_loa='country_year', from_column='wdi_sp_pop_totl') + .transform.missing.fill() + .transform.temporal.tlag(12) + .transform.missing.fill() + .transform.missing.replace_na() + ) + + .with_column(Column('decay_ged_sb_5', from_loa='country_month', from_column='ged_sb_best_sum_nokgi') + .transform.missing.replace_na() + .transform.bool.gte(5) + .transform.temporal.time_since() + .transform.temporal.decay(24) + .transform.missing.replace_na() + ) + + .with_column(Column('decay_ged_os_5', from_loa='country_month', from_column='ged_os_best_sum_nokgi') + .transform.missing.replace_na() + .transform.bool.gte(5) + .transform.temporal.time_since() + .transform.temporal.decay(24) + .transform.missing.replace_na() + ) + + .with_column(Column('splag_1_decay_ged_sb_5', from_loa='country_month', from_column='ged_sb_best_sum_nokgi') + .transform.missing.replace_na() + .transform.bool.gte(5) + .transform.temporal.time_since() + .transform.temporal.decay(24) + .transform.spatial.countrylag(1,1,0,0) + .transform.missing.replace_na() + ) + + .with_theme('fatalities002') + .describe("""Predicting ln(fatalities), cm level + + Queryset with baseline and imfweo features + + """) + ) + + return queryset diff --git a/meta_tools/templates/model/template_config_hyperparameters.py b/meta_tools/templates/model/template_config_hyperparameters.py index 54dedb4f..0c60c7fa 100644 --- a/meta_tools/templates/model/template_config_hyperparameters.py +++ b/meta_tools/templates/model/template_config_hyperparameters.py @@ -27,7 +27,7 @@ def get_hp_config(): \""" hyperparameters = {{ - 'steps': [*range(1, 36 + 1, 1)], + "steps": [*range(1, 36 + 1, 1)], # Add more hyperparameters as needed }} return hyperparameters diff --git a/models/bad_blood/configs/config_meta.py b/models/bad_blood/configs/config_meta.py index 640123ec..b436299d 100644 --- a/models/bad_blood/configs/config_meta.py +++ b/models/bad_blood/configs/config_meta.py @@ -11,7 +11,7 @@ def get_meta_config(): "name": "bad_blood", "algorithm": "LightGBMModel", "depvar": "ln_ged_sb_dep", - "queryset": "fatalities002_pgm_natsoc", + "queryset": "fatalities003_pgm_natsoc", "level": "pgm", "creator": "Xiaolong" } diff --git a/models/bittersweet_symphony/README.md b/models/bittersweet_symphony/README.md new file mode 100644 index 00000000..87a5b573 --- /dev/null +++ b/models/bittersweet_symphony/README.md @@ -0,0 +1,3 @@ +# Model README +## Model name: bittersweet_symphony +## Created on: 2024-11-05 10:55:38.515494 \ No newline at end of file diff --git a/models/brown_cheese/data/raw/test.ipynb b/models/bittersweet_symphony/artifacts/.gitkeep similarity index 100% rename from models/brown_cheese/data/raw/test.ipynb rename to models/bittersweet_symphony/artifacts/.gitkeep diff --git a/models/bittersweet_symphony/configs/config_deployment.py b/models/bittersweet_symphony/configs/config_deployment.py new file mode 100644 index 00000000..9e45b735 --- /dev/null +++ b/models/bittersweet_symphony/configs/config_deployment.py @@ -0,0 +1,20 @@ +""" +Deployment Configuration Script + +This script defines the deployment configuration settings for the application. +It includes the deployment status and any additional settings specified. + +Deployment Status: +- shadow: The deployment is shadowed and not yet active. +- deployed: The deployment is active and in use. +- baseline: The deployment is in a baseline state, for reference or comparison. +- deprecated: The deployment is deprecated and no longer supported. + +Additional settings can be included in the configuration dictionary as needed. + +""" + +def get_deployment_config(): + # Deployment settings + deployment_config = {'deployment_status': 'shadow'} + return deployment_config diff --git a/models/bittersweet_symphony/configs/config_hyperparameters.py b/models/bittersweet_symphony/configs/config_hyperparameters.py new file mode 100644 index 00000000..5bd4d6e9 --- /dev/null +++ b/models/bittersweet_symphony/configs/config_hyperparameters.py @@ -0,0 +1,19 @@ + +def get_hp_config(): + """ + Contains the hyperparameter configurations for model training. + This configuration is "operational" so modifying these settings will impact the model's behavior during the training. + + Returns: + - hyperparameters (dict): A dictionary containing hyperparameters for training the model, which determine the model's behavior during the training phase. + """ + + hyperparameters = { + "steps": [*range(1, 36 + 1, 1)], + "parameters": { + "n_estimators": 100, + "n_jobs": 12, + "learning_rate": 0.05 + } + } + return hyperparameters diff --git a/models/bittersweet_symphony/configs/config_meta.py b/models/bittersweet_symphony/configs/config_meta.py new file mode 100644 index 00000000..f61143eb --- /dev/null +++ b/models/bittersweet_symphony/configs/config_meta.py @@ -0,0 +1,18 @@ +def get_meta_config(): + """ + Contains the meta data for the model (model algorithm, name, target variable, and level of analysis). + This config is for documentation purposes only, and modifying it will not affect the model, the training, or the evaluation. + + Returns: + - meta_config (dict): A dictionary containing model meta configuration. + """ + + meta_config = { + "name": "bittersweet_symphony", + "algorithm": "XGBModel", + "depvar": "ln_ged_sb_dep", + "queryset": " fatalities003_all_features", + "level": "cm", + "creator": "Marina" + } + return meta_config diff --git a/models/bittersweet_symphony/configs/config_sweep.py b/models/bittersweet_symphony/configs/config_sweep.py new file mode 100644 index 00000000..bf034b74 --- /dev/null +++ b/models/bittersweet_symphony/configs/config_sweep.py @@ -0,0 +1,29 @@ + +def get_sweep_config(): + """ + Contains the configuration for hyperparameter sweeps using WandB. + This configuration is "operational" so modifying it will change the search strategy, parameter ranges, and other settings for hyperparameter tuning aimed at optimizing model performance. + + Returns: + - sweep_config (dict): A dictionary containing the configuration for hyperparameter sweeps, defining the methods and parameter ranges used to search for optimal hyperparameters. + """ + + sweep_config = { + 'method': 'grid', + 'name': 'bittersweet_symphony' + } + + # Example metric setup: + metric = { + 'name': 'MSE', + 'goal': 'minimize' + } + sweep_config['metric'] = metric + + # Example parameters setup: + parameters_dict = { + 'steps': {'values': [[*range(1, 36 + 1, 1)]]}, + } + sweep_config['parameters'] = parameters_dict + + return sweep_config diff --git a/models/bittersweet_symphony/data/generated/.gitkeep b/models/bittersweet_symphony/data/generated/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/models/bittersweet_symphony/data/processed/.gitkeep b/models/bittersweet_symphony/data/processed/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/models/bittersweet_symphony/data/raw/.gitkeep b/models/bittersweet_symphony/data/raw/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/models/bittersweet_symphony/main.py b/models/bittersweet_symphony/main.py new file mode 100644 index 00000000..36429ecb --- /dev/null +++ b/models/bittersweet_symphony/main.py @@ -0,0 +1,38 @@ +import wandb +import sys +import warnings + +from pathlib import Path +PATH = Path(__file__) +sys.path.insert(0, str(Path( + *[i for i in PATH.parts[:PATH.parts.index("views_pipeline") + 1]]) / "common_utils")) # PATH_COMMON_UTILS +from set_path import setup_project_paths +setup_project_paths(PATH) + +from utils_cli_parser import parse_args, validate_arguments +from utils_logger import setup_logging +from execute_model_runs import execute_sweep_run, execute_single_run + +warnings.filterwarnings("ignore") +try: + from common_utils.model_path import ModelPath + from common_utils.global_cache import GlobalCache + model_name = ModelPath.get_model_name_from_path(PATH) + GlobalCache["current_model"] = model_name +except ImportError as e: + warnings.warn(f"ImportError: {e}. Some functionalities (model seperated log files) may not work properly.", ImportWarning) +except Exception as e: + warnings.warn(f"An unexpected error occurred: {e}.", RuntimeWarning) +logger = setup_logging("run.log") + + +if __name__ == "__main__": + wandb.login() + + args = parse_args() + validate_arguments(args) + + if args.sweep: + execute_sweep_run(args) + else: + execute_single_run(args) diff --git a/models/bittersweet_symphony/notebooks/.gitkeep b/models/bittersweet_symphony/notebooks/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/models/bittersweet_symphony/reports/.gitkeep b/models/bittersweet_symphony/reports/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/models/bittersweet_symphony/src/architectures/.gitkeep b/models/bittersweet_symphony/src/architectures/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/models/bittersweet_symphony/src/dataloaders/get_data.py b/models/bittersweet_symphony/src/dataloaders/get_data.py new file mode 100644 index 00000000..4dcd20c8 --- /dev/null +++ b/models/bittersweet_symphony/src/dataloaders/get_data.py @@ -0,0 +1,14 @@ +import logging +from model_path import ModelPath +from utils_dataloaders import fetch_or_load_views_df + +logger = logging.getLogger(__name__) + +def get_data(args, model_name, self_test): + model_path = ModelPath(model_name, validate=False) + path_raw = model_path.data_raw + + data, alerts = fetch_or_load_views_df(model_name, args.run_type, path_raw, self_test, use_saved=args.saved) + logger.debug(f"DataFrame shape: {data.shape if data is not None else 'None'}") + + return data diff --git a/models/bittersweet_symphony/src/forecasting/generate_forecast.py b/models/bittersweet_symphony/src/forecasting/generate_forecast.py new file mode 100644 index 00000000..c011a203 --- /dev/null +++ b/models/bittersweet_symphony/src/forecasting/generate_forecast.py @@ -0,0 +1,47 @@ +import pandas as pd +from datetime import datetime +import logging +from model_path import ModelPath +from utils_log_files import create_log_file, read_log_file +from utils_run import get_standardized_df +from utils_save_outputs import save_predictions +from utils_artifacts import get_latest_model_artifact + +logger = logging.getLogger(__name__) + + +def forecast_model_artifact(config, artifact_name): + model_path = ModelPath(config["name"]) + path_raw = model_path.data_raw + path_generated = model_path.data_generated + path_artifacts = model_path.artifacts + run_type = config["run_type"] + + # if an artifact name is provided through the CLI, use it. + # Otherwise, get the latest model artifact based on the run type + if artifact_name: + logger.info(f"Using (non-default) artifact: {artifact_name}") + + if not artifact_name.endswith(".pkl"): + artifact_name += ".pkl" + path_artifact = path_artifacts / artifact_name + else: + # use the latest model artifact based on the run type + logger.info(f"Using latest (default) run type ({run_type}) specific artifact") + path_artifact = get_latest_model_artifact(path_artifacts, run_type) + + config["timestamp"] = path_artifact.stem[-15:] + df_viewser = pd.read_pickle(path_raw / f"{run_type}_viewser_df.pkl") + + try: + stepshift_model = pd.read_pickle(path_artifact) + except FileNotFoundError: + logger.exception(f"Model artifact not found at {path_artifact}") + + df_predictions = stepshift_model.predict(run_type, df_viewser) + df_predictions = get_standardized_df(df_predictions, config) + data_generation_timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + date_fetch_timestamp = read_log_file(path_raw / f"{run_type}_data_fetch_log.txt").get("Data Fetch Timestamp", None) + + save_predictions(df_predictions, path_generated, config) + create_log_file(path_generated, config, config["timestamp"], data_generation_timestamp, date_fetch_timestamp) diff --git a/models/bittersweet_symphony/src/management/execute_model_runs.py b/models/bittersweet_symphony/src/management/execute_model_runs.py new file mode 100644 index 00000000..d5b32a46 --- /dev/null +++ b/models/bittersweet_symphony/src/management/execute_model_runs.py @@ -0,0 +1,51 @@ +import wandb +from config_deployment import get_deployment_config +from config_hyperparameters import get_hp_config +from config_meta import get_meta_config +from config_sweep import get_sweep_config +from execute_model_tasks import execute_model_tasks +from get_data import get_data +from utils_run import update_config, update_sweep_config + + +def execute_sweep_run(args): + sweep_config = get_sweep_config() + meta_config = get_meta_config() + update_sweep_config(sweep_config, args, meta_config) + + project = f"{sweep_config['name']}_sweep" # we can name the sweep in the config file + + with wandb.init(project=f'{project}_fetch', entity="views_pipeline"): + + get_data(args, sweep_config["name"], args.drift_self_test) + + wandb.finish() + + sweep_id = wandb.sweep(sweep_config, project=project, entity="views_pipeline") + wandb.agent(sweep_id, execute_model_tasks, entity="views_pipeline") + + +def execute_single_run(args): + + hp_config = get_hp_config() + meta_config = get_meta_config() + dp_config = get_deployment_config() + config = update_config(hp_config, meta_config, dp_config, args) + + project = f"{config['name']}_{args.run_type}" + + with wandb.init(project=f'{project}_fetch', entity="views_pipeline"): + + get_data(args, config["name"], args.drift_self_test) + + wandb.finish() + + if args.run_type == 'calibration' or args.run_type == 'testing': + + execute_model_tasks(config=config, project=project, train=args.train, eval=args.evaluate, + forecast=False, artifact_name=args.artifact_name) + + elif args.run_type == "forecasting": + execute_model_tasks(config=config, project=project, train=args.train, eval=False, + forecast=args.forecast, artifact_name=args.artifact_name) + diff --git a/models/bittersweet_symphony/src/management/execute_model_tasks.py b/models/bittersweet_symphony/src/management/execute_model_tasks.py new file mode 100644 index 00000000..a913fa53 --- /dev/null +++ b/models/bittersweet_symphony/src/management/execute_model_tasks.py @@ -0,0 +1,70 @@ +import wandb +import logging +import time +from evaluate_model import evaluate_model_artifact +from evaluate_sweep import evaluate_sweep +from generate_forecast import forecast_model_artifact +from train_model import train_model_artifact +from utils_run import split_hurdle_parameters +from utils_wandb import add_wandb_monthly_metrics + +logger = logging.getLogger(__name__) + +def execute_model_tasks(config=None, project=None, train=None, eval=None, forecast=None, artifact_name=None): + """ + Executes various model-related tasks including training, evaluation, and forecasting. + + This function manages the execution of different tasks such as training the model, + evaluating an existing model, or performing forecasting. + It also initializes the WandB project. + + Args: + config: Configuration object containing parameters and settings. + project: The WandB project name. + train: Flag to indicate if the model should be trained. + eval: Flag to indicate if the model should be evaluated. + forecast: Flag to indicate if forecasting should be performed. + artifact_name (optional): Specific name of the model artifact to load for evaluation or forecasting. + """ + + start_t = time.time() + + # Initialize WandB + with wandb.init(project=project, entity="views_pipeline", + config=config): # project and config ignored when running a sweep + + # add the monthly metrics to WandB + add_wandb_monthly_metrics() + + # Update config from WandB initialization above + config = wandb.config + + # W&B does not directly support nested dictionaries for hyperparameters + # This will make the sweep config super ugly, but we don't have to distinguish between sweep and single runs + if config["sweep"] and config["algorithm"] == "HurdleRegression": + config["parameters"] = {} + config["parameters"]["clf"], config["parameters"]["reg"] = split_hurdle_parameters(config) + + if config["sweep"]: + logger.info(f"Sweeping model {config['name']}...") + stepshift_model = train_model_artifact(config) + logger.info(f"Evaluating model {config['name']}...") + evaluate_sweep(config, stepshift_model) + + # Handle the single model runs: train and save the model as an artifact + if train: + logger.info(f"Training model {config['name']}...") + train_model_artifact(config) + + # Handle the single model runs: evaluate a trained model (artifact) + if eval: + logger.info(f"Evaluating model {config['name']}...") + evaluate_model_artifact(config, artifact_name) + + if forecast: + logger.info(f"Forecasting model {config['name']}...") + forecast_model_artifact(config, artifact_name) + + end_t = time.time() + minutes = (end_t - start_t) / 60 + logger.info(f"Done. Runtime: {minutes:.3f} minutes.\n") diff --git a/models/bittersweet_symphony/src/offline_evaluation/evaluate_model.py b/models/bittersweet_symphony/src/offline_evaluation/evaluate_model.py new file mode 100644 index 00000000..0d86a87c --- /dev/null +++ b/models/bittersweet_symphony/src/offline_evaluation/evaluate_model.py @@ -0,0 +1,55 @@ +from datetime import datetime +import pandas as pd +import logging +from model_path import ModelPath +from utils_log_files import create_log_file, read_log_file +from utils_save_outputs import save_model_outputs, save_predictions +from utils_run import get_standardized_df +from utils_artifacts import get_latest_model_artifact +from utils_evaluation_metrics import generate_metric_dict +from utils_model_outputs import generate_output_dict +from utils_wandb import log_wandb_log_dict +from views_forecasts.extensions import * + +logger = logging.getLogger(__name__) + +def evaluate_model_artifact(config, artifact_name): + model_path = ModelPath(config["name"]) + path_raw = model_path.data_raw + path_generated = model_path.data_generated + path_artifacts = model_path.artifacts + run_type = config["run_type"] + + # if an artifact name is provided through the CLI, use it. + # Otherwise, get the latest model artifact based on the run type + if artifact_name: + logger.info(f"Using (non-default) artifact: {artifact_name}") + + if not artifact_name.endswith(".pkl"): + artifact_name += ".pkl" + PATH_ARTIFACT = path_artifacts / artifact_name + else: + # use the latest model artifact based on the run type + logger.info(f"Using latest (default) run type ({run_type}) specific artifact") + PATH_ARTIFACT = get_latest_model_artifact(path_artifacts, run_type) + + config["timestamp"] = PATH_ARTIFACT.stem[-15:] + df_viewser = pd.read_pickle(path_raw / f"{run_type}_viewser_df.pkl") + + try: + stepshift_model = pd.read_pickle(PATH_ARTIFACT) + except FileNotFoundError: + logger.exception(f"Model artifact not found at {PATH_ARTIFACT}") + + df = stepshift_model.predict(run_type, df_viewser) + df = get_standardized_df(df, config) + data_generation_timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + date_fetch_timestamp = read_log_file(path_raw / f"{run_type}_data_fetch_log.txt").get("Data Fetch Timestamp", None) + + _, df_output = generate_output_dict(df, config) + evaluation, df_evaluation = generate_metric_dict(df, config) + log_wandb_log_dict(config, evaluation) + + save_model_outputs(df_evaluation, df_output, path_generated, config) + save_predictions(df, path_generated, config) + create_log_file(path_generated, config, config["timestamp"], data_generation_timestamp, date_fetch_timestamp) diff --git a/models/bittersweet_symphony/src/offline_evaluation/evaluate_sweep.py b/models/bittersweet_symphony/src/offline_evaluation/evaluate_sweep.py new file mode 100644 index 00000000..d6726cf0 --- /dev/null +++ b/models/bittersweet_symphony/src/offline_evaluation/evaluate_sweep.py @@ -0,0 +1,28 @@ +import pandas as pd +import wandb +from sklearn.metrics import mean_squared_error +from model_path import ModelPath +from utils_run import get_standardized_df +from utils_wandb import log_wandb_log_dict +from utils_evaluation_metrics import generate_metric_dict + + +def evaluate_sweep(config, stepshift_model): + model_path = ModelPath(config["name"]) + path_raw = model_path.data_raw + run_type = config["run_type"] + steps = config["steps"] + + df_viewser = pd.read_pickle(path_raw / f"{run_type}_viewser_df.pkl") + df = stepshift_model.predict(run_type, df_viewser) + df = get_standardized_df(df, config) + + # Temporarily keep this because the metric to minimize is MSE + pred_cols = [f"step_pred_{str(i)}" for i in steps] + df["mse"] = df.apply(lambda row: mean_squared_error([row[config["depvar"]]] * 36, + [row[col] for col in pred_cols]), axis=1) + + wandb.log({"MSE": df["mse"].mean()}) + + evaluation, _ = generate_metric_dict(df, config) + log_wandb_log_dict(config, evaluation) diff --git a/models/bittersweet_symphony/src/online_evaluation/.gitkeep b/models/bittersweet_symphony/src/online_evaluation/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/models/bittersweet_symphony/src/training/train_model.py b/models/bittersweet_symphony/src/training/train_model.py new file mode 100644 index 00000000..f2342912 --- /dev/null +++ b/models/bittersweet_symphony/src/training/train_model.py @@ -0,0 +1,33 @@ +from datetime import datetime +import pandas as pd +from model_path import ModelPath +from utils_log_files import create_log_file, read_log_file +from utils_run import get_model +from set_partition import get_partitioner_dict +from views_forecasts.extensions import * + + +def train_model_artifact(config): + # print(config) + model_path = ModelPath(config["name"]) + path_raw = model_path.data_raw + path_generated = model_path.data_generated + path_artifacts = model_path.artifacts + run_type = config["run_type"] + df_viewser = pd.read_pickle(path_raw / f"{run_type}_viewser_df.pkl") + + stepshift_model = stepshift_training(config, run_type, df_viewser) + if not config["sweep"]: + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + model_filename = f"{run_type}_model_{timestamp}.pkl" + stepshift_model.save(path_artifacts / model_filename) + date_fetch_timestamp = read_log_file(path_raw / f"{run_type}_data_fetch_log.txt").get("Data Fetch Timestamp", None) + create_log_file(path_generated, config, timestamp, None, date_fetch_timestamp) + return stepshift_model + + +def stepshift_training(config, partition_name, dataset): + partitioner_dict = get_partitioner_dict(partition_name) + stepshift_model = get_model(config, partitioner_dict) + stepshift_model.fit(dataset) + return stepshift_model diff --git a/models/bittersweet_symphony/src/utils/utils_run.py b/models/bittersweet_symphony/src/utils/utils_run.py new file mode 100644 index 00000000..51901a01 --- /dev/null +++ b/models/bittersweet_symphony/src/utils/utils_run.py @@ -0,0 +1,83 @@ +import numpy as np +from views_stepshifter_darts.stepshifter import StepshifterModel +from views_stepshifter_darts.hurdle_model import HurdleModel +from views_forecasts.extensions import * + + +def get_model(config, partitioner_dict): + """ + Get the model based on the algorithm specified in the config + """ + + if config["algorithm"] == "HurdleModel": + model = HurdleModel(config, partitioner_dict) + else: + config["model_reg"] = config["algorithm"] + model = StepshifterModel(config, partitioner_dict) + + return model + + +def get_standardized_df(df, config): + """ + Standardize the DataFrame based on the run type + """ + + run_type = config["run_type"] + steps = config["steps"] + depvar = config["depvar"] + + # choose the columns to keep based on the run type and replace negative values with 0 + if run_type in ["calibration", "testing"]: + cols = [depvar] + df.forecasts.prediction_columns + elif run_type == "forecasting": + cols = [f"step_pred_{i}" for i in steps] + df = df.replace([np.inf, -np.inf], 0)[cols] + df = df.mask(df < 0, 0) + return df + + +def split_hurdle_parameters(parameters_dict): + """ + Split the parameters dictionary into two separate dictionaries, one for the + classification model and one for the regression model. + """ + + cls_dict = {} + reg_dict = {} + + for key, value in parameters_dict.items(): + if key.startswith("cls_"): + cls_key = key.replace("cls_", "") + cls_dict[cls_key] = value + elif key.startswith("reg_"): + reg_key = key.replace("reg_", "") + reg_dict[reg_key] = value + + return cls_dict, reg_dict + + +def update_config(hp_config, meta_config, dp_config, args): + config = hp_config.copy() + config["run_type"] = args.run_type + config["sweep"] = False + config["name"] = meta_config["name"] + config["depvar"] = meta_config["depvar"] + config["algorithm"] = meta_config["algorithm"] + if meta_config["algorithm"] == "HurdleModel": + config["model_clf"] = meta_config["model_clf"] + config["model_reg"] = meta_config["model_reg"] + config["deployment_status"] = dp_config["deployment_status"] + + return config + + +def update_sweep_config(sweep_config, args, meta_config): + sweep_config["parameters"]["run_type"] = {"value": args.run_type} + sweep_config["parameters"]["sweep"] = {"value": True} + sweep_config["parameters"]["name"] = {"value": meta_config["name"]} + sweep_config["parameters"]["depvar"] = {"value": meta_config["depvar"]} + sweep_config["parameters"]["algorithm"] = {"value": meta_config["algorithm"]} + if meta_config["algorithm"] == "HurdleModel": + sweep_config["parameters"]["model_clf"] = {"value": meta_config["model_clf"]} + sweep_config["parameters"]["model_reg"] = {"value": meta_config["model_reg"]} diff --git a/models/bittersweet_symphony/src/visualization/.gitkeep b/models/bittersweet_symphony/src/visualization/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/models/blank_space/configs/config_meta.py b/models/blank_space/configs/config_meta.py index cc6d6cc3..d473b42c 100644 --- a/models/blank_space/configs/config_meta.py +++ b/models/blank_space/configs/config_meta.py @@ -12,7 +12,7 @@ def get_meta_config(): "model_clf": "LGBMClassifier", "model_reg": "LGBMRegressor", "depvar": "ln_ged_sb_dep", # IMPORTANT! The current stepshift only takes one target variable! Not compatiable with Simon's code! - "queryset": "fatalities002_pgm_natsoc", + "queryset": "fatalities003_pgm_natsoc", "level": "pgm", "creator": "Xiaolong" } diff --git a/models/brown_cheese/configs/config_meta.py b/models/brown_cheese/configs/config_meta.py index 080941e7..16b9cf89 100644 --- a/models/brown_cheese/configs/config_meta.py +++ b/models/brown_cheese/configs/config_meta.py @@ -9,9 +9,9 @@ def get_meta_config(): meta_config = { "name": "brown_cheese", - "algorithm": "XGBModel", + "algorithm": "RandomForestModel", "depvar": "ln_ged_sb_dep", - "queryset": "fatalities002_baseline", + "queryset": "fatalities003_baseline", "level": "cm", "creator": "Borbála" } diff --git a/models/brown_cheese/notebooks/.gitkeep b/models/brown_cheese/notebooks/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/models/car_radio/README.md b/models/car_radio/README.md new file mode 100644 index 00000000..d4990e2a --- /dev/null +++ b/models/car_radio/README.md @@ -0,0 +1,3 @@ +# Model README +## Model name: car_radio +## Created on: 2024-11-05 10:52:29.142194 \ No newline at end of file diff --git a/models/car_radio/artifacts/.gitkeep b/models/car_radio/artifacts/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/models/car_radio/configs/config_deployment.py b/models/car_radio/configs/config_deployment.py new file mode 100644 index 00000000..9e45b735 --- /dev/null +++ b/models/car_radio/configs/config_deployment.py @@ -0,0 +1,20 @@ +""" +Deployment Configuration Script + +This script defines the deployment configuration settings for the application. +It includes the deployment status and any additional settings specified. + +Deployment Status: +- shadow: The deployment is shadowed and not yet active. +- deployed: The deployment is active and in use. +- baseline: The deployment is in a baseline state, for reference or comparison. +- deprecated: The deployment is deprecated and no longer supported. + +Additional settings can be included in the configuration dictionary as needed. + +""" + +def get_deployment_config(): + # Deployment settings + deployment_config = {'deployment_status': 'shadow'} + return deployment_config diff --git a/models/car_radio/configs/config_hyperparameters.py b/models/car_radio/configs/config_hyperparameters.py new file mode 100644 index 00000000..e8d15390 --- /dev/null +++ b/models/car_radio/configs/config_hyperparameters.py @@ -0,0 +1,19 @@ + +def get_hp_config(): + """ + Contains the hyperparameter configurations for model training. + This configuration is "operational" so modifying these settings will impact the model's behavior during the training. + + Returns: + - hyperparameters (dict): A dictionary containing hyperparameters for training the model, which determine the model's behavior during the training phase. + """ + + hyperparameters = { + "steps": [*range(1, 36 + 1, 1)], + "parameters": { + "n_estimators": 80, + "n_jobs": 12, + "learning_rate": 0.05 + } + } + return hyperparameters diff --git a/models/car_radio/configs/config_meta.py b/models/car_radio/configs/config_meta.py new file mode 100644 index 00000000..1cbbfcdc --- /dev/null +++ b/models/car_radio/configs/config_meta.py @@ -0,0 +1,18 @@ +def get_meta_config(): + """ + Contains the meta data for the model (model algorithm, name, target variable, and level of analysis). + This config is for documentation purposes only, and modifying it will not affect the model, the training, or the evaluation. + + Returns: + - meta_config (dict): A dictionary containing model meta configuration. + """ + + meta_config = { + "name": "car_radio", + "algorithm": "XGBModel", + "depvar": "ln_ged_sb_dep", + "queryset": "fatalities003_topics", + "level": "cm", + "creator": "Borbála" + } + return meta_config diff --git a/models/car_radio/configs/config_sweep.py b/models/car_radio/configs/config_sweep.py new file mode 100644 index 00000000..d608043e --- /dev/null +++ b/models/car_radio/configs/config_sweep.py @@ -0,0 +1,29 @@ + +def get_sweep_config(): + """ + Contains the configuration for hyperparameter sweeps using WandB. + This configuration is "operational" so modifying it will change the search strategy, parameter ranges, and other settings for hyperparameter tuning aimed at optimizing model performance. + + Returns: + - sweep_config (dict): A dictionary containing the configuration for hyperparameter sweeps, defining the methods and parameter ranges used to search for optimal hyperparameters. + """ + + sweep_config = { + 'method': 'grid', + 'name': 'car_radio' + } + + # Example metric setup: + metric = { + 'name': 'MSE', + 'goal': 'minimize' + } + sweep_config['metric'] = metric + + # Example parameters setup: + parameters_dict = { + 'steps': {'values': [[*range(1, 36 + 1, 1)]]}, + } + sweep_config['parameters'] = parameters_dict + + return sweep_config diff --git a/models/car_radio/data/generated/.gitkeep b/models/car_radio/data/generated/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/models/car_radio/data/processed/.gitkeep b/models/car_radio/data/processed/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/models/car_radio/data/raw/.gitkeep b/models/car_radio/data/raw/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/models/car_radio/main.py b/models/car_radio/main.py new file mode 100644 index 00000000..36429ecb --- /dev/null +++ b/models/car_radio/main.py @@ -0,0 +1,38 @@ +import wandb +import sys +import warnings + +from pathlib import Path +PATH = Path(__file__) +sys.path.insert(0, str(Path( + *[i for i in PATH.parts[:PATH.parts.index("views_pipeline") + 1]]) / "common_utils")) # PATH_COMMON_UTILS +from set_path import setup_project_paths +setup_project_paths(PATH) + +from utils_cli_parser import parse_args, validate_arguments +from utils_logger import setup_logging +from execute_model_runs import execute_sweep_run, execute_single_run + +warnings.filterwarnings("ignore") +try: + from common_utils.model_path import ModelPath + from common_utils.global_cache import GlobalCache + model_name = ModelPath.get_model_name_from_path(PATH) + GlobalCache["current_model"] = model_name +except ImportError as e: + warnings.warn(f"ImportError: {e}. Some functionalities (model seperated log files) may not work properly.", ImportWarning) +except Exception as e: + warnings.warn(f"An unexpected error occurred: {e}.", RuntimeWarning) +logger = setup_logging("run.log") + + +if __name__ == "__main__": + wandb.login() + + args = parse_args() + validate_arguments(args) + + if args.sweep: + execute_sweep_run(args) + else: + execute_single_run(args) diff --git a/models/car_radio/notebooks/.gitkeep b/models/car_radio/notebooks/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/models/car_radio/reports/.gitkeep b/models/car_radio/reports/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/models/car_radio/src/architectures/.gitkeep b/models/car_radio/src/architectures/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/models/car_radio/src/dataloaders/get_data.py b/models/car_radio/src/dataloaders/get_data.py new file mode 100644 index 00000000..4dcd20c8 --- /dev/null +++ b/models/car_radio/src/dataloaders/get_data.py @@ -0,0 +1,14 @@ +import logging +from model_path import ModelPath +from utils_dataloaders import fetch_or_load_views_df + +logger = logging.getLogger(__name__) + +def get_data(args, model_name, self_test): + model_path = ModelPath(model_name, validate=False) + path_raw = model_path.data_raw + + data, alerts = fetch_or_load_views_df(model_name, args.run_type, path_raw, self_test, use_saved=args.saved) + logger.debug(f"DataFrame shape: {data.shape if data is not None else 'None'}") + + return data diff --git a/models/car_radio/src/forecasting/generate_forecast.py b/models/car_radio/src/forecasting/generate_forecast.py new file mode 100644 index 00000000..c011a203 --- /dev/null +++ b/models/car_radio/src/forecasting/generate_forecast.py @@ -0,0 +1,47 @@ +import pandas as pd +from datetime import datetime +import logging +from model_path import ModelPath +from utils_log_files import create_log_file, read_log_file +from utils_run import get_standardized_df +from utils_save_outputs import save_predictions +from utils_artifacts import get_latest_model_artifact + +logger = logging.getLogger(__name__) + + +def forecast_model_artifact(config, artifact_name): + model_path = ModelPath(config["name"]) + path_raw = model_path.data_raw + path_generated = model_path.data_generated + path_artifacts = model_path.artifacts + run_type = config["run_type"] + + # if an artifact name is provided through the CLI, use it. + # Otherwise, get the latest model artifact based on the run type + if artifact_name: + logger.info(f"Using (non-default) artifact: {artifact_name}") + + if not artifact_name.endswith(".pkl"): + artifact_name += ".pkl" + path_artifact = path_artifacts / artifact_name + else: + # use the latest model artifact based on the run type + logger.info(f"Using latest (default) run type ({run_type}) specific artifact") + path_artifact = get_latest_model_artifact(path_artifacts, run_type) + + config["timestamp"] = path_artifact.stem[-15:] + df_viewser = pd.read_pickle(path_raw / f"{run_type}_viewser_df.pkl") + + try: + stepshift_model = pd.read_pickle(path_artifact) + except FileNotFoundError: + logger.exception(f"Model artifact not found at {path_artifact}") + + df_predictions = stepshift_model.predict(run_type, df_viewser) + df_predictions = get_standardized_df(df_predictions, config) + data_generation_timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + date_fetch_timestamp = read_log_file(path_raw / f"{run_type}_data_fetch_log.txt").get("Data Fetch Timestamp", None) + + save_predictions(df_predictions, path_generated, config) + create_log_file(path_generated, config, config["timestamp"], data_generation_timestamp, date_fetch_timestamp) diff --git a/models/car_radio/src/management/execute_model_runs.py b/models/car_radio/src/management/execute_model_runs.py new file mode 100644 index 00000000..d5b32a46 --- /dev/null +++ b/models/car_radio/src/management/execute_model_runs.py @@ -0,0 +1,51 @@ +import wandb +from config_deployment import get_deployment_config +from config_hyperparameters import get_hp_config +from config_meta import get_meta_config +from config_sweep import get_sweep_config +from execute_model_tasks import execute_model_tasks +from get_data import get_data +from utils_run import update_config, update_sweep_config + + +def execute_sweep_run(args): + sweep_config = get_sweep_config() + meta_config = get_meta_config() + update_sweep_config(sweep_config, args, meta_config) + + project = f"{sweep_config['name']}_sweep" # we can name the sweep in the config file + + with wandb.init(project=f'{project}_fetch', entity="views_pipeline"): + + get_data(args, sweep_config["name"], args.drift_self_test) + + wandb.finish() + + sweep_id = wandb.sweep(sweep_config, project=project, entity="views_pipeline") + wandb.agent(sweep_id, execute_model_tasks, entity="views_pipeline") + + +def execute_single_run(args): + + hp_config = get_hp_config() + meta_config = get_meta_config() + dp_config = get_deployment_config() + config = update_config(hp_config, meta_config, dp_config, args) + + project = f"{config['name']}_{args.run_type}" + + with wandb.init(project=f'{project}_fetch', entity="views_pipeline"): + + get_data(args, config["name"], args.drift_self_test) + + wandb.finish() + + if args.run_type == 'calibration' or args.run_type == 'testing': + + execute_model_tasks(config=config, project=project, train=args.train, eval=args.evaluate, + forecast=False, artifact_name=args.artifact_name) + + elif args.run_type == "forecasting": + execute_model_tasks(config=config, project=project, train=args.train, eval=False, + forecast=args.forecast, artifact_name=args.artifact_name) + diff --git a/models/car_radio/src/management/execute_model_tasks.py b/models/car_radio/src/management/execute_model_tasks.py new file mode 100644 index 00000000..a913fa53 --- /dev/null +++ b/models/car_radio/src/management/execute_model_tasks.py @@ -0,0 +1,70 @@ +import wandb +import logging +import time +from evaluate_model import evaluate_model_artifact +from evaluate_sweep import evaluate_sweep +from generate_forecast import forecast_model_artifact +from train_model import train_model_artifact +from utils_run import split_hurdle_parameters +from utils_wandb import add_wandb_monthly_metrics + +logger = logging.getLogger(__name__) + +def execute_model_tasks(config=None, project=None, train=None, eval=None, forecast=None, artifact_name=None): + """ + Executes various model-related tasks including training, evaluation, and forecasting. + + This function manages the execution of different tasks such as training the model, + evaluating an existing model, or performing forecasting. + It also initializes the WandB project. + + Args: + config: Configuration object containing parameters and settings. + project: The WandB project name. + train: Flag to indicate if the model should be trained. + eval: Flag to indicate if the model should be evaluated. + forecast: Flag to indicate if forecasting should be performed. + artifact_name (optional): Specific name of the model artifact to load for evaluation or forecasting. + """ + + start_t = time.time() + + # Initialize WandB + with wandb.init(project=project, entity="views_pipeline", + config=config): # project and config ignored when running a sweep + + # add the monthly metrics to WandB + add_wandb_monthly_metrics() + + # Update config from WandB initialization above + config = wandb.config + + # W&B does not directly support nested dictionaries for hyperparameters + # This will make the sweep config super ugly, but we don't have to distinguish between sweep and single runs + if config["sweep"] and config["algorithm"] == "HurdleRegression": + config["parameters"] = {} + config["parameters"]["clf"], config["parameters"]["reg"] = split_hurdle_parameters(config) + + if config["sweep"]: + logger.info(f"Sweeping model {config['name']}...") + stepshift_model = train_model_artifact(config) + logger.info(f"Evaluating model {config['name']}...") + evaluate_sweep(config, stepshift_model) + + # Handle the single model runs: train and save the model as an artifact + if train: + logger.info(f"Training model {config['name']}...") + train_model_artifact(config) + + # Handle the single model runs: evaluate a trained model (artifact) + if eval: + logger.info(f"Evaluating model {config['name']}...") + evaluate_model_artifact(config, artifact_name) + + if forecast: + logger.info(f"Forecasting model {config['name']}...") + forecast_model_artifact(config, artifact_name) + + end_t = time.time() + minutes = (end_t - start_t) / 60 + logger.info(f"Done. Runtime: {minutes:.3f} minutes.\n") diff --git a/models/car_radio/src/offline_evaluation/evaluate_model.py b/models/car_radio/src/offline_evaluation/evaluate_model.py new file mode 100644 index 00000000..0d86a87c --- /dev/null +++ b/models/car_radio/src/offline_evaluation/evaluate_model.py @@ -0,0 +1,55 @@ +from datetime import datetime +import pandas as pd +import logging +from model_path import ModelPath +from utils_log_files import create_log_file, read_log_file +from utils_save_outputs import save_model_outputs, save_predictions +from utils_run import get_standardized_df +from utils_artifacts import get_latest_model_artifact +from utils_evaluation_metrics import generate_metric_dict +from utils_model_outputs import generate_output_dict +from utils_wandb import log_wandb_log_dict +from views_forecasts.extensions import * + +logger = logging.getLogger(__name__) + +def evaluate_model_artifact(config, artifact_name): + model_path = ModelPath(config["name"]) + path_raw = model_path.data_raw + path_generated = model_path.data_generated + path_artifacts = model_path.artifacts + run_type = config["run_type"] + + # if an artifact name is provided through the CLI, use it. + # Otherwise, get the latest model artifact based on the run type + if artifact_name: + logger.info(f"Using (non-default) artifact: {artifact_name}") + + if not artifact_name.endswith(".pkl"): + artifact_name += ".pkl" + PATH_ARTIFACT = path_artifacts / artifact_name + else: + # use the latest model artifact based on the run type + logger.info(f"Using latest (default) run type ({run_type}) specific artifact") + PATH_ARTIFACT = get_latest_model_artifact(path_artifacts, run_type) + + config["timestamp"] = PATH_ARTIFACT.stem[-15:] + df_viewser = pd.read_pickle(path_raw / f"{run_type}_viewser_df.pkl") + + try: + stepshift_model = pd.read_pickle(PATH_ARTIFACT) + except FileNotFoundError: + logger.exception(f"Model artifact not found at {PATH_ARTIFACT}") + + df = stepshift_model.predict(run_type, df_viewser) + df = get_standardized_df(df, config) + data_generation_timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + date_fetch_timestamp = read_log_file(path_raw / f"{run_type}_data_fetch_log.txt").get("Data Fetch Timestamp", None) + + _, df_output = generate_output_dict(df, config) + evaluation, df_evaluation = generate_metric_dict(df, config) + log_wandb_log_dict(config, evaluation) + + save_model_outputs(df_evaluation, df_output, path_generated, config) + save_predictions(df, path_generated, config) + create_log_file(path_generated, config, config["timestamp"], data_generation_timestamp, date_fetch_timestamp) diff --git a/models/car_radio/src/offline_evaluation/evaluate_sweep.py b/models/car_radio/src/offline_evaluation/evaluate_sweep.py new file mode 100644 index 00000000..d6726cf0 --- /dev/null +++ b/models/car_radio/src/offline_evaluation/evaluate_sweep.py @@ -0,0 +1,28 @@ +import pandas as pd +import wandb +from sklearn.metrics import mean_squared_error +from model_path import ModelPath +from utils_run import get_standardized_df +from utils_wandb import log_wandb_log_dict +from utils_evaluation_metrics import generate_metric_dict + + +def evaluate_sweep(config, stepshift_model): + model_path = ModelPath(config["name"]) + path_raw = model_path.data_raw + run_type = config["run_type"] + steps = config["steps"] + + df_viewser = pd.read_pickle(path_raw / f"{run_type}_viewser_df.pkl") + df = stepshift_model.predict(run_type, df_viewser) + df = get_standardized_df(df, config) + + # Temporarily keep this because the metric to minimize is MSE + pred_cols = [f"step_pred_{str(i)}" for i in steps] + df["mse"] = df.apply(lambda row: mean_squared_error([row[config["depvar"]]] * 36, + [row[col] for col in pred_cols]), axis=1) + + wandb.log({"MSE": df["mse"].mean()}) + + evaluation, _ = generate_metric_dict(df, config) + log_wandb_log_dict(config, evaluation) diff --git a/models/car_radio/src/online_evaluation/.gitkeep b/models/car_radio/src/online_evaluation/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/models/car_radio/src/training/train_model.py b/models/car_radio/src/training/train_model.py new file mode 100644 index 00000000..f2342912 --- /dev/null +++ b/models/car_radio/src/training/train_model.py @@ -0,0 +1,33 @@ +from datetime import datetime +import pandas as pd +from model_path import ModelPath +from utils_log_files import create_log_file, read_log_file +from utils_run import get_model +from set_partition import get_partitioner_dict +from views_forecasts.extensions import * + + +def train_model_artifact(config): + # print(config) + model_path = ModelPath(config["name"]) + path_raw = model_path.data_raw + path_generated = model_path.data_generated + path_artifacts = model_path.artifacts + run_type = config["run_type"] + df_viewser = pd.read_pickle(path_raw / f"{run_type}_viewser_df.pkl") + + stepshift_model = stepshift_training(config, run_type, df_viewser) + if not config["sweep"]: + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + model_filename = f"{run_type}_model_{timestamp}.pkl" + stepshift_model.save(path_artifacts / model_filename) + date_fetch_timestamp = read_log_file(path_raw / f"{run_type}_data_fetch_log.txt").get("Data Fetch Timestamp", None) + create_log_file(path_generated, config, timestamp, None, date_fetch_timestamp) + return stepshift_model + + +def stepshift_training(config, partition_name, dataset): + partitioner_dict = get_partitioner_dict(partition_name) + stepshift_model = get_model(config, partitioner_dict) + stepshift_model.fit(dataset) + return stepshift_model diff --git a/models/car_radio/src/utils/utils_run.py b/models/car_radio/src/utils/utils_run.py new file mode 100644 index 00000000..51901a01 --- /dev/null +++ b/models/car_radio/src/utils/utils_run.py @@ -0,0 +1,83 @@ +import numpy as np +from views_stepshifter_darts.stepshifter import StepshifterModel +from views_stepshifter_darts.hurdle_model import HurdleModel +from views_forecasts.extensions import * + + +def get_model(config, partitioner_dict): + """ + Get the model based on the algorithm specified in the config + """ + + if config["algorithm"] == "HurdleModel": + model = HurdleModel(config, partitioner_dict) + else: + config["model_reg"] = config["algorithm"] + model = StepshifterModel(config, partitioner_dict) + + return model + + +def get_standardized_df(df, config): + """ + Standardize the DataFrame based on the run type + """ + + run_type = config["run_type"] + steps = config["steps"] + depvar = config["depvar"] + + # choose the columns to keep based on the run type and replace negative values with 0 + if run_type in ["calibration", "testing"]: + cols = [depvar] + df.forecasts.prediction_columns + elif run_type == "forecasting": + cols = [f"step_pred_{i}" for i in steps] + df = df.replace([np.inf, -np.inf], 0)[cols] + df = df.mask(df < 0, 0) + return df + + +def split_hurdle_parameters(parameters_dict): + """ + Split the parameters dictionary into two separate dictionaries, one for the + classification model and one for the regression model. + """ + + cls_dict = {} + reg_dict = {} + + for key, value in parameters_dict.items(): + if key.startswith("cls_"): + cls_key = key.replace("cls_", "") + cls_dict[cls_key] = value + elif key.startswith("reg_"): + reg_key = key.replace("reg_", "") + reg_dict[reg_key] = value + + return cls_dict, reg_dict + + +def update_config(hp_config, meta_config, dp_config, args): + config = hp_config.copy() + config["run_type"] = args.run_type + config["sweep"] = False + config["name"] = meta_config["name"] + config["depvar"] = meta_config["depvar"] + config["algorithm"] = meta_config["algorithm"] + if meta_config["algorithm"] == "HurdleModel": + config["model_clf"] = meta_config["model_clf"] + config["model_reg"] = meta_config["model_reg"] + config["deployment_status"] = dp_config["deployment_status"] + + return config + + +def update_sweep_config(sweep_config, args, meta_config): + sweep_config["parameters"]["run_type"] = {"value": args.run_type} + sweep_config["parameters"]["sweep"] = {"value": True} + sweep_config["parameters"]["name"] = {"value": meta_config["name"]} + sweep_config["parameters"]["depvar"] = {"value": meta_config["depvar"]} + sweep_config["parameters"]["algorithm"] = {"value": meta_config["algorithm"]} + if meta_config["algorithm"] == "HurdleModel": + sweep_config["parameters"]["model_clf"] = {"value": meta_config["model_clf"]} + sweep_config["parameters"]["model_reg"] = {"value": meta_config["model_reg"]} diff --git a/models/car_radio/src/visualization/.gitkeep b/models/car_radio/src/visualization/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/models/caring_fish/configs/config_meta.py b/models/caring_fish/configs/config_meta.py index c33ce15f..8ee2da57 100644 --- a/models/caring_fish/configs/config_meta.py +++ b/models/caring_fish/configs/config_meta.py @@ -11,7 +11,7 @@ def get_meta_config(): "name": "caring_fish", "algorithm": "XGBModel", "depvar": "ln_ged_sb_dep", - "queryset": "fatalities002_pgm_conflict_history", + "queryset": "fatalities003_pgm_conflict_history", "level": "pgm", "creator": "Xiaolong" } diff --git a/models/chunky_cat/configs/config_meta.py b/models/chunky_cat/configs/config_meta.py index 76e6bf37..5c25f285 100644 --- a/models/chunky_cat/configs/config_meta.py +++ b/models/chunky_cat/configs/config_meta.py @@ -11,7 +11,7 @@ def get_meta_config(): "name": "chunky_cat", "algorithm": "LightGBMModel", "depvar": "ln_ged_sb_dep", - "queryset": "fatalities002_pgm_conflictlong", + "queryset": "fatalities003_pgm_conflictlong", "level": "pgm", "creator": "Xiaolong" } diff --git a/models/counting_stars/README.md b/models/counting_stars/README.md new file mode 100644 index 00000000..3eebd6f2 --- /dev/null +++ b/models/counting_stars/README.md @@ -0,0 +1,3 @@ +# Model README +## Model name: counting_stars +## Created on: 2024-11-04 14:50:33.251948 \ No newline at end of file diff --git a/models/counting_stars/artifacts/.gitkeep b/models/counting_stars/artifacts/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/models/counting_stars/configs/config_deployment.py b/models/counting_stars/configs/config_deployment.py new file mode 100644 index 00000000..9e45b735 --- /dev/null +++ b/models/counting_stars/configs/config_deployment.py @@ -0,0 +1,20 @@ +""" +Deployment Configuration Script + +This script defines the deployment configuration settings for the application. +It includes the deployment status and any additional settings specified. + +Deployment Status: +- shadow: The deployment is shadowed and not yet active. +- deployed: The deployment is active and in use. +- baseline: The deployment is in a baseline state, for reference or comparison. +- deprecated: The deployment is deprecated and no longer supported. + +Additional settings can be included in the configuration dictionary as needed. + +""" + +def get_deployment_config(): + # Deployment settings + deployment_config = {'deployment_status': 'shadow'} + return deployment_config diff --git a/models/counting_stars/configs/config_hyperparameters.py b/models/counting_stars/configs/config_hyperparameters.py new file mode 100644 index 00000000..5bd4d6e9 --- /dev/null +++ b/models/counting_stars/configs/config_hyperparameters.py @@ -0,0 +1,19 @@ + +def get_hp_config(): + """ + Contains the hyperparameter configurations for model training. + This configuration is "operational" so modifying these settings will impact the model's behavior during the training. + + Returns: + - hyperparameters (dict): A dictionary containing hyperparameters for training the model, which determine the model's behavior during the training phase. + """ + + hyperparameters = { + "steps": [*range(1, 36 + 1, 1)], + "parameters": { + "n_estimators": 100, + "n_jobs": 12, + "learning_rate": 0.05 + } + } + return hyperparameters diff --git a/models/counting_stars/configs/config_meta.py b/models/counting_stars/configs/config_meta.py new file mode 100644 index 00000000..f1c2da08 --- /dev/null +++ b/models/counting_stars/configs/config_meta.py @@ -0,0 +1,18 @@ +def get_meta_config(): + """ + Contains the meta data for the model (model algorithm, name, target variable, and level of analysis). + This config is for documentation purposes only, and modifying it will not affect the model, the training, or the evaluation. + + Returns: + - meta_config (dict): A dictionary containing model meta configuration. + """ + + meta_config = { + "name": "counting_stars", + "algorithm": "XGBModel", + "depvar": "ln_ged_sb_dep", + "queryset": "fatalities003_conflict_history_long", + "level": "cm", + "creator": "Borbála" + } + return meta_config diff --git a/models/counting_stars/configs/config_sweep.py b/models/counting_stars/configs/config_sweep.py new file mode 100644 index 00000000..c1135d95 --- /dev/null +++ b/models/counting_stars/configs/config_sweep.py @@ -0,0 +1,29 @@ + +def get_sweep_config(): + """ + Contains the configuration for hyperparameter sweeps using WandB. + This configuration is "operational" so modifying it will change the search strategy, parameter ranges, and other settings for hyperparameter tuning aimed at optimizing model performance. + + Returns: + - sweep_config (dict): A dictionary containing the configuration for hyperparameter sweeps, defining the methods and parameter ranges used to search for optimal hyperparameters. + """ + + sweep_config = { + 'method': 'grid', + 'name': 'counting_stars' + } + + # Example metric setup: + metric = { + 'name': 'MSE', + 'goal': 'minimize' + } + sweep_config['metric'] = metric + + # Example parameters setup: + parameters_dict = { + 'steps': {'values': [[*range(1, 36 + 1, 1)]]}, + } + sweep_config['parameters'] = parameters_dict + + return sweep_config diff --git a/models/counting_stars/data/generated/.gitkeep b/models/counting_stars/data/generated/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/models/counting_stars/data/processed/.gitkeep b/models/counting_stars/data/processed/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/models/counting_stars/data/raw/.gitkeep b/models/counting_stars/data/raw/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/models/counting_stars/main.py b/models/counting_stars/main.py new file mode 100644 index 00000000..36429ecb --- /dev/null +++ b/models/counting_stars/main.py @@ -0,0 +1,38 @@ +import wandb +import sys +import warnings + +from pathlib import Path +PATH = Path(__file__) +sys.path.insert(0, str(Path( + *[i for i in PATH.parts[:PATH.parts.index("views_pipeline") + 1]]) / "common_utils")) # PATH_COMMON_UTILS +from set_path import setup_project_paths +setup_project_paths(PATH) + +from utils_cli_parser import parse_args, validate_arguments +from utils_logger import setup_logging +from execute_model_runs import execute_sweep_run, execute_single_run + +warnings.filterwarnings("ignore") +try: + from common_utils.model_path import ModelPath + from common_utils.global_cache import GlobalCache + model_name = ModelPath.get_model_name_from_path(PATH) + GlobalCache["current_model"] = model_name +except ImportError as e: + warnings.warn(f"ImportError: {e}. Some functionalities (model seperated log files) may not work properly.", ImportWarning) +except Exception as e: + warnings.warn(f"An unexpected error occurred: {e}.", RuntimeWarning) +logger = setup_logging("run.log") + + +if __name__ == "__main__": + wandb.login() + + args = parse_args() + validate_arguments(args) + + if args.sweep: + execute_sweep_run(args) + else: + execute_single_run(args) diff --git a/models/counting_stars/notebooks/.gitkeep b/models/counting_stars/notebooks/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/models/counting_stars/reports/.gitkeep b/models/counting_stars/reports/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/models/counting_stars/src/architectures/.gitkeep b/models/counting_stars/src/architectures/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/models/counting_stars/src/dataloaders/get_data.py b/models/counting_stars/src/dataloaders/get_data.py new file mode 100644 index 00000000..4dcd20c8 --- /dev/null +++ b/models/counting_stars/src/dataloaders/get_data.py @@ -0,0 +1,14 @@ +import logging +from model_path import ModelPath +from utils_dataloaders import fetch_or_load_views_df + +logger = logging.getLogger(__name__) + +def get_data(args, model_name, self_test): + model_path = ModelPath(model_name, validate=False) + path_raw = model_path.data_raw + + data, alerts = fetch_or_load_views_df(model_name, args.run_type, path_raw, self_test, use_saved=args.saved) + logger.debug(f"DataFrame shape: {data.shape if data is not None else 'None'}") + + return data diff --git a/models/counting_stars/src/forecasting/generate_forecast.py b/models/counting_stars/src/forecasting/generate_forecast.py new file mode 100644 index 00000000..c011a203 --- /dev/null +++ b/models/counting_stars/src/forecasting/generate_forecast.py @@ -0,0 +1,47 @@ +import pandas as pd +from datetime import datetime +import logging +from model_path import ModelPath +from utils_log_files import create_log_file, read_log_file +from utils_run import get_standardized_df +from utils_save_outputs import save_predictions +from utils_artifacts import get_latest_model_artifact + +logger = logging.getLogger(__name__) + + +def forecast_model_artifact(config, artifact_name): + model_path = ModelPath(config["name"]) + path_raw = model_path.data_raw + path_generated = model_path.data_generated + path_artifacts = model_path.artifacts + run_type = config["run_type"] + + # if an artifact name is provided through the CLI, use it. + # Otherwise, get the latest model artifact based on the run type + if artifact_name: + logger.info(f"Using (non-default) artifact: {artifact_name}") + + if not artifact_name.endswith(".pkl"): + artifact_name += ".pkl" + path_artifact = path_artifacts / artifact_name + else: + # use the latest model artifact based on the run type + logger.info(f"Using latest (default) run type ({run_type}) specific artifact") + path_artifact = get_latest_model_artifact(path_artifacts, run_type) + + config["timestamp"] = path_artifact.stem[-15:] + df_viewser = pd.read_pickle(path_raw / f"{run_type}_viewser_df.pkl") + + try: + stepshift_model = pd.read_pickle(path_artifact) + except FileNotFoundError: + logger.exception(f"Model artifact not found at {path_artifact}") + + df_predictions = stepshift_model.predict(run_type, df_viewser) + df_predictions = get_standardized_df(df_predictions, config) + data_generation_timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + date_fetch_timestamp = read_log_file(path_raw / f"{run_type}_data_fetch_log.txt").get("Data Fetch Timestamp", None) + + save_predictions(df_predictions, path_generated, config) + create_log_file(path_generated, config, config["timestamp"], data_generation_timestamp, date_fetch_timestamp) diff --git a/models/counting_stars/src/management/execute_model_runs.py b/models/counting_stars/src/management/execute_model_runs.py new file mode 100644 index 00000000..d5b32a46 --- /dev/null +++ b/models/counting_stars/src/management/execute_model_runs.py @@ -0,0 +1,51 @@ +import wandb +from config_deployment import get_deployment_config +from config_hyperparameters import get_hp_config +from config_meta import get_meta_config +from config_sweep import get_sweep_config +from execute_model_tasks import execute_model_tasks +from get_data import get_data +from utils_run import update_config, update_sweep_config + + +def execute_sweep_run(args): + sweep_config = get_sweep_config() + meta_config = get_meta_config() + update_sweep_config(sweep_config, args, meta_config) + + project = f"{sweep_config['name']}_sweep" # we can name the sweep in the config file + + with wandb.init(project=f'{project}_fetch', entity="views_pipeline"): + + get_data(args, sweep_config["name"], args.drift_self_test) + + wandb.finish() + + sweep_id = wandb.sweep(sweep_config, project=project, entity="views_pipeline") + wandb.agent(sweep_id, execute_model_tasks, entity="views_pipeline") + + +def execute_single_run(args): + + hp_config = get_hp_config() + meta_config = get_meta_config() + dp_config = get_deployment_config() + config = update_config(hp_config, meta_config, dp_config, args) + + project = f"{config['name']}_{args.run_type}" + + with wandb.init(project=f'{project}_fetch', entity="views_pipeline"): + + get_data(args, config["name"], args.drift_self_test) + + wandb.finish() + + if args.run_type == 'calibration' or args.run_type == 'testing': + + execute_model_tasks(config=config, project=project, train=args.train, eval=args.evaluate, + forecast=False, artifact_name=args.artifact_name) + + elif args.run_type == "forecasting": + execute_model_tasks(config=config, project=project, train=args.train, eval=False, + forecast=args.forecast, artifact_name=args.artifact_name) + diff --git a/models/counting_stars/src/management/execute_model_tasks.py b/models/counting_stars/src/management/execute_model_tasks.py new file mode 100644 index 00000000..a913fa53 --- /dev/null +++ b/models/counting_stars/src/management/execute_model_tasks.py @@ -0,0 +1,70 @@ +import wandb +import logging +import time +from evaluate_model import evaluate_model_artifact +from evaluate_sweep import evaluate_sweep +from generate_forecast import forecast_model_artifact +from train_model import train_model_artifact +from utils_run import split_hurdle_parameters +from utils_wandb import add_wandb_monthly_metrics + +logger = logging.getLogger(__name__) + +def execute_model_tasks(config=None, project=None, train=None, eval=None, forecast=None, artifact_name=None): + """ + Executes various model-related tasks including training, evaluation, and forecasting. + + This function manages the execution of different tasks such as training the model, + evaluating an existing model, or performing forecasting. + It also initializes the WandB project. + + Args: + config: Configuration object containing parameters and settings. + project: The WandB project name. + train: Flag to indicate if the model should be trained. + eval: Flag to indicate if the model should be evaluated. + forecast: Flag to indicate if forecasting should be performed. + artifact_name (optional): Specific name of the model artifact to load for evaluation or forecasting. + """ + + start_t = time.time() + + # Initialize WandB + with wandb.init(project=project, entity="views_pipeline", + config=config): # project and config ignored when running a sweep + + # add the monthly metrics to WandB + add_wandb_monthly_metrics() + + # Update config from WandB initialization above + config = wandb.config + + # W&B does not directly support nested dictionaries for hyperparameters + # This will make the sweep config super ugly, but we don't have to distinguish between sweep and single runs + if config["sweep"] and config["algorithm"] == "HurdleRegression": + config["parameters"] = {} + config["parameters"]["clf"], config["parameters"]["reg"] = split_hurdle_parameters(config) + + if config["sweep"]: + logger.info(f"Sweeping model {config['name']}...") + stepshift_model = train_model_artifact(config) + logger.info(f"Evaluating model {config['name']}...") + evaluate_sweep(config, stepshift_model) + + # Handle the single model runs: train and save the model as an artifact + if train: + logger.info(f"Training model {config['name']}...") + train_model_artifact(config) + + # Handle the single model runs: evaluate a trained model (artifact) + if eval: + logger.info(f"Evaluating model {config['name']}...") + evaluate_model_artifact(config, artifact_name) + + if forecast: + logger.info(f"Forecasting model {config['name']}...") + forecast_model_artifact(config, artifact_name) + + end_t = time.time() + minutes = (end_t - start_t) / 60 + logger.info(f"Done. Runtime: {minutes:.3f} minutes.\n") diff --git a/models/counting_stars/src/offline_evaluation/evaluate_model.py b/models/counting_stars/src/offline_evaluation/evaluate_model.py new file mode 100644 index 00000000..0d86a87c --- /dev/null +++ b/models/counting_stars/src/offline_evaluation/evaluate_model.py @@ -0,0 +1,55 @@ +from datetime import datetime +import pandas as pd +import logging +from model_path import ModelPath +from utils_log_files import create_log_file, read_log_file +from utils_save_outputs import save_model_outputs, save_predictions +from utils_run import get_standardized_df +from utils_artifacts import get_latest_model_artifact +from utils_evaluation_metrics import generate_metric_dict +from utils_model_outputs import generate_output_dict +from utils_wandb import log_wandb_log_dict +from views_forecasts.extensions import * + +logger = logging.getLogger(__name__) + +def evaluate_model_artifact(config, artifact_name): + model_path = ModelPath(config["name"]) + path_raw = model_path.data_raw + path_generated = model_path.data_generated + path_artifacts = model_path.artifacts + run_type = config["run_type"] + + # if an artifact name is provided through the CLI, use it. + # Otherwise, get the latest model artifact based on the run type + if artifact_name: + logger.info(f"Using (non-default) artifact: {artifact_name}") + + if not artifact_name.endswith(".pkl"): + artifact_name += ".pkl" + PATH_ARTIFACT = path_artifacts / artifact_name + else: + # use the latest model artifact based on the run type + logger.info(f"Using latest (default) run type ({run_type}) specific artifact") + PATH_ARTIFACT = get_latest_model_artifact(path_artifacts, run_type) + + config["timestamp"] = PATH_ARTIFACT.stem[-15:] + df_viewser = pd.read_pickle(path_raw / f"{run_type}_viewser_df.pkl") + + try: + stepshift_model = pd.read_pickle(PATH_ARTIFACT) + except FileNotFoundError: + logger.exception(f"Model artifact not found at {PATH_ARTIFACT}") + + df = stepshift_model.predict(run_type, df_viewser) + df = get_standardized_df(df, config) + data_generation_timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + date_fetch_timestamp = read_log_file(path_raw / f"{run_type}_data_fetch_log.txt").get("Data Fetch Timestamp", None) + + _, df_output = generate_output_dict(df, config) + evaluation, df_evaluation = generate_metric_dict(df, config) + log_wandb_log_dict(config, evaluation) + + save_model_outputs(df_evaluation, df_output, path_generated, config) + save_predictions(df, path_generated, config) + create_log_file(path_generated, config, config["timestamp"], data_generation_timestamp, date_fetch_timestamp) diff --git a/models/counting_stars/src/offline_evaluation/evaluate_sweep.py b/models/counting_stars/src/offline_evaluation/evaluate_sweep.py new file mode 100644 index 00000000..d6726cf0 --- /dev/null +++ b/models/counting_stars/src/offline_evaluation/evaluate_sweep.py @@ -0,0 +1,28 @@ +import pandas as pd +import wandb +from sklearn.metrics import mean_squared_error +from model_path import ModelPath +from utils_run import get_standardized_df +from utils_wandb import log_wandb_log_dict +from utils_evaluation_metrics import generate_metric_dict + + +def evaluate_sweep(config, stepshift_model): + model_path = ModelPath(config["name"]) + path_raw = model_path.data_raw + run_type = config["run_type"] + steps = config["steps"] + + df_viewser = pd.read_pickle(path_raw / f"{run_type}_viewser_df.pkl") + df = stepshift_model.predict(run_type, df_viewser) + df = get_standardized_df(df, config) + + # Temporarily keep this because the metric to minimize is MSE + pred_cols = [f"step_pred_{str(i)}" for i in steps] + df["mse"] = df.apply(lambda row: mean_squared_error([row[config["depvar"]]] * 36, + [row[col] for col in pred_cols]), axis=1) + + wandb.log({"MSE": df["mse"].mean()}) + + evaluation, _ = generate_metric_dict(df, config) + log_wandb_log_dict(config, evaluation) diff --git a/models/counting_stars/src/online_evaluation/.gitkeep b/models/counting_stars/src/online_evaluation/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/models/counting_stars/src/training/train_model.py b/models/counting_stars/src/training/train_model.py new file mode 100644 index 00000000..f2342912 --- /dev/null +++ b/models/counting_stars/src/training/train_model.py @@ -0,0 +1,33 @@ +from datetime import datetime +import pandas as pd +from model_path import ModelPath +from utils_log_files import create_log_file, read_log_file +from utils_run import get_model +from set_partition import get_partitioner_dict +from views_forecasts.extensions import * + + +def train_model_artifact(config): + # print(config) + model_path = ModelPath(config["name"]) + path_raw = model_path.data_raw + path_generated = model_path.data_generated + path_artifacts = model_path.artifacts + run_type = config["run_type"] + df_viewser = pd.read_pickle(path_raw / f"{run_type}_viewser_df.pkl") + + stepshift_model = stepshift_training(config, run_type, df_viewser) + if not config["sweep"]: + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + model_filename = f"{run_type}_model_{timestamp}.pkl" + stepshift_model.save(path_artifacts / model_filename) + date_fetch_timestamp = read_log_file(path_raw / f"{run_type}_data_fetch_log.txt").get("Data Fetch Timestamp", None) + create_log_file(path_generated, config, timestamp, None, date_fetch_timestamp) + return stepshift_model + + +def stepshift_training(config, partition_name, dataset): + partitioner_dict = get_partitioner_dict(partition_name) + stepshift_model = get_model(config, partitioner_dict) + stepshift_model.fit(dataset) + return stepshift_model diff --git a/models/counting_stars/src/utils/utils_run.py b/models/counting_stars/src/utils/utils_run.py new file mode 100644 index 00000000..51901a01 --- /dev/null +++ b/models/counting_stars/src/utils/utils_run.py @@ -0,0 +1,83 @@ +import numpy as np +from views_stepshifter_darts.stepshifter import StepshifterModel +from views_stepshifter_darts.hurdle_model import HurdleModel +from views_forecasts.extensions import * + + +def get_model(config, partitioner_dict): + """ + Get the model based on the algorithm specified in the config + """ + + if config["algorithm"] == "HurdleModel": + model = HurdleModel(config, partitioner_dict) + else: + config["model_reg"] = config["algorithm"] + model = StepshifterModel(config, partitioner_dict) + + return model + + +def get_standardized_df(df, config): + """ + Standardize the DataFrame based on the run type + """ + + run_type = config["run_type"] + steps = config["steps"] + depvar = config["depvar"] + + # choose the columns to keep based on the run type and replace negative values with 0 + if run_type in ["calibration", "testing"]: + cols = [depvar] + df.forecasts.prediction_columns + elif run_type == "forecasting": + cols = [f"step_pred_{i}" for i in steps] + df = df.replace([np.inf, -np.inf], 0)[cols] + df = df.mask(df < 0, 0) + return df + + +def split_hurdle_parameters(parameters_dict): + """ + Split the parameters dictionary into two separate dictionaries, one for the + classification model and one for the regression model. + """ + + cls_dict = {} + reg_dict = {} + + for key, value in parameters_dict.items(): + if key.startswith("cls_"): + cls_key = key.replace("cls_", "") + cls_dict[cls_key] = value + elif key.startswith("reg_"): + reg_key = key.replace("reg_", "") + reg_dict[reg_key] = value + + return cls_dict, reg_dict + + +def update_config(hp_config, meta_config, dp_config, args): + config = hp_config.copy() + config["run_type"] = args.run_type + config["sweep"] = False + config["name"] = meta_config["name"] + config["depvar"] = meta_config["depvar"] + config["algorithm"] = meta_config["algorithm"] + if meta_config["algorithm"] == "HurdleModel": + config["model_clf"] = meta_config["model_clf"] + config["model_reg"] = meta_config["model_reg"] + config["deployment_status"] = dp_config["deployment_status"] + + return config + + +def update_sweep_config(sweep_config, args, meta_config): + sweep_config["parameters"]["run_type"] = {"value": args.run_type} + sweep_config["parameters"]["sweep"] = {"value": True} + sweep_config["parameters"]["name"] = {"value": meta_config["name"]} + sweep_config["parameters"]["depvar"] = {"value": meta_config["depvar"]} + sweep_config["parameters"]["algorithm"] = {"value": meta_config["algorithm"]} + if meta_config["algorithm"] == "HurdleModel": + sweep_config["parameters"]["model_clf"] = {"value": meta_config["model_clf"]} + sweep_config["parameters"]["model_reg"] = {"value": meta_config["model_reg"]} diff --git a/models/counting_stars/src/visualization/.gitkeep b/models/counting_stars/src/visualization/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/models/dark_paradise/configs/config_meta.py b/models/dark_paradise/configs/config_meta.py index 5ffbd1f9..d4f5b713 100644 --- a/models/dark_paradise/configs/config_meta.py +++ b/models/dark_paradise/configs/config_meta.py @@ -13,7 +13,7 @@ def get_meta_config(): "model_clf": "LightGBMModel", "model_reg": "LightGBMModel", "depvar": "ln_ged_sb_dep", - "queryset": "fatalities002_pgm_conflictlong", + "queryset": "fatalities003_pgm_conflictlong", "level": "pgm", "creator": "Xiaolong" } diff --git a/models/demon_days/README.md b/models/demon_days/README.md new file mode 100644 index 00000000..51ac06ba --- /dev/null +++ b/models/demon_days/README.md @@ -0,0 +1,3 @@ +# Model README +## Model name: demon_days +## Created on: 2024-11-04 17:01:53.416452 \ No newline at end of file diff --git a/models/demon_days/artifacts/.gitkeep b/models/demon_days/artifacts/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/models/demon_days/configs/config_deployment.py b/models/demon_days/configs/config_deployment.py new file mode 100644 index 00000000..9e45b735 --- /dev/null +++ b/models/demon_days/configs/config_deployment.py @@ -0,0 +1,20 @@ +""" +Deployment Configuration Script + +This script defines the deployment configuration settings for the application. +It includes the deployment status and any additional settings specified. + +Deployment Status: +- shadow: The deployment is shadowed and not yet active. +- deployed: The deployment is active and in use. +- baseline: The deployment is in a baseline state, for reference or comparison. +- deprecated: The deployment is deprecated and no longer supported. + +Additional settings can be included in the configuration dictionary as needed. + +""" + +def get_deployment_config(): + # Deployment settings + deployment_config = {'deployment_status': 'shadow'} + return deployment_config diff --git a/models/demon_days/configs/config_hyperparameters.py b/models/demon_days/configs/config_hyperparameters.py new file mode 100644 index 00000000..73900504 --- /dev/null +++ b/models/demon_days/configs/config_hyperparameters.py @@ -0,0 +1,18 @@ + +def get_hp_config(): + """ + Contains the hyperparameter configurations for model training. + This configuration is "operational" so modifying these settings will impact the model's behavior during the training. + + Returns: + - hyperparameters (dict): A dictionary containing hyperparameters for training the model, which determine the model's behavior during the training phase. + """ + + hyperparameters = { + "steps": [*range(1, 36 + 1, 1)], + "parameters": { + "n_estimators": 300, + "n_jobs": 12, + } + } + return hyperparameters diff --git a/models/demon_days/configs/config_meta.py b/models/demon_days/configs/config_meta.py new file mode 100644 index 00000000..6633470d --- /dev/null +++ b/models/demon_days/configs/config_meta.py @@ -0,0 +1,18 @@ +def get_meta_config(): + """ + Contains the meta data for the model (model algorithm, name, target variable, and level of analysis). + This config is for documentation purposes only, and modifying it will not affect the model, the training, or the evaluation. + + Returns: + - meta_config (dict): A dictionary containing model meta configuration. + """ + + meta_config = { + "name": "demon_days", + "algorithm": "RandomForestModel", + "depvar": "ln_ged_sb_dep", + "queryset": "fatalities003_faostat", + "level": "cm", + "creator": "Marina" + } + return meta_config diff --git a/models/demon_days/configs/config_sweep.py b/models/demon_days/configs/config_sweep.py new file mode 100644 index 00000000..44230a3d --- /dev/null +++ b/models/demon_days/configs/config_sweep.py @@ -0,0 +1,29 @@ + +def get_sweep_config(): + """ + Contains the configuration for hyperparameter sweeps using WandB. + This configuration is "operational" so modifying it will change the search strategy, parameter ranges, and other settings for hyperparameter tuning aimed at optimizing model performance. + + Returns: + - sweep_config (dict): A dictionary containing the configuration for hyperparameter sweeps, defining the methods and parameter ranges used to search for optimal hyperparameters. + """ + + sweep_config = { + 'method': 'grid', + 'name': 'demon_days' + } + + # Example metric setup: + metric = { + 'name': 'MSE', + 'goal': 'minimize' + } + sweep_config['metric'] = metric + + # Example parameters setup: + parameters_dict = { + 'steps': {'values': [[*range(1, 36 + 1, 1)]]}, + } + sweep_config['parameters'] = parameters_dict + + return sweep_config diff --git a/models/demon_days/data/generated/.gitkeep b/models/demon_days/data/generated/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/models/demon_days/data/processed/.gitkeep b/models/demon_days/data/processed/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/models/demon_days/data/raw/.gitkeep b/models/demon_days/data/raw/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/models/demon_days/main.py b/models/demon_days/main.py new file mode 100644 index 00000000..36429ecb --- /dev/null +++ b/models/demon_days/main.py @@ -0,0 +1,38 @@ +import wandb +import sys +import warnings + +from pathlib import Path +PATH = Path(__file__) +sys.path.insert(0, str(Path( + *[i for i in PATH.parts[:PATH.parts.index("views_pipeline") + 1]]) / "common_utils")) # PATH_COMMON_UTILS +from set_path import setup_project_paths +setup_project_paths(PATH) + +from utils_cli_parser import parse_args, validate_arguments +from utils_logger import setup_logging +from execute_model_runs import execute_sweep_run, execute_single_run + +warnings.filterwarnings("ignore") +try: + from common_utils.model_path import ModelPath + from common_utils.global_cache import GlobalCache + model_name = ModelPath.get_model_name_from_path(PATH) + GlobalCache["current_model"] = model_name +except ImportError as e: + warnings.warn(f"ImportError: {e}. Some functionalities (model seperated log files) may not work properly.", ImportWarning) +except Exception as e: + warnings.warn(f"An unexpected error occurred: {e}.", RuntimeWarning) +logger = setup_logging("run.log") + + +if __name__ == "__main__": + wandb.login() + + args = parse_args() + validate_arguments(args) + + if args.sweep: + execute_sweep_run(args) + else: + execute_single_run(args) diff --git a/models/demon_days/notebooks/.gitkeep b/models/demon_days/notebooks/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/models/demon_days/reports/.gitkeep b/models/demon_days/reports/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/models/demon_days/src/architectures/.gitkeep b/models/demon_days/src/architectures/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/models/demon_days/src/dataloaders/get_data.py b/models/demon_days/src/dataloaders/get_data.py new file mode 100644 index 00000000..4dcd20c8 --- /dev/null +++ b/models/demon_days/src/dataloaders/get_data.py @@ -0,0 +1,14 @@ +import logging +from model_path import ModelPath +from utils_dataloaders import fetch_or_load_views_df + +logger = logging.getLogger(__name__) + +def get_data(args, model_name, self_test): + model_path = ModelPath(model_name, validate=False) + path_raw = model_path.data_raw + + data, alerts = fetch_or_load_views_df(model_name, args.run_type, path_raw, self_test, use_saved=args.saved) + logger.debug(f"DataFrame shape: {data.shape if data is not None else 'None'}") + + return data diff --git a/models/demon_days/src/forecasting/generate_forecast.py b/models/demon_days/src/forecasting/generate_forecast.py new file mode 100644 index 00000000..c011a203 --- /dev/null +++ b/models/demon_days/src/forecasting/generate_forecast.py @@ -0,0 +1,47 @@ +import pandas as pd +from datetime import datetime +import logging +from model_path import ModelPath +from utils_log_files import create_log_file, read_log_file +from utils_run import get_standardized_df +from utils_save_outputs import save_predictions +from utils_artifacts import get_latest_model_artifact + +logger = logging.getLogger(__name__) + + +def forecast_model_artifact(config, artifact_name): + model_path = ModelPath(config["name"]) + path_raw = model_path.data_raw + path_generated = model_path.data_generated + path_artifacts = model_path.artifacts + run_type = config["run_type"] + + # if an artifact name is provided through the CLI, use it. + # Otherwise, get the latest model artifact based on the run type + if artifact_name: + logger.info(f"Using (non-default) artifact: {artifact_name}") + + if not artifact_name.endswith(".pkl"): + artifact_name += ".pkl" + path_artifact = path_artifacts / artifact_name + else: + # use the latest model artifact based on the run type + logger.info(f"Using latest (default) run type ({run_type}) specific artifact") + path_artifact = get_latest_model_artifact(path_artifacts, run_type) + + config["timestamp"] = path_artifact.stem[-15:] + df_viewser = pd.read_pickle(path_raw / f"{run_type}_viewser_df.pkl") + + try: + stepshift_model = pd.read_pickle(path_artifact) + except FileNotFoundError: + logger.exception(f"Model artifact not found at {path_artifact}") + + df_predictions = stepshift_model.predict(run_type, df_viewser) + df_predictions = get_standardized_df(df_predictions, config) + data_generation_timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + date_fetch_timestamp = read_log_file(path_raw / f"{run_type}_data_fetch_log.txt").get("Data Fetch Timestamp", None) + + save_predictions(df_predictions, path_generated, config) + create_log_file(path_generated, config, config["timestamp"], data_generation_timestamp, date_fetch_timestamp) diff --git a/models/demon_days/src/management/execute_model_runs.py b/models/demon_days/src/management/execute_model_runs.py new file mode 100644 index 00000000..d5b32a46 --- /dev/null +++ b/models/demon_days/src/management/execute_model_runs.py @@ -0,0 +1,51 @@ +import wandb +from config_deployment import get_deployment_config +from config_hyperparameters import get_hp_config +from config_meta import get_meta_config +from config_sweep import get_sweep_config +from execute_model_tasks import execute_model_tasks +from get_data import get_data +from utils_run import update_config, update_sweep_config + + +def execute_sweep_run(args): + sweep_config = get_sweep_config() + meta_config = get_meta_config() + update_sweep_config(sweep_config, args, meta_config) + + project = f"{sweep_config['name']}_sweep" # we can name the sweep in the config file + + with wandb.init(project=f'{project}_fetch', entity="views_pipeline"): + + get_data(args, sweep_config["name"], args.drift_self_test) + + wandb.finish() + + sweep_id = wandb.sweep(sweep_config, project=project, entity="views_pipeline") + wandb.agent(sweep_id, execute_model_tasks, entity="views_pipeline") + + +def execute_single_run(args): + + hp_config = get_hp_config() + meta_config = get_meta_config() + dp_config = get_deployment_config() + config = update_config(hp_config, meta_config, dp_config, args) + + project = f"{config['name']}_{args.run_type}" + + with wandb.init(project=f'{project}_fetch', entity="views_pipeline"): + + get_data(args, config["name"], args.drift_self_test) + + wandb.finish() + + if args.run_type == 'calibration' or args.run_type == 'testing': + + execute_model_tasks(config=config, project=project, train=args.train, eval=args.evaluate, + forecast=False, artifact_name=args.artifact_name) + + elif args.run_type == "forecasting": + execute_model_tasks(config=config, project=project, train=args.train, eval=False, + forecast=args.forecast, artifact_name=args.artifact_name) + diff --git a/models/demon_days/src/management/execute_model_tasks.py b/models/demon_days/src/management/execute_model_tasks.py new file mode 100644 index 00000000..a913fa53 --- /dev/null +++ b/models/demon_days/src/management/execute_model_tasks.py @@ -0,0 +1,70 @@ +import wandb +import logging +import time +from evaluate_model import evaluate_model_artifact +from evaluate_sweep import evaluate_sweep +from generate_forecast import forecast_model_artifact +from train_model import train_model_artifact +from utils_run import split_hurdle_parameters +from utils_wandb import add_wandb_monthly_metrics + +logger = logging.getLogger(__name__) + +def execute_model_tasks(config=None, project=None, train=None, eval=None, forecast=None, artifact_name=None): + """ + Executes various model-related tasks including training, evaluation, and forecasting. + + This function manages the execution of different tasks such as training the model, + evaluating an existing model, or performing forecasting. + It also initializes the WandB project. + + Args: + config: Configuration object containing parameters and settings. + project: The WandB project name. + train: Flag to indicate if the model should be trained. + eval: Flag to indicate if the model should be evaluated. + forecast: Flag to indicate if forecasting should be performed. + artifact_name (optional): Specific name of the model artifact to load for evaluation or forecasting. + """ + + start_t = time.time() + + # Initialize WandB + with wandb.init(project=project, entity="views_pipeline", + config=config): # project and config ignored when running a sweep + + # add the monthly metrics to WandB + add_wandb_monthly_metrics() + + # Update config from WandB initialization above + config = wandb.config + + # W&B does not directly support nested dictionaries for hyperparameters + # This will make the sweep config super ugly, but we don't have to distinguish between sweep and single runs + if config["sweep"] and config["algorithm"] == "HurdleRegression": + config["parameters"] = {} + config["parameters"]["clf"], config["parameters"]["reg"] = split_hurdle_parameters(config) + + if config["sweep"]: + logger.info(f"Sweeping model {config['name']}...") + stepshift_model = train_model_artifact(config) + logger.info(f"Evaluating model {config['name']}...") + evaluate_sweep(config, stepshift_model) + + # Handle the single model runs: train and save the model as an artifact + if train: + logger.info(f"Training model {config['name']}...") + train_model_artifact(config) + + # Handle the single model runs: evaluate a trained model (artifact) + if eval: + logger.info(f"Evaluating model {config['name']}...") + evaluate_model_artifact(config, artifact_name) + + if forecast: + logger.info(f"Forecasting model {config['name']}...") + forecast_model_artifact(config, artifact_name) + + end_t = time.time() + minutes = (end_t - start_t) / 60 + logger.info(f"Done. Runtime: {minutes:.3f} minutes.\n") diff --git a/models/demon_days/src/offline_evaluation/evaluate_model.py b/models/demon_days/src/offline_evaluation/evaluate_model.py new file mode 100644 index 00000000..0d86a87c --- /dev/null +++ b/models/demon_days/src/offline_evaluation/evaluate_model.py @@ -0,0 +1,55 @@ +from datetime import datetime +import pandas as pd +import logging +from model_path import ModelPath +from utils_log_files import create_log_file, read_log_file +from utils_save_outputs import save_model_outputs, save_predictions +from utils_run import get_standardized_df +from utils_artifacts import get_latest_model_artifact +from utils_evaluation_metrics import generate_metric_dict +from utils_model_outputs import generate_output_dict +from utils_wandb import log_wandb_log_dict +from views_forecasts.extensions import * + +logger = logging.getLogger(__name__) + +def evaluate_model_artifact(config, artifact_name): + model_path = ModelPath(config["name"]) + path_raw = model_path.data_raw + path_generated = model_path.data_generated + path_artifacts = model_path.artifacts + run_type = config["run_type"] + + # if an artifact name is provided through the CLI, use it. + # Otherwise, get the latest model artifact based on the run type + if artifact_name: + logger.info(f"Using (non-default) artifact: {artifact_name}") + + if not artifact_name.endswith(".pkl"): + artifact_name += ".pkl" + PATH_ARTIFACT = path_artifacts / artifact_name + else: + # use the latest model artifact based on the run type + logger.info(f"Using latest (default) run type ({run_type}) specific artifact") + PATH_ARTIFACT = get_latest_model_artifact(path_artifacts, run_type) + + config["timestamp"] = PATH_ARTIFACT.stem[-15:] + df_viewser = pd.read_pickle(path_raw / f"{run_type}_viewser_df.pkl") + + try: + stepshift_model = pd.read_pickle(PATH_ARTIFACT) + except FileNotFoundError: + logger.exception(f"Model artifact not found at {PATH_ARTIFACT}") + + df = stepshift_model.predict(run_type, df_viewser) + df = get_standardized_df(df, config) + data_generation_timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + date_fetch_timestamp = read_log_file(path_raw / f"{run_type}_data_fetch_log.txt").get("Data Fetch Timestamp", None) + + _, df_output = generate_output_dict(df, config) + evaluation, df_evaluation = generate_metric_dict(df, config) + log_wandb_log_dict(config, evaluation) + + save_model_outputs(df_evaluation, df_output, path_generated, config) + save_predictions(df, path_generated, config) + create_log_file(path_generated, config, config["timestamp"], data_generation_timestamp, date_fetch_timestamp) diff --git a/models/demon_days/src/offline_evaluation/evaluate_sweep.py b/models/demon_days/src/offline_evaluation/evaluate_sweep.py new file mode 100644 index 00000000..d6726cf0 --- /dev/null +++ b/models/demon_days/src/offline_evaluation/evaluate_sweep.py @@ -0,0 +1,28 @@ +import pandas as pd +import wandb +from sklearn.metrics import mean_squared_error +from model_path import ModelPath +from utils_run import get_standardized_df +from utils_wandb import log_wandb_log_dict +from utils_evaluation_metrics import generate_metric_dict + + +def evaluate_sweep(config, stepshift_model): + model_path = ModelPath(config["name"]) + path_raw = model_path.data_raw + run_type = config["run_type"] + steps = config["steps"] + + df_viewser = pd.read_pickle(path_raw / f"{run_type}_viewser_df.pkl") + df = stepshift_model.predict(run_type, df_viewser) + df = get_standardized_df(df, config) + + # Temporarily keep this because the metric to minimize is MSE + pred_cols = [f"step_pred_{str(i)}" for i in steps] + df["mse"] = df.apply(lambda row: mean_squared_error([row[config["depvar"]]] * 36, + [row[col] for col in pred_cols]), axis=1) + + wandb.log({"MSE": df["mse"].mean()}) + + evaluation, _ = generate_metric_dict(df, config) + log_wandb_log_dict(config, evaluation) diff --git a/models/demon_days/src/online_evaluation/.gitkeep b/models/demon_days/src/online_evaluation/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/models/demon_days/src/training/train_model.py b/models/demon_days/src/training/train_model.py new file mode 100644 index 00000000..f2342912 --- /dev/null +++ b/models/demon_days/src/training/train_model.py @@ -0,0 +1,33 @@ +from datetime import datetime +import pandas as pd +from model_path import ModelPath +from utils_log_files import create_log_file, read_log_file +from utils_run import get_model +from set_partition import get_partitioner_dict +from views_forecasts.extensions import * + + +def train_model_artifact(config): + # print(config) + model_path = ModelPath(config["name"]) + path_raw = model_path.data_raw + path_generated = model_path.data_generated + path_artifacts = model_path.artifacts + run_type = config["run_type"] + df_viewser = pd.read_pickle(path_raw / f"{run_type}_viewser_df.pkl") + + stepshift_model = stepshift_training(config, run_type, df_viewser) + if not config["sweep"]: + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + model_filename = f"{run_type}_model_{timestamp}.pkl" + stepshift_model.save(path_artifacts / model_filename) + date_fetch_timestamp = read_log_file(path_raw / f"{run_type}_data_fetch_log.txt").get("Data Fetch Timestamp", None) + create_log_file(path_generated, config, timestamp, None, date_fetch_timestamp) + return stepshift_model + + +def stepshift_training(config, partition_name, dataset): + partitioner_dict = get_partitioner_dict(partition_name) + stepshift_model = get_model(config, partitioner_dict) + stepshift_model.fit(dataset) + return stepshift_model diff --git a/models/demon_days/src/utils/utils_run.py b/models/demon_days/src/utils/utils_run.py new file mode 100644 index 00000000..51901a01 --- /dev/null +++ b/models/demon_days/src/utils/utils_run.py @@ -0,0 +1,83 @@ +import numpy as np +from views_stepshifter_darts.stepshifter import StepshifterModel +from views_stepshifter_darts.hurdle_model import HurdleModel +from views_forecasts.extensions import * + + +def get_model(config, partitioner_dict): + """ + Get the model based on the algorithm specified in the config + """ + + if config["algorithm"] == "HurdleModel": + model = HurdleModel(config, partitioner_dict) + else: + config["model_reg"] = config["algorithm"] + model = StepshifterModel(config, partitioner_dict) + + return model + + +def get_standardized_df(df, config): + """ + Standardize the DataFrame based on the run type + """ + + run_type = config["run_type"] + steps = config["steps"] + depvar = config["depvar"] + + # choose the columns to keep based on the run type and replace negative values with 0 + if run_type in ["calibration", "testing"]: + cols = [depvar] + df.forecasts.prediction_columns + elif run_type == "forecasting": + cols = [f"step_pred_{i}" for i in steps] + df = df.replace([np.inf, -np.inf], 0)[cols] + df = df.mask(df < 0, 0) + return df + + +def split_hurdle_parameters(parameters_dict): + """ + Split the parameters dictionary into two separate dictionaries, one for the + classification model and one for the regression model. + """ + + cls_dict = {} + reg_dict = {} + + for key, value in parameters_dict.items(): + if key.startswith("cls_"): + cls_key = key.replace("cls_", "") + cls_dict[cls_key] = value + elif key.startswith("reg_"): + reg_key = key.replace("reg_", "") + reg_dict[reg_key] = value + + return cls_dict, reg_dict + + +def update_config(hp_config, meta_config, dp_config, args): + config = hp_config.copy() + config["run_type"] = args.run_type + config["sweep"] = False + config["name"] = meta_config["name"] + config["depvar"] = meta_config["depvar"] + config["algorithm"] = meta_config["algorithm"] + if meta_config["algorithm"] == "HurdleModel": + config["model_clf"] = meta_config["model_clf"] + config["model_reg"] = meta_config["model_reg"] + config["deployment_status"] = dp_config["deployment_status"] + + return config + + +def update_sweep_config(sweep_config, args, meta_config): + sweep_config["parameters"]["run_type"] = {"value": args.run_type} + sweep_config["parameters"]["sweep"] = {"value": True} + sweep_config["parameters"]["name"] = {"value": meta_config["name"]} + sweep_config["parameters"]["depvar"] = {"value": meta_config["depvar"]} + sweep_config["parameters"]["algorithm"] = {"value": meta_config["algorithm"]} + if meta_config["algorithm"] == "HurdleModel": + sweep_config["parameters"]["model_clf"] = {"value": meta_config["model_clf"]} + sweep_config["parameters"]["model_reg"] = {"value": meta_config["model_reg"]} diff --git a/models/demon_days/src/visualization/.gitkeep b/models/demon_days/src/visualization/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/models/fast_car/README.md b/models/fast_car/README.md new file mode 100644 index 00000000..0c945e1a --- /dev/null +++ b/models/fast_car/README.md @@ -0,0 +1,3 @@ +# Model README +## Model name: fast_car +## Created on: 2024-11-05 09:13:52.459828 \ No newline at end of file diff --git a/models/fast_car/artifacts/.gitkeep b/models/fast_car/artifacts/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/models/fast_car/configs/config_deployment.py b/models/fast_car/configs/config_deployment.py new file mode 100644 index 00000000..9e45b735 --- /dev/null +++ b/models/fast_car/configs/config_deployment.py @@ -0,0 +1,20 @@ +""" +Deployment Configuration Script + +This script defines the deployment configuration settings for the application. +It includes the deployment status and any additional settings specified. + +Deployment Status: +- shadow: The deployment is shadowed and not yet active. +- deployed: The deployment is active and in use. +- baseline: The deployment is in a baseline state, for reference or comparison. +- deprecated: The deployment is deprecated and no longer supported. + +Additional settings can be included in the configuration dictionary as needed. + +""" + +def get_deployment_config(): + # Deployment settings + deployment_config = {'deployment_status': 'shadow'} + return deployment_config diff --git a/models/fast_car/configs/config_hyperparameters.py b/models/fast_car/configs/config_hyperparameters.py new file mode 100644 index 00000000..80916d98 --- /dev/null +++ b/models/fast_car/configs/config_hyperparameters.py @@ -0,0 +1,26 @@ + +def get_hp_config(): + """ + Contains the hyperparameter configurations for model training. + This configuration is "operational" so modifying these settings will impact the model's behavior during the training. + + Returns: + - hyperparameters (dict): A dictionary containing hyperparameters for training the model, which determine the model's behavior during the training phase. + """ + + hyperparameters = { + "steps": [*range(1, 36 + 1, 1)], + "parameters": { + "clf": { + "n_estimators": 100, + "learning_rate": 0.05, + "n_jobs": -2 + }, + "reg": { + "n_estimators": 100, + "learning_rate": 0.05, + "n_jobs": -2 + } + } + } + return hyperparameters diff --git a/models/fast_car/configs/config_meta.py b/models/fast_car/configs/config_meta.py new file mode 100644 index 00000000..15d0e1c7 --- /dev/null +++ b/models/fast_car/configs/config_meta.py @@ -0,0 +1,20 @@ +def get_meta_config(): + """ + Contains the meta data for the model (model algorithm, name, target variable, and level of analysis). + This config is for documentation purposes only, and modifying it will not affect the model, the training, or the evaluation. + + Returns: + - meta_config (dict): A dictionary containing model meta configuration. + """ + + meta_config = { + "name": "fast_car", + "algorithm": "HurdleModel", + "model_clf": "XGBModel", + "model_reg": "XGBModel", + "depvar": "ln_ged_sb_dep", + "queryset": "fatalities003_vdem_short", + "level": "cm", + "creator": "Borbála" + } + return meta_config diff --git a/models/fast_car/configs/config_sweep.py b/models/fast_car/configs/config_sweep.py new file mode 100644 index 00000000..5d8265c9 --- /dev/null +++ b/models/fast_car/configs/config_sweep.py @@ -0,0 +1,29 @@ + +def get_sweep_config(): + """ + Contains the configuration for hyperparameter sweeps using WandB. + This configuration is "operational" so modifying it will change the search strategy, parameter ranges, and other settings for hyperparameter tuning aimed at optimizing model performance. + + Returns: + - sweep_config (dict): A dictionary containing the configuration for hyperparameter sweeps, defining the methods and parameter ranges used to search for optimal hyperparameters. + """ + + sweep_config = { + 'method': 'grid', + 'name': 'fast_car' + } + + # Example metric setup: + metric = { + 'name': 'MSE', + 'goal': 'minimize' + } + sweep_config['metric'] = metric + + # Example parameters setup: + parameters_dict = { + 'steps': {'values': [[*range(1, 36 + 1, 1)]]}, + } + sweep_config['parameters'] = parameters_dict + + return sweep_config diff --git a/models/fast_car/data/generated/.gitkeep b/models/fast_car/data/generated/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/models/fast_car/data/processed/.gitkeep b/models/fast_car/data/processed/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/models/fast_car/data/raw/.gitkeep b/models/fast_car/data/raw/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/models/fast_car/main.py b/models/fast_car/main.py new file mode 100644 index 00000000..36429ecb --- /dev/null +++ b/models/fast_car/main.py @@ -0,0 +1,38 @@ +import wandb +import sys +import warnings + +from pathlib import Path +PATH = Path(__file__) +sys.path.insert(0, str(Path( + *[i for i in PATH.parts[:PATH.parts.index("views_pipeline") + 1]]) / "common_utils")) # PATH_COMMON_UTILS +from set_path import setup_project_paths +setup_project_paths(PATH) + +from utils_cli_parser import parse_args, validate_arguments +from utils_logger import setup_logging +from execute_model_runs import execute_sweep_run, execute_single_run + +warnings.filterwarnings("ignore") +try: + from common_utils.model_path import ModelPath + from common_utils.global_cache import GlobalCache + model_name = ModelPath.get_model_name_from_path(PATH) + GlobalCache["current_model"] = model_name +except ImportError as e: + warnings.warn(f"ImportError: {e}. Some functionalities (model seperated log files) may not work properly.", ImportWarning) +except Exception as e: + warnings.warn(f"An unexpected error occurred: {e}.", RuntimeWarning) +logger = setup_logging("run.log") + + +if __name__ == "__main__": + wandb.login() + + args = parse_args() + validate_arguments(args) + + if args.sweep: + execute_sweep_run(args) + else: + execute_single_run(args) diff --git a/models/fast_car/notebooks/.gitkeep b/models/fast_car/notebooks/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/models/fast_car/reports/.gitkeep b/models/fast_car/reports/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/models/fast_car/src/architectures/.gitkeep b/models/fast_car/src/architectures/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/models/fast_car/src/dataloaders/get_data.py b/models/fast_car/src/dataloaders/get_data.py new file mode 100644 index 00000000..4dcd20c8 --- /dev/null +++ b/models/fast_car/src/dataloaders/get_data.py @@ -0,0 +1,14 @@ +import logging +from model_path import ModelPath +from utils_dataloaders import fetch_or_load_views_df + +logger = logging.getLogger(__name__) + +def get_data(args, model_name, self_test): + model_path = ModelPath(model_name, validate=False) + path_raw = model_path.data_raw + + data, alerts = fetch_or_load_views_df(model_name, args.run_type, path_raw, self_test, use_saved=args.saved) + logger.debug(f"DataFrame shape: {data.shape if data is not None else 'None'}") + + return data diff --git a/models/fast_car/src/forecasting/generate_forecast.py b/models/fast_car/src/forecasting/generate_forecast.py new file mode 100644 index 00000000..c011a203 --- /dev/null +++ b/models/fast_car/src/forecasting/generate_forecast.py @@ -0,0 +1,47 @@ +import pandas as pd +from datetime import datetime +import logging +from model_path import ModelPath +from utils_log_files import create_log_file, read_log_file +from utils_run import get_standardized_df +from utils_save_outputs import save_predictions +from utils_artifacts import get_latest_model_artifact + +logger = logging.getLogger(__name__) + + +def forecast_model_artifact(config, artifact_name): + model_path = ModelPath(config["name"]) + path_raw = model_path.data_raw + path_generated = model_path.data_generated + path_artifacts = model_path.artifacts + run_type = config["run_type"] + + # if an artifact name is provided through the CLI, use it. + # Otherwise, get the latest model artifact based on the run type + if artifact_name: + logger.info(f"Using (non-default) artifact: {artifact_name}") + + if not artifact_name.endswith(".pkl"): + artifact_name += ".pkl" + path_artifact = path_artifacts / artifact_name + else: + # use the latest model artifact based on the run type + logger.info(f"Using latest (default) run type ({run_type}) specific artifact") + path_artifact = get_latest_model_artifact(path_artifacts, run_type) + + config["timestamp"] = path_artifact.stem[-15:] + df_viewser = pd.read_pickle(path_raw / f"{run_type}_viewser_df.pkl") + + try: + stepshift_model = pd.read_pickle(path_artifact) + except FileNotFoundError: + logger.exception(f"Model artifact not found at {path_artifact}") + + df_predictions = stepshift_model.predict(run_type, df_viewser) + df_predictions = get_standardized_df(df_predictions, config) + data_generation_timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + date_fetch_timestamp = read_log_file(path_raw / f"{run_type}_data_fetch_log.txt").get("Data Fetch Timestamp", None) + + save_predictions(df_predictions, path_generated, config) + create_log_file(path_generated, config, config["timestamp"], data_generation_timestamp, date_fetch_timestamp) diff --git a/models/fast_car/src/management/execute_model_runs.py b/models/fast_car/src/management/execute_model_runs.py new file mode 100644 index 00000000..d5b32a46 --- /dev/null +++ b/models/fast_car/src/management/execute_model_runs.py @@ -0,0 +1,51 @@ +import wandb +from config_deployment import get_deployment_config +from config_hyperparameters import get_hp_config +from config_meta import get_meta_config +from config_sweep import get_sweep_config +from execute_model_tasks import execute_model_tasks +from get_data import get_data +from utils_run import update_config, update_sweep_config + + +def execute_sweep_run(args): + sweep_config = get_sweep_config() + meta_config = get_meta_config() + update_sweep_config(sweep_config, args, meta_config) + + project = f"{sweep_config['name']}_sweep" # we can name the sweep in the config file + + with wandb.init(project=f'{project}_fetch', entity="views_pipeline"): + + get_data(args, sweep_config["name"], args.drift_self_test) + + wandb.finish() + + sweep_id = wandb.sweep(sweep_config, project=project, entity="views_pipeline") + wandb.agent(sweep_id, execute_model_tasks, entity="views_pipeline") + + +def execute_single_run(args): + + hp_config = get_hp_config() + meta_config = get_meta_config() + dp_config = get_deployment_config() + config = update_config(hp_config, meta_config, dp_config, args) + + project = f"{config['name']}_{args.run_type}" + + with wandb.init(project=f'{project}_fetch', entity="views_pipeline"): + + get_data(args, config["name"], args.drift_self_test) + + wandb.finish() + + if args.run_type == 'calibration' or args.run_type == 'testing': + + execute_model_tasks(config=config, project=project, train=args.train, eval=args.evaluate, + forecast=False, artifact_name=args.artifact_name) + + elif args.run_type == "forecasting": + execute_model_tasks(config=config, project=project, train=args.train, eval=False, + forecast=args.forecast, artifact_name=args.artifact_name) + diff --git a/models/fast_car/src/management/execute_model_tasks.py b/models/fast_car/src/management/execute_model_tasks.py new file mode 100644 index 00000000..a913fa53 --- /dev/null +++ b/models/fast_car/src/management/execute_model_tasks.py @@ -0,0 +1,70 @@ +import wandb +import logging +import time +from evaluate_model import evaluate_model_artifact +from evaluate_sweep import evaluate_sweep +from generate_forecast import forecast_model_artifact +from train_model import train_model_artifact +from utils_run import split_hurdle_parameters +from utils_wandb import add_wandb_monthly_metrics + +logger = logging.getLogger(__name__) + +def execute_model_tasks(config=None, project=None, train=None, eval=None, forecast=None, artifact_name=None): + """ + Executes various model-related tasks including training, evaluation, and forecasting. + + This function manages the execution of different tasks such as training the model, + evaluating an existing model, or performing forecasting. + It also initializes the WandB project. + + Args: + config: Configuration object containing parameters and settings. + project: The WandB project name. + train: Flag to indicate if the model should be trained. + eval: Flag to indicate if the model should be evaluated. + forecast: Flag to indicate if forecasting should be performed. + artifact_name (optional): Specific name of the model artifact to load for evaluation or forecasting. + """ + + start_t = time.time() + + # Initialize WandB + with wandb.init(project=project, entity="views_pipeline", + config=config): # project and config ignored when running a sweep + + # add the monthly metrics to WandB + add_wandb_monthly_metrics() + + # Update config from WandB initialization above + config = wandb.config + + # W&B does not directly support nested dictionaries for hyperparameters + # This will make the sweep config super ugly, but we don't have to distinguish between sweep and single runs + if config["sweep"] and config["algorithm"] == "HurdleRegression": + config["parameters"] = {} + config["parameters"]["clf"], config["parameters"]["reg"] = split_hurdle_parameters(config) + + if config["sweep"]: + logger.info(f"Sweeping model {config['name']}...") + stepshift_model = train_model_artifact(config) + logger.info(f"Evaluating model {config['name']}...") + evaluate_sweep(config, stepshift_model) + + # Handle the single model runs: train and save the model as an artifact + if train: + logger.info(f"Training model {config['name']}...") + train_model_artifact(config) + + # Handle the single model runs: evaluate a trained model (artifact) + if eval: + logger.info(f"Evaluating model {config['name']}...") + evaluate_model_artifact(config, artifact_name) + + if forecast: + logger.info(f"Forecasting model {config['name']}...") + forecast_model_artifact(config, artifact_name) + + end_t = time.time() + minutes = (end_t - start_t) / 60 + logger.info(f"Done. Runtime: {minutes:.3f} minutes.\n") diff --git a/models/fast_car/src/offline_evaluation/evaluate_model.py b/models/fast_car/src/offline_evaluation/evaluate_model.py new file mode 100644 index 00000000..0d86a87c --- /dev/null +++ b/models/fast_car/src/offline_evaluation/evaluate_model.py @@ -0,0 +1,55 @@ +from datetime import datetime +import pandas as pd +import logging +from model_path import ModelPath +from utils_log_files import create_log_file, read_log_file +from utils_save_outputs import save_model_outputs, save_predictions +from utils_run import get_standardized_df +from utils_artifacts import get_latest_model_artifact +from utils_evaluation_metrics import generate_metric_dict +from utils_model_outputs import generate_output_dict +from utils_wandb import log_wandb_log_dict +from views_forecasts.extensions import * + +logger = logging.getLogger(__name__) + +def evaluate_model_artifact(config, artifact_name): + model_path = ModelPath(config["name"]) + path_raw = model_path.data_raw + path_generated = model_path.data_generated + path_artifacts = model_path.artifacts + run_type = config["run_type"] + + # if an artifact name is provided through the CLI, use it. + # Otherwise, get the latest model artifact based on the run type + if artifact_name: + logger.info(f"Using (non-default) artifact: {artifact_name}") + + if not artifact_name.endswith(".pkl"): + artifact_name += ".pkl" + PATH_ARTIFACT = path_artifacts / artifact_name + else: + # use the latest model artifact based on the run type + logger.info(f"Using latest (default) run type ({run_type}) specific artifact") + PATH_ARTIFACT = get_latest_model_artifact(path_artifacts, run_type) + + config["timestamp"] = PATH_ARTIFACT.stem[-15:] + df_viewser = pd.read_pickle(path_raw / f"{run_type}_viewser_df.pkl") + + try: + stepshift_model = pd.read_pickle(PATH_ARTIFACT) + except FileNotFoundError: + logger.exception(f"Model artifact not found at {PATH_ARTIFACT}") + + df = stepshift_model.predict(run_type, df_viewser) + df = get_standardized_df(df, config) + data_generation_timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + date_fetch_timestamp = read_log_file(path_raw / f"{run_type}_data_fetch_log.txt").get("Data Fetch Timestamp", None) + + _, df_output = generate_output_dict(df, config) + evaluation, df_evaluation = generate_metric_dict(df, config) + log_wandb_log_dict(config, evaluation) + + save_model_outputs(df_evaluation, df_output, path_generated, config) + save_predictions(df, path_generated, config) + create_log_file(path_generated, config, config["timestamp"], data_generation_timestamp, date_fetch_timestamp) diff --git a/models/fast_car/src/offline_evaluation/evaluate_sweep.py b/models/fast_car/src/offline_evaluation/evaluate_sweep.py new file mode 100644 index 00000000..d6726cf0 --- /dev/null +++ b/models/fast_car/src/offline_evaluation/evaluate_sweep.py @@ -0,0 +1,28 @@ +import pandas as pd +import wandb +from sklearn.metrics import mean_squared_error +from model_path import ModelPath +from utils_run import get_standardized_df +from utils_wandb import log_wandb_log_dict +from utils_evaluation_metrics import generate_metric_dict + + +def evaluate_sweep(config, stepshift_model): + model_path = ModelPath(config["name"]) + path_raw = model_path.data_raw + run_type = config["run_type"] + steps = config["steps"] + + df_viewser = pd.read_pickle(path_raw / f"{run_type}_viewser_df.pkl") + df = stepshift_model.predict(run_type, df_viewser) + df = get_standardized_df(df, config) + + # Temporarily keep this because the metric to minimize is MSE + pred_cols = [f"step_pred_{str(i)}" for i in steps] + df["mse"] = df.apply(lambda row: mean_squared_error([row[config["depvar"]]] * 36, + [row[col] for col in pred_cols]), axis=1) + + wandb.log({"MSE": df["mse"].mean()}) + + evaluation, _ = generate_metric_dict(df, config) + log_wandb_log_dict(config, evaluation) diff --git a/models/fast_car/src/online_evaluation/.gitkeep b/models/fast_car/src/online_evaluation/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/models/fast_car/src/training/train_model.py b/models/fast_car/src/training/train_model.py new file mode 100644 index 00000000..f2342912 --- /dev/null +++ b/models/fast_car/src/training/train_model.py @@ -0,0 +1,33 @@ +from datetime import datetime +import pandas as pd +from model_path import ModelPath +from utils_log_files import create_log_file, read_log_file +from utils_run import get_model +from set_partition import get_partitioner_dict +from views_forecasts.extensions import * + + +def train_model_artifact(config): + # print(config) + model_path = ModelPath(config["name"]) + path_raw = model_path.data_raw + path_generated = model_path.data_generated + path_artifacts = model_path.artifacts + run_type = config["run_type"] + df_viewser = pd.read_pickle(path_raw / f"{run_type}_viewser_df.pkl") + + stepshift_model = stepshift_training(config, run_type, df_viewser) + if not config["sweep"]: + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + model_filename = f"{run_type}_model_{timestamp}.pkl" + stepshift_model.save(path_artifacts / model_filename) + date_fetch_timestamp = read_log_file(path_raw / f"{run_type}_data_fetch_log.txt").get("Data Fetch Timestamp", None) + create_log_file(path_generated, config, timestamp, None, date_fetch_timestamp) + return stepshift_model + + +def stepshift_training(config, partition_name, dataset): + partitioner_dict = get_partitioner_dict(partition_name) + stepshift_model = get_model(config, partitioner_dict) + stepshift_model.fit(dataset) + return stepshift_model diff --git a/models/fast_car/src/utils/utils_run.py b/models/fast_car/src/utils/utils_run.py new file mode 100644 index 00000000..51901a01 --- /dev/null +++ b/models/fast_car/src/utils/utils_run.py @@ -0,0 +1,83 @@ +import numpy as np +from views_stepshifter_darts.stepshifter import StepshifterModel +from views_stepshifter_darts.hurdle_model import HurdleModel +from views_forecasts.extensions import * + + +def get_model(config, partitioner_dict): + """ + Get the model based on the algorithm specified in the config + """ + + if config["algorithm"] == "HurdleModel": + model = HurdleModel(config, partitioner_dict) + else: + config["model_reg"] = config["algorithm"] + model = StepshifterModel(config, partitioner_dict) + + return model + + +def get_standardized_df(df, config): + """ + Standardize the DataFrame based on the run type + """ + + run_type = config["run_type"] + steps = config["steps"] + depvar = config["depvar"] + + # choose the columns to keep based on the run type and replace negative values with 0 + if run_type in ["calibration", "testing"]: + cols = [depvar] + df.forecasts.prediction_columns + elif run_type == "forecasting": + cols = [f"step_pred_{i}" for i in steps] + df = df.replace([np.inf, -np.inf], 0)[cols] + df = df.mask(df < 0, 0) + return df + + +def split_hurdle_parameters(parameters_dict): + """ + Split the parameters dictionary into two separate dictionaries, one for the + classification model and one for the regression model. + """ + + cls_dict = {} + reg_dict = {} + + for key, value in parameters_dict.items(): + if key.startswith("cls_"): + cls_key = key.replace("cls_", "") + cls_dict[cls_key] = value + elif key.startswith("reg_"): + reg_key = key.replace("reg_", "") + reg_dict[reg_key] = value + + return cls_dict, reg_dict + + +def update_config(hp_config, meta_config, dp_config, args): + config = hp_config.copy() + config["run_type"] = args.run_type + config["sweep"] = False + config["name"] = meta_config["name"] + config["depvar"] = meta_config["depvar"] + config["algorithm"] = meta_config["algorithm"] + if meta_config["algorithm"] == "HurdleModel": + config["model_clf"] = meta_config["model_clf"] + config["model_reg"] = meta_config["model_reg"] + config["deployment_status"] = dp_config["deployment_status"] + + return config + + +def update_sweep_config(sweep_config, args, meta_config): + sweep_config["parameters"]["run_type"] = {"value": args.run_type} + sweep_config["parameters"]["sweep"] = {"value": True} + sweep_config["parameters"]["name"] = {"value": meta_config["name"]} + sweep_config["parameters"]["depvar"] = {"value": meta_config["depvar"]} + sweep_config["parameters"]["algorithm"] = {"value": meta_config["algorithm"]} + if meta_config["algorithm"] == "HurdleModel": + sweep_config["parameters"]["model_clf"] = {"value": meta_config["model_clf"]} + sweep_config["parameters"]["model_reg"] = {"value": meta_config["model_reg"]} diff --git a/models/fast_car/src/visualization/.gitkeep b/models/fast_car/src/visualization/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/models/fluorescent_adolescent/README.md b/models/fluorescent_adolescent/README.md new file mode 100644 index 00000000..51ce6360 --- /dev/null +++ b/models/fluorescent_adolescent/README.md @@ -0,0 +1,3 @@ +# Model README +## Model name: fluorescent_adolescent +## Created on: 2024-11-05 15:12:57.752756 \ No newline at end of file diff --git a/models/fluorescent_adolescent/artifacts/.gitkeep b/models/fluorescent_adolescent/artifacts/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/models/fluorescent_adolescent/configs/config_deployment.py b/models/fluorescent_adolescent/configs/config_deployment.py new file mode 100644 index 00000000..9e45b735 --- /dev/null +++ b/models/fluorescent_adolescent/configs/config_deployment.py @@ -0,0 +1,20 @@ +""" +Deployment Configuration Script + +This script defines the deployment configuration settings for the application. +It includes the deployment status and any additional settings specified. + +Deployment Status: +- shadow: The deployment is shadowed and not yet active. +- deployed: The deployment is active and in use. +- baseline: The deployment is in a baseline state, for reference or comparison. +- deprecated: The deployment is deprecated and no longer supported. + +Additional settings can be included in the configuration dictionary as needed. + +""" + +def get_deployment_config(): + # Deployment settings + deployment_config = {'deployment_status': 'shadow'} + return deployment_config diff --git a/models/fluorescent_adolescent/configs/config_hyperparameters.py b/models/fluorescent_adolescent/configs/config_hyperparameters.py new file mode 100644 index 00000000..5dfd0c53 --- /dev/null +++ b/models/fluorescent_adolescent/configs/config_hyperparameters.py @@ -0,0 +1,28 @@ + +def get_hp_config(): + """ + Contains the hyperparameter configurations for model training. + This configuration is "operational" so modifying these settings will impact the model's behavior during the training. + + Returns: + - hyperparameters (dict): A dictionary containing hyperparameters for training the model, which determine the model's behavior during the training phase. + """ + + hyperparameters = { + "steps": [*range(1, 36 + 1, 1)], + "parameters": { + "clf": { + "n_estimators": 100, + "n_jobs": -2, + "learning_rate": 0.05 + }, + "reg": { + "n_estimators": 100, + "n_jobs": -2, + "learning_rate": 0.05 + + } + } + } + return hyperparameters + diff --git a/models/fluorescent_adolescent/configs/config_meta.py b/models/fluorescent_adolescent/configs/config_meta.py new file mode 100644 index 00000000..319ef581 --- /dev/null +++ b/models/fluorescent_adolescent/configs/config_meta.py @@ -0,0 +1,20 @@ +def get_meta_config(): + """ + Contains the meta data for the model (model algorithm, name, target variable, and level of analysis). + This config is for documentation purposes only, and modifying it will not affect the model, the training, or the evaluation. + + Returns: + - meta_config (dict): A dictionary containing model meta configuration. + """ + + meta_config = { + "name": "fluorescent_adolescent", + "algorithm": "HurdleModel", + "model_clf": "XGBModel", + "model_reg": "XGBModel", + "depvar": "ln_ged_sb_dep", + "queryset": "fatalities003_joint_narrow", + "level": "cm", + "creator": "Marina" + } + return meta_config diff --git a/models/fluorescent_adolescent/configs/config_sweep.py b/models/fluorescent_adolescent/configs/config_sweep.py new file mode 100644 index 00000000..89f3a5df --- /dev/null +++ b/models/fluorescent_adolescent/configs/config_sweep.py @@ -0,0 +1,29 @@ + +def get_sweep_config(): + """ + Contains the configuration for hyperparameter sweeps using WandB. + This configuration is "operational" so modifying it will change the search strategy, parameter ranges, and other settings for hyperparameter tuning aimed at optimizing model performance. + + Returns: + - sweep_config (dict): A dictionary containing the configuration for hyperparameter sweeps, defining the methods and parameter ranges used to search for optimal hyperparameters. + """ + + sweep_config = { + 'method': 'grid', + 'name': 'fluorescent_adolescent' + } + + # Example metric setup: + metric = { + 'name': 'MSE', + 'goal': 'minimize' + } + sweep_config['metric'] = metric + + # Example parameters setup: + parameters_dict = { + 'steps': {'values': [[*range(1, 36 + 1, 1)]]}, + } + sweep_config['parameters'] = parameters_dict + + return sweep_config diff --git a/models/fluorescent_adolescent/data/generated/.gitkeep b/models/fluorescent_adolescent/data/generated/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/models/fluorescent_adolescent/data/processed/.gitkeep b/models/fluorescent_adolescent/data/processed/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/models/fluorescent_adolescent/data/raw/.gitkeep b/models/fluorescent_adolescent/data/raw/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/models/fluorescent_adolescent/main.py b/models/fluorescent_adolescent/main.py new file mode 100644 index 00000000..36429ecb --- /dev/null +++ b/models/fluorescent_adolescent/main.py @@ -0,0 +1,38 @@ +import wandb +import sys +import warnings + +from pathlib import Path +PATH = Path(__file__) +sys.path.insert(0, str(Path( + *[i for i in PATH.parts[:PATH.parts.index("views_pipeline") + 1]]) / "common_utils")) # PATH_COMMON_UTILS +from set_path import setup_project_paths +setup_project_paths(PATH) + +from utils_cli_parser import parse_args, validate_arguments +from utils_logger import setup_logging +from execute_model_runs import execute_sweep_run, execute_single_run + +warnings.filterwarnings("ignore") +try: + from common_utils.model_path import ModelPath + from common_utils.global_cache import GlobalCache + model_name = ModelPath.get_model_name_from_path(PATH) + GlobalCache["current_model"] = model_name +except ImportError as e: + warnings.warn(f"ImportError: {e}. Some functionalities (model seperated log files) may not work properly.", ImportWarning) +except Exception as e: + warnings.warn(f"An unexpected error occurred: {e}.", RuntimeWarning) +logger = setup_logging("run.log") + + +if __name__ == "__main__": + wandb.login() + + args = parse_args() + validate_arguments(args) + + if args.sweep: + execute_sweep_run(args) + else: + execute_single_run(args) diff --git a/models/fluorescent_adolescent/notebooks/.gitkeep b/models/fluorescent_adolescent/notebooks/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/models/fluorescent_adolescent/reports/.gitkeep b/models/fluorescent_adolescent/reports/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/models/fluorescent_adolescent/src/architectures/.gitkeep b/models/fluorescent_adolescent/src/architectures/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/models/fluorescent_adolescent/src/dataloaders/get_data.py b/models/fluorescent_adolescent/src/dataloaders/get_data.py new file mode 100644 index 00000000..4dcd20c8 --- /dev/null +++ b/models/fluorescent_adolescent/src/dataloaders/get_data.py @@ -0,0 +1,14 @@ +import logging +from model_path import ModelPath +from utils_dataloaders import fetch_or_load_views_df + +logger = logging.getLogger(__name__) + +def get_data(args, model_name, self_test): + model_path = ModelPath(model_name, validate=False) + path_raw = model_path.data_raw + + data, alerts = fetch_or_load_views_df(model_name, args.run_type, path_raw, self_test, use_saved=args.saved) + logger.debug(f"DataFrame shape: {data.shape if data is not None else 'None'}") + + return data diff --git a/models/fluorescent_adolescent/src/forecasting/generate_forecast.py b/models/fluorescent_adolescent/src/forecasting/generate_forecast.py new file mode 100644 index 00000000..c011a203 --- /dev/null +++ b/models/fluorescent_adolescent/src/forecasting/generate_forecast.py @@ -0,0 +1,47 @@ +import pandas as pd +from datetime import datetime +import logging +from model_path import ModelPath +from utils_log_files import create_log_file, read_log_file +from utils_run import get_standardized_df +from utils_save_outputs import save_predictions +from utils_artifacts import get_latest_model_artifact + +logger = logging.getLogger(__name__) + + +def forecast_model_artifact(config, artifact_name): + model_path = ModelPath(config["name"]) + path_raw = model_path.data_raw + path_generated = model_path.data_generated + path_artifacts = model_path.artifacts + run_type = config["run_type"] + + # if an artifact name is provided through the CLI, use it. + # Otherwise, get the latest model artifact based on the run type + if artifact_name: + logger.info(f"Using (non-default) artifact: {artifact_name}") + + if not artifact_name.endswith(".pkl"): + artifact_name += ".pkl" + path_artifact = path_artifacts / artifact_name + else: + # use the latest model artifact based on the run type + logger.info(f"Using latest (default) run type ({run_type}) specific artifact") + path_artifact = get_latest_model_artifact(path_artifacts, run_type) + + config["timestamp"] = path_artifact.stem[-15:] + df_viewser = pd.read_pickle(path_raw / f"{run_type}_viewser_df.pkl") + + try: + stepshift_model = pd.read_pickle(path_artifact) + except FileNotFoundError: + logger.exception(f"Model artifact not found at {path_artifact}") + + df_predictions = stepshift_model.predict(run_type, df_viewser) + df_predictions = get_standardized_df(df_predictions, config) + data_generation_timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + date_fetch_timestamp = read_log_file(path_raw / f"{run_type}_data_fetch_log.txt").get("Data Fetch Timestamp", None) + + save_predictions(df_predictions, path_generated, config) + create_log_file(path_generated, config, config["timestamp"], data_generation_timestamp, date_fetch_timestamp) diff --git a/models/fluorescent_adolescent/src/management/execute_model_runs.py b/models/fluorescent_adolescent/src/management/execute_model_runs.py new file mode 100644 index 00000000..d5b32a46 --- /dev/null +++ b/models/fluorescent_adolescent/src/management/execute_model_runs.py @@ -0,0 +1,51 @@ +import wandb +from config_deployment import get_deployment_config +from config_hyperparameters import get_hp_config +from config_meta import get_meta_config +from config_sweep import get_sweep_config +from execute_model_tasks import execute_model_tasks +from get_data import get_data +from utils_run import update_config, update_sweep_config + + +def execute_sweep_run(args): + sweep_config = get_sweep_config() + meta_config = get_meta_config() + update_sweep_config(sweep_config, args, meta_config) + + project = f"{sweep_config['name']}_sweep" # we can name the sweep in the config file + + with wandb.init(project=f'{project}_fetch', entity="views_pipeline"): + + get_data(args, sweep_config["name"], args.drift_self_test) + + wandb.finish() + + sweep_id = wandb.sweep(sweep_config, project=project, entity="views_pipeline") + wandb.agent(sweep_id, execute_model_tasks, entity="views_pipeline") + + +def execute_single_run(args): + + hp_config = get_hp_config() + meta_config = get_meta_config() + dp_config = get_deployment_config() + config = update_config(hp_config, meta_config, dp_config, args) + + project = f"{config['name']}_{args.run_type}" + + with wandb.init(project=f'{project}_fetch', entity="views_pipeline"): + + get_data(args, config["name"], args.drift_self_test) + + wandb.finish() + + if args.run_type == 'calibration' or args.run_type == 'testing': + + execute_model_tasks(config=config, project=project, train=args.train, eval=args.evaluate, + forecast=False, artifact_name=args.artifact_name) + + elif args.run_type == "forecasting": + execute_model_tasks(config=config, project=project, train=args.train, eval=False, + forecast=args.forecast, artifact_name=args.artifact_name) + diff --git a/models/fluorescent_adolescent/src/management/execute_model_tasks.py b/models/fluorescent_adolescent/src/management/execute_model_tasks.py new file mode 100644 index 00000000..a913fa53 --- /dev/null +++ b/models/fluorescent_adolescent/src/management/execute_model_tasks.py @@ -0,0 +1,70 @@ +import wandb +import logging +import time +from evaluate_model import evaluate_model_artifact +from evaluate_sweep import evaluate_sweep +from generate_forecast import forecast_model_artifact +from train_model import train_model_artifact +from utils_run import split_hurdle_parameters +from utils_wandb import add_wandb_monthly_metrics + +logger = logging.getLogger(__name__) + +def execute_model_tasks(config=None, project=None, train=None, eval=None, forecast=None, artifact_name=None): + """ + Executes various model-related tasks including training, evaluation, and forecasting. + + This function manages the execution of different tasks such as training the model, + evaluating an existing model, or performing forecasting. + It also initializes the WandB project. + + Args: + config: Configuration object containing parameters and settings. + project: The WandB project name. + train: Flag to indicate if the model should be trained. + eval: Flag to indicate if the model should be evaluated. + forecast: Flag to indicate if forecasting should be performed. + artifact_name (optional): Specific name of the model artifact to load for evaluation or forecasting. + """ + + start_t = time.time() + + # Initialize WandB + with wandb.init(project=project, entity="views_pipeline", + config=config): # project and config ignored when running a sweep + + # add the monthly metrics to WandB + add_wandb_monthly_metrics() + + # Update config from WandB initialization above + config = wandb.config + + # W&B does not directly support nested dictionaries for hyperparameters + # This will make the sweep config super ugly, but we don't have to distinguish between sweep and single runs + if config["sweep"] and config["algorithm"] == "HurdleRegression": + config["parameters"] = {} + config["parameters"]["clf"], config["parameters"]["reg"] = split_hurdle_parameters(config) + + if config["sweep"]: + logger.info(f"Sweeping model {config['name']}...") + stepshift_model = train_model_artifact(config) + logger.info(f"Evaluating model {config['name']}...") + evaluate_sweep(config, stepshift_model) + + # Handle the single model runs: train and save the model as an artifact + if train: + logger.info(f"Training model {config['name']}...") + train_model_artifact(config) + + # Handle the single model runs: evaluate a trained model (artifact) + if eval: + logger.info(f"Evaluating model {config['name']}...") + evaluate_model_artifact(config, artifact_name) + + if forecast: + logger.info(f"Forecasting model {config['name']}...") + forecast_model_artifact(config, artifact_name) + + end_t = time.time() + minutes = (end_t - start_t) / 60 + logger.info(f"Done. Runtime: {minutes:.3f} minutes.\n") diff --git a/models/fluorescent_adolescent/src/offline_evaluation/evaluate_model.py b/models/fluorescent_adolescent/src/offline_evaluation/evaluate_model.py new file mode 100644 index 00000000..0d86a87c --- /dev/null +++ b/models/fluorescent_adolescent/src/offline_evaluation/evaluate_model.py @@ -0,0 +1,55 @@ +from datetime import datetime +import pandas as pd +import logging +from model_path import ModelPath +from utils_log_files import create_log_file, read_log_file +from utils_save_outputs import save_model_outputs, save_predictions +from utils_run import get_standardized_df +from utils_artifacts import get_latest_model_artifact +from utils_evaluation_metrics import generate_metric_dict +from utils_model_outputs import generate_output_dict +from utils_wandb import log_wandb_log_dict +from views_forecasts.extensions import * + +logger = logging.getLogger(__name__) + +def evaluate_model_artifact(config, artifact_name): + model_path = ModelPath(config["name"]) + path_raw = model_path.data_raw + path_generated = model_path.data_generated + path_artifacts = model_path.artifacts + run_type = config["run_type"] + + # if an artifact name is provided through the CLI, use it. + # Otherwise, get the latest model artifact based on the run type + if artifact_name: + logger.info(f"Using (non-default) artifact: {artifact_name}") + + if not artifact_name.endswith(".pkl"): + artifact_name += ".pkl" + PATH_ARTIFACT = path_artifacts / artifact_name + else: + # use the latest model artifact based on the run type + logger.info(f"Using latest (default) run type ({run_type}) specific artifact") + PATH_ARTIFACT = get_latest_model_artifact(path_artifacts, run_type) + + config["timestamp"] = PATH_ARTIFACT.stem[-15:] + df_viewser = pd.read_pickle(path_raw / f"{run_type}_viewser_df.pkl") + + try: + stepshift_model = pd.read_pickle(PATH_ARTIFACT) + except FileNotFoundError: + logger.exception(f"Model artifact not found at {PATH_ARTIFACT}") + + df = stepshift_model.predict(run_type, df_viewser) + df = get_standardized_df(df, config) + data_generation_timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + date_fetch_timestamp = read_log_file(path_raw / f"{run_type}_data_fetch_log.txt").get("Data Fetch Timestamp", None) + + _, df_output = generate_output_dict(df, config) + evaluation, df_evaluation = generate_metric_dict(df, config) + log_wandb_log_dict(config, evaluation) + + save_model_outputs(df_evaluation, df_output, path_generated, config) + save_predictions(df, path_generated, config) + create_log_file(path_generated, config, config["timestamp"], data_generation_timestamp, date_fetch_timestamp) diff --git a/models/fluorescent_adolescent/src/offline_evaluation/evaluate_sweep.py b/models/fluorescent_adolescent/src/offline_evaluation/evaluate_sweep.py new file mode 100644 index 00000000..d6726cf0 --- /dev/null +++ b/models/fluorescent_adolescent/src/offline_evaluation/evaluate_sweep.py @@ -0,0 +1,28 @@ +import pandas as pd +import wandb +from sklearn.metrics import mean_squared_error +from model_path import ModelPath +from utils_run import get_standardized_df +from utils_wandb import log_wandb_log_dict +from utils_evaluation_metrics import generate_metric_dict + + +def evaluate_sweep(config, stepshift_model): + model_path = ModelPath(config["name"]) + path_raw = model_path.data_raw + run_type = config["run_type"] + steps = config["steps"] + + df_viewser = pd.read_pickle(path_raw / f"{run_type}_viewser_df.pkl") + df = stepshift_model.predict(run_type, df_viewser) + df = get_standardized_df(df, config) + + # Temporarily keep this because the metric to minimize is MSE + pred_cols = [f"step_pred_{str(i)}" for i in steps] + df["mse"] = df.apply(lambda row: mean_squared_error([row[config["depvar"]]] * 36, + [row[col] for col in pred_cols]), axis=1) + + wandb.log({"MSE": df["mse"].mean()}) + + evaluation, _ = generate_metric_dict(df, config) + log_wandb_log_dict(config, evaluation) diff --git a/models/fluorescent_adolescent/src/online_evaluation/.gitkeep b/models/fluorescent_adolescent/src/online_evaluation/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/models/fluorescent_adolescent/src/training/train_model.py b/models/fluorescent_adolescent/src/training/train_model.py new file mode 100644 index 00000000..f2342912 --- /dev/null +++ b/models/fluorescent_adolescent/src/training/train_model.py @@ -0,0 +1,33 @@ +from datetime import datetime +import pandas as pd +from model_path import ModelPath +from utils_log_files import create_log_file, read_log_file +from utils_run import get_model +from set_partition import get_partitioner_dict +from views_forecasts.extensions import * + + +def train_model_artifact(config): + # print(config) + model_path = ModelPath(config["name"]) + path_raw = model_path.data_raw + path_generated = model_path.data_generated + path_artifacts = model_path.artifacts + run_type = config["run_type"] + df_viewser = pd.read_pickle(path_raw / f"{run_type}_viewser_df.pkl") + + stepshift_model = stepshift_training(config, run_type, df_viewser) + if not config["sweep"]: + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + model_filename = f"{run_type}_model_{timestamp}.pkl" + stepshift_model.save(path_artifacts / model_filename) + date_fetch_timestamp = read_log_file(path_raw / f"{run_type}_data_fetch_log.txt").get("Data Fetch Timestamp", None) + create_log_file(path_generated, config, timestamp, None, date_fetch_timestamp) + return stepshift_model + + +def stepshift_training(config, partition_name, dataset): + partitioner_dict = get_partitioner_dict(partition_name) + stepshift_model = get_model(config, partitioner_dict) + stepshift_model.fit(dataset) + return stepshift_model diff --git a/models/fluorescent_adolescent/src/utils/utils_run.py b/models/fluorescent_adolescent/src/utils/utils_run.py new file mode 100644 index 00000000..51901a01 --- /dev/null +++ b/models/fluorescent_adolescent/src/utils/utils_run.py @@ -0,0 +1,83 @@ +import numpy as np +from views_stepshifter_darts.stepshifter import StepshifterModel +from views_stepshifter_darts.hurdle_model import HurdleModel +from views_forecasts.extensions import * + + +def get_model(config, partitioner_dict): + """ + Get the model based on the algorithm specified in the config + """ + + if config["algorithm"] == "HurdleModel": + model = HurdleModel(config, partitioner_dict) + else: + config["model_reg"] = config["algorithm"] + model = StepshifterModel(config, partitioner_dict) + + return model + + +def get_standardized_df(df, config): + """ + Standardize the DataFrame based on the run type + """ + + run_type = config["run_type"] + steps = config["steps"] + depvar = config["depvar"] + + # choose the columns to keep based on the run type and replace negative values with 0 + if run_type in ["calibration", "testing"]: + cols = [depvar] + df.forecasts.prediction_columns + elif run_type == "forecasting": + cols = [f"step_pred_{i}" for i in steps] + df = df.replace([np.inf, -np.inf], 0)[cols] + df = df.mask(df < 0, 0) + return df + + +def split_hurdle_parameters(parameters_dict): + """ + Split the parameters dictionary into two separate dictionaries, one for the + classification model and one for the regression model. + """ + + cls_dict = {} + reg_dict = {} + + for key, value in parameters_dict.items(): + if key.startswith("cls_"): + cls_key = key.replace("cls_", "") + cls_dict[cls_key] = value + elif key.startswith("reg_"): + reg_key = key.replace("reg_", "") + reg_dict[reg_key] = value + + return cls_dict, reg_dict + + +def update_config(hp_config, meta_config, dp_config, args): + config = hp_config.copy() + config["run_type"] = args.run_type + config["sweep"] = False + config["name"] = meta_config["name"] + config["depvar"] = meta_config["depvar"] + config["algorithm"] = meta_config["algorithm"] + if meta_config["algorithm"] == "HurdleModel": + config["model_clf"] = meta_config["model_clf"] + config["model_reg"] = meta_config["model_reg"] + config["deployment_status"] = dp_config["deployment_status"] + + return config + + +def update_sweep_config(sweep_config, args, meta_config): + sweep_config["parameters"]["run_type"] = {"value": args.run_type} + sweep_config["parameters"]["sweep"] = {"value": True} + sweep_config["parameters"]["name"] = {"value": meta_config["name"]} + sweep_config["parameters"]["depvar"] = {"value": meta_config["depvar"]} + sweep_config["parameters"]["algorithm"] = {"value": meta_config["algorithm"]} + if meta_config["algorithm"] == "HurdleModel": + sweep_config["parameters"]["model_clf"] = {"value": meta_config["model_clf"]} + sweep_config["parameters"]["model_reg"] = {"value": meta_config["model_reg"]} diff --git a/models/fluorescent_adolescent/src/visualization/.gitkeep b/models/fluorescent_adolescent/src/visualization/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/models/good_riddance/README.md b/models/good_riddance/README.md new file mode 100644 index 00000000..f7a9a376 --- /dev/null +++ b/models/good_riddance/README.md @@ -0,0 +1,3 @@ +# Model README +## Model name: good_riddance +## Created on: 2024-11-05 16:26:27.322664 \ No newline at end of file diff --git a/models/good_riddance/artifacts/.gitkeep b/models/good_riddance/artifacts/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/models/good_riddance/configs/config_deployment.py b/models/good_riddance/configs/config_deployment.py new file mode 100644 index 00000000..9e45b735 --- /dev/null +++ b/models/good_riddance/configs/config_deployment.py @@ -0,0 +1,20 @@ +""" +Deployment Configuration Script + +This script defines the deployment configuration settings for the application. +It includes the deployment status and any additional settings specified. + +Deployment Status: +- shadow: The deployment is shadowed and not yet active. +- deployed: The deployment is active and in use. +- baseline: The deployment is in a baseline state, for reference or comparison. +- deprecated: The deployment is deprecated and no longer supported. + +Additional settings can be included in the configuration dictionary as needed. + +""" + +def get_deployment_config(): + # Deployment settings + deployment_config = {'deployment_status': 'shadow'} + return deployment_config diff --git a/models/good_riddance/configs/config_hyperparameters.py b/models/good_riddance/configs/config_hyperparameters.py new file mode 100644 index 00000000..da785a64 --- /dev/null +++ b/models/good_riddance/configs/config_hyperparameters.py @@ -0,0 +1,18 @@ + +def get_hp_config(): + """ + Contains the hyperparameter configurations for model training. + This configuration is "operational" so modifying these settings will impact the model's behavior during the training. + + Returns: + - hyperparameters (dict): A dictionary containing hyperparameters for training the model, which determine the model's behavior during the training phase. + """ + + hyperparameters = { + "steps": [*range(1, 36 + 1, 1)], + "parameters": { + "n_estimators": 250, + "n_jobs": 12, + } + } + return hyperparameters diff --git a/models/good_riddance/configs/config_meta.py b/models/good_riddance/configs/config_meta.py new file mode 100644 index 00000000..1a86ba9d --- /dev/null +++ b/models/good_riddance/configs/config_meta.py @@ -0,0 +1,18 @@ +def get_meta_config(): + """ + Contains the meta data for the model (model algorithm, name, target variable, and level of analysis). + This config is for documentation purposes only, and modifying it will not affect the model, the training, or the evaluation. + + Returns: + - meta_config (dict): A dictionary containing model meta configuration. + """ + + meta_config = { + "name": "good_riddance", + "algorithm": "RandomForestModel", + "depvar": "ln_ged_sb_dep", + "queryset": "fatalities003_joint_narrow", + "level": "cm", + "creator": "Marina" + } + return meta_config diff --git a/models/good_riddance/configs/config_sweep.py b/models/good_riddance/configs/config_sweep.py new file mode 100644 index 00000000..455cb3ea --- /dev/null +++ b/models/good_riddance/configs/config_sweep.py @@ -0,0 +1,29 @@ + +def get_sweep_config(): + """ + Contains the configuration for hyperparameter sweeps using WandB. + This configuration is "operational" so modifying it will change the search strategy, parameter ranges, and other settings for hyperparameter tuning aimed at optimizing model performance. + + Returns: + - sweep_config (dict): A dictionary containing the configuration for hyperparameter sweeps, defining the methods and parameter ranges used to search for optimal hyperparameters. + """ + + sweep_config = { + 'method': 'grid', + 'name': 'good_riddance' + } + + # Example metric setup: + metric = { + 'name': 'MSE', + 'goal': 'minimize' + } + sweep_config['metric'] = metric + + # Example parameters setup: + parameters_dict = { + 'steps': {'values': [[*range(1, 36 + 1, 1)]]}, + } + sweep_config['parameters'] = parameters_dict + + return sweep_config diff --git a/models/good_riddance/data/generated/.gitkeep b/models/good_riddance/data/generated/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/models/good_riddance/data/processed/.gitkeep b/models/good_riddance/data/processed/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/models/good_riddance/data/raw/.gitkeep b/models/good_riddance/data/raw/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/models/good_riddance/main.py b/models/good_riddance/main.py new file mode 100644 index 00000000..36429ecb --- /dev/null +++ b/models/good_riddance/main.py @@ -0,0 +1,38 @@ +import wandb +import sys +import warnings + +from pathlib import Path +PATH = Path(__file__) +sys.path.insert(0, str(Path( + *[i for i in PATH.parts[:PATH.parts.index("views_pipeline") + 1]]) / "common_utils")) # PATH_COMMON_UTILS +from set_path import setup_project_paths +setup_project_paths(PATH) + +from utils_cli_parser import parse_args, validate_arguments +from utils_logger import setup_logging +from execute_model_runs import execute_sweep_run, execute_single_run + +warnings.filterwarnings("ignore") +try: + from common_utils.model_path import ModelPath + from common_utils.global_cache import GlobalCache + model_name = ModelPath.get_model_name_from_path(PATH) + GlobalCache["current_model"] = model_name +except ImportError as e: + warnings.warn(f"ImportError: {e}. Some functionalities (model seperated log files) may not work properly.", ImportWarning) +except Exception as e: + warnings.warn(f"An unexpected error occurred: {e}.", RuntimeWarning) +logger = setup_logging("run.log") + + +if __name__ == "__main__": + wandb.login() + + args = parse_args() + validate_arguments(args) + + if args.sweep: + execute_sweep_run(args) + else: + execute_single_run(args) diff --git a/models/good_riddance/notebooks/.gitkeep b/models/good_riddance/notebooks/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/models/good_riddance/reports/.gitkeep b/models/good_riddance/reports/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/models/good_riddance/src/architectures/.gitkeep b/models/good_riddance/src/architectures/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/models/good_riddance/src/dataloaders/get_data.py b/models/good_riddance/src/dataloaders/get_data.py new file mode 100644 index 00000000..4dcd20c8 --- /dev/null +++ b/models/good_riddance/src/dataloaders/get_data.py @@ -0,0 +1,14 @@ +import logging +from model_path import ModelPath +from utils_dataloaders import fetch_or_load_views_df + +logger = logging.getLogger(__name__) + +def get_data(args, model_name, self_test): + model_path = ModelPath(model_name, validate=False) + path_raw = model_path.data_raw + + data, alerts = fetch_or_load_views_df(model_name, args.run_type, path_raw, self_test, use_saved=args.saved) + logger.debug(f"DataFrame shape: {data.shape if data is not None else 'None'}") + + return data diff --git a/models/good_riddance/src/forecasting/generate_forecast.py b/models/good_riddance/src/forecasting/generate_forecast.py new file mode 100644 index 00000000..c011a203 --- /dev/null +++ b/models/good_riddance/src/forecasting/generate_forecast.py @@ -0,0 +1,47 @@ +import pandas as pd +from datetime import datetime +import logging +from model_path import ModelPath +from utils_log_files import create_log_file, read_log_file +from utils_run import get_standardized_df +from utils_save_outputs import save_predictions +from utils_artifacts import get_latest_model_artifact + +logger = logging.getLogger(__name__) + + +def forecast_model_artifact(config, artifact_name): + model_path = ModelPath(config["name"]) + path_raw = model_path.data_raw + path_generated = model_path.data_generated + path_artifacts = model_path.artifacts + run_type = config["run_type"] + + # if an artifact name is provided through the CLI, use it. + # Otherwise, get the latest model artifact based on the run type + if artifact_name: + logger.info(f"Using (non-default) artifact: {artifact_name}") + + if not artifact_name.endswith(".pkl"): + artifact_name += ".pkl" + path_artifact = path_artifacts / artifact_name + else: + # use the latest model artifact based on the run type + logger.info(f"Using latest (default) run type ({run_type}) specific artifact") + path_artifact = get_latest_model_artifact(path_artifacts, run_type) + + config["timestamp"] = path_artifact.stem[-15:] + df_viewser = pd.read_pickle(path_raw / f"{run_type}_viewser_df.pkl") + + try: + stepshift_model = pd.read_pickle(path_artifact) + except FileNotFoundError: + logger.exception(f"Model artifact not found at {path_artifact}") + + df_predictions = stepshift_model.predict(run_type, df_viewser) + df_predictions = get_standardized_df(df_predictions, config) + data_generation_timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + date_fetch_timestamp = read_log_file(path_raw / f"{run_type}_data_fetch_log.txt").get("Data Fetch Timestamp", None) + + save_predictions(df_predictions, path_generated, config) + create_log_file(path_generated, config, config["timestamp"], data_generation_timestamp, date_fetch_timestamp) diff --git a/models/good_riddance/src/management/execute_model_runs.py b/models/good_riddance/src/management/execute_model_runs.py new file mode 100644 index 00000000..d5b32a46 --- /dev/null +++ b/models/good_riddance/src/management/execute_model_runs.py @@ -0,0 +1,51 @@ +import wandb +from config_deployment import get_deployment_config +from config_hyperparameters import get_hp_config +from config_meta import get_meta_config +from config_sweep import get_sweep_config +from execute_model_tasks import execute_model_tasks +from get_data import get_data +from utils_run import update_config, update_sweep_config + + +def execute_sweep_run(args): + sweep_config = get_sweep_config() + meta_config = get_meta_config() + update_sweep_config(sweep_config, args, meta_config) + + project = f"{sweep_config['name']}_sweep" # we can name the sweep in the config file + + with wandb.init(project=f'{project}_fetch', entity="views_pipeline"): + + get_data(args, sweep_config["name"], args.drift_self_test) + + wandb.finish() + + sweep_id = wandb.sweep(sweep_config, project=project, entity="views_pipeline") + wandb.agent(sweep_id, execute_model_tasks, entity="views_pipeline") + + +def execute_single_run(args): + + hp_config = get_hp_config() + meta_config = get_meta_config() + dp_config = get_deployment_config() + config = update_config(hp_config, meta_config, dp_config, args) + + project = f"{config['name']}_{args.run_type}" + + with wandb.init(project=f'{project}_fetch', entity="views_pipeline"): + + get_data(args, config["name"], args.drift_self_test) + + wandb.finish() + + if args.run_type == 'calibration' or args.run_type == 'testing': + + execute_model_tasks(config=config, project=project, train=args.train, eval=args.evaluate, + forecast=False, artifact_name=args.artifact_name) + + elif args.run_type == "forecasting": + execute_model_tasks(config=config, project=project, train=args.train, eval=False, + forecast=args.forecast, artifact_name=args.artifact_name) + diff --git a/models/good_riddance/src/management/execute_model_tasks.py b/models/good_riddance/src/management/execute_model_tasks.py new file mode 100644 index 00000000..a913fa53 --- /dev/null +++ b/models/good_riddance/src/management/execute_model_tasks.py @@ -0,0 +1,70 @@ +import wandb +import logging +import time +from evaluate_model import evaluate_model_artifact +from evaluate_sweep import evaluate_sweep +from generate_forecast import forecast_model_artifact +from train_model import train_model_artifact +from utils_run import split_hurdle_parameters +from utils_wandb import add_wandb_monthly_metrics + +logger = logging.getLogger(__name__) + +def execute_model_tasks(config=None, project=None, train=None, eval=None, forecast=None, artifact_name=None): + """ + Executes various model-related tasks including training, evaluation, and forecasting. + + This function manages the execution of different tasks such as training the model, + evaluating an existing model, or performing forecasting. + It also initializes the WandB project. + + Args: + config: Configuration object containing parameters and settings. + project: The WandB project name. + train: Flag to indicate if the model should be trained. + eval: Flag to indicate if the model should be evaluated. + forecast: Flag to indicate if forecasting should be performed. + artifact_name (optional): Specific name of the model artifact to load for evaluation or forecasting. + """ + + start_t = time.time() + + # Initialize WandB + with wandb.init(project=project, entity="views_pipeline", + config=config): # project and config ignored when running a sweep + + # add the monthly metrics to WandB + add_wandb_monthly_metrics() + + # Update config from WandB initialization above + config = wandb.config + + # W&B does not directly support nested dictionaries for hyperparameters + # This will make the sweep config super ugly, but we don't have to distinguish between sweep and single runs + if config["sweep"] and config["algorithm"] == "HurdleRegression": + config["parameters"] = {} + config["parameters"]["clf"], config["parameters"]["reg"] = split_hurdle_parameters(config) + + if config["sweep"]: + logger.info(f"Sweeping model {config['name']}...") + stepshift_model = train_model_artifact(config) + logger.info(f"Evaluating model {config['name']}...") + evaluate_sweep(config, stepshift_model) + + # Handle the single model runs: train and save the model as an artifact + if train: + logger.info(f"Training model {config['name']}...") + train_model_artifact(config) + + # Handle the single model runs: evaluate a trained model (artifact) + if eval: + logger.info(f"Evaluating model {config['name']}...") + evaluate_model_artifact(config, artifact_name) + + if forecast: + logger.info(f"Forecasting model {config['name']}...") + forecast_model_artifact(config, artifact_name) + + end_t = time.time() + minutes = (end_t - start_t) / 60 + logger.info(f"Done. Runtime: {minutes:.3f} minutes.\n") diff --git a/models/good_riddance/src/offline_evaluation/evaluate_model.py b/models/good_riddance/src/offline_evaluation/evaluate_model.py new file mode 100644 index 00000000..0d86a87c --- /dev/null +++ b/models/good_riddance/src/offline_evaluation/evaluate_model.py @@ -0,0 +1,55 @@ +from datetime import datetime +import pandas as pd +import logging +from model_path import ModelPath +from utils_log_files import create_log_file, read_log_file +from utils_save_outputs import save_model_outputs, save_predictions +from utils_run import get_standardized_df +from utils_artifacts import get_latest_model_artifact +from utils_evaluation_metrics import generate_metric_dict +from utils_model_outputs import generate_output_dict +from utils_wandb import log_wandb_log_dict +from views_forecasts.extensions import * + +logger = logging.getLogger(__name__) + +def evaluate_model_artifact(config, artifact_name): + model_path = ModelPath(config["name"]) + path_raw = model_path.data_raw + path_generated = model_path.data_generated + path_artifacts = model_path.artifacts + run_type = config["run_type"] + + # if an artifact name is provided through the CLI, use it. + # Otherwise, get the latest model artifact based on the run type + if artifact_name: + logger.info(f"Using (non-default) artifact: {artifact_name}") + + if not artifact_name.endswith(".pkl"): + artifact_name += ".pkl" + PATH_ARTIFACT = path_artifacts / artifact_name + else: + # use the latest model artifact based on the run type + logger.info(f"Using latest (default) run type ({run_type}) specific artifact") + PATH_ARTIFACT = get_latest_model_artifact(path_artifacts, run_type) + + config["timestamp"] = PATH_ARTIFACT.stem[-15:] + df_viewser = pd.read_pickle(path_raw / f"{run_type}_viewser_df.pkl") + + try: + stepshift_model = pd.read_pickle(PATH_ARTIFACT) + except FileNotFoundError: + logger.exception(f"Model artifact not found at {PATH_ARTIFACT}") + + df = stepshift_model.predict(run_type, df_viewser) + df = get_standardized_df(df, config) + data_generation_timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + date_fetch_timestamp = read_log_file(path_raw / f"{run_type}_data_fetch_log.txt").get("Data Fetch Timestamp", None) + + _, df_output = generate_output_dict(df, config) + evaluation, df_evaluation = generate_metric_dict(df, config) + log_wandb_log_dict(config, evaluation) + + save_model_outputs(df_evaluation, df_output, path_generated, config) + save_predictions(df, path_generated, config) + create_log_file(path_generated, config, config["timestamp"], data_generation_timestamp, date_fetch_timestamp) diff --git a/models/good_riddance/src/offline_evaluation/evaluate_sweep.py b/models/good_riddance/src/offline_evaluation/evaluate_sweep.py new file mode 100644 index 00000000..d6726cf0 --- /dev/null +++ b/models/good_riddance/src/offline_evaluation/evaluate_sweep.py @@ -0,0 +1,28 @@ +import pandas as pd +import wandb +from sklearn.metrics import mean_squared_error +from model_path import ModelPath +from utils_run import get_standardized_df +from utils_wandb import log_wandb_log_dict +from utils_evaluation_metrics import generate_metric_dict + + +def evaluate_sweep(config, stepshift_model): + model_path = ModelPath(config["name"]) + path_raw = model_path.data_raw + run_type = config["run_type"] + steps = config["steps"] + + df_viewser = pd.read_pickle(path_raw / f"{run_type}_viewser_df.pkl") + df = stepshift_model.predict(run_type, df_viewser) + df = get_standardized_df(df, config) + + # Temporarily keep this because the metric to minimize is MSE + pred_cols = [f"step_pred_{str(i)}" for i in steps] + df["mse"] = df.apply(lambda row: mean_squared_error([row[config["depvar"]]] * 36, + [row[col] for col in pred_cols]), axis=1) + + wandb.log({"MSE": df["mse"].mean()}) + + evaluation, _ = generate_metric_dict(df, config) + log_wandb_log_dict(config, evaluation) diff --git a/models/good_riddance/src/online_evaluation/.gitkeep b/models/good_riddance/src/online_evaluation/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/models/good_riddance/src/training/train_model.py b/models/good_riddance/src/training/train_model.py new file mode 100644 index 00000000..f2342912 --- /dev/null +++ b/models/good_riddance/src/training/train_model.py @@ -0,0 +1,33 @@ +from datetime import datetime +import pandas as pd +from model_path import ModelPath +from utils_log_files import create_log_file, read_log_file +from utils_run import get_model +from set_partition import get_partitioner_dict +from views_forecasts.extensions import * + + +def train_model_artifact(config): + # print(config) + model_path = ModelPath(config["name"]) + path_raw = model_path.data_raw + path_generated = model_path.data_generated + path_artifacts = model_path.artifacts + run_type = config["run_type"] + df_viewser = pd.read_pickle(path_raw / f"{run_type}_viewser_df.pkl") + + stepshift_model = stepshift_training(config, run_type, df_viewser) + if not config["sweep"]: + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + model_filename = f"{run_type}_model_{timestamp}.pkl" + stepshift_model.save(path_artifacts / model_filename) + date_fetch_timestamp = read_log_file(path_raw / f"{run_type}_data_fetch_log.txt").get("Data Fetch Timestamp", None) + create_log_file(path_generated, config, timestamp, None, date_fetch_timestamp) + return stepshift_model + + +def stepshift_training(config, partition_name, dataset): + partitioner_dict = get_partitioner_dict(partition_name) + stepshift_model = get_model(config, partitioner_dict) + stepshift_model.fit(dataset) + return stepshift_model diff --git a/models/good_riddance/src/utils/utils_run.py b/models/good_riddance/src/utils/utils_run.py new file mode 100644 index 00000000..51901a01 --- /dev/null +++ b/models/good_riddance/src/utils/utils_run.py @@ -0,0 +1,83 @@ +import numpy as np +from views_stepshifter_darts.stepshifter import StepshifterModel +from views_stepshifter_darts.hurdle_model import HurdleModel +from views_forecasts.extensions import * + + +def get_model(config, partitioner_dict): + """ + Get the model based on the algorithm specified in the config + """ + + if config["algorithm"] == "HurdleModel": + model = HurdleModel(config, partitioner_dict) + else: + config["model_reg"] = config["algorithm"] + model = StepshifterModel(config, partitioner_dict) + + return model + + +def get_standardized_df(df, config): + """ + Standardize the DataFrame based on the run type + """ + + run_type = config["run_type"] + steps = config["steps"] + depvar = config["depvar"] + + # choose the columns to keep based on the run type and replace negative values with 0 + if run_type in ["calibration", "testing"]: + cols = [depvar] + df.forecasts.prediction_columns + elif run_type == "forecasting": + cols = [f"step_pred_{i}" for i in steps] + df = df.replace([np.inf, -np.inf], 0)[cols] + df = df.mask(df < 0, 0) + return df + + +def split_hurdle_parameters(parameters_dict): + """ + Split the parameters dictionary into two separate dictionaries, one for the + classification model and one for the regression model. + """ + + cls_dict = {} + reg_dict = {} + + for key, value in parameters_dict.items(): + if key.startswith("cls_"): + cls_key = key.replace("cls_", "") + cls_dict[cls_key] = value + elif key.startswith("reg_"): + reg_key = key.replace("reg_", "") + reg_dict[reg_key] = value + + return cls_dict, reg_dict + + +def update_config(hp_config, meta_config, dp_config, args): + config = hp_config.copy() + config["run_type"] = args.run_type + config["sweep"] = False + config["name"] = meta_config["name"] + config["depvar"] = meta_config["depvar"] + config["algorithm"] = meta_config["algorithm"] + if meta_config["algorithm"] == "HurdleModel": + config["model_clf"] = meta_config["model_clf"] + config["model_reg"] = meta_config["model_reg"] + config["deployment_status"] = dp_config["deployment_status"] + + return config + + +def update_sweep_config(sweep_config, args, meta_config): + sweep_config["parameters"]["run_type"] = {"value": args.run_type} + sweep_config["parameters"]["sweep"] = {"value": True} + sweep_config["parameters"]["name"] = {"value": meta_config["name"]} + sweep_config["parameters"]["depvar"] = {"value": meta_config["depvar"]} + sweep_config["parameters"]["algorithm"] = {"value": meta_config["algorithm"]} + if meta_config["algorithm"] == "HurdleModel": + sweep_config["parameters"]["model_clf"] = {"value": meta_config["model_clf"]} + sweep_config["parameters"]["model_reg"] = {"value": meta_config["model_reg"]} diff --git a/models/good_riddance/src/visualization/.gitkeep b/models/good_riddance/src/visualization/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/models/green_squirrel/README.md b/models/green_squirrel/README.md new file mode 100644 index 00000000..11a01e49 --- /dev/null +++ b/models/green_squirrel/README.md @@ -0,0 +1,3 @@ +# Model README +## Model name: green_squirrel +## Created on: 2024-11-05 13:46:57.758958 \ No newline at end of file diff --git a/models/green_squirrel/artifacts/.gitkeep b/models/green_squirrel/artifacts/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/models/green_squirrel/configs/config_deployment.py b/models/green_squirrel/configs/config_deployment.py new file mode 100644 index 00000000..9e45b735 --- /dev/null +++ b/models/green_squirrel/configs/config_deployment.py @@ -0,0 +1,20 @@ +""" +Deployment Configuration Script + +This script defines the deployment configuration settings for the application. +It includes the deployment status and any additional settings specified. + +Deployment Status: +- shadow: The deployment is shadowed and not yet active. +- deployed: The deployment is active and in use. +- baseline: The deployment is in a baseline state, for reference or comparison. +- deprecated: The deployment is deprecated and no longer supported. + +Additional settings can be included in the configuration dictionary as needed. + +""" + +def get_deployment_config(): + # Deployment settings + deployment_config = {'deployment_status': 'shadow'} + return deployment_config diff --git a/models/green_squirrel/configs/config_hyperparameters.py b/models/green_squirrel/configs/config_hyperparameters.py new file mode 100644 index 00000000..175a66f1 --- /dev/null +++ b/models/green_squirrel/configs/config_hyperparameters.py @@ -0,0 +1,24 @@ + +def get_hp_config(): + """ + Contains the hyperparameter configurations for model training. + This configuration is "operational" so modifying these settings will impact the model's behavior during the training. + + Returns: + - hyperparameters (dict): A dictionary containing hyperparameters for training the model, which determine the model's behavior during the training phase. + """ + + hyperparameters = { + "steps": [*range(1, 36 + 1, 1)], + "parameters": { + "clf": { + "n_estimators": 250, + "n_jobs": -2 + }, + "reg": { + "n_estimators": 250, + "n_jobs": -2 + } + } + } + return hyperparameters diff --git a/models/green_squirrel/configs/config_meta.py b/models/green_squirrel/configs/config_meta.py new file mode 100644 index 00000000..fb89f006 --- /dev/null +++ b/models/green_squirrel/configs/config_meta.py @@ -0,0 +1,20 @@ +def get_meta_config(): + """ + Contains the meta data for the model (model algorithm, name, target variable, and level of analysis). + This config is for documentation purposes only, and modifying it will not affect the model, the training, or the evaluation. + + Returns: + - meta_config (dict): A dictionary containing model meta configuration. + """ + + meta_config = { + "name": "green_squirrel", + "algorithm": "HurdleModel", + "model_clf": "RandomForestModel", + "model_reg": "RandomForestModel", + "depvar": "ln_ged_sb_dep", + "queryset": "fatalities003_joint_broad", + "level": "cm", + "creator": "Borbála" + } + return meta_config diff --git a/models/green_squirrel/configs/config_sweep.py b/models/green_squirrel/configs/config_sweep.py new file mode 100644 index 00000000..fd87bf13 --- /dev/null +++ b/models/green_squirrel/configs/config_sweep.py @@ -0,0 +1,29 @@ + +def get_sweep_config(): + """ + Contains the configuration for hyperparameter sweeps using WandB. + This configuration is "operational" so modifying it will change the search strategy, parameter ranges, and other settings for hyperparameter tuning aimed at optimizing model performance. + + Returns: + - sweep_config (dict): A dictionary containing the configuration for hyperparameter sweeps, defining the methods and parameter ranges used to search for optimal hyperparameters. + """ + + sweep_config = { + 'method': 'grid', + 'name': 'green_squirrel' + } + + # Example metric setup: + metric = { + 'name': 'MSE', + 'goal': 'minimize' + } + sweep_config['metric'] = metric + + # Example parameters setup: + parameters_dict = { + 'steps': {'values': [[*range(1, 36 + 1, 1)]]}, + } + sweep_config['parameters'] = parameters_dict + + return sweep_config diff --git a/models/green_squirrel/data/generated/.gitkeep b/models/green_squirrel/data/generated/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/models/green_squirrel/data/processed/.gitkeep b/models/green_squirrel/data/processed/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/models/green_squirrel/data/raw/.gitkeep b/models/green_squirrel/data/raw/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/models/green_squirrel/main.py b/models/green_squirrel/main.py new file mode 100644 index 00000000..36429ecb --- /dev/null +++ b/models/green_squirrel/main.py @@ -0,0 +1,38 @@ +import wandb +import sys +import warnings + +from pathlib import Path +PATH = Path(__file__) +sys.path.insert(0, str(Path( + *[i for i in PATH.parts[:PATH.parts.index("views_pipeline") + 1]]) / "common_utils")) # PATH_COMMON_UTILS +from set_path import setup_project_paths +setup_project_paths(PATH) + +from utils_cli_parser import parse_args, validate_arguments +from utils_logger import setup_logging +from execute_model_runs import execute_sweep_run, execute_single_run + +warnings.filterwarnings("ignore") +try: + from common_utils.model_path import ModelPath + from common_utils.global_cache import GlobalCache + model_name = ModelPath.get_model_name_from_path(PATH) + GlobalCache["current_model"] = model_name +except ImportError as e: + warnings.warn(f"ImportError: {e}. Some functionalities (model seperated log files) may not work properly.", ImportWarning) +except Exception as e: + warnings.warn(f"An unexpected error occurred: {e}.", RuntimeWarning) +logger = setup_logging("run.log") + + +if __name__ == "__main__": + wandb.login() + + args = parse_args() + validate_arguments(args) + + if args.sweep: + execute_sweep_run(args) + else: + execute_single_run(args) diff --git a/models/green_squirrel/notebooks/.gitkeep b/models/green_squirrel/notebooks/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/models/green_squirrel/reports/.gitkeep b/models/green_squirrel/reports/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/models/green_squirrel/src/architectures/.gitkeep b/models/green_squirrel/src/architectures/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/models/green_squirrel/src/dataloaders/get_data.py b/models/green_squirrel/src/dataloaders/get_data.py new file mode 100644 index 00000000..4dcd20c8 --- /dev/null +++ b/models/green_squirrel/src/dataloaders/get_data.py @@ -0,0 +1,14 @@ +import logging +from model_path import ModelPath +from utils_dataloaders import fetch_or_load_views_df + +logger = logging.getLogger(__name__) + +def get_data(args, model_name, self_test): + model_path = ModelPath(model_name, validate=False) + path_raw = model_path.data_raw + + data, alerts = fetch_or_load_views_df(model_name, args.run_type, path_raw, self_test, use_saved=args.saved) + logger.debug(f"DataFrame shape: {data.shape if data is not None else 'None'}") + + return data diff --git a/models/green_squirrel/src/forecasting/generate_forecast.py b/models/green_squirrel/src/forecasting/generate_forecast.py new file mode 100644 index 00000000..c011a203 --- /dev/null +++ b/models/green_squirrel/src/forecasting/generate_forecast.py @@ -0,0 +1,47 @@ +import pandas as pd +from datetime import datetime +import logging +from model_path import ModelPath +from utils_log_files import create_log_file, read_log_file +from utils_run import get_standardized_df +from utils_save_outputs import save_predictions +from utils_artifacts import get_latest_model_artifact + +logger = logging.getLogger(__name__) + + +def forecast_model_artifact(config, artifact_name): + model_path = ModelPath(config["name"]) + path_raw = model_path.data_raw + path_generated = model_path.data_generated + path_artifacts = model_path.artifacts + run_type = config["run_type"] + + # if an artifact name is provided through the CLI, use it. + # Otherwise, get the latest model artifact based on the run type + if artifact_name: + logger.info(f"Using (non-default) artifact: {artifact_name}") + + if not artifact_name.endswith(".pkl"): + artifact_name += ".pkl" + path_artifact = path_artifacts / artifact_name + else: + # use the latest model artifact based on the run type + logger.info(f"Using latest (default) run type ({run_type}) specific artifact") + path_artifact = get_latest_model_artifact(path_artifacts, run_type) + + config["timestamp"] = path_artifact.stem[-15:] + df_viewser = pd.read_pickle(path_raw / f"{run_type}_viewser_df.pkl") + + try: + stepshift_model = pd.read_pickle(path_artifact) + except FileNotFoundError: + logger.exception(f"Model artifact not found at {path_artifact}") + + df_predictions = stepshift_model.predict(run_type, df_viewser) + df_predictions = get_standardized_df(df_predictions, config) + data_generation_timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + date_fetch_timestamp = read_log_file(path_raw / f"{run_type}_data_fetch_log.txt").get("Data Fetch Timestamp", None) + + save_predictions(df_predictions, path_generated, config) + create_log_file(path_generated, config, config["timestamp"], data_generation_timestamp, date_fetch_timestamp) diff --git a/models/green_squirrel/src/management/execute_model_runs.py b/models/green_squirrel/src/management/execute_model_runs.py new file mode 100644 index 00000000..d5b32a46 --- /dev/null +++ b/models/green_squirrel/src/management/execute_model_runs.py @@ -0,0 +1,51 @@ +import wandb +from config_deployment import get_deployment_config +from config_hyperparameters import get_hp_config +from config_meta import get_meta_config +from config_sweep import get_sweep_config +from execute_model_tasks import execute_model_tasks +from get_data import get_data +from utils_run import update_config, update_sweep_config + + +def execute_sweep_run(args): + sweep_config = get_sweep_config() + meta_config = get_meta_config() + update_sweep_config(sweep_config, args, meta_config) + + project = f"{sweep_config['name']}_sweep" # we can name the sweep in the config file + + with wandb.init(project=f'{project}_fetch', entity="views_pipeline"): + + get_data(args, sweep_config["name"], args.drift_self_test) + + wandb.finish() + + sweep_id = wandb.sweep(sweep_config, project=project, entity="views_pipeline") + wandb.agent(sweep_id, execute_model_tasks, entity="views_pipeline") + + +def execute_single_run(args): + + hp_config = get_hp_config() + meta_config = get_meta_config() + dp_config = get_deployment_config() + config = update_config(hp_config, meta_config, dp_config, args) + + project = f"{config['name']}_{args.run_type}" + + with wandb.init(project=f'{project}_fetch', entity="views_pipeline"): + + get_data(args, config["name"], args.drift_self_test) + + wandb.finish() + + if args.run_type == 'calibration' or args.run_type == 'testing': + + execute_model_tasks(config=config, project=project, train=args.train, eval=args.evaluate, + forecast=False, artifact_name=args.artifact_name) + + elif args.run_type == "forecasting": + execute_model_tasks(config=config, project=project, train=args.train, eval=False, + forecast=args.forecast, artifact_name=args.artifact_name) + diff --git a/models/green_squirrel/src/management/execute_model_tasks.py b/models/green_squirrel/src/management/execute_model_tasks.py new file mode 100644 index 00000000..a913fa53 --- /dev/null +++ b/models/green_squirrel/src/management/execute_model_tasks.py @@ -0,0 +1,70 @@ +import wandb +import logging +import time +from evaluate_model import evaluate_model_artifact +from evaluate_sweep import evaluate_sweep +from generate_forecast import forecast_model_artifact +from train_model import train_model_artifact +from utils_run import split_hurdle_parameters +from utils_wandb import add_wandb_monthly_metrics + +logger = logging.getLogger(__name__) + +def execute_model_tasks(config=None, project=None, train=None, eval=None, forecast=None, artifact_name=None): + """ + Executes various model-related tasks including training, evaluation, and forecasting. + + This function manages the execution of different tasks such as training the model, + evaluating an existing model, or performing forecasting. + It also initializes the WandB project. + + Args: + config: Configuration object containing parameters and settings. + project: The WandB project name. + train: Flag to indicate if the model should be trained. + eval: Flag to indicate if the model should be evaluated. + forecast: Flag to indicate if forecasting should be performed. + artifact_name (optional): Specific name of the model artifact to load for evaluation or forecasting. + """ + + start_t = time.time() + + # Initialize WandB + with wandb.init(project=project, entity="views_pipeline", + config=config): # project and config ignored when running a sweep + + # add the monthly metrics to WandB + add_wandb_monthly_metrics() + + # Update config from WandB initialization above + config = wandb.config + + # W&B does not directly support nested dictionaries for hyperparameters + # This will make the sweep config super ugly, but we don't have to distinguish between sweep and single runs + if config["sweep"] and config["algorithm"] == "HurdleRegression": + config["parameters"] = {} + config["parameters"]["clf"], config["parameters"]["reg"] = split_hurdle_parameters(config) + + if config["sweep"]: + logger.info(f"Sweeping model {config['name']}...") + stepshift_model = train_model_artifact(config) + logger.info(f"Evaluating model {config['name']}...") + evaluate_sweep(config, stepshift_model) + + # Handle the single model runs: train and save the model as an artifact + if train: + logger.info(f"Training model {config['name']}...") + train_model_artifact(config) + + # Handle the single model runs: evaluate a trained model (artifact) + if eval: + logger.info(f"Evaluating model {config['name']}...") + evaluate_model_artifact(config, artifact_name) + + if forecast: + logger.info(f"Forecasting model {config['name']}...") + forecast_model_artifact(config, artifact_name) + + end_t = time.time() + minutes = (end_t - start_t) / 60 + logger.info(f"Done. Runtime: {minutes:.3f} minutes.\n") diff --git a/models/green_squirrel/src/offline_evaluation/evaluate_model.py b/models/green_squirrel/src/offline_evaluation/evaluate_model.py new file mode 100644 index 00000000..0d86a87c --- /dev/null +++ b/models/green_squirrel/src/offline_evaluation/evaluate_model.py @@ -0,0 +1,55 @@ +from datetime import datetime +import pandas as pd +import logging +from model_path import ModelPath +from utils_log_files import create_log_file, read_log_file +from utils_save_outputs import save_model_outputs, save_predictions +from utils_run import get_standardized_df +from utils_artifacts import get_latest_model_artifact +from utils_evaluation_metrics import generate_metric_dict +from utils_model_outputs import generate_output_dict +from utils_wandb import log_wandb_log_dict +from views_forecasts.extensions import * + +logger = logging.getLogger(__name__) + +def evaluate_model_artifact(config, artifact_name): + model_path = ModelPath(config["name"]) + path_raw = model_path.data_raw + path_generated = model_path.data_generated + path_artifacts = model_path.artifacts + run_type = config["run_type"] + + # if an artifact name is provided through the CLI, use it. + # Otherwise, get the latest model artifact based on the run type + if artifact_name: + logger.info(f"Using (non-default) artifact: {artifact_name}") + + if not artifact_name.endswith(".pkl"): + artifact_name += ".pkl" + PATH_ARTIFACT = path_artifacts / artifact_name + else: + # use the latest model artifact based on the run type + logger.info(f"Using latest (default) run type ({run_type}) specific artifact") + PATH_ARTIFACT = get_latest_model_artifact(path_artifacts, run_type) + + config["timestamp"] = PATH_ARTIFACT.stem[-15:] + df_viewser = pd.read_pickle(path_raw / f"{run_type}_viewser_df.pkl") + + try: + stepshift_model = pd.read_pickle(PATH_ARTIFACT) + except FileNotFoundError: + logger.exception(f"Model artifact not found at {PATH_ARTIFACT}") + + df = stepshift_model.predict(run_type, df_viewser) + df = get_standardized_df(df, config) + data_generation_timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + date_fetch_timestamp = read_log_file(path_raw / f"{run_type}_data_fetch_log.txt").get("Data Fetch Timestamp", None) + + _, df_output = generate_output_dict(df, config) + evaluation, df_evaluation = generate_metric_dict(df, config) + log_wandb_log_dict(config, evaluation) + + save_model_outputs(df_evaluation, df_output, path_generated, config) + save_predictions(df, path_generated, config) + create_log_file(path_generated, config, config["timestamp"], data_generation_timestamp, date_fetch_timestamp) diff --git a/models/green_squirrel/src/offline_evaluation/evaluate_sweep.py b/models/green_squirrel/src/offline_evaluation/evaluate_sweep.py new file mode 100644 index 00000000..d6726cf0 --- /dev/null +++ b/models/green_squirrel/src/offline_evaluation/evaluate_sweep.py @@ -0,0 +1,28 @@ +import pandas as pd +import wandb +from sklearn.metrics import mean_squared_error +from model_path import ModelPath +from utils_run import get_standardized_df +from utils_wandb import log_wandb_log_dict +from utils_evaluation_metrics import generate_metric_dict + + +def evaluate_sweep(config, stepshift_model): + model_path = ModelPath(config["name"]) + path_raw = model_path.data_raw + run_type = config["run_type"] + steps = config["steps"] + + df_viewser = pd.read_pickle(path_raw / f"{run_type}_viewser_df.pkl") + df = stepshift_model.predict(run_type, df_viewser) + df = get_standardized_df(df, config) + + # Temporarily keep this because the metric to minimize is MSE + pred_cols = [f"step_pred_{str(i)}" for i in steps] + df["mse"] = df.apply(lambda row: mean_squared_error([row[config["depvar"]]] * 36, + [row[col] for col in pred_cols]), axis=1) + + wandb.log({"MSE": df["mse"].mean()}) + + evaluation, _ = generate_metric_dict(df, config) + log_wandb_log_dict(config, evaluation) diff --git a/models/green_squirrel/src/online_evaluation/.gitkeep b/models/green_squirrel/src/online_evaluation/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/models/green_squirrel/src/training/train_model.py b/models/green_squirrel/src/training/train_model.py new file mode 100644 index 00000000..f2342912 --- /dev/null +++ b/models/green_squirrel/src/training/train_model.py @@ -0,0 +1,33 @@ +from datetime import datetime +import pandas as pd +from model_path import ModelPath +from utils_log_files import create_log_file, read_log_file +from utils_run import get_model +from set_partition import get_partitioner_dict +from views_forecasts.extensions import * + + +def train_model_artifact(config): + # print(config) + model_path = ModelPath(config["name"]) + path_raw = model_path.data_raw + path_generated = model_path.data_generated + path_artifacts = model_path.artifacts + run_type = config["run_type"] + df_viewser = pd.read_pickle(path_raw / f"{run_type}_viewser_df.pkl") + + stepshift_model = stepshift_training(config, run_type, df_viewser) + if not config["sweep"]: + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + model_filename = f"{run_type}_model_{timestamp}.pkl" + stepshift_model.save(path_artifacts / model_filename) + date_fetch_timestamp = read_log_file(path_raw / f"{run_type}_data_fetch_log.txt").get("Data Fetch Timestamp", None) + create_log_file(path_generated, config, timestamp, None, date_fetch_timestamp) + return stepshift_model + + +def stepshift_training(config, partition_name, dataset): + partitioner_dict = get_partitioner_dict(partition_name) + stepshift_model = get_model(config, partitioner_dict) + stepshift_model.fit(dataset) + return stepshift_model diff --git a/models/green_squirrel/src/utils/utils_run.py b/models/green_squirrel/src/utils/utils_run.py new file mode 100644 index 00000000..51901a01 --- /dev/null +++ b/models/green_squirrel/src/utils/utils_run.py @@ -0,0 +1,83 @@ +import numpy as np +from views_stepshifter_darts.stepshifter import StepshifterModel +from views_stepshifter_darts.hurdle_model import HurdleModel +from views_forecasts.extensions import * + + +def get_model(config, partitioner_dict): + """ + Get the model based on the algorithm specified in the config + """ + + if config["algorithm"] == "HurdleModel": + model = HurdleModel(config, partitioner_dict) + else: + config["model_reg"] = config["algorithm"] + model = StepshifterModel(config, partitioner_dict) + + return model + + +def get_standardized_df(df, config): + """ + Standardize the DataFrame based on the run type + """ + + run_type = config["run_type"] + steps = config["steps"] + depvar = config["depvar"] + + # choose the columns to keep based on the run type and replace negative values with 0 + if run_type in ["calibration", "testing"]: + cols = [depvar] + df.forecasts.prediction_columns + elif run_type == "forecasting": + cols = [f"step_pred_{i}" for i in steps] + df = df.replace([np.inf, -np.inf], 0)[cols] + df = df.mask(df < 0, 0) + return df + + +def split_hurdle_parameters(parameters_dict): + """ + Split the parameters dictionary into two separate dictionaries, one for the + classification model and one for the regression model. + """ + + cls_dict = {} + reg_dict = {} + + for key, value in parameters_dict.items(): + if key.startswith("cls_"): + cls_key = key.replace("cls_", "") + cls_dict[cls_key] = value + elif key.startswith("reg_"): + reg_key = key.replace("reg_", "") + reg_dict[reg_key] = value + + return cls_dict, reg_dict + + +def update_config(hp_config, meta_config, dp_config, args): + config = hp_config.copy() + config["run_type"] = args.run_type + config["sweep"] = False + config["name"] = meta_config["name"] + config["depvar"] = meta_config["depvar"] + config["algorithm"] = meta_config["algorithm"] + if meta_config["algorithm"] == "HurdleModel": + config["model_clf"] = meta_config["model_clf"] + config["model_reg"] = meta_config["model_reg"] + config["deployment_status"] = dp_config["deployment_status"] + + return config + + +def update_sweep_config(sweep_config, args, meta_config): + sweep_config["parameters"]["run_type"] = {"value": args.run_type} + sweep_config["parameters"]["sweep"] = {"value": True} + sweep_config["parameters"]["name"] = {"value": meta_config["name"]} + sweep_config["parameters"]["depvar"] = {"value": meta_config["depvar"]} + sweep_config["parameters"]["algorithm"] = {"value": meta_config["algorithm"]} + if meta_config["algorithm"] == "HurdleModel": + sweep_config["parameters"]["model_clf"] = {"value": meta_config["model_clf"]} + sweep_config["parameters"]["model_reg"] = {"value": meta_config["model_reg"]} diff --git a/models/green_squirrel/src/visualization/.gitkeep b/models/green_squirrel/src/visualization/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/models/heavy_rotation/README.md b/models/heavy_rotation/README.md new file mode 100644 index 00000000..b838697d --- /dev/null +++ b/models/heavy_rotation/README.md @@ -0,0 +1,3 @@ +# Model README +## Model name: heavy_rotation +## Created on: 2024-11-05 11:47:36.415527 \ No newline at end of file diff --git a/models/heavy_rotation/artifacts/.gitkeep b/models/heavy_rotation/artifacts/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/models/heavy_rotation/configs/config_deployment.py b/models/heavy_rotation/configs/config_deployment.py new file mode 100644 index 00000000..9e45b735 --- /dev/null +++ b/models/heavy_rotation/configs/config_deployment.py @@ -0,0 +1,20 @@ +""" +Deployment Configuration Script + +This script defines the deployment configuration settings for the application. +It includes the deployment status and any additional settings specified. + +Deployment Status: +- shadow: The deployment is shadowed and not yet active. +- deployed: The deployment is active and in use. +- baseline: The deployment is in a baseline state, for reference or comparison. +- deprecated: The deployment is deprecated and no longer supported. + +Additional settings can be included in the configuration dictionary as needed. + +""" + +def get_deployment_config(): + # Deployment settings + deployment_config = {'deployment_status': 'shadow'} + return deployment_config diff --git a/models/heavy_rotation/configs/config_hyperparameters.py b/models/heavy_rotation/configs/config_hyperparameters.py new file mode 100644 index 00000000..47c85547 --- /dev/null +++ b/models/heavy_rotation/configs/config_hyperparameters.py @@ -0,0 +1,18 @@ + +def get_hp_config(): + """ + Contains the hyperparameter configurations for model training. + This configuration is "operational" so modifying these settings will impact the model's behavior during the training. + + Returns: + - hyperparameters (dict): A dictionary containing hyperparameters for training the model, which determine the model's behavior during the training phase. + """ + + hyperparameters = { + "steps": [*range(1, 36 + 1, 1)], + "parameters": { + "n_estimators": 250, + "n_jobs": 12 + } + } + return hyperparameters diff --git a/models/heavy_rotation/configs/config_meta.py b/models/heavy_rotation/configs/config_meta.py new file mode 100644 index 00000000..6dd96e53 --- /dev/null +++ b/models/heavy_rotation/configs/config_meta.py @@ -0,0 +1,18 @@ +def get_meta_config(): + """ + Contains the meta data for the model (model algorithm, name, target variable, and level of analysis). + This config is for documentation purposes only, and modifying it will not affect the model, the training, or the evaluation. + + Returns: + - meta_config (dict): A dictionary containing model meta configuration. + """ + + meta_config = { + "name": "heavy_rotation", + "algorithm": "RandomForestModel", + "depvar": "ln_ged_sb_dep", + "queryset": "fatalities003_joint_broad", + "level": "cm", + "creator": "Borbála" + } + return meta_config diff --git a/models/heavy_rotation/configs/config_sweep.py b/models/heavy_rotation/configs/config_sweep.py new file mode 100644 index 00000000..36eafd97 --- /dev/null +++ b/models/heavy_rotation/configs/config_sweep.py @@ -0,0 +1,29 @@ + +def get_sweep_config(): + """ + Contains the configuration for hyperparameter sweeps using WandB. + This configuration is "operational" so modifying it will change the search strategy, parameter ranges, and other settings for hyperparameter tuning aimed at optimizing model performance. + + Returns: + - sweep_config (dict): A dictionary containing the configuration for hyperparameter sweeps, defining the methods and parameter ranges used to search for optimal hyperparameters. + """ + + sweep_config = { + 'method': 'grid', + 'name': 'heavy_rotation' + } + + # Example metric setup: + metric = { + 'name': 'MSE', + 'goal': 'minimize' + } + sweep_config['metric'] = metric + + # Example parameters setup: + parameters_dict = { + 'steps': {'values': [[*range(1, 36 + 1, 1)]]}, + } + sweep_config['parameters'] = parameters_dict + + return sweep_config diff --git a/models/heavy_rotation/data/generated/.gitkeep b/models/heavy_rotation/data/generated/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/models/heavy_rotation/data/processed/.gitkeep b/models/heavy_rotation/data/processed/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/models/heavy_rotation/data/raw/.gitkeep b/models/heavy_rotation/data/raw/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/models/heavy_rotation/main.py b/models/heavy_rotation/main.py new file mode 100644 index 00000000..36429ecb --- /dev/null +++ b/models/heavy_rotation/main.py @@ -0,0 +1,38 @@ +import wandb +import sys +import warnings + +from pathlib import Path +PATH = Path(__file__) +sys.path.insert(0, str(Path( + *[i for i in PATH.parts[:PATH.parts.index("views_pipeline") + 1]]) / "common_utils")) # PATH_COMMON_UTILS +from set_path import setup_project_paths +setup_project_paths(PATH) + +from utils_cli_parser import parse_args, validate_arguments +from utils_logger import setup_logging +from execute_model_runs import execute_sweep_run, execute_single_run + +warnings.filterwarnings("ignore") +try: + from common_utils.model_path import ModelPath + from common_utils.global_cache import GlobalCache + model_name = ModelPath.get_model_name_from_path(PATH) + GlobalCache["current_model"] = model_name +except ImportError as e: + warnings.warn(f"ImportError: {e}. Some functionalities (model seperated log files) may not work properly.", ImportWarning) +except Exception as e: + warnings.warn(f"An unexpected error occurred: {e}.", RuntimeWarning) +logger = setup_logging("run.log") + + +if __name__ == "__main__": + wandb.login() + + args = parse_args() + validate_arguments(args) + + if args.sweep: + execute_sweep_run(args) + else: + execute_single_run(args) diff --git a/models/heavy_rotation/notebooks/.gitkeep b/models/heavy_rotation/notebooks/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/models/heavy_rotation/reports/.gitkeep b/models/heavy_rotation/reports/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/models/heavy_rotation/src/architectures/.gitkeep b/models/heavy_rotation/src/architectures/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/models/heavy_rotation/src/dataloaders/get_data.py b/models/heavy_rotation/src/dataloaders/get_data.py new file mode 100644 index 00000000..4dcd20c8 --- /dev/null +++ b/models/heavy_rotation/src/dataloaders/get_data.py @@ -0,0 +1,14 @@ +import logging +from model_path import ModelPath +from utils_dataloaders import fetch_or_load_views_df + +logger = logging.getLogger(__name__) + +def get_data(args, model_name, self_test): + model_path = ModelPath(model_name, validate=False) + path_raw = model_path.data_raw + + data, alerts = fetch_or_load_views_df(model_name, args.run_type, path_raw, self_test, use_saved=args.saved) + logger.debug(f"DataFrame shape: {data.shape if data is not None else 'None'}") + + return data diff --git a/models/heavy_rotation/src/forecasting/generate_forecast.py b/models/heavy_rotation/src/forecasting/generate_forecast.py new file mode 100644 index 00000000..c011a203 --- /dev/null +++ b/models/heavy_rotation/src/forecasting/generate_forecast.py @@ -0,0 +1,47 @@ +import pandas as pd +from datetime import datetime +import logging +from model_path import ModelPath +from utils_log_files import create_log_file, read_log_file +from utils_run import get_standardized_df +from utils_save_outputs import save_predictions +from utils_artifacts import get_latest_model_artifact + +logger = logging.getLogger(__name__) + + +def forecast_model_artifact(config, artifact_name): + model_path = ModelPath(config["name"]) + path_raw = model_path.data_raw + path_generated = model_path.data_generated + path_artifacts = model_path.artifacts + run_type = config["run_type"] + + # if an artifact name is provided through the CLI, use it. + # Otherwise, get the latest model artifact based on the run type + if artifact_name: + logger.info(f"Using (non-default) artifact: {artifact_name}") + + if not artifact_name.endswith(".pkl"): + artifact_name += ".pkl" + path_artifact = path_artifacts / artifact_name + else: + # use the latest model artifact based on the run type + logger.info(f"Using latest (default) run type ({run_type}) specific artifact") + path_artifact = get_latest_model_artifact(path_artifacts, run_type) + + config["timestamp"] = path_artifact.stem[-15:] + df_viewser = pd.read_pickle(path_raw / f"{run_type}_viewser_df.pkl") + + try: + stepshift_model = pd.read_pickle(path_artifact) + except FileNotFoundError: + logger.exception(f"Model artifact not found at {path_artifact}") + + df_predictions = stepshift_model.predict(run_type, df_viewser) + df_predictions = get_standardized_df(df_predictions, config) + data_generation_timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + date_fetch_timestamp = read_log_file(path_raw / f"{run_type}_data_fetch_log.txt").get("Data Fetch Timestamp", None) + + save_predictions(df_predictions, path_generated, config) + create_log_file(path_generated, config, config["timestamp"], data_generation_timestamp, date_fetch_timestamp) diff --git a/models/heavy_rotation/src/management/execute_model_runs.py b/models/heavy_rotation/src/management/execute_model_runs.py new file mode 100644 index 00000000..d5b32a46 --- /dev/null +++ b/models/heavy_rotation/src/management/execute_model_runs.py @@ -0,0 +1,51 @@ +import wandb +from config_deployment import get_deployment_config +from config_hyperparameters import get_hp_config +from config_meta import get_meta_config +from config_sweep import get_sweep_config +from execute_model_tasks import execute_model_tasks +from get_data import get_data +from utils_run import update_config, update_sweep_config + + +def execute_sweep_run(args): + sweep_config = get_sweep_config() + meta_config = get_meta_config() + update_sweep_config(sweep_config, args, meta_config) + + project = f"{sweep_config['name']}_sweep" # we can name the sweep in the config file + + with wandb.init(project=f'{project}_fetch', entity="views_pipeline"): + + get_data(args, sweep_config["name"], args.drift_self_test) + + wandb.finish() + + sweep_id = wandb.sweep(sweep_config, project=project, entity="views_pipeline") + wandb.agent(sweep_id, execute_model_tasks, entity="views_pipeline") + + +def execute_single_run(args): + + hp_config = get_hp_config() + meta_config = get_meta_config() + dp_config = get_deployment_config() + config = update_config(hp_config, meta_config, dp_config, args) + + project = f"{config['name']}_{args.run_type}" + + with wandb.init(project=f'{project}_fetch', entity="views_pipeline"): + + get_data(args, config["name"], args.drift_self_test) + + wandb.finish() + + if args.run_type == 'calibration' or args.run_type == 'testing': + + execute_model_tasks(config=config, project=project, train=args.train, eval=args.evaluate, + forecast=False, artifact_name=args.artifact_name) + + elif args.run_type == "forecasting": + execute_model_tasks(config=config, project=project, train=args.train, eval=False, + forecast=args.forecast, artifact_name=args.artifact_name) + diff --git a/models/heavy_rotation/src/management/execute_model_tasks.py b/models/heavy_rotation/src/management/execute_model_tasks.py new file mode 100644 index 00000000..a913fa53 --- /dev/null +++ b/models/heavy_rotation/src/management/execute_model_tasks.py @@ -0,0 +1,70 @@ +import wandb +import logging +import time +from evaluate_model import evaluate_model_artifact +from evaluate_sweep import evaluate_sweep +from generate_forecast import forecast_model_artifact +from train_model import train_model_artifact +from utils_run import split_hurdle_parameters +from utils_wandb import add_wandb_monthly_metrics + +logger = logging.getLogger(__name__) + +def execute_model_tasks(config=None, project=None, train=None, eval=None, forecast=None, artifact_name=None): + """ + Executes various model-related tasks including training, evaluation, and forecasting. + + This function manages the execution of different tasks such as training the model, + evaluating an existing model, or performing forecasting. + It also initializes the WandB project. + + Args: + config: Configuration object containing parameters and settings. + project: The WandB project name. + train: Flag to indicate if the model should be trained. + eval: Flag to indicate if the model should be evaluated. + forecast: Flag to indicate if forecasting should be performed. + artifact_name (optional): Specific name of the model artifact to load for evaluation or forecasting. + """ + + start_t = time.time() + + # Initialize WandB + with wandb.init(project=project, entity="views_pipeline", + config=config): # project and config ignored when running a sweep + + # add the monthly metrics to WandB + add_wandb_monthly_metrics() + + # Update config from WandB initialization above + config = wandb.config + + # W&B does not directly support nested dictionaries for hyperparameters + # This will make the sweep config super ugly, but we don't have to distinguish between sweep and single runs + if config["sweep"] and config["algorithm"] == "HurdleRegression": + config["parameters"] = {} + config["parameters"]["clf"], config["parameters"]["reg"] = split_hurdle_parameters(config) + + if config["sweep"]: + logger.info(f"Sweeping model {config['name']}...") + stepshift_model = train_model_artifact(config) + logger.info(f"Evaluating model {config['name']}...") + evaluate_sweep(config, stepshift_model) + + # Handle the single model runs: train and save the model as an artifact + if train: + logger.info(f"Training model {config['name']}...") + train_model_artifact(config) + + # Handle the single model runs: evaluate a trained model (artifact) + if eval: + logger.info(f"Evaluating model {config['name']}...") + evaluate_model_artifact(config, artifact_name) + + if forecast: + logger.info(f"Forecasting model {config['name']}...") + forecast_model_artifact(config, artifact_name) + + end_t = time.time() + minutes = (end_t - start_t) / 60 + logger.info(f"Done. Runtime: {minutes:.3f} minutes.\n") diff --git a/models/heavy_rotation/src/offline_evaluation/evaluate_model.py b/models/heavy_rotation/src/offline_evaluation/evaluate_model.py new file mode 100644 index 00000000..0d86a87c --- /dev/null +++ b/models/heavy_rotation/src/offline_evaluation/evaluate_model.py @@ -0,0 +1,55 @@ +from datetime import datetime +import pandas as pd +import logging +from model_path import ModelPath +from utils_log_files import create_log_file, read_log_file +from utils_save_outputs import save_model_outputs, save_predictions +from utils_run import get_standardized_df +from utils_artifacts import get_latest_model_artifact +from utils_evaluation_metrics import generate_metric_dict +from utils_model_outputs import generate_output_dict +from utils_wandb import log_wandb_log_dict +from views_forecasts.extensions import * + +logger = logging.getLogger(__name__) + +def evaluate_model_artifact(config, artifact_name): + model_path = ModelPath(config["name"]) + path_raw = model_path.data_raw + path_generated = model_path.data_generated + path_artifacts = model_path.artifacts + run_type = config["run_type"] + + # if an artifact name is provided through the CLI, use it. + # Otherwise, get the latest model artifact based on the run type + if artifact_name: + logger.info(f"Using (non-default) artifact: {artifact_name}") + + if not artifact_name.endswith(".pkl"): + artifact_name += ".pkl" + PATH_ARTIFACT = path_artifacts / artifact_name + else: + # use the latest model artifact based on the run type + logger.info(f"Using latest (default) run type ({run_type}) specific artifact") + PATH_ARTIFACT = get_latest_model_artifact(path_artifacts, run_type) + + config["timestamp"] = PATH_ARTIFACT.stem[-15:] + df_viewser = pd.read_pickle(path_raw / f"{run_type}_viewser_df.pkl") + + try: + stepshift_model = pd.read_pickle(PATH_ARTIFACT) + except FileNotFoundError: + logger.exception(f"Model artifact not found at {PATH_ARTIFACT}") + + df = stepshift_model.predict(run_type, df_viewser) + df = get_standardized_df(df, config) + data_generation_timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + date_fetch_timestamp = read_log_file(path_raw / f"{run_type}_data_fetch_log.txt").get("Data Fetch Timestamp", None) + + _, df_output = generate_output_dict(df, config) + evaluation, df_evaluation = generate_metric_dict(df, config) + log_wandb_log_dict(config, evaluation) + + save_model_outputs(df_evaluation, df_output, path_generated, config) + save_predictions(df, path_generated, config) + create_log_file(path_generated, config, config["timestamp"], data_generation_timestamp, date_fetch_timestamp) diff --git a/models/heavy_rotation/src/offline_evaluation/evaluate_sweep.py b/models/heavy_rotation/src/offline_evaluation/evaluate_sweep.py new file mode 100644 index 00000000..d6726cf0 --- /dev/null +++ b/models/heavy_rotation/src/offline_evaluation/evaluate_sweep.py @@ -0,0 +1,28 @@ +import pandas as pd +import wandb +from sklearn.metrics import mean_squared_error +from model_path import ModelPath +from utils_run import get_standardized_df +from utils_wandb import log_wandb_log_dict +from utils_evaluation_metrics import generate_metric_dict + + +def evaluate_sweep(config, stepshift_model): + model_path = ModelPath(config["name"]) + path_raw = model_path.data_raw + run_type = config["run_type"] + steps = config["steps"] + + df_viewser = pd.read_pickle(path_raw / f"{run_type}_viewser_df.pkl") + df = stepshift_model.predict(run_type, df_viewser) + df = get_standardized_df(df, config) + + # Temporarily keep this because the metric to minimize is MSE + pred_cols = [f"step_pred_{str(i)}" for i in steps] + df["mse"] = df.apply(lambda row: mean_squared_error([row[config["depvar"]]] * 36, + [row[col] for col in pred_cols]), axis=1) + + wandb.log({"MSE": df["mse"].mean()}) + + evaluation, _ = generate_metric_dict(df, config) + log_wandb_log_dict(config, evaluation) diff --git a/models/heavy_rotation/src/online_evaluation/.gitkeep b/models/heavy_rotation/src/online_evaluation/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/models/heavy_rotation/src/training/train_model.py b/models/heavy_rotation/src/training/train_model.py new file mode 100644 index 00000000..f2342912 --- /dev/null +++ b/models/heavy_rotation/src/training/train_model.py @@ -0,0 +1,33 @@ +from datetime import datetime +import pandas as pd +from model_path import ModelPath +from utils_log_files import create_log_file, read_log_file +from utils_run import get_model +from set_partition import get_partitioner_dict +from views_forecasts.extensions import * + + +def train_model_artifact(config): + # print(config) + model_path = ModelPath(config["name"]) + path_raw = model_path.data_raw + path_generated = model_path.data_generated + path_artifacts = model_path.artifacts + run_type = config["run_type"] + df_viewser = pd.read_pickle(path_raw / f"{run_type}_viewser_df.pkl") + + stepshift_model = stepshift_training(config, run_type, df_viewser) + if not config["sweep"]: + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + model_filename = f"{run_type}_model_{timestamp}.pkl" + stepshift_model.save(path_artifacts / model_filename) + date_fetch_timestamp = read_log_file(path_raw / f"{run_type}_data_fetch_log.txt").get("Data Fetch Timestamp", None) + create_log_file(path_generated, config, timestamp, None, date_fetch_timestamp) + return stepshift_model + + +def stepshift_training(config, partition_name, dataset): + partitioner_dict = get_partitioner_dict(partition_name) + stepshift_model = get_model(config, partitioner_dict) + stepshift_model.fit(dataset) + return stepshift_model diff --git a/models/heavy_rotation/src/utils/utils_run.py b/models/heavy_rotation/src/utils/utils_run.py new file mode 100644 index 00000000..51901a01 --- /dev/null +++ b/models/heavy_rotation/src/utils/utils_run.py @@ -0,0 +1,83 @@ +import numpy as np +from views_stepshifter_darts.stepshifter import StepshifterModel +from views_stepshifter_darts.hurdle_model import HurdleModel +from views_forecasts.extensions import * + + +def get_model(config, partitioner_dict): + """ + Get the model based on the algorithm specified in the config + """ + + if config["algorithm"] == "HurdleModel": + model = HurdleModel(config, partitioner_dict) + else: + config["model_reg"] = config["algorithm"] + model = StepshifterModel(config, partitioner_dict) + + return model + + +def get_standardized_df(df, config): + """ + Standardize the DataFrame based on the run type + """ + + run_type = config["run_type"] + steps = config["steps"] + depvar = config["depvar"] + + # choose the columns to keep based on the run type and replace negative values with 0 + if run_type in ["calibration", "testing"]: + cols = [depvar] + df.forecasts.prediction_columns + elif run_type == "forecasting": + cols = [f"step_pred_{i}" for i in steps] + df = df.replace([np.inf, -np.inf], 0)[cols] + df = df.mask(df < 0, 0) + return df + + +def split_hurdle_parameters(parameters_dict): + """ + Split the parameters dictionary into two separate dictionaries, one for the + classification model and one for the regression model. + """ + + cls_dict = {} + reg_dict = {} + + for key, value in parameters_dict.items(): + if key.startswith("cls_"): + cls_key = key.replace("cls_", "") + cls_dict[cls_key] = value + elif key.startswith("reg_"): + reg_key = key.replace("reg_", "") + reg_dict[reg_key] = value + + return cls_dict, reg_dict + + +def update_config(hp_config, meta_config, dp_config, args): + config = hp_config.copy() + config["run_type"] = args.run_type + config["sweep"] = False + config["name"] = meta_config["name"] + config["depvar"] = meta_config["depvar"] + config["algorithm"] = meta_config["algorithm"] + if meta_config["algorithm"] == "HurdleModel": + config["model_clf"] = meta_config["model_clf"] + config["model_reg"] = meta_config["model_reg"] + config["deployment_status"] = dp_config["deployment_status"] + + return config + + +def update_sweep_config(sweep_config, args, meta_config): + sweep_config["parameters"]["run_type"] = {"value": args.run_type} + sweep_config["parameters"]["sweep"] = {"value": True} + sweep_config["parameters"]["name"] = {"value": meta_config["name"]} + sweep_config["parameters"]["depvar"] = {"value": meta_config["depvar"]} + sweep_config["parameters"]["algorithm"] = {"value": meta_config["algorithm"]} + if meta_config["algorithm"] == "HurdleModel": + sweep_config["parameters"]["model_clf"] = {"value": meta_config["model_clf"]} + sweep_config["parameters"]["model_reg"] = {"value": meta_config["model_reg"]} diff --git a/models/heavy_rotation/src/visualization/.gitkeep b/models/heavy_rotation/src/visualization/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/models/high_hopes/README.md b/models/high_hopes/README.md new file mode 100644 index 00000000..626381a3 --- /dev/null +++ b/models/high_hopes/README.md @@ -0,0 +1,3 @@ +# Model README +## Model name: high_hopes +## Created on: 2024-11-01 17:18:35.973196 \ No newline at end of file diff --git a/models/high_hopes/artifacts/.gitkeep b/models/high_hopes/artifacts/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/models/high_hopes/configs/config_deployment.py b/models/high_hopes/configs/config_deployment.py new file mode 100644 index 00000000..9e45b735 --- /dev/null +++ b/models/high_hopes/configs/config_deployment.py @@ -0,0 +1,20 @@ +""" +Deployment Configuration Script + +This script defines the deployment configuration settings for the application. +It includes the deployment status and any additional settings specified. + +Deployment Status: +- shadow: The deployment is shadowed and not yet active. +- deployed: The deployment is active and in use. +- baseline: The deployment is in a baseline state, for reference or comparison. +- deprecated: The deployment is deprecated and no longer supported. + +Additional settings can be included in the configuration dictionary as needed. + +""" + +def get_deployment_config(): + # Deployment settings + deployment_config = {'deployment_status': 'shadow'} + return deployment_config diff --git a/models/high_hopes/configs/config_hyperparameters.py b/models/high_hopes/configs/config_hyperparameters.py new file mode 100644 index 00000000..5cbe5a6d --- /dev/null +++ b/models/high_hopes/configs/config_hyperparameters.py @@ -0,0 +1,22 @@ + +def get_hp_config(): + """ + Contains the hyperparameter configurations for model training. + This configuration is "operational" so modifying these settings will impact the model's behavior during the training. + + Returns: + - hyperparameters (dict): A dictionary containing hyperparameters for training the model, which determine the model's behavior during the training phase. + """ + + hyperparameters = { + 'steps': [*range(1, 36 + 1, 1)], + "parameters": { + "clf": { + "n_estimators": 250, + }, + "reg": { + "n_estimators": 250, + } + } + } + return hyperparameters diff --git a/models/high_hopes/configs/config_meta.py b/models/high_hopes/configs/config_meta.py new file mode 100644 index 00000000..0536ae84 --- /dev/null +++ b/models/high_hopes/configs/config_meta.py @@ -0,0 +1,20 @@ +def get_meta_config(): + """ + Contains the meta data for the model (model algorithm, name, target variable, and level of analysis). + This config is for documentation purposes only, and modifying it will not affect the model, the training, or the evaluation. + + Returns: + - meta_config (dict): A dictionary containing model meta configuration. + """ + + meta_config = { + "name": "high_hopes", + "algorithm": "HurdleModel", + "model_clf": "LightGBMModel", + "model_reg": "LightGBMModel", + "depvar": "ln_ged_sb_dep", + "queryset": "fatalities003_conflict_history", + "level": "cm", + "creator": "Borbála" + } + return meta_config diff --git a/models/high_hopes/configs/config_sweep.py b/models/high_hopes/configs/config_sweep.py new file mode 100644 index 00000000..bcdc9f19 --- /dev/null +++ b/models/high_hopes/configs/config_sweep.py @@ -0,0 +1,29 @@ + +def get_sweep_config(): + """ + Contains the configuration for hyperparameter sweeps using WandB. + This configuration is "operational" so modifying it will change the search strategy, parameter ranges, and other settings for hyperparameter tuning aimed at optimizing model performance. + + Returns: + - sweep_config (dict): A dictionary containing the configuration for hyperparameter sweeps, defining the methods and parameter ranges used to search for optimal hyperparameters. + """ + + sweep_config = { + 'method': 'grid', + 'name': 'high_hopes' + } + + # Example metric setup: + metric = { + 'name': 'MSE', + 'goal': 'minimize' + } + sweep_config['metric'] = metric + + # Example parameters setup: + parameters_dict = { + 'steps': {'values': [[*range(1, 36 + 1, 1)]]}, + } + sweep_config['parameters'] = parameters_dict + + return sweep_config diff --git a/models/high_hopes/data/generated/.gitkeep b/models/high_hopes/data/generated/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/models/high_hopes/data/processed/.gitkeep b/models/high_hopes/data/processed/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/models/high_hopes/data/raw/.gitkeep b/models/high_hopes/data/raw/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/models/high_hopes/main.py b/models/high_hopes/main.py new file mode 100644 index 00000000..36429ecb --- /dev/null +++ b/models/high_hopes/main.py @@ -0,0 +1,38 @@ +import wandb +import sys +import warnings + +from pathlib import Path +PATH = Path(__file__) +sys.path.insert(0, str(Path( + *[i for i in PATH.parts[:PATH.parts.index("views_pipeline") + 1]]) / "common_utils")) # PATH_COMMON_UTILS +from set_path import setup_project_paths +setup_project_paths(PATH) + +from utils_cli_parser import parse_args, validate_arguments +from utils_logger import setup_logging +from execute_model_runs import execute_sweep_run, execute_single_run + +warnings.filterwarnings("ignore") +try: + from common_utils.model_path import ModelPath + from common_utils.global_cache import GlobalCache + model_name = ModelPath.get_model_name_from_path(PATH) + GlobalCache["current_model"] = model_name +except ImportError as e: + warnings.warn(f"ImportError: {e}. Some functionalities (model seperated log files) may not work properly.", ImportWarning) +except Exception as e: + warnings.warn(f"An unexpected error occurred: {e}.", RuntimeWarning) +logger = setup_logging("run.log") + + +if __name__ == "__main__": + wandb.login() + + args = parse_args() + validate_arguments(args) + + if args.sweep: + execute_sweep_run(args) + else: + execute_single_run(args) diff --git a/models/high_hopes/notebooks/.gitkeep b/models/high_hopes/notebooks/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/models/high_hopes/reports/.gitkeep b/models/high_hopes/reports/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/models/high_hopes/src/architectures/.gitkeep b/models/high_hopes/src/architectures/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/models/high_hopes/src/dataloaders/get_data.py b/models/high_hopes/src/dataloaders/get_data.py new file mode 100644 index 00000000..4dcd20c8 --- /dev/null +++ b/models/high_hopes/src/dataloaders/get_data.py @@ -0,0 +1,14 @@ +import logging +from model_path import ModelPath +from utils_dataloaders import fetch_or_load_views_df + +logger = logging.getLogger(__name__) + +def get_data(args, model_name, self_test): + model_path = ModelPath(model_name, validate=False) + path_raw = model_path.data_raw + + data, alerts = fetch_or_load_views_df(model_name, args.run_type, path_raw, self_test, use_saved=args.saved) + logger.debug(f"DataFrame shape: {data.shape if data is not None else 'None'}") + + return data diff --git a/models/high_hopes/src/forecasting/generate_forecast.py b/models/high_hopes/src/forecasting/generate_forecast.py new file mode 100644 index 00000000..c011a203 --- /dev/null +++ b/models/high_hopes/src/forecasting/generate_forecast.py @@ -0,0 +1,47 @@ +import pandas as pd +from datetime import datetime +import logging +from model_path import ModelPath +from utils_log_files import create_log_file, read_log_file +from utils_run import get_standardized_df +from utils_save_outputs import save_predictions +from utils_artifacts import get_latest_model_artifact + +logger = logging.getLogger(__name__) + + +def forecast_model_artifact(config, artifact_name): + model_path = ModelPath(config["name"]) + path_raw = model_path.data_raw + path_generated = model_path.data_generated + path_artifacts = model_path.artifacts + run_type = config["run_type"] + + # if an artifact name is provided through the CLI, use it. + # Otherwise, get the latest model artifact based on the run type + if artifact_name: + logger.info(f"Using (non-default) artifact: {artifact_name}") + + if not artifact_name.endswith(".pkl"): + artifact_name += ".pkl" + path_artifact = path_artifacts / artifact_name + else: + # use the latest model artifact based on the run type + logger.info(f"Using latest (default) run type ({run_type}) specific artifact") + path_artifact = get_latest_model_artifact(path_artifacts, run_type) + + config["timestamp"] = path_artifact.stem[-15:] + df_viewser = pd.read_pickle(path_raw / f"{run_type}_viewser_df.pkl") + + try: + stepshift_model = pd.read_pickle(path_artifact) + except FileNotFoundError: + logger.exception(f"Model artifact not found at {path_artifact}") + + df_predictions = stepshift_model.predict(run_type, df_viewser) + df_predictions = get_standardized_df(df_predictions, config) + data_generation_timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + date_fetch_timestamp = read_log_file(path_raw / f"{run_type}_data_fetch_log.txt").get("Data Fetch Timestamp", None) + + save_predictions(df_predictions, path_generated, config) + create_log_file(path_generated, config, config["timestamp"], data_generation_timestamp, date_fetch_timestamp) diff --git a/models/high_hopes/src/management/execute_model_runs.py b/models/high_hopes/src/management/execute_model_runs.py new file mode 100644 index 00000000..d5b32a46 --- /dev/null +++ b/models/high_hopes/src/management/execute_model_runs.py @@ -0,0 +1,51 @@ +import wandb +from config_deployment import get_deployment_config +from config_hyperparameters import get_hp_config +from config_meta import get_meta_config +from config_sweep import get_sweep_config +from execute_model_tasks import execute_model_tasks +from get_data import get_data +from utils_run import update_config, update_sweep_config + + +def execute_sweep_run(args): + sweep_config = get_sweep_config() + meta_config = get_meta_config() + update_sweep_config(sweep_config, args, meta_config) + + project = f"{sweep_config['name']}_sweep" # we can name the sweep in the config file + + with wandb.init(project=f'{project}_fetch', entity="views_pipeline"): + + get_data(args, sweep_config["name"], args.drift_self_test) + + wandb.finish() + + sweep_id = wandb.sweep(sweep_config, project=project, entity="views_pipeline") + wandb.agent(sweep_id, execute_model_tasks, entity="views_pipeline") + + +def execute_single_run(args): + + hp_config = get_hp_config() + meta_config = get_meta_config() + dp_config = get_deployment_config() + config = update_config(hp_config, meta_config, dp_config, args) + + project = f"{config['name']}_{args.run_type}" + + with wandb.init(project=f'{project}_fetch', entity="views_pipeline"): + + get_data(args, config["name"], args.drift_self_test) + + wandb.finish() + + if args.run_type == 'calibration' or args.run_type == 'testing': + + execute_model_tasks(config=config, project=project, train=args.train, eval=args.evaluate, + forecast=False, artifact_name=args.artifact_name) + + elif args.run_type == "forecasting": + execute_model_tasks(config=config, project=project, train=args.train, eval=False, + forecast=args.forecast, artifact_name=args.artifact_name) + diff --git a/models/high_hopes/src/management/execute_model_tasks.py b/models/high_hopes/src/management/execute_model_tasks.py new file mode 100644 index 00000000..a913fa53 --- /dev/null +++ b/models/high_hopes/src/management/execute_model_tasks.py @@ -0,0 +1,70 @@ +import wandb +import logging +import time +from evaluate_model import evaluate_model_artifact +from evaluate_sweep import evaluate_sweep +from generate_forecast import forecast_model_artifact +from train_model import train_model_artifact +from utils_run import split_hurdle_parameters +from utils_wandb import add_wandb_monthly_metrics + +logger = logging.getLogger(__name__) + +def execute_model_tasks(config=None, project=None, train=None, eval=None, forecast=None, artifact_name=None): + """ + Executes various model-related tasks including training, evaluation, and forecasting. + + This function manages the execution of different tasks such as training the model, + evaluating an existing model, or performing forecasting. + It also initializes the WandB project. + + Args: + config: Configuration object containing parameters and settings. + project: The WandB project name. + train: Flag to indicate if the model should be trained. + eval: Flag to indicate if the model should be evaluated. + forecast: Flag to indicate if forecasting should be performed. + artifact_name (optional): Specific name of the model artifact to load for evaluation or forecasting. + """ + + start_t = time.time() + + # Initialize WandB + with wandb.init(project=project, entity="views_pipeline", + config=config): # project and config ignored when running a sweep + + # add the monthly metrics to WandB + add_wandb_monthly_metrics() + + # Update config from WandB initialization above + config = wandb.config + + # W&B does not directly support nested dictionaries for hyperparameters + # This will make the sweep config super ugly, but we don't have to distinguish between sweep and single runs + if config["sweep"] and config["algorithm"] == "HurdleRegression": + config["parameters"] = {} + config["parameters"]["clf"], config["parameters"]["reg"] = split_hurdle_parameters(config) + + if config["sweep"]: + logger.info(f"Sweeping model {config['name']}...") + stepshift_model = train_model_artifact(config) + logger.info(f"Evaluating model {config['name']}...") + evaluate_sweep(config, stepshift_model) + + # Handle the single model runs: train and save the model as an artifact + if train: + logger.info(f"Training model {config['name']}...") + train_model_artifact(config) + + # Handle the single model runs: evaluate a trained model (artifact) + if eval: + logger.info(f"Evaluating model {config['name']}...") + evaluate_model_artifact(config, artifact_name) + + if forecast: + logger.info(f"Forecasting model {config['name']}...") + forecast_model_artifact(config, artifact_name) + + end_t = time.time() + minutes = (end_t - start_t) / 60 + logger.info(f"Done. Runtime: {minutes:.3f} minutes.\n") diff --git a/models/high_hopes/src/offline_evaluation/evaluate_model.py b/models/high_hopes/src/offline_evaluation/evaluate_model.py new file mode 100644 index 00000000..0d86a87c --- /dev/null +++ b/models/high_hopes/src/offline_evaluation/evaluate_model.py @@ -0,0 +1,55 @@ +from datetime import datetime +import pandas as pd +import logging +from model_path import ModelPath +from utils_log_files import create_log_file, read_log_file +from utils_save_outputs import save_model_outputs, save_predictions +from utils_run import get_standardized_df +from utils_artifacts import get_latest_model_artifact +from utils_evaluation_metrics import generate_metric_dict +from utils_model_outputs import generate_output_dict +from utils_wandb import log_wandb_log_dict +from views_forecasts.extensions import * + +logger = logging.getLogger(__name__) + +def evaluate_model_artifact(config, artifact_name): + model_path = ModelPath(config["name"]) + path_raw = model_path.data_raw + path_generated = model_path.data_generated + path_artifacts = model_path.artifacts + run_type = config["run_type"] + + # if an artifact name is provided through the CLI, use it. + # Otherwise, get the latest model artifact based on the run type + if artifact_name: + logger.info(f"Using (non-default) artifact: {artifact_name}") + + if not artifact_name.endswith(".pkl"): + artifact_name += ".pkl" + PATH_ARTIFACT = path_artifacts / artifact_name + else: + # use the latest model artifact based on the run type + logger.info(f"Using latest (default) run type ({run_type}) specific artifact") + PATH_ARTIFACT = get_latest_model_artifact(path_artifacts, run_type) + + config["timestamp"] = PATH_ARTIFACT.stem[-15:] + df_viewser = pd.read_pickle(path_raw / f"{run_type}_viewser_df.pkl") + + try: + stepshift_model = pd.read_pickle(PATH_ARTIFACT) + except FileNotFoundError: + logger.exception(f"Model artifact not found at {PATH_ARTIFACT}") + + df = stepshift_model.predict(run_type, df_viewser) + df = get_standardized_df(df, config) + data_generation_timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + date_fetch_timestamp = read_log_file(path_raw / f"{run_type}_data_fetch_log.txt").get("Data Fetch Timestamp", None) + + _, df_output = generate_output_dict(df, config) + evaluation, df_evaluation = generate_metric_dict(df, config) + log_wandb_log_dict(config, evaluation) + + save_model_outputs(df_evaluation, df_output, path_generated, config) + save_predictions(df, path_generated, config) + create_log_file(path_generated, config, config["timestamp"], data_generation_timestamp, date_fetch_timestamp) diff --git a/models/high_hopes/src/offline_evaluation/evaluate_sweep.py b/models/high_hopes/src/offline_evaluation/evaluate_sweep.py new file mode 100644 index 00000000..d6726cf0 --- /dev/null +++ b/models/high_hopes/src/offline_evaluation/evaluate_sweep.py @@ -0,0 +1,28 @@ +import pandas as pd +import wandb +from sklearn.metrics import mean_squared_error +from model_path import ModelPath +from utils_run import get_standardized_df +from utils_wandb import log_wandb_log_dict +from utils_evaluation_metrics import generate_metric_dict + + +def evaluate_sweep(config, stepshift_model): + model_path = ModelPath(config["name"]) + path_raw = model_path.data_raw + run_type = config["run_type"] + steps = config["steps"] + + df_viewser = pd.read_pickle(path_raw / f"{run_type}_viewser_df.pkl") + df = stepshift_model.predict(run_type, df_viewser) + df = get_standardized_df(df, config) + + # Temporarily keep this because the metric to minimize is MSE + pred_cols = [f"step_pred_{str(i)}" for i in steps] + df["mse"] = df.apply(lambda row: mean_squared_error([row[config["depvar"]]] * 36, + [row[col] for col in pred_cols]), axis=1) + + wandb.log({"MSE": df["mse"].mean()}) + + evaluation, _ = generate_metric_dict(df, config) + log_wandb_log_dict(config, evaluation) diff --git a/models/high_hopes/src/online_evaluation/.gitkeep b/models/high_hopes/src/online_evaluation/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/models/high_hopes/src/training/train_model.py b/models/high_hopes/src/training/train_model.py new file mode 100644 index 00000000..f2342912 --- /dev/null +++ b/models/high_hopes/src/training/train_model.py @@ -0,0 +1,33 @@ +from datetime import datetime +import pandas as pd +from model_path import ModelPath +from utils_log_files import create_log_file, read_log_file +from utils_run import get_model +from set_partition import get_partitioner_dict +from views_forecasts.extensions import * + + +def train_model_artifact(config): + # print(config) + model_path = ModelPath(config["name"]) + path_raw = model_path.data_raw + path_generated = model_path.data_generated + path_artifacts = model_path.artifacts + run_type = config["run_type"] + df_viewser = pd.read_pickle(path_raw / f"{run_type}_viewser_df.pkl") + + stepshift_model = stepshift_training(config, run_type, df_viewser) + if not config["sweep"]: + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + model_filename = f"{run_type}_model_{timestamp}.pkl" + stepshift_model.save(path_artifacts / model_filename) + date_fetch_timestamp = read_log_file(path_raw / f"{run_type}_data_fetch_log.txt").get("Data Fetch Timestamp", None) + create_log_file(path_generated, config, timestamp, None, date_fetch_timestamp) + return stepshift_model + + +def stepshift_training(config, partition_name, dataset): + partitioner_dict = get_partitioner_dict(partition_name) + stepshift_model = get_model(config, partitioner_dict) + stepshift_model.fit(dataset) + return stepshift_model diff --git a/models/high_hopes/src/utils/utils_run.py b/models/high_hopes/src/utils/utils_run.py new file mode 100644 index 00000000..51901a01 --- /dev/null +++ b/models/high_hopes/src/utils/utils_run.py @@ -0,0 +1,83 @@ +import numpy as np +from views_stepshifter_darts.stepshifter import StepshifterModel +from views_stepshifter_darts.hurdle_model import HurdleModel +from views_forecasts.extensions import * + + +def get_model(config, partitioner_dict): + """ + Get the model based on the algorithm specified in the config + """ + + if config["algorithm"] == "HurdleModel": + model = HurdleModel(config, partitioner_dict) + else: + config["model_reg"] = config["algorithm"] + model = StepshifterModel(config, partitioner_dict) + + return model + + +def get_standardized_df(df, config): + """ + Standardize the DataFrame based on the run type + """ + + run_type = config["run_type"] + steps = config["steps"] + depvar = config["depvar"] + + # choose the columns to keep based on the run type and replace negative values with 0 + if run_type in ["calibration", "testing"]: + cols = [depvar] + df.forecasts.prediction_columns + elif run_type == "forecasting": + cols = [f"step_pred_{i}" for i in steps] + df = df.replace([np.inf, -np.inf], 0)[cols] + df = df.mask(df < 0, 0) + return df + + +def split_hurdle_parameters(parameters_dict): + """ + Split the parameters dictionary into two separate dictionaries, one for the + classification model and one for the regression model. + """ + + cls_dict = {} + reg_dict = {} + + for key, value in parameters_dict.items(): + if key.startswith("cls_"): + cls_key = key.replace("cls_", "") + cls_dict[cls_key] = value + elif key.startswith("reg_"): + reg_key = key.replace("reg_", "") + reg_dict[reg_key] = value + + return cls_dict, reg_dict + + +def update_config(hp_config, meta_config, dp_config, args): + config = hp_config.copy() + config["run_type"] = args.run_type + config["sweep"] = False + config["name"] = meta_config["name"] + config["depvar"] = meta_config["depvar"] + config["algorithm"] = meta_config["algorithm"] + if meta_config["algorithm"] == "HurdleModel": + config["model_clf"] = meta_config["model_clf"] + config["model_reg"] = meta_config["model_reg"] + config["deployment_status"] = dp_config["deployment_status"] + + return config + + +def update_sweep_config(sweep_config, args, meta_config): + sweep_config["parameters"]["run_type"] = {"value": args.run_type} + sweep_config["parameters"]["sweep"] = {"value": True} + sweep_config["parameters"]["name"] = {"value": meta_config["name"]} + sweep_config["parameters"]["depvar"] = {"value": meta_config["depvar"]} + sweep_config["parameters"]["algorithm"] = {"value": meta_config["algorithm"]} + if meta_config["algorithm"] == "HurdleModel": + sweep_config["parameters"]["model_clf"] = {"value": meta_config["model_clf"]} + sweep_config["parameters"]["model_reg"] = {"value": meta_config["model_reg"]} diff --git a/models/high_hopes/src/visualization/.gitkeep b/models/high_hopes/src/visualization/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/models/invisible_string/configs/config_meta.py b/models/invisible_string/configs/config_meta.py index 6ffa4754..da7d0673 100644 --- a/models/invisible_string/configs/config_meta.py +++ b/models/invisible_string/configs/config_meta.py @@ -11,7 +11,7 @@ def get_meta_config(): "name": "invisible_string", "algorithm": "LightGBMModel", "depvar": "ln_ged_sb_dep", - "queryset": "fatalities002_pgm_broad", + "queryset": "fatalities003_pgm_broad", "level": "pgm", "creator": "Xiaolong" } diff --git a/models/lavender_haze/configs/config_meta.py b/models/lavender_haze/configs/config_meta.py index e4293995..e7ca1fc5 100644 --- a/models/lavender_haze/configs/config_meta.py +++ b/models/lavender_haze/configs/config_meta.py @@ -12,7 +12,7 @@ def get_meta_config(): "model_clf": "LGBMClassifier", "model_reg": "LGBMRegressor", "depvar": "ln_ged_sb_dep", # IMPORTANT! The current stepshift only takes one target variable! Not compatiable with Simon's code! - "queryset": "fatalities002_pgm_broad", + "queryset": "fatalities003_pgm_broad", "level": "pgm", "creator": "Xiaolong" } diff --git a/models/little_lies/README.md b/models/little_lies/README.md new file mode 100644 index 00000000..a347365a --- /dev/null +++ b/models/little_lies/README.md @@ -0,0 +1,3 @@ +# Model README +## Model name: little_lies +## Created on: 2024-11-05 11:44:56.489805 \ No newline at end of file diff --git a/models/little_lies/artifacts/.gitkeep b/models/little_lies/artifacts/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/models/little_lies/configs/config_deployment.py b/models/little_lies/configs/config_deployment.py new file mode 100644 index 00000000..9e45b735 --- /dev/null +++ b/models/little_lies/configs/config_deployment.py @@ -0,0 +1,20 @@ +""" +Deployment Configuration Script + +This script defines the deployment configuration settings for the application. +It includes the deployment status and any additional settings specified. + +Deployment Status: +- shadow: The deployment is shadowed and not yet active. +- deployed: The deployment is active and in use. +- baseline: The deployment is in a baseline state, for reference or comparison. +- deprecated: The deployment is deprecated and no longer supported. + +Additional settings can be included in the configuration dictionary as needed. + +""" + +def get_deployment_config(): + # Deployment settings + deployment_config = {'deployment_status': 'shadow'} + return deployment_config diff --git a/models/little_lies/configs/config_hyperparameters.py b/models/little_lies/configs/config_hyperparameters.py new file mode 100644 index 00000000..1fd2b86c --- /dev/null +++ b/models/little_lies/configs/config_hyperparameters.py @@ -0,0 +1,23 @@ + +def get_hp_config(): + """ + Contains the hyperparameter configurations for model training. + This configuration is "operational" so modifying these settings will impact the model's behavior during the training. + + Returns: + - hyperparameters (dict): A dictionary containing hyperparameters for training the model, which determine the model's behavior during the training phase. + """ + + hyperparameters = { + "steps": [*range(1, 36 + 1, 1)], + "parameters": { + "clf": { + "n_estimators": 250, + }, + "reg": { + "n_estimators": 250, + } + } + } + return hyperparameters + diff --git a/models/little_lies/configs/config_meta.py b/models/little_lies/configs/config_meta.py new file mode 100644 index 00000000..133c5ecb --- /dev/null +++ b/models/little_lies/configs/config_meta.py @@ -0,0 +1,20 @@ +def get_meta_config(): + """ + Contains the meta data for the model (model algorithm, name, target variable, and level of analysis). + This config is for documentation purposes only, and modifying it will not affect the model, the training, or the evaluation. + + Returns: + - meta_config (dict): A dictionary containing model meta configuration. + """ + + meta_config = { + "name": "little_lies", + "algorithm": "HurdleModel", + "model_clf": "LightGBMModel", + "model_reg": "LightGBMModel", + "depvar": "ln_ged_sb_dep", + "queryset": "fatalities003_joint_narrow", + "level": "cm", + "creator": "Marina" + } + return meta_config diff --git a/models/little_lies/configs/config_sweep.py b/models/little_lies/configs/config_sweep.py new file mode 100644 index 00000000..30ee5b05 --- /dev/null +++ b/models/little_lies/configs/config_sweep.py @@ -0,0 +1,29 @@ + +def get_sweep_config(): + """ + Contains the configuration for hyperparameter sweeps using WandB. + This configuration is "operational" so modifying it will change the search strategy, parameter ranges, and other settings for hyperparameter tuning aimed at optimizing model performance. + + Returns: + - sweep_config (dict): A dictionary containing the configuration for hyperparameter sweeps, defining the methods and parameter ranges used to search for optimal hyperparameters. + """ + + sweep_config = { + 'method': 'grid', + 'name': 'little_lies' + } + + # Example metric setup: + metric = { + 'name': 'MSE', + 'goal': 'minimize' + } + sweep_config['metric'] = metric + + # Example parameters setup: + parameters_dict = { + 'steps': {'values': [[*range(1, 36 + 1, 1)]]}, + } + sweep_config['parameters'] = parameters_dict + + return sweep_config diff --git a/models/little_lies/data/generated/.gitkeep b/models/little_lies/data/generated/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/models/little_lies/data/processed/.gitkeep b/models/little_lies/data/processed/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/models/little_lies/data/raw/.gitkeep b/models/little_lies/data/raw/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/models/little_lies/main.py b/models/little_lies/main.py new file mode 100644 index 00000000..36429ecb --- /dev/null +++ b/models/little_lies/main.py @@ -0,0 +1,38 @@ +import wandb +import sys +import warnings + +from pathlib import Path +PATH = Path(__file__) +sys.path.insert(0, str(Path( + *[i for i in PATH.parts[:PATH.parts.index("views_pipeline") + 1]]) / "common_utils")) # PATH_COMMON_UTILS +from set_path import setup_project_paths +setup_project_paths(PATH) + +from utils_cli_parser import parse_args, validate_arguments +from utils_logger import setup_logging +from execute_model_runs import execute_sweep_run, execute_single_run + +warnings.filterwarnings("ignore") +try: + from common_utils.model_path import ModelPath + from common_utils.global_cache import GlobalCache + model_name = ModelPath.get_model_name_from_path(PATH) + GlobalCache["current_model"] = model_name +except ImportError as e: + warnings.warn(f"ImportError: {e}. Some functionalities (model seperated log files) may not work properly.", ImportWarning) +except Exception as e: + warnings.warn(f"An unexpected error occurred: {e}.", RuntimeWarning) +logger = setup_logging("run.log") + + +if __name__ == "__main__": + wandb.login() + + args = parse_args() + validate_arguments(args) + + if args.sweep: + execute_sweep_run(args) + else: + execute_single_run(args) diff --git a/models/little_lies/notebooks/.gitkeep b/models/little_lies/notebooks/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/models/little_lies/reports/.gitkeep b/models/little_lies/reports/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/models/little_lies/src/architectures/.gitkeep b/models/little_lies/src/architectures/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/models/little_lies/src/dataloaders/get_data.py b/models/little_lies/src/dataloaders/get_data.py new file mode 100644 index 00000000..4dcd20c8 --- /dev/null +++ b/models/little_lies/src/dataloaders/get_data.py @@ -0,0 +1,14 @@ +import logging +from model_path import ModelPath +from utils_dataloaders import fetch_or_load_views_df + +logger = logging.getLogger(__name__) + +def get_data(args, model_name, self_test): + model_path = ModelPath(model_name, validate=False) + path_raw = model_path.data_raw + + data, alerts = fetch_or_load_views_df(model_name, args.run_type, path_raw, self_test, use_saved=args.saved) + logger.debug(f"DataFrame shape: {data.shape if data is not None else 'None'}") + + return data diff --git a/models/little_lies/src/forecasting/generate_forecast.py b/models/little_lies/src/forecasting/generate_forecast.py new file mode 100644 index 00000000..c011a203 --- /dev/null +++ b/models/little_lies/src/forecasting/generate_forecast.py @@ -0,0 +1,47 @@ +import pandas as pd +from datetime import datetime +import logging +from model_path import ModelPath +from utils_log_files import create_log_file, read_log_file +from utils_run import get_standardized_df +from utils_save_outputs import save_predictions +from utils_artifacts import get_latest_model_artifact + +logger = logging.getLogger(__name__) + + +def forecast_model_artifact(config, artifact_name): + model_path = ModelPath(config["name"]) + path_raw = model_path.data_raw + path_generated = model_path.data_generated + path_artifacts = model_path.artifacts + run_type = config["run_type"] + + # if an artifact name is provided through the CLI, use it. + # Otherwise, get the latest model artifact based on the run type + if artifact_name: + logger.info(f"Using (non-default) artifact: {artifact_name}") + + if not artifact_name.endswith(".pkl"): + artifact_name += ".pkl" + path_artifact = path_artifacts / artifact_name + else: + # use the latest model artifact based on the run type + logger.info(f"Using latest (default) run type ({run_type}) specific artifact") + path_artifact = get_latest_model_artifact(path_artifacts, run_type) + + config["timestamp"] = path_artifact.stem[-15:] + df_viewser = pd.read_pickle(path_raw / f"{run_type}_viewser_df.pkl") + + try: + stepshift_model = pd.read_pickle(path_artifact) + except FileNotFoundError: + logger.exception(f"Model artifact not found at {path_artifact}") + + df_predictions = stepshift_model.predict(run_type, df_viewser) + df_predictions = get_standardized_df(df_predictions, config) + data_generation_timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + date_fetch_timestamp = read_log_file(path_raw / f"{run_type}_data_fetch_log.txt").get("Data Fetch Timestamp", None) + + save_predictions(df_predictions, path_generated, config) + create_log_file(path_generated, config, config["timestamp"], data_generation_timestamp, date_fetch_timestamp) diff --git a/models/little_lies/src/management/execute_model_runs.py b/models/little_lies/src/management/execute_model_runs.py new file mode 100644 index 00000000..d5b32a46 --- /dev/null +++ b/models/little_lies/src/management/execute_model_runs.py @@ -0,0 +1,51 @@ +import wandb +from config_deployment import get_deployment_config +from config_hyperparameters import get_hp_config +from config_meta import get_meta_config +from config_sweep import get_sweep_config +from execute_model_tasks import execute_model_tasks +from get_data import get_data +from utils_run import update_config, update_sweep_config + + +def execute_sweep_run(args): + sweep_config = get_sweep_config() + meta_config = get_meta_config() + update_sweep_config(sweep_config, args, meta_config) + + project = f"{sweep_config['name']}_sweep" # we can name the sweep in the config file + + with wandb.init(project=f'{project}_fetch', entity="views_pipeline"): + + get_data(args, sweep_config["name"], args.drift_self_test) + + wandb.finish() + + sweep_id = wandb.sweep(sweep_config, project=project, entity="views_pipeline") + wandb.agent(sweep_id, execute_model_tasks, entity="views_pipeline") + + +def execute_single_run(args): + + hp_config = get_hp_config() + meta_config = get_meta_config() + dp_config = get_deployment_config() + config = update_config(hp_config, meta_config, dp_config, args) + + project = f"{config['name']}_{args.run_type}" + + with wandb.init(project=f'{project}_fetch', entity="views_pipeline"): + + get_data(args, config["name"], args.drift_self_test) + + wandb.finish() + + if args.run_type == 'calibration' or args.run_type == 'testing': + + execute_model_tasks(config=config, project=project, train=args.train, eval=args.evaluate, + forecast=False, artifact_name=args.artifact_name) + + elif args.run_type == "forecasting": + execute_model_tasks(config=config, project=project, train=args.train, eval=False, + forecast=args.forecast, artifact_name=args.artifact_name) + diff --git a/models/little_lies/src/management/execute_model_tasks.py b/models/little_lies/src/management/execute_model_tasks.py new file mode 100644 index 00000000..a913fa53 --- /dev/null +++ b/models/little_lies/src/management/execute_model_tasks.py @@ -0,0 +1,70 @@ +import wandb +import logging +import time +from evaluate_model import evaluate_model_artifact +from evaluate_sweep import evaluate_sweep +from generate_forecast import forecast_model_artifact +from train_model import train_model_artifact +from utils_run import split_hurdle_parameters +from utils_wandb import add_wandb_monthly_metrics + +logger = logging.getLogger(__name__) + +def execute_model_tasks(config=None, project=None, train=None, eval=None, forecast=None, artifact_name=None): + """ + Executes various model-related tasks including training, evaluation, and forecasting. + + This function manages the execution of different tasks such as training the model, + evaluating an existing model, or performing forecasting. + It also initializes the WandB project. + + Args: + config: Configuration object containing parameters and settings. + project: The WandB project name. + train: Flag to indicate if the model should be trained. + eval: Flag to indicate if the model should be evaluated. + forecast: Flag to indicate if forecasting should be performed. + artifact_name (optional): Specific name of the model artifact to load for evaluation or forecasting. + """ + + start_t = time.time() + + # Initialize WandB + with wandb.init(project=project, entity="views_pipeline", + config=config): # project and config ignored when running a sweep + + # add the monthly metrics to WandB + add_wandb_monthly_metrics() + + # Update config from WandB initialization above + config = wandb.config + + # W&B does not directly support nested dictionaries for hyperparameters + # This will make the sweep config super ugly, but we don't have to distinguish between sweep and single runs + if config["sweep"] and config["algorithm"] == "HurdleRegression": + config["parameters"] = {} + config["parameters"]["clf"], config["parameters"]["reg"] = split_hurdle_parameters(config) + + if config["sweep"]: + logger.info(f"Sweeping model {config['name']}...") + stepshift_model = train_model_artifact(config) + logger.info(f"Evaluating model {config['name']}...") + evaluate_sweep(config, stepshift_model) + + # Handle the single model runs: train and save the model as an artifact + if train: + logger.info(f"Training model {config['name']}...") + train_model_artifact(config) + + # Handle the single model runs: evaluate a trained model (artifact) + if eval: + logger.info(f"Evaluating model {config['name']}...") + evaluate_model_artifact(config, artifact_name) + + if forecast: + logger.info(f"Forecasting model {config['name']}...") + forecast_model_artifact(config, artifact_name) + + end_t = time.time() + minutes = (end_t - start_t) / 60 + logger.info(f"Done. Runtime: {minutes:.3f} minutes.\n") diff --git a/models/little_lies/src/offline_evaluation/evaluate_model.py b/models/little_lies/src/offline_evaluation/evaluate_model.py new file mode 100644 index 00000000..0d86a87c --- /dev/null +++ b/models/little_lies/src/offline_evaluation/evaluate_model.py @@ -0,0 +1,55 @@ +from datetime import datetime +import pandas as pd +import logging +from model_path import ModelPath +from utils_log_files import create_log_file, read_log_file +from utils_save_outputs import save_model_outputs, save_predictions +from utils_run import get_standardized_df +from utils_artifacts import get_latest_model_artifact +from utils_evaluation_metrics import generate_metric_dict +from utils_model_outputs import generate_output_dict +from utils_wandb import log_wandb_log_dict +from views_forecasts.extensions import * + +logger = logging.getLogger(__name__) + +def evaluate_model_artifact(config, artifact_name): + model_path = ModelPath(config["name"]) + path_raw = model_path.data_raw + path_generated = model_path.data_generated + path_artifacts = model_path.artifacts + run_type = config["run_type"] + + # if an artifact name is provided through the CLI, use it. + # Otherwise, get the latest model artifact based on the run type + if artifact_name: + logger.info(f"Using (non-default) artifact: {artifact_name}") + + if not artifact_name.endswith(".pkl"): + artifact_name += ".pkl" + PATH_ARTIFACT = path_artifacts / artifact_name + else: + # use the latest model artifact based on the run type + logger.info(f"Using latest (default) run type ({run_type}) specific artifact") + PATH_ARTIFACT = get_latest_model_artifact(path_artifacts, run_type) + + config["timestamp"] = PATH_ARTIFACT.stem[-15:] + df_viewser = pd.read_pickle(path_raw / f"{run_type}_viewser_df.pkl") + + try: + stepshift_model = pd.read_pickle(PATH_ARTIFACT) + except FileNotFoundError: + logger.exception(f"Model artifact not found at {PATH_ARTIFACT}") + + df = stepshift_model.predict(run_type, df_viewser) + df = get_standardized_df(df, config) + data_generation_timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + date_fetch_timestamp = read_log_file(path_raw / f"{run_type}_data_fetch_log.txt").get("Data Fetch Timestamp", None) + + _, df_output = generate_output_dict(df, config) + evaluation, df_evaluation = generate_metric_dict(df, config) + log_wandb_log_dict(config, evaluation) + + save_model_outputs(df_evaluation, df_output, path_generated, config) + save_predictions(df, path_generated, config) + create_log_file(path_generated, config, config["timestamp"], data_generation_timestamp, date_fetch_timestamp) diff --git a/models/little_lies/src/offline_evaluation/evaluate_sweep.py b/models/little_lies/src/offline_evaluation/evaluate_sweep.py new file mode 100644 index 00000000..d6726cf0 --- /dev/null +++ b/models/little_lies/src/offline_evaluation/evaluate_sweep.py @@ -0,0 +1,28 @@ +import pandas as pd +import wandb +from sklearn.metrics import mean_squared_error +from model_path import ModelPath +from utils_run import get_standardized_df +from utils_wandb import log_wandb_log_dict +from utils_evaluation_metrics import generate_metric_dict + + +def evaluate_sweep(config, stepshift_model): + model_path = ModelPath(config["name"]) + path_raw = model_path.data_raw + run_type = config["run_type"] + steps = config["steps"] + + df_viewser = pd.read_pickle(path_raw / f"{run_type}_viewser_df.pkl") + df = stepshift_model.predict(run_type, df_viewser) + df = get_standardized_df(df, config) + + # Temporarily keep this because the metric to minimize is MSE + pred_cols = [f"step_pred_{str(i)}" for i in steps] + df["mse"] = df.apply(lambda row: mean_squared_error([row[config["depvar"]]] * 36, + [row[col] for col in pred_cols]), axis=1) + + wandb.log({"MSE": df["mse"].mean()}) + + evaluation, _ = generate_metric_dict(df, config) + log_wandb_log_dict(config, evaluation) diff --git a/models/little_lies/src/online_evaluation/.gitkeep b/models/little_lies/src/online_evaluation/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/models/little_lies/src/training/train_model.py b/models/little_lies/src/training/train_model.py new file mode 100644 index 00000000..f2342912 --- /dev/null +++ b/models/little_lies/src/training/train_model.py @@ -0,0 +1,33 @@ +from datetime import datetime +import pandas as pd +from model_path import ModelPath +from utils_log_files import create_log_file, read_log_file +from utils_run import get_model +from set_partition import get_partitioner_dict +from views_forecasts.extensions import * + + +def train_model_artifact(config): + # print(config) + model_path = ModelPath(config["name"]) + path_raw = model_path.data_raw + path_generated = model_path.data_generated + path_artifacts = model_path.artifacts + run_type = config["run_type"] + df_viewser = pd.read_pickle(path_raw / f"{run_type}_viewser_df.pkl") + + stepshift_model = stepshift_training(config, run_type, df_viewser) + if not config["sweep"]: + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + model_filename = f"{run_type}_model_{timestamp}.pkl" + stepshift_model.save(path_artifacts / model_filename) + date_fetch_timestamp = read_log_file(path_raw / f"{run_type}_data_fetch_log.txt").get("Data Fetch Timestamp", None) + create_log_file(path_generated, config, timestamp, None, date_fetch_timestamp) + return stepshift_model + + +def stepshift_training(config, partition_name, dataset): + partitioner_dict = get_partitioner_dict(partition_name) + stepshift_model = get_model(config, partitioner_dict) + stepshift_model.fit(dataset) + return stepshift_model diff --git a/models/little_lies/src/utils/utils_run.py b/models/little_lies/src/utils/utils_run.py new file mode 100644 index 00000000..51901a01 --- /dev/null +++ b/models/little_lies/src/utils/utils_run.py @@ -0,0 +1,83 @@ +import numpy as np +from views_stepshifter_darts.stepshifter import StepshifterModel +from views_stepshifter_darts.hurdle_model import HurdleModel +from views_forecasts.extensions import * + + +def get_model(config, partitioner_dict): + """ + Get the model based on the algorithm specified in the config + """ + + if config["algorithm"] == "HurdleModel": + model = HurdleModel(config, partitioner_dict) + else: + config["model_reg"] = config["algorithm"] + model = StepshifterModel(config, partitioner_dict) + + return model + + +def get_standardized_df(df, config): + """ + Standardize the DataFrame based on the run type + """ + + run_type = config["run_type"] + steps = config["steps"] + depvar = config["depvar"] + + # choose the columns to keep based on the run type and replace negative values with 0 + if run_type in ["calibration", "testing"]: + cols = [depvar] + df.forecasts.prediction_columns + elif run_type == "forecasting": + cols = [f"step_pred_{i}" for i in steps] + df = df.replace([np.inf, -np.inf], 0)[cols] + df = df.mask(df < 0, 0) + return df + + +def split_hurdle_parameters(parameters_dict): + """ + Split the parameters dictionary into two separate dictionaries, one for the + classification model and one for the regression model. + """ + + cls_dict = {} + reg_dict = {} + + for key, value in parameters_dict.items(): + if key.startswith("cls_"): + cls_key = key.replace("cls_", "") + cls_dict[cls_key] = value + elif key.startswith("reg_"): + reg_key = key.replace("reg_", "") + reg_dict[reg_key] = value + + return cls_dict, reg_dict + + +def update_config(hp_config, meta_config, dp_config, args): + config = hp_config.copy() + config["run_type"] = args.run_type + config["sweep"] = False + config["name"] = meta_config["name"] + config["depvar"] = meta_config["depvar"] + config["algorithm"] = meta_config["algorithm"] + if meta_config["algorithm"] == "HurdleModel": + config["model_clf"] = meta_config["model_clf"] + config["model_reg"] = meta_config["model_reg"] + config["deployment_status"] = dp_config["deployment_status"] + + return config + + +def update_sweep_config(sweep_config, args, meta_config): + sweep_config["parameters"]["run_type"] = {"value": args.run_type} + sweep_config["parameters"]["sweep"] = {"value": True} + sweep_config["parameters"]["name"] = {"value": meta_config["name"]} + sweep_config["parameters"]["depvar"] = {"value": meta_config["depvar"]} + sweep_config["parameters"]["algorithm"] = {"value": meta_config["algorithm"]} + if meta_config["algorithm"] == "HurdleModel": + sweep_config["parameters"]["model_clf"] = {"value": meta_config["model_clf"]} + sweep_config["parameters"]["model_reg"] = {"value": meta_config["model_reg"]} diff --git a/models/little_lies/src/visualization/.gitkeep b/models/little_lies/src/visualization/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/models/midnight_rain/configs/config_meta.py b/models/midnight_rain/configs/config_meta.py index eb8ddf8d..cba41347 100644 --- a/models/midnight_rain/configs/config_meta.py +++ b/models/midnight_rain/configs/config_meta.py @@ -11,7 +11,7 @@ def get_meta_config(): "name": "midnight_rain", "algorithm": "LightGBMModel", "depvar": "ln_ged_sb_dep", - "queryset": "fatalities002_pgm_escwa_drought", + "queryset": "fatalities003_pgm_escwa_drought", "level": "pgm", "creator": "Xiaolong" } diff --git a/models/national_anthem/README.md b/models/national_anthem/README.md new file mode 100644 index 00000000..ce892fb3 --- /dev/null +++ b/models/national_anthem/README.md @@ -0,0 +1,3 @@ +# Model README +## Model name: national_anthem +## Created on: 2024-11-05 09:50:36.060516 \ No newline at end of file diff --git a/models/national_anthem/artifacts/.gitkeep b/models/national_anthem/artifacts/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/models/national_anthem/configs/config_deployment.py b/models/national_anthem/configs/config_deployment.py new file mode 100644 index 00000000..9e45b735 --- /dev/null +++ b/models/national_anthem/configs/config_deployment.py @@ -0,0 +1,20 @@ +""" +Deployment Configuration Script + +This script defines the deployment configuration settings for the application. +It includes the deployment status and any additional settings specified. + +Deployment Status: +- shadow: The deployment is shadowed and not yet active. +- deployed: The deployment is active and in use. +- baseline: The deployment is in a baseline state, for reference or comparison. +- deprecated: The deployment is deprecated and no longer supported. + +Additional settings can be included in the configuration dictionary as needed. + +""" + +def get_deployment_config(): + # Deployment settings + deployment_config = {'deployment_status': 'shadow'} + return deployment_config diff --git a/models/national_anthem/configs/config_hyperparameters.py b/models/national_anthem/configs/config_hyperparameters.py new file mode 100644 index 00000000..b1eb0a5e --- /dev/null +++ b/models/national_anthem/configs/config_hyperparameters.py @@ -0,0 +1,18 @@ + +def get_hp_config(): + """ + Contains the hyperparameter configurations for model training. + This configuration is "operational" so modifying these settings will impact the model's behavior during the training. + + Returns: + - hyperparameters (dict): A dictionary containing hyperparameters for training the model, which determine the model's behavior during the training phase. + """ + + hyperparameters = { + "steps": [*range(1, 36 + 1, 1)], + "parameters": { + "n_estimators": 300, + "n_jobs": 12 + } + } + return hyperparameters diff --git a/models/national_anthem/configs/config_meta.py b/models/national_anthem/configs/config_meta.py new file mode 100644 index 00000000..f8a56c3e --- /dev/null +++ b/models/national_anthem/configs/config_meta.py @@ -0,0 +1,18 @@ +def get_meta_config(): + """ + Contains the meta data for the model (model algorithm, name, target variable, and level of analysis). + This config is for documentation purposes only, and modifying it will not affect the model, the training, or the evaluation. + + Returns: + - meta_config (dict): A dictionary containing model meta configuration. + """ + + meta_config = { + "name": "national_anthem", + "algorithm": "RandomForestModel", + "depvar": "ln_ged_sb_dep", + "queryset": "fatalities003_wdi_short", + "level": "cm", + "creator": "Borbála" + } + return meta_config diff --git a/models/national_anthem/configs/config_sweep.py b/models/national_anthem/configs/config_sweep.py new file mode 100644 index 00000000..e8d63738 --- /dev/null +++ b/models/national_anthem/configs/config_sweep.py @@ -0,0 +1,29 @@ + +def get_sweep_config(): + """ + Contains the configuration for hyperparameter sweeps using WandB. + This configuration is "operational" so modifying it will change the search strategy, parameter ranges, and other settings for hyperparameter tuning aimed at optimizing model performance. + + Returns: + - sweep_config (dict): A dictionary containing the configuration for hyperparameter sweeps, defining the methods and parameter ranges used to search for optimal hyperparameters. + """ + + sweep_config = { + 'method': 'grid', + 'name': 'national_anthem' + } + + # Example metric setup: + metric = { + 'name': 'MSE', + 'goal': 'minimize' + } + sweep_config['metric'] = metric + + # Example parameters setup: + parameters_dict = { + 'steps': {'values': [[*range(1, 36 + 1, 1)]]}, + } + sweep_config['parameters'] = parameters_dict + + return sweep_config diff --git a/models/national_anthem/data/generated/.gitkeep b/models/national_anthem/data/generated/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/models/national_anthem/data/processed/.gitkeep b/models/national_anthem/data/processed/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/models/national_anthem/data/raw/.gitkeep b/models/national_anthem/data/raw/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/models/national_anthem/main.py b/models/national_anthem/main.py new file mode 100644 index 00000000..36429ecb --- /dev/null +++ b/models/national_anthem/main.py @@ -0,0 +1,38 @@ +import wandb +import sys +import warnings + +from pathlib import Path +PATH = Path(__file__) +sys.path.insert(0, str(Path( + *[i for i in PATH.parts[:PATH.parts.index("views_pipeline") + 1]]) / "common_utils")) # PATH_COMMON_UTILS +from set_path import setup_project_paths +setup_project_paths(PATH) + +from utils_cli_parser import parse_args, validate_arguments +from utils_logger import setup_logging +from execute_model_runs import execute_sweep_run, execute_single_run + +warnings.filterwarnings("ignore") +try: + from common_utils.model_path import ModelPath + from common_utils.global_cache import GlobalCache + model_name = ModelPath.get_model_name_from_path(PATH) + GlobalCache["current_model"] = model_name +except ImportError as e: + warnings.warn(f"ImportError: {e}. Some functionalities (model seperated log files) may not work properly.", ImportWarning) +except Exception as e: + warnings.warn(f"An unexpected error occurred: {e}.", RuntimeWarning) +logger = setup_logging("run.log") + + +if __name__ == "__main__": + wandb.login() + + args = parse_args() + validate_arguments(args) + + if args.sweep: + execute_sweep_run(args) + else: + execute_single_run(args) diff --git a/models/national_anthem/notebooks/.gitkeep b/models/national_anthem/notebooks/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/models/national_anthem/reports/.gitkeep b/models/national_anthem/reports/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/models/national_anthem/src/architectures/.gitkeep b/models/national_anthem/src/architectures/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/models/national_anthem/src/dataloaders/get_data.py b/models/national_anthem/src/dataloaders/get_data.py new file mode 100644 index 00000000..4dcd20c8 --- /dev/null +++ b/models/national_anthem/src/dataloaders/get_data.py @@ -0,0 +1,14 @@ +import logging +from model_path import ModelPath +from utils_dataloaders import fetch_or_load_views_df + +logger = logging.getLogger(__name__) + +def get_data(args, model_name, self_test): + model_path = ModelPath(model_name, validate=False) + path_raw = model_path.data_raw + + data, alerts = fetch_or_load_views_df(model_name, args.run_type, path_raw, self_test, use_saved=args.saved) + logger.debug(f"DataFrame shape: {data.shape if data is not None else 'None'}") + + return data diff --git a/models/national_anthem/src/forecasting/generate_forecast.py b/models/national_anthem/src/forecasting/generate_forecast.py new file mode 100644 index 00000000..c011a203 --- /dev/null +++ b/models/national_anthem/src/forecasting/generate_forecast.py @@ -0,0 +1,47 @@ +import pandas as pd +from datetime import datetime +import logging +from model_path import ModelPath +from utils_log_files import create_log_file, read_log_file +from utils_run import get_standardized_df +from utils_save_outputs import save_predictions +from utils_artifacts import get_latest_model_artifact + +logger = logging.getLogger(__name__) + + +def forecast_model_artifact(config, artifact_name): + model_path = ModelPath(config["name"]) + path_raw = model_path.data_raw + path_generated = model_path.data_generated + path_artifacts = model_path.artifacts + run_type = config["run_type"] + + # if an artifact name is provided through the CLI, use it. + # Otherwise, get the latest model artifact based on the run type + if artifact_name: + logger.info(f"Using (non-default) artifact: {artifact_name}") + + if not artifact_name.endswith(".pkl"): + artifact_name += ".pkl" + path_artifact = path_artifacts / artifact_name + else: + # use the latest model artifact based on the run type + logger.info(f"Using latest (default) run type ({run_type}) specific artifact") + path_artifact = get_latest_model_artifact(path_artifacts, run_type) + + config["timestamp"] = path_artifact.stem[-15:] + df_viewser = pd.read_pickle(path_raw / f"{run_type}_viewser_df.pkl") + + try: + stepshift_model = pd.read_pickle(path_artifact) + except FileNotFoundError: + logger.exception(f"Model artifact not found at {path_artifact}") + + df_predictions = stepshift_model.predict(run_type, df_viewser) + df_predictions = get_standardized_df(df_predictions, config) + data_generation_timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + date_fetch_timestamp = read_log_file(path_raw / f"{run_type}_data_fetch_log.txt").get("Data Fetch Timestamp", None) + + save_predictions(df_predictions, path_generated, config) + create_log_file(path_generated, config, config["timestamp"], data_generation_timestamp, date_fetch_timestamp) diff --git a/models/national_anthem/src/management/execute_model_runs.py b/models/national_anthem/src/management/execute_model_runs.py new file mode 100644 index 00000000..d5b32a46 --- /dev/null +++ b/models/national_anthem/src/management/execute_model_runs.py @@ -0,0 +1,51 @@ +import wandb +from config_deployment import get_deployment_config +from config_hyperparameters import get_hp_config +from config_meta import get_meta_config +from config_sweep import get_sweep_config +from execute_model_tasks import execute_model_tasks +from get_data import get_data +from utils_run import update_config, update_sweep_config + + +def execute_sweep_run(args): + sweep_config = get_sweep_config() + meta_config = get_meta_config() + update_sweep_config(sweep_config, args, meta_config) + + project = f"{sweep_config['name']}_sweep" # we can name the sweep in the config file + + with wandb.init(project=f'{project}_fetch', entity="views_pipeline"): + + get_data(args, sweep_config["name"], args.drift_self_test) + + wandb.finish() + + sweep_id = wandb.sweep(sweep_config, project=project, entity="views_pipeline") + wandb.agent(sweep_id, execute_model_tasks, entity="views_pipeline") + + +def execute_single_run(args): + + hp_config = get_hp_config() + meta_config = get_meta_config() + dp_config = get_deployment_config() + config = update_config(hp_config, meta_config, dp_config, args) + + project = f"{config['name']}_{args.run_type}" + + with wandb.init(project=f'{project}_fetch', entity="views_pipeline"): + + get_data(args, config["name"], args.drift_self_test) + + wandb.finish() + + if args.run_type == 'calibration' or args.run_type == 'testing': + + execute_model_tasks(config=config, project=project, train=args.train, eval=args.evaluate, + forecast=False, artifact_name=args.artifact_name) + + elif args.run_type == "forecasting": + execute_model_tasks(config=config, project=project, train=args.train, eval=False, + forecast=args.forecast, artifact_name=args.artifact_name) + diff --git a/models/national_anthem/src/management/execute_model_tasks.py b/models/national_anthem/src/management/execute_model_tasks.py new file mode 100644 index 00000000..a913fa53 --- /dev/null +++ b/models/national_anthem/src/management/execute_model_tasks.py @@ -0,0 +1,70 @@ +import wandb +import logging +import time +from evaluate_model import evaluate_model_artifact +from evaluate_sweep import evaluate_sweep +from generate_forecast import forecast_model_artifact +from train_model import train_model_artifact +from utils_run import split_hurdle_parameters +from utils_wandb import add_wandb_monthly_metrics + +logger = logging.getLogger(__name__) + +def execute_model_tasks(config=None, project=None, train=None, eval=None, forecast=None, artifact_name=None): + """ + Executes various model-related tasks including training, evaluation, and forecasting. + + This function manages the execution of different tasks such as training the model, + evaluating an existing model, or performing forecasting. + It also initializes the WandB project. + + Args: + config: Configuration object containing parameters and settings. + project: The WandB project name. + train: Flag to indicate if the model should be trained. + eval: Flag to indicate if the model should be evaluated. + forecast: Flag to indicate if forecasting should be performed. + artifact_name (optional): Specific name of the model artifact to load for evaluation or forecasting. + """ + + start_t = time.time() + + # Initialize WandB + with wandb.init(project=project, entity="views_pipeline", + config=config): # project and config ignored when running a sweep + + # add the monthly metrics to WandB + add_wandb_monthly_metrics() + + # Update config from WandB initialization above + config = wandb.config + + # W&B does not directly support nested dictionaries for hyperparameters + # This will make the sweep config super ugly, but we don't have to distinguish between sweep and single runs + if config["sweep"] and config["algorithm"] == "HurdleRegression": + config["parameters"] = {} + config["parameters"]["clf"], config["parameters"]["reg"] = split_hurdle_parameters(config) + + if config["sweep"]: + logger.info(f"Sweeping model {config['name']}...") + stepshift_model = train_model_artifact(config) + logger.info(f"Evaluating model {config['name']}...") + evaluate_sweep(config, stepshift_model) + + # Handle the single model runs: train and save the model as an artifact + if train: + logger.info(f"Training model {config['name']}...") + train_model_artifact(config) + + # Handle the single model runs: evaluate a trained model (artifact) + if eval: + logger.info(f"Evaluating model {config['name']}...") + evaluate_model_artifact(config, artifact_name) + + if forecast: + logger.info(f"Forecasting model {config['name']}...") + forecast_model_artifact(config, artifact_name) + + end_t = time.time() + minutes = (end_t - start_t) / 60 + logger.info(f"Done. Runtime: {minutes:.3f} minutes.\n") diff --git a/models/national_anthem/src/offline_evaluation/evaluate_model.py b/models/national_anthem/src/offline_evaluation/evaluate_model.py new file mode 100644 index 00000000..0d86a87c --- /dev/null +++ b/models/national_anthem/src/offline_evaluation/evaluate_model.py @@ -0,0 +1,55 @@ +from datetime import datetime +import pandas as pd +import logging +from model_path import ModelPath +from utils_log_files import create_log_file, read_log_file +from utils_save_outputs import save_model_outputs, save_predictions +from utils_run import get_standardized_df +from utils_artifacts import get_latest_model_artifact +from utils_evaluation_metrics import generate_metric_dict +from utils_model_outputs import generate_output_dict +from utils_wandb import log_wandb_log_dict +from views_forecasts.extensions import * + +logger = logging.getLogger(__name__) + +def evaluate_model_artifact(config, artifact_name): + model_path = ModelPath(config["name"]) + path_raw = model_path.data_raw + path_generated = model_path.data_generated + path_artifacts = model_path.artifacts + run_type = config["run_type"] + + # if an artifact name is provided through the CLI, use it. + # Otherwise, get the latest model artifact based on the run type + if artifact_name: + logger.info(f"Using (non-default) artifact: {artifact_name}") + + if not artifact_name.endswith(".pkl"): + artifact_name += ".pkl" + PATH_ARTIFACT = path_artifacts / artifact_name + else: + # use the latest model artifact based on the run type + logger.info(f"Using latest (default) run type ({run_type}) specific artifact") + PATH_ARTIFACT = get_latest_model_artifact(path_artifacts, run_type) + + config["timestamp"] = PATH_ARTIFACT.stem[-15:] + df_viewser = pd.read_pickle(path_raw / f"{run_type}_viewser_df.pkl") + + try: + stepshift_model = pd.read_pickle(PATH_ARTIFACT) + except FileNotFoundError: + logger.exception(f"Model artifact not found at {PATH_ARTIFACT}") + + df = stepshift_model.predict(run_type, df_viewser) + df = get_standardized_df(df, config) + data_generation_timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + date_fetch_timestamp = read_log_file(path_raw / f"{run_type}_data_fetch_log.txt").get("Data Fetch Timestamp", None) + + _, df_output = generate_output_dict(df, config) + evaluation, df_evaluation = generate_metric_dict(df, config) + log_wandb_log_dict(config, evaluation) + + save_model_outputs(df_evaluation, df_output, path_generated, config) + save_predictions(df, path_generated, config) + create_log_file(path_generated, config, config["timestamp"], data_generation_timestamp, date_fetch_timestamp) diff --git a/models/national_anthem/src/offline_evaluation/evaluate_sweep.py b/models/national_anthem/src/offline_evaluation/evaluate_sweep.py new file mode 100644 index 00000000..d6726cf0 --- /dev/null +++ b/models/national_anthem/src/offline_evaluation/evaluate_sweep.py @@ -0,0 +1,28 @@ +import pandas as pd +import wandb +from sklearn.metrics import mean_squared_error +from model_path import ModelPath +from utils_run import get_standardized_df +from utils_wandb import log_wandb_log_dict +from utils_evaluation_metrics import generate_metric_dict + + +def evaluate_sweep(config, stepshift_model): + model_path = ModelPath(config["name"]) + path_raw = model_path.data_raw + run_type = config["run_type"] + steps = config["steps"] + + df_viewser = pd.read_pickle(path_raw / f"{run_type}_viewser_df.pkl") + df = stepshift_model.predict(run_type, df_viewser) + df = get_standardized_df(df, config) + + # Temporarily keep this because the metric to minimize is MSE + pred_cols = [f"step_pred_{str(i)}" for i in steps] + df["mse"] = df.apply(lambda row: mean_squared_error([row[config["depvar"]]] * 36, + [row[col] for col in pred_cols]), axis=1) + + wandb.log({"MSE": df["mse"].mean()}) + + evaluation, _ = generate_metric_dict(df, config) + log_wandb_log_dict(config, evaluation) diff --git a/models/national_anthem/src/online_evaluation/.gitkeep b/models/national_anthem/src/online_evaluation/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/models/national_anthem/src/training/train_model.py b/models/national_anthem/src/training/train_model.py new file mode 100644 index 00000000..f2342912 --- /dev/null +++ b/models/national_anthem/src/training/train_model.py @@ -0,0 +1,33 @@ +from datetime import datetime +import pandas as pd +from model_path import ModelPath +from utils_log_files import create_log_file, read_log_file +from utils_run import get_model +from set_partition import get_partitioner_dict +from views_forecasts.extensions import * + + +def train_model_artifact(config): + # print(config) + model_path = ModelPath(config["name"]) + path_raw = model_path.data_raw + path_generated = model_path.data_generated + path_artifacts = model_path.artifacts + run_type = config["run_type"] + df_viewser = pd.read_pickle(path_raw / f"{run_type}_viewser_df.pkl") + + stepshift_model = stepshift_training(config, run_type, df_viewser) + if not config["sweep"]: + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + model_filename = f"{run_type}_model_{timestamp}.pkl" + stepshift_model.save(path_artifacts / model_filename) + date_fetch_timestamp = read_log_file(path_raw / f"{run_type}_data_fetch_log.txt").get("Data Fetch Timestamp", None) + create_log_file(path_generated, config, timestamp, None, date_fetch_timestamp) + return stepshift_model + + +def stepshift_training(config, partition_name, dataset): + partitioner_dict = get_partitioner_dict(partition_name) + stepshift_model = get_model(config, partitioner_dict) + stepshift_model.fit(dataset) + return stepshift_model diff --git a/models/national_anthem/src/utils/utils_run.py b/models/national_anthem/src/utils/utils_run.py new file mode 100644 index 00000000..51901a01 --- /dev/null +++ b/models/national_anthem/src/utils/utils_run.py @@ -0,0 +1,83 @@ +import numpy as np +from views_stepshifter_darts.stepshifter import StepshifterModel +from views_stepshifter_darts.hurdle_model import HurdleModel +from views_forecasts.extensions import * + + +def get_model(config, partitioner_dict): + """ + Get the model based on the algorithm specified in the config + """ + + if config["algorithm"] == "HurdleModel": + model = HurdleModel(config, partitioner_dict) + else: + config["model_reg"] = config["algorithm"] + model = StepshifterModel(config, partitioner_dict) + + return model + + +def get_standardized_df(df, config): + """ + Standardize the DataFrame based on the run type + """ + + run_type = config["run_type"] + steps = config["steps"] + depvar = config["depvar"] + + # choose the columns to keep based on the run type and replace negative values with 0 + if run_type in ["calibration", "testing"]: + cols = [depvar] + df.forecasts.prediction_columns + elif run_type == "forecasting": + cols = [f"step_pred_{i}" for i in steps] + df = df.replace([np.inf, -np.inf], 0)[cols] + df = df.mask(df < 0, 0) + return df + + +def split_hurdle_parameters(parameters_dict): + """ + Split the parameters dictionary into two separate dictionaries, one for the + classification model and one for the regression model. + """ + + cls_dict = {} + reg_dict = {} + + for key, value in parameters_dict.items(): + if key.startswith("cls_"): + cls_key = key.replace("cls_", "") + cls_dict[cls_key] = value + elif key.startswith("reg_"): + reg_key = key.replace("reg_", "") + reg_dict[reg_key] = value + + return cls_dict, reg_dict + + +def update_config(hp_config, meta_config, dp_config, args): + config = hp_config.copy() + config["run_type"] = args.run_type + config["sweep"] = False + config["name"] = meta_config["name"] + config["depvar"] = meta_config["depvar"] + config["algorithm"] = meta_config["algorithm"] + if meta_config["algorithm"] == "HurdleModel": + config["model_clf"] = meta_config["model_clf"] + config["model_reg"] = meta_config["model_reg"] + config["deployment_status"] = dp_config["deployment_status"] + + return config + + +def update_sweep_config(sweep_config, args, meta_config): + sweep_config["parameters"]["run_type"] = {"value": args.run_type} + sweep_config["parameters"]["sweep"] = {"value": True} + sweep_config["parameters"]["name"] = {"value": meta_config["name"]} + sweep_config["parameters"]["depvar"] = {"value": meta_config["depvar"]} + sweep_config["parameters"]["algorithm"] = {"value": meta_config["algorithm"]} + if meta_config["algorithm"] == "HurdleModel": + sweep_config["parameters"]["model_clf"] = {"value": meta_config["model_clf"]} + sweep_config["parameters"]["model_reg"] = {"value": meta_config["model_reg"]} diff --git a/models/national_anthem/src/visualization/.gitkeep b/models/national_anthem/src/visualization/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/models/old_money/configs/config_meta.py b/models/old_money/configs/config_meta.py index 35e815da..8fc23896 100644 --- a/models/old_money/configs/config_meta.py +++ b/models/old_money/configs/config_meta.py @@ -12,7 +12,7 @@ def get_meta_config(): "model_clf": "LightGBMModel", "model_reg": "LightGBMModel", "depvar": "ln_ged_sb_dep", # IMPORTANT! The current stepshift only takes one target variable! Not compatiable with Simon's code! - "queryset": "fatalities002_pgm_escwa_drought", + "queryset": "fatalities003_pgm_escwa_drought", "level": "pgm", "creator": "Xiaolong" } diff --git a/models/ominous_ox/README.md b/models/ominous_ox/README.md new file mode 100644 index 00000000..bed5204e --- /dev/null +++ b/models/ominous_ox/README.md @@ -0,0 +1,3 @@ +# Model README +## Model name: ominous_ox +## Created on: 2024-11-01 14:35:14.982193 \ No newline at end of file diff --git a/models/ominous_ox/artifacts/.gitkeep b/models/ominous_ox/artifacts/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/models/ominous_ox/configs/config_deployment.py b/models/ominous_ox/configs/config_deployment.py new file mode 100644 index 00000000..9e45b735 --- /dev/null +++ b/models/ominous_ox/configs/config_deployment.py @@ -0,0 +1,20 @@ +""" +Deployment Configuration Script + +This script defines the deployment configuration settings for the application. +It includes the deployment status and any additional settings specified. + +Deployment Status: +- shadow: The deployment is shadowed and not yet active. +- deployed: The deployment is active and in use. +- baseline: The deployment is in a baseline state, for reference or comparison. +- deprecated: The deployment is deprecated and no longer supported. + +Additional settings can be included in the configuration dictionary as needed. + +""" + +def get_deployment_config(): + # Deployment settings + deployment_config = {'deployment_status': 'shadow'} + return deployment_config diff --git a/models/ominous_ox/configs/config_hyperparameters.py b/models/ominous_ox/configs/config_hyperparameters.py new file mode 100644 index 00000000..73eab15d --- /dev/null +++ b/models/ominous_ox/configs/config_hyperparameters.py @@ -0,0 +1,18 @@ + +def get_hp_config(): + """ + Contains the hyperparameter configurations for model training. + This configuration is "operational" so modifying these settings will impact the model's behavior during the training. + + Returns: + - hyperparameters (dict): A dictionary containing hyperparameters for training the model, which determine the model's behavior during the training phase. + """ + + hyperparameters = { + 'steps': [*range(1, 36 + 1, 1)], + "parameters": { + "n_estimators": 250, + "n_jobs": 12 + } + } + return hyperparameters diff --git a/models/ominous_ox/configs/config_meta.py b/models/ominous_ox/configs/config_meta.py new file mode 100644 index 00000000..d3a699b3 --- /dev/null +++ b/models/ominous_ox/configs/config_meta.py @@ -0,0 +1,18 @@ +def get_meta_config(): + """ + Contains the meta data for the model (model algorithm, name, target variable, and level of analysis). + This config is for documentation purposes only, and modifying it will not affect the model, the training, or the evaluation. + + Returns: + - meta_config (dict): A dictionary containing model meta configuration. + """ + + meta_config = { + "name": "ominous_ox", + "algorithm": "RandomForestModel", + "depvar": "ln_ged_sb_dep", + "queryset": "fatalities003_conflict_history", + "level": "cm", + "creator": "Borbála" + } + return meta_config diff --git a/models/ominous_ox/configs/config_sweep.py b/models/ominous_ox/configs/config_sweep.py new file mode 100644 index 00000000..0d9ba354 --- /dev/null +++ b/models/ominous_ox/configs/config_sweep.py @@ -0,0 +1,29 @@ + +def get_sweep_config(): + """ + Contains the configuration for hyperparameter sweeps using WandB. + This configuration is "operational" so modifying it will change the search strategy, parameter ranges, and other settings for hyperparameter tuning aimed at optimizing model performance. + + Returns: + - sweep_config (dict): A dictionary containing the configuration for hyperparameter sweeps, defining the methods and parameter ranges used to search for optimal hyperparameters. + """ + + sweep_config = { + 'method': 'grid', + 'name': 'ominous_ox' + } + + # Example metric setup: + metric = { + 'name': 'MSE', + 'goal': 'minimize' + } + sweep_config['metric'] = metric + + # Example parameters setup: + parameters_dict = { + 'steps': {'values': [[*range(1, 36 + 1, 1)]]}, + } + sweep_config['parameters'] = parameters_dict + + return sweep_config diff --git a/models/ominous_ox/data/generated/.gitkeep b/models/ominous_ox/data/generated/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/models/ominous_ox/data/processed/.gitkeep b/models/ominous_ox/data/processed/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/models/ominous_ox/data/raw/.gitkeep b/models/ominous_ox/data/raw/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/models/ominous_ox/main.py b/models/ominous_ox/main.py new file mode 100644 index 00000000..36429ecb --- /dev/null +++ b/models/ominous_ox/main.py @@ -0,0 +1,38 @@ +import wandb +import sys +import warnings + +from pathlib import Path +PATH = Path(__file__) +sys.path.insert(0, str(Path( + *[i for i in PATH.parts[:PATH.parts.index("views_pipeline") + 1]]) / "common_utils")) # PATH_COMMON_UTILS +from set_path import setup_project_paths +setup_project_paths(PATH) + +from utils_cli_parser import parse_args, validate_arguments +from utils_logger import setup_logging +from execute_model_runs import execute_sweep_run, execute_single_run + +warnings.filterwarnings("ignore") +try: + from common_utils.model_path import ModelPath + from common_utils.global_cache import GlobalCache + model_name = ModelPath.get_model_name_from_path(PATH) + GlobalCache["current_model"] = model_name +except ImportError as e: + warnings.warn(f"ImportError: {e}. Some functionalities (model seperated log files) may not work properly.", ImportWarning) +except Exception as e: + warnings.warn(f"An unexpected error occurred: {e}.", RuntimeWarning) +logger = setup_logging("run.log") + + +if __name__ == "__main__": + wandb.login() + + args = parse_args() + validate_arguments(args) + + if args.sweep: + execute_sweep_run(args) + else: + execute_single_run(args) diff --git a/models/ominous_ox/notebooks/.gitkeep b/models/ominous_ox/notebooks/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/models/ominous_ox/reports/.gitkeep b/models/ominous_ox/reports/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/models/ominous_ox/src/architectures/.gitkeep b/models/ominous_ox/src/architectures/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/models/ominous_ox/src/dataloaders/get_data.py b/models/ominous_ox/src/dataloaders/get_data.py new file mode 100644 index 00000000..4dcd20c8 --- /dev/null +++ b/models/ominous_ox/src/dataloaders/get_data.py @@ -0,0 +1,14 @@ +import logging +from model_path import ModelPath +from utils_dataloaders import fetch_or_load_views_df + +logger = logging.getLogger(__name__) + +def get_data(args, model_name, self_test): + model_path = ModelPath(model_name, validate=False) + path_raw = model_path.data_raw + + data, alerts = fetch_or_load_views_df(model_name, args.run_type, path_raw, self_test, use_saved=args.saved) + logger.debug(f"DataFrame shape: {data.shape if data is not None else 'None'}") + + return data diff --git a/models/ominous_ox/src/forecasting/generate_forecast.py b/models/ominous_ox/src/forecasting/generate_forecast.py new file mode 100644 index 00000000..c011a203 --- /dev/null +++ b/models/ominous_ox/src/forecasting/generate_forecast.py @@ -0,0 +1,47 @@ +import pandas as pd +from datetime import datetime +import logging +from model_path import ModelPath +from utils_log_files import create_log_file, read_log_file +from utils_run import get_standardized_df +from utils_save_outputs import save_predictions +from utils_artifacts import get_latest_model_artifact + +logger = logging.getLogger(__name__) + + +def forecast_model_artifact(config, artifact_name): + model_path = ModelPath(config["name"]) + path_raw = model_path.data_raw + path_generated = model_path.data_generated + path_artifacts = model_path.artifacts + run_type = config["run_type"] + + # if an artifact name is provided through the CLI, use it. + # Otherwise, get the latest model artifact based on the run type + if artifact_name: + logger.info(f"Using (non-default) artifact: {artifact_name}") + + if not artifact_name.endswith(".pkl"): + artifact_name += ".pkl" + path_artifact = path_artifacts / artifact_name + else: + # use the latest model artifact based on the run type + logger.info(f"Using latest (default) run type ({run_type}) specific artifact") + path_artifact = get_latest_model_artifact(path_artifacts, run_type) + + config["timestamp"] = path_artifact.stem[-15:] + df_viewser = pd.read_pickle(path_raw / f"{run_type}_viewser_df.pkl") + + try: + stepshift_model = pd.read_pickle(path_artifact) + except FileNotFoundError: + logger.exception(f"Model artifact not found at {path_artifact}") + + df_predictions = stepshift_model.predict(run_type, df_viewser) + df_predictions = get_standardized_df(df_predictions, config) + data_generation_timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + date_fetch_timestamp = read_log_file(path_raw / f"{run_type}_data_fetch_log.txt").get("Data Fetch Timestamp", None) + + save_predictions(df_predictions, path_generated, config) + create_log_file(path_generated, config, config["timestamp"], data_generation_timestamp, date_fetch_timestamp) diff --git a/models/ominous_ox/src/management/execute_model_runs.py b/models/ominous_ox/src/management/execute_model_runs.py new file mode 100644 index 00000000..d5b32a46 --- /dev/null +++ b/models/ominous_ox/src/management/execute_model_runs.py @@ -0,0 +1,51 @@ +import wandb +from config_deployment import get_deployment_config +from config_hyperparameters import get_hp_config +from config_meta import get_meta_config +from config_sweep import get_sweep_config +from execute_model_tasks import execute_model_tasks +from get_data import get_data +from utils_run import update_config, update_sweep_config + + +def execute_sweep_run(args): + sweep_config = get_sweep_config() + meta_config = get_meta_config() + update_sweep_config(sweep_config, args, meta_config) + + project = f"{sweep_config['name']}_sweep" # we can name the sweep in the config file + + with wandb.init(project=f'{project}_fetch', entity="views_pipeline"): + + get_data(args, sweep_config["name"], args.drift_self_test) + + wandb.finish() + + sweep_id = wandb.sweep(sweep_config, project=project, entity="views_pipeline") + wandb.agent(sweep_id, execute_model_tasks, entity="views_pipeline") + + +def execute_single_run(args): + + hp_config = get_hp_config() + meta_config = get_meta_config() + dp_config = get_deployment_config() + config = update_config(hp_config, meta_config, dp_config, args) + + project = f"{config['name']}_{args.run_type}" + + with wandb.init(project=f'{project}_fetch', entity="views_pipeline"): + + get_data(args, config["name"], args.drift_self_test) + + wandb.finish() + + if args.run_type == 'calibration' or args.run_type == 'testing': + + execute_model_tasks(config=config, project=project, train=args.train, eval=args.evaluate, + forecast=False, artifact_name=args.artifact_name) + + elif args.run_type == "forecasting": + execute_model_tasks(config=config, project=project, train=args.train, eval=False, + forecast=args.forecast, artifact_name=args.artifact_name) + diff --git a/models/ominous_ox/src/management/execute_model_tasks.py b/models/ominous_ox/src/management/execute_model_tasks.py new file mode 100644 index 00000000..a913fa53 --- /dev/null +++ b/models/ominous_ox/src/management/execute_model_tasks.py @@ -0,0 +1,70 @@ +import wandb +import logging +import time +from evaluate_model import evaluate_model_artifact +from evaluate_sweep import evaluate_sweep +from generate_forecast import forecast_model_artifact +from train_model import train_model_artifact +from utils_run import split_hurdle_parameters +from utils_wandb import add_wandb_monthly_metrics + +logger = logging.getLogger(__name__) + +def execute_model_tasks(config=None, project=None, train=None, eval=None, forecast=None, artifact_name=None): + """ + Executes various model-related tasks including training, evaluation, and forecasting. + + This function manages the execution of different tasks such as training the model, + evaluating an existing model, or performing forecasting. + It also initializes the WandB project. + + Args: + config: Configuration object containing parameters and settings. + project: The WandB project name. + train: Flag to indicate if the model should be trained. + eval: Flag to indicate if the model should be evaluated. + forecast: Flag to indicate if forecasting should be performed. + artifact_name (optional): Specific name of the model artifact to load for evaluation or forecasting. + """ + + start_t = time.time() + + # Initialize WandB + with wandb.init(project=project, entity="views_pipeline", + config=config): # project and config ignored when running a sweep + + # add the monthly metrics to WandB + add_wandb_monthly_metrics() + + # Update config from WandB initialization above + config = wandb.config + + # W&B does not directly support nested dictionaries for hyperparameters + # This will make the sweep config super ugly, but we don't have to distinguish between sweep and single runs + if config["sweep"] and config["algorithm"] == "HurdleRegression": + config["parameters"] = {} + config["parameters"]["clf"], config["parameters"]["reg"] = split_hurdle_parameters(config) + + if config["sweep"]: + logger.info(f"Sweeping model {config['name']}...") + stepshift_model = train_model_artifact(config) + logger.info(f"Evaluating model {config['name']}...") + evaluate_sweep(config, stepshift_model) + + # Handle the single model runs: train and save the model as an artifact + if train: + logger.info(f"Training model {config['name']}...") + train_model_artifact(config) + + # Handle the single model runs: evaluate a trained model (artifact) + if eval: + logger.info(f"Evaluating model {config['name']}...") + evaluate_model_artifact(config, artifact_name) + + if forecast: + logger.info(f"Forecasting model {config['name']}...") + forecast_model_artifact(config, artifact_name) + + end_t = time.time() + minutes = (end_t - start_t) / 60 + logger.info(f"Done. Runtime: {minutes:.3f} minutes.\n") diff --git a/models/ominous_ox/src/offline_evaluation/evaluate_model.py b/models/ominous_ox/src/offline_evaluation/evaluate_model.py new file mode 100644 index 00000000..0d86a87c --- /dev/null +++ b/models/ominous_ox/src/offline_evaluation/evaluate_model.py @@ -0,0 +1,55 @@ +from datetime import datetime +import pandas as pd +import logging +from model_path import ModelPath +from utils_log_files import create_log_file, read_log_file +from utils_save_outputs import save_model_outputs, save_predictions +from utils_run import get_standardized_df +from utils_artifacts import get_latest_model_artifact +from utils_evaluation_metrics import generate_metric_dict +from utils_model_outputs import generate_output_dict +from utils_wandb import log_wandb_log_dict +from views_forecasts.extensions import * + +logger = logging.getLogger(__name__) + +def evaluate_model_artifact(config, artifact_name): + model_path = ModelPath(config["name"]) + path_raw = model_path.data_raw + path_generated = model_path.data_generated + path_artifacts = model_path.artifacts + run_type = config["run_type"] + + # if an artifact name is provided through the CLI, use it. + # Otherwise, get the latest model artifact based on the run type + if artifact_name: + logger.info(f"Using (non-default) artifact: {artifact_name}") + + if not artifact_name.endswith(".pkl"): + artifact_name += ".pkl" + PATH_ARTIFACT = path_artifacts / artifact_name + else: + # use the latest model artifact based on the run type + logger.info(f"Using latest (default) run type ({run_type}) specific artifact") + PATH_ARTIFACT = get_latest_model_artifact(path_artifacts, run_type) + + config["timestamp"] = PATH_ARTIFACT.stem[-15:] + df_viewser = pd.read_pickle(path_raw / f"{run_type}_viewser_df.pkl") + + try: + stepshift_model = pd.read_pickle(PATH_ARTIFACT) + except FileNotFoundError: + logger.exception(f"Model artifact not found at {PATH_ARTIFACT}") + + df = stepshift_model.predict(run_type, df_viewser) + df = get_standardized_df(df, config) + data_generation_timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + date_fetch_timestamp = read_log_file(path_raw / f"{run_type}_data_fetch_log.txt").get("Data Fetch Timestamp", None) + + _, df_output = generate_output_dict(df, config) + evaluation, df_evaluation = generate_metric_dict(df, config) + log_wandb_log_dict(config, evaluation) + + save_model_outputs(df_evaluation, df_output, path_generated, config) + save_predictions(df, path_generated, config) + create_log_file(path_generated, config, config["timestamp"], data_generation_timestamp, date_fetch_timestamp) diff --git a/models/ominous_ox/src/offline_evaluation/evaluate_sweep.py b/models/ominous_ox/src/offline_evaluation/evaluate_sweep.py new file mode 100644 index 00000000..d6726cf0 --- /dev/null +++ b/models/ominous_ox/src/offline_evaluation/evaluate_sweep.py @@ -0,0 +1,28 @@ +import pandas as pd +import wandb +from sklearn.metrics import mean_squared_error +from model_path import ModelPath +from utils_run import get_standardized_df +from utils_wandb import log_wandb_log_dict +from utils_evaluation_metrics import generate_metric_dict + + +def evaluate_sweep(config, stepshift_model): + model_path = ModelPath(config["name"]) + path_raw = model_path.data_raw + run_type = config["run_type"] + steps = config["steps"] + + df_viewser = pd.read_pickle(path_raw / f"{run_type}_viewser_df.pkl") + df = stepshift_model.predict(run_type, df_viewser) + df = get_standardized_df(df, config) + + # Temporarily keep this because the metric to minimize is MSE + pred_cols = [f"step_pred_{str(i)}" for i in steps] + df["mse"] = df.apply(lambda row: mean_squared_error([row[config["depvar"]]] * 36, + [row[col] for col in pred_cols]), axis=1) + + wandb.log({"MSE": df["mse"].mean()}) + + evaluation, _ = generate_metric_dict(df, config) + log_wandb_log_dict(config, evaluation) diff --git a/models/ominous_ox/src/online_evaluation/.gitkeep b/models/ominous_ox/src/online_evaluation/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/models/ominous_ox/src/training/train_model.py b/models/ominous_ox/src/training/train_model.py new file mode 100644 index 00000000..f2342912 --- /dev/null +++ b/models/ominous_ox/src/training/train_model.py @@ -0,0 +1,33 @@ +from datetime import datetime +import pandas as pd +from model_path import ModelPath +from utils_log_files import create_log_file, read_log_file +from utils_run import get_model +from set_partition import get_partitioner_dict +from views_forecasts.extensions import * + + +def train_model_artifact(config): + # print(config) + model_path = ModelPath(config["name"]) + path_raw = model_path.data_raw + path_generated = model_path.data_generated + path_artifacts = model_path.artifacts + run_type = config["run_type"] + df_viewser = pd.read_pickle(path_raw / f"{run_type}_viewser_df.pkl") + + stepshift_model = stepshift_training(config, run_type, df_viewser) + if not config["sweep"]: + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + model_filename = f"{run_type}_model_{timestamp}.pkl" + stepshift_model.save(path_artifacts / model_filename) + date_fetch_timestamp = read_log_file(path_raw / f"{run_type}_data_fetch_log.txt").get("Data Fetch Timestamp", None) + create_log_file(path_generated, config, timestamp, None, date_fetch_timestamp) + return stepshift_model + + +def stepshift_training(config, partition_name, dataset): + partitioner_dict = get_partitioner_dict(partition_name) + stepshift_model = get_model(config, partitioner_dict) + stepshift_model.fit(dataset) + return stepshift_model diff --git a/models/ominous_ox/src/utils/utils_run.py b/models/ominous_ox/src/utils/utils_run.py new file mode 100644 index 00000000..51901a01 --- /dev/null +++ b/models/ominous_ox/src/utils/utils_run.py @@ -0,0 +1,83 @@ +import numpy as np +from views_stepshifter_darts.stepshifter import StepshifterModel +from views_stepshifter_darts.hurdle_model import HurdleModel +from views_forecasts.extensions import * + + +def get_model(config, partitioner_dict): + """ + Get the model based on the algorithm specified in the config + """ + + if config["algorithm"] == "HurdleModel": + model = HurdleModel(config, partitioner_dict) + else: + config["model_reg"] = config["algorithm"] + model = StepshifterModel(config, partitioner_dict) + + return model + + +def get_standardized_df(df, config): + """ + Standardize the DataFrame based on the run type + """ + + run_type = config["run_type"] + steps = config["steps"] + depvar = config["depvar"] + + # choose the columns to keep based on the run type and replace negative values with 0 + if run_type in ["calibration", "testing"]: + cols = [depvar] + df.forecasts.prediction_columns + elif run_type == "forecasting": + cols = [f"step_pred_{i}" for i in steps] + df = df.replace([np.inf, -np.inf], 0)[cols] + df = df.mask(df < 0, 0) + return df + + +def split_hurdle_parameters(parameters_dict): + """ + Split the parameters dictionary into two separate dictionaries, one for the + classification model and one for the regression model. + """ + + cls_dict = {} + reg_dict = {} + + for key, value in parameters_dict.items(): + if key.startswith("cls_"): + cls_key = key.replace("cls_", "") + cls_dict[cls_key] = value + elif key.startswith("reg_"): + reg_key = key.replace("reg_", "") + reg_dict[reg_key] = value + + return cls_dict, reg_dict + + +def update_config(hp_config, meta_config, dp_config, args): + config = hp_config.copy() + config["run_type"] = args.run_type + config["sweep"] = False + config["name"] = meta_config["name"] + config["depvar"] = meta_config["depvar"] + config["algorithm"] = meta_config["algorithm"] + if meta_config["algorithm"] == "HurdleModel": + config["model_clf"] = meta_config["model_clf"] + config["model_reg"] = meta_config["model_reg"] + config["deployment_status"] = dp_config["deployment_status"] + + return config + + +def update_sweep_config(sweep_config, args, meta_config): + sweep_config["parameters"]["run_type"] = {"value": args.run_type} + sweep_config["parameters"]["sweep"] = {"value": True} + sweep_config["parameters"]["name"] = {"value": meta_config["name"]} + sweep_config["parameters"]["depvar"] = {"value": meta_config["depvar"]} + sweep_config["parameters"]["algorithm"] = {"value": meta_config["algorithm"]} + if meta_config["algorithm"] == "HurdleModel": + sweep_config["parameters"]["model_clf"] = {"value": meta_config["model_clf"]} + sweep_config["parameters"]["model_reg"] = {"value": meta_config["model_reg"]} diff --git a/models/ominous_ox/src/visualization/.gitkeep b/models/ominous_ox/src/visualization/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/models/orange_pasta/configs/config_meta.py b/models/orange_pasta/configs/config_meta.py index f33cf6b9..bfaaed3e 100644 --- a/models/orange_pasta/configs/config_meta.py +++ b/models/orange_pasta/configs/config_meta.py @@ -10,7 +10,7 @@ def get_meta_config(): "name": "orange_pasta", "algorithm": "LightGBMModel", "depvar": "ln_ged_sb_dep", # IMPORTANT! The current stepshift only takes one target variable! Not compatiable with Simon's code! - "queryset": "fatalities002_pgm_baseline", + "queryset": "fatalities003_pgm_baseline", "level": "pgm", "creator": "Xiaolong" } diff --git a/models/plastic_beach/README.md b/models/plastic_beach/README.md new file mode 100644 index 00000000..c6b4ed6f --- /dev/null +++ b/models/plastic_beach/README.md @@ -0,0 +1,3 @@ +# Model README +## Model name: plastic_beach +## Created on: 2024-11-04 17:21:58.769822 \ No newline at end of file diff --git a/models/plastic_beach/artifacts/.gitkeep b/models/plastic_beach/artifacts/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/models/plastic_beach/configs/config_deployment.py b/models/plastic_beach/configs/config_deployment.py new file mode 100644 index 00000000..9e45b735 --- /dev/null +++ b/models/plastic_beach/configs/config_deployment.py @@ -0,0 +1,20 @@ +""" +Deployment Configuration Script + +This script defines the deployment configuration settings for the application. +It includes the deployment status and any additional settings specified. + +Deployment Status: +- shadow: The deployment is shadowed and not yet active. +- deployed: The deployment is active and in use. +- baseline: The deployment is in a baseline state, for reference or comparison. +- deprecated: The deployment is deprecated and no longer supported. + +Additional settings can be included in the configuration dictionary as needed. + +""" + +def get_deployment_config(): + # Deployment settings + deployment_config = {'deployment_status': 'shadow'} + return deployment_config diff --git a/models/plastic_beach/configs/config_hyperparameters.py b/models/plastic_beach/configs/config_hyperparameters.py new file mode 100644 index 00000000..73900504 --- /dev/null +++ b/models/plastic_beach/configs/config_hyperparameters.py @@ -0,0 +1,18 @@ + +def get_hp_config(): + """ + Contains the hyperparameter configurations for model training. + This configuration is "operational" so modifying these settings will impact the model's behavior during the training. + + Returns: + - hyperparameters (dict): A dictionary containing hyperparameters for training the model, which determine the model's behavior during the training phase. + """ + + hyperparameters = { + "steps": [*range(1, 36 + 1, 1)], + "parameters": { + "n_estimators": 300, + "n_jobs": 12, + } + } + return hyperparameters diff --git a/models/plastic_beach/configs/config_meta.py b/models/plastic_beach/configs/config_meta.py new file mode 100644 index 00000000..e64b3973 --- /dev/null +++ b/models/plastic_beach/configs/config_meta.py @@ -0,0 +1,18 @@ +def get_meta_config(): + """ + Contains the meta data for the model (model algorithm, name, target variable, and level of analysis). + This config is for documentation purposes only, and modifying it will not affect the model, the training, or the evaluation. + + Returns: + - meta_config (dict): A dictionary containing model meta configuration. + """ + + meta_config = { + "name": "plastic_beach", + "algorithm": "RandomForestModel", + "depvar": "ln_ged_sb_dep", + "queryset": "fatalities003_aquastat", + "level": "cm", + "creator": "Marina" + } + return meta_config diff --git a/models/plastic_beach/configs/config_sweep.py b/models/plastic_beach/configs/config_sweep.py new file mode 100644 index 00000000..a1463d55 --- /dev/null +++ b/models/plastic_beach/configs/config_sweep.py @@ -0,0 +1,29 @@ + +def get_sweep_config(): + """ + Contains the configuration for hyperparameter sweeps using WandB. + This configuration is "operational" so modifying it will change the search strategy, parameter ranges, and other settings for hyperparameter tuning aimed at optimizing model performance. + + Returns: + - sweep_config (dict): A dictionary containing the configuration for hyperparameter sweeps, defining the methods and parameter ranges used to search for optimal hyperparameters. + """ + + sweep_config = { + 'method': 'grid', + 'name': 'plastic_beach' + } + + # Example metric setup: + metric = { + 'name': 'MSE', + 'goal': 'minimize' + } + sweep_config['metric'] = metric + + # Example parameters setup: + parameters_dict = { + 'steps': {'values': [[*range(1, 36 + 1, 1)]]}, + } + sweep_config['parameters'] = parameters_dict + + return sweep_config diff --git a/models/plastic_beach/data/generated/.gitkeep b/models/plastic_beach/data/generated/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/models/plastic_beach/data/processed/.gitkeep b/models/plastic_beach/data/processed/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/models/plastic_beach/data/raw/.gitkeep b/models/plastic_beach/data/raw/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/models/plastic_beach/main.py b/models/plastic_beach/main.py new file mode 100644 index 00000000..36429ecb --- /dev/null +++ b/models/plastic_beach/main.py @@ -0,0 +1,38 @@ +import wandb +import sys +import warnings + +from pathlib import Path +PATH = Path(__file__) +sys.path.insert(0, str(Path( + *[i for i in PATH.parts[:PATH.parts.index("views_pipeline") + 1]]) / "common_utils")) # PATH_COMMON_UTILS +from set_path import setup_project_paths +setup_project_paths(PATH) + +from utils_cli_parser import parse_args, validate_arguments +from utils_logger import setup_logging +from execute_model_runs import execute_sweep_run, execute_single_run + +warnings.filterwarnings("ignore") +try: + from common_utils.model_path import ModelPath + from common_utils.global_cache import GlobalCache + model_name = ModelPath.get_model_name_from_path(PATH) + GlobalCache["current_model"] = model_name +except ImportError as e: + warnings.warn(f"ImportError: {e}. Some functionalities (model seperated log files) may not work properly.", ImportWarning) +except Exception as e: + warnings.warn(f"An unexpected error occurred: {e}.", RuntimeWarning) +logger = setup_logging("run.log") + + +if __name__ == "__main__": + wandb.login() + + args = parse_args() + validate_arguments(args) + + if args.sweep: + execute_sweep_run(args) + else: + execute_single_run(args) diff --git a/models/plastic_beach/notebooks/.gitkeep b/models/plastic_beach/notebooks/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/models/plastic_beach/reports/.gitkeep b/models/plastic_beach/reports/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/models/plastic_beach/src/architectures/.gitkeep b/models/plastic_beach/src/architectures/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/models/plastic_beach/src/dataloaders/get_data.py b/models/plastic_beach/src/dataloaders/get_data.py new file mode 100644 index 00000000..4dcd20c8 --- /dev/null +++ b/models/plastic_beach/src/dataloaders/get_data.py @@ -0,0 +1,14 @@ +import logging +from model_path import ModelPath +from utils_dataloaders import fetch_or_load_views_df + +logger = logging.getLogger(__name__) + +def get_data(args, model_name, self_test): + model_path = ModelPath(model_name, validate=False) + path_raw = model_path.data_raw + + data, alerts = fetch_or_load_views_df(model_name, args.run_type, path_raw, self_test, use_saved=args.saved) + logger.debug(f"DataFrame shape: {data.shape if data is not None else 'None'}") + + return data diff --git a/models/plastic_beach/src/forecasting/generate_forecast.py b/models/plastic_beach/src/forecasting/generate_forecast.py new file mode 100644 index 00000000..c011a203 --- /dev/null +++ b/models/plastic_beach/src/forecasting/generate_forecast.py @@ -0,0 +1,47 @@ +import pandas as pd +from datetime import datetime +import logging +from model_path import ModelPath +from utils_log_files import create_log_file, read_log_file +from utils_run import get_standardized_df +from utils_save_outputs import save_predictions +from utils_artifacts import get_latest_model_artifact + +logger = logging.getLogger(__name__) + + +def forecast_model_artifact(config, artifact_name): + model_path = ModelPath(config["name"]) + path_raw = model_path.data_raw + path_generated = model_path.data_generated + path_artifacts = model_path.artifacts + run_type = config["run_type"] + + # if an artifact name is provided through the CLI, use it. + # Otherwise, get the latest model artifact based on the run type + if artifact_name: + logger.info(f"Using (non-default) artifact: {artifact_name}") + + if not artifact_name.endswith(".pkl"): + artifact_name += ".pkl" + path_artifact = path_artifacts / artifact_name + else: + # use the latest model artifact based on the run type + logger.info(f"Using latest (default) run type ({run_type}) specific artifact") + path_artifact = get_latest_model_artifact(path_artifacts, run_type) + + config["timestamp"] = path_artifact.stem[-15:] + df_viewser = pd.read_pickle(path_raw / f"{run_type}_viewser_df.pkl") + + try: + stepshift_model = pd.read_pickle(path_artifact) + except FileNotFoundError: + logger.exception(f"Model artifact not found at {path_artifact}") + + df_predictions = stepshift_model.predict(run_type, df_viewser) + df_predictions = get_standardized_df(df_predictions, config) + data_generation_timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + date_fetch_timestamp = read_log_file(path_raw / f"{run_type}_data_fetch_log.txt").get("Data Fetch Timestamp", None) + + save_predictions(df_predictions, path_generated, config) + create_log_file(path_generated, config, config["timestamp"], data_generation_timestamp, date_fetch_timestamp) diff --git a/models/plastic_beach/src/management/execute_model_runs.py b/models/plastic_beach/src/management/execute_model_runs.py new file mode 100644 index 00000000..d5b32a46 --- /dev/null +++ b/models/plastic_beach/src/management/execute_model_runs.py @@ -0,0 +1,51 @@ +import wandb +from config_deployment import get_deployment_config +from config_hyperparameters import get_hp_config +from config_meta import get_meta_config +from config_sweep import get_sweep_config +from execute_model_tasks import execute_model_tasks +from get_data import get_data +from utils_run import update_config, update_sweep_config + + +def execute_sweep_run(args): + sweep_config = get_sweep_config() + meta_config = get_meta_config() + update_sweep_config(sweep_config, args, meta_config) + + project = f"{sweep_config['name']}_sweep" # we can name the sweep in the config file + + with wandb.init(project=f'{project}_fetch', entity="views_pipeline"): + + get_data(args, sweep_config["name"], args.drift_self_test) + + wandb.finish() + + sweep_id = wandb.sweep(sweep_config, project=project, entity="views_pipeline") + wandb.agent(sweep_id, execute_model_tasks, entity="views_pipeline") + + +def execute_single_run(args): + + hp_config = get_hp_config() + meta_config = get_meta_config() + dp_config = get_deployment_config() + config = update_config(hp_config, meta_config, dp_config, args) + + project = f"{config['name']}_{args.run_type}" + + with wandb.init(project=f'{project}_fetch', entity="views_pipeline"): + + get_data(args, config["name"], args.drift_self_test) + + wandb.finish() + + if args.run_type == 'calibration' or args.run_type == 'testing': + + execute_model_tasks(config=config, project=project, train=args.train, eval=args.evaluate, + forecast=False, artifact_name=args.artifact_name) + + elif args.run_type == "forecasting": + execute_model_tasks(config=config, project=project, train=args.train, eval=False, + forecast=args.forecast, artifact_name=args.artifact_name) + diff --git a/models/plastic_beach/src/management/execute_model_tasks.py b/models/plastic_beach/src/management/execute_model_tasks.py new file mode 100644 index 00000000..a913fa53 --- /dev/null +++ b/models/plastic_beach/src/management/execute_model_tasks.py @@ -0,0 +1,70 @@ +import wandb +import logging +import time +from evaluate_model import evaluate_model_artifact +from evaluate_sweep import evaluate_sweep +from generate_forecast import forecast_model_artifact +from train_model import train_model_artifact +from utils_run import split_hurdle_parameters +from utils_wandb import add_wandb_monthly_metrics + +logger = logging.getLogger(__name__) + +def execute_model_tasks(config=None, project=None, train=None, eval=None, forecast=None, artifact_name=None): + """ + Executes various model-related tasks including training, evaluation, and forecasting. + + This function manages the execution of different tasks such as training the model, + evaluating an existing model, or performing forecasting. + It also initializes the WandB project. + + Args: + config: Configuration object containing parameters and settings. + project: The WandB project name. + train: Flag to indicate if the model should be trained. + eval: Flag to indicate if the model should be evaluated. + forecast: Flag to indicate if forecasting should be performed. + artifact_name (optional): Specific name of the model artifact to load for evaluation or forecasting. + """ + + start_t = time.time() + + # Initialize WandB + with wandb.init(project=project, entity="views_pipeline", + config=config): # project and config ignored when running a sweep + + # add the monthly metrics to WandB + add_wandb_monthly_metrics() + + # Update config from WandB initialization above + config = wandb.config + + # W&B does not directly support nested dictionaries for hyperparameters + # This will make the sweep config super ugly, but we don't have to distinguish between sweep and single runs + if config["sweep"] and config["algorithm"] == "HurdleRegression": + config["parameters"] = {} + config["parameters"]["clf"], config["parameters"]["reg"] = split_hurdle_parameters(config) + + if config["sweep"]: + logger.info(f"Sweeping model {config['name']}...") + stepshift_model = train_model_artifact(config) + logger.info(f"Evaluating model {config['name']}...") + evaluate_sweep(config, stepshift_model) + + # Handle the single model runs: train and save the model as an artifact + if train: + logger.info(f"Training model {config['name']}...") + train_model_artifact(config) + + # Handle the single model runs: evaluate a trained model (artifact) + if eval: + logger.info(f"Evaluating model {config['name']}...") + evaluate_model_artifact(config, artifact_name) + + if forecast: + logger.info(f"Forecasting model {config['name']}...") + forecast_model_artifact(config, artifact_name) + + end_t = time.time() + minutes = (end_t - start_t) / 60 + logger.info(f"Done. Runtime: {minutes:.3f} minutes.\n") diff --git a/models/plastic_beach/src/offline_evaluation/evaluate_model.py b/models/plastic_beach/src/offline_evaluation/evaluate_model.py new file mode 100644 index 00000000..0d86a87c --- /dev/null +++ b/models/plastic_beach/src/offline_evaluation/evaluate_model.py @@ -0,0 +1,55 @@ +from datetime import datetime +import pandas as pd +import logging +from model_path import ModelPath +from utils_log_files import create_log_file, read_log_file +from utils_save_outputs import save_model_outputs, save_predictions +from utils_run import get_standardized_df +from utils_artifacts import get_latest_model_artifact +from utils_evaluation_metrics import generate_metric_dict +from utils_model_outputs import generate_output_dict +from utils_wandb import log_wandb_log_dict +from views_forecasts.extensions import * + +logger = logging.getLogger(__name__) + +def evaluate_model_artifact(config, artifact_name): + model_path = ModelPath(config["name"]) + path_raw = model_path.data_raw + path_generated = model_path.data_generated + path_artifacts = model_path.artifacts + run_type = config["run_type"] + + # if an artifact name is provided through the CLI, use it. + # Otherwise, get the latest model artifact based on the run type + if artifact_name: + logger.info(f"Using (non-default) artifact: {artifact_name}") + + if not artifact_name.endswith(".pkl"): + artifact_name += ".pkl" + PATH_ARTIFACT = path_artifacts / artifact_name + else: + # use the latest model artifact based on the run type + logger.info(f"Using latest (default) run type ({run_type}) specific artifact") + PATH_ARTIFACT = get_latest_model_artifact(path_artifacts, run_type) + + config["timestamp"] = PATH_ARTIFACT.stem[-15:] + df_viewser = pd.read_pickle(path_raw / f"{run_type}_viewser_df.pkl") + + try: + stepshift_model = pd.read_pickle(PATH_ARTIFACT) + except FileNotFoundError: + logger.exception(f"Model artifact not found at {PATH_ARTIFACT}") + + df = stepshift_model.predict(run_type, df_viewser) + df = get_standardized_df(df, config) + data_generation_timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + date_fetch_timestamp = read_log_file(path_raw / f"{run_type}_data_fetch_log.txt").get("Data Fetch Timestamp", None) + + _, df_output = generate_output_dict(df, config) + evaluation, df_evaluation = generate_metric_dict(df, config) + log_wandb_log_dict(config, evaluation) + + save_model_outputs(df_evaluation, df_output, path_generated, config) + save_predictions(df, path_generated, config) + create_log_file(path_generated, config, config["timestamp"], data_generation_timestamp, date_fetch_timestamp) diff --git a/models/plastic_beach/src/offline_evaluation/evaluate_sweep.py b/models/plastic_beach/src/offline_evaluation/evaluate_sweep.py new file mode 100644 index 00000000..d6726cf0 --- /dev/null +++ b/models/plastic_beach/src/offline_evaluation/evaluate_sweep.py @@ -0,0 +1,28 @@ +import pandas as pd +import wandb +from sklearn.metrics import mean_squared_error +from model_path import ModelPath +from utils_run import get_standardized_df +from utils_wandb import log_wandb_log_dict +from utils_evaluation_metrics import generate_metric_dict + + +def evaluate_sweep(config, stepshift_model): + model_path = ModelPath(config["name"]) + path_raw = model_path.data_raw + run_type = config["run_type"] + steps = config["steps"] + + df_viewser = pd.read_pickle(path_raw / f"{run_type}_viewser_df.pkl") + df = stepshift_model.predict(run_type, df_viewser) + df = get_standardized_df(df, config) + + # Temporarily keep this because the metric to minimize is MSE + pred_cols = [f"step_pred_{str(i)}" for i in steps] + df["mse"] = df.apply(lambda row: mean_squared_error([row[config["depvar"]]] * 36, + [row[col] for col in pred_cols]), axis=1) + + wandb.log({"MSE": df["mse"].mean()}) + + evaluation, _ = generate_metric_dict(df, config) + log_wandb_log_dict(config, evaluation) diff --git a/models/plastic_beach/src/online_evaluation/.gitkeep b/models/plastic_beach/src/online_evaluation/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/models/plastic_beach/src/training/train_model.py b/models/plastic_beach/src/training/train_model.py new file mode 100644 index 00000000..f2342912 --- /dev/null +++ b/models/plastic_beach/src/training/train_model.py @@ -0,0 +1,33 @@ +from datetime import datetime +import pandas as pd +from model_path import ModelPath +from utils_log_files import create_log_file, read_log_file +from utils_run import get_model +from set_partition import get_partitioner_dict +from views_forecasts.extensions import * + + +def train_model_artifact(config): + # print(config) + model_path = ModelPath(config["name"]) + path_raw = model_path.data_raw + path_generated = model_path.data_generated + path_artifacts = model_path.artifacts + run_type = config["run_type"] + df_viewser = pd.read_pickle(path_raw / f"{run_type}_viewser_df.pkl") + + stepshift_model = stepshift_training(config, run_type, df_viewser) + if not config["sweep"]: + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + model_filename = f"{run_type}_model_{timestamp}.pkl" + stepshift_model.save(path_artifacts / model_filename) + date_fetch_timestamp = read_log_file(path_raw / f"{run_type}_data_fetch_log.txt").get("Data Fetch Timestamp", None) + create_log_file(path_generated, config, timestamp, None, date_fetch_timestamp) + return stepshift_model + + +def stepshift_training(config, partition_name, dataset): + partitioner_dict = get_partitioner_dict(partition_name) + stepshift_model = get_model(config, partitioner_dict) + stepshift_model.fit(dataset) + return stepshift_model diff --git a/models/plastic_beach/src/utils/utils_run.py b/models/plastic_beach/src/utils/utils_run.py new file mode 100644 index 00000000..51901a01 --- /dev/null +++ b/models/plastic_beach/src/utils/utils_run.py @@ -0,0 +1,83 @@ +import numpy as np +from views_stepshifter_darts.stepshifter import StepshifterModel +from views_stepshifter_darts.hurdle_model import HurdleModel +from views_forecasts.extensions import * + + +def get_model(config, partitioner_dict): + """ + Get the model based on the algorithm specified in the config + """ + + if config["algorithm"] == "HurdleModel": + model = HurdleModel(config, partitioner_dict) + else: + config["model_reg"] = config["algorithm"] + model = StepshifterModel(config, partitioner_dict) + + return model + + +def get_standardized_df(df, config): + """ + Standardize the DataFrame based on the run type + """ + + run_type = config["run_type"] + steps = config["steps"] + depvar = config["depvar"] + + # choose the columns to keep based on the run type and replace negative values with 0 + if run_type in ["calibration", "testing"]: + cols = [depvar] + df.forecasts.prediction_columns + elif run_type == "forecasting": + cols = [f"step_pred_{i}" for i in steps] + df = df.replace([np.inf, -np.inf], 0)[cols] + df = df.mask(df < 0, 0) + return df + + +def split_hurdle_parameters(parameters_dict): + """ + Split the parameters dictionary into two separate dictionaries, one for the + classification model and one for the regression model. + """ + + cls_dict = {} + reg_dict = {} + + for key, value in parameters_dict.items(): + if key.startswith("cls_"): + cls_key = key.replace("cls_", "") + cls_dict[cls_key] = value + elif key.startswith("reg_"): + reg_key = key.replace("reg_", "") + reg_dict[reg_key] = value + + return cls_dict, reg_dict + + +def update_config(hp_config, meta_config, dp_config, args): + config = hp_config.copy() + config["run_type"] = args.run_type + config["sweep"] = False + config["name"] = meta_config["name"] + config["depvar"] = meta_config["depvar"] + config["algorithm"] = meta_config["algorithm"] + if meta_config["algorithm"] == "HurdleModel": + config["model_clf"] = meta_config["model_clf"] + config["model_reg"] = meta_config["model_reg"] + config["deployment_status"] = dp_config["deployment_status"] + + return config + + +def update_sweep_config(sweep_config, args, meta_config): + sweep_config["parameters"]["run_type"] = {"value": args.run_type} + sweep_config["parameters"]["sweep"] = {"value": True} + sweep_config["parameters"]["name"] = {"value": meta_config["name"]} + sweep_config["parameters"]["depvar"] = {"value": meta_config["depvar"]} + sweep_config["parameters"]["algorithm"] = {"value": meta_config["algorithm"]} + if meta_config["algorithm"] == "HurdleModel": + sweep_config["parameters"]["model_clf"] = {"value": meta_config["model_clf"]} + sweep_config["parameters"]["model_reg"] = {"value": meta_config["model_reg"]} diff --git a/models/plastic_beach/src/visualization/.gitkeep b/models/plastic_beach/src/visualization/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/models/popular_monster/README.md b/models/popular_monster/README.md new file mode 100644 index 00000000..8617c5ba --- /dev/null +++ b/models/popular_monster/README.md @@ -0,0 +1,3 @@ +# Model README +## Model name: popular_monster +## Created on: 2024-11-05 10:01:38.278277 \ No newline at end of file diff --git a/models/popular_monster/artifacts/.gitkeep b/models/popular_monster/artifacts/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/models/popular_monster/configs/config_deployment.py b/models/popular_monster/configs/config_deployment.py new file mode 100644 index 00000000..9e45b735 --- /dev/null +++ b/models/popular_monster/configs/config_deployment.py @@ -0,0 +1,20 @@ +""" +Deployment Configuration Script + +This script defines the deployment configuration settings for the application. +It includes the deployment status and any additional settings specified. + +Deployment Status: +- shadow: The deployment is shadowed and not yet active. +- deployed: The deployment is active and in use. +- baseline: The deployment is in a baseline state, for reference or comparison. +- deprecated: The deployment is deprecated and no longer supported. + +Additional settings can be included in the configuration dictionary as needed. + +""" + +def get_deployment_config(): + # Deployment settings + deployment_config = {'deployment_status': 'shadow'} + return deployment_config diff --git a/models/popular_monster/configs/config_hyperparameters.py b/models/popular_monster/configs/config_hyperparameters.py new file mode 100644 index 00000000..47c85547 --- /dev/null +++ b/models/popular_monster/configs/config_hyperparameters.py @@ -0,0 +1,18 @@ + +def get_hp_config(): + """ + Contains the hyperparameter configurations for model training. + This configuration is "operational" so modifying these settings will impact the model's behavior during the training. + + Returns: + - hyperparameters (dict): A dictionary containing hyperparameters for training the model, which determine the model's behavior during the training phase. + """ + + hyperparameters = { + "steps": [*range(1, 36 + 1, 1)], + "parameters": { + "n_estimators": 250, + "n_jobs": 12 + } + } + return hyperparameters diff --git a/models/popular_monster/configs/config_meta.py b/models/popular_monster/configs/config_meta.py new file mode 100644 index 00000000..56014941 --- /dev/null +++ b/models/popular_monster/configs/config_meta.py @@ -0,0 +1,18 @@ +def get_meta_config(): + """ + Contains the meta data for the model (model algorithm, name, target variable, and level of analysis). + This config is for documentation purposes only, and modifying it will not affect the model, the training, or the evaluation. + + Returns: + - meta_config (dict): A dictionary containing model meta configuration. + """ + + meta_config = { + "name": "popular_monster", + "algorithm": "RandomForestModel", + "depvar": "ln_ged_sb_dep", + "queryset": "fatalities003_topics", + "level": "cm", + "creator": "Borbála" + } + return meta_config diff --git a/models/popular_monster/configs/config_sweep.py b/models/popular_monster/configs/config_sweep.py new file mode 100644 index 00000000..68e84385 --- /dev/null +++ b/models/popular_monster/configs/config_sweep.py @@ -0,0 +1,29 @@ + +def get_sweep_config(): + """ + Contains the configuration for hyperparameter sweeps using WandB. + This configuration is "operational" so modifying it will change the search strategy, parameter ranges, and other settings for hyperparameter tuning aimed at optimizing model performance. + + Returns: + - sweep_config (dict): A dictionary containing the configuration for hyperparameter sweeps, defining the methods and parameter ranges used to search for optimal hyperparameters. + """ + + sweep_config = { + 'method': 'grid', + 'name': 'popular_monster' + } + + # Example metric setup: + metric = { + 'name': 'MSE', + 'goal': 'minimize' + } + sweep_config['metric'] = metric + + # Example parameters setup: + parameters_dict = { + 'steps': {'values': [[*range(1, 36 + 1, 1)]]}, + } + sweep_config['parameters'] = parameters_dict + + return sweep_config diff --git a/models/popular_monster/data/generated/.gitkeep b/models/popular_monster/data/generated/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/models/popular_monster/data/processed/.gitkeep b/models/popular_monster/data/processed/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/models/popular_monster/data/raw/.gitkeep b/models/popular_monster/data/raw/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/models/popular_monster/main.py b/models/popular_monster/main.py new file mode 100644 index 00000000..36429ecb --- /dev/null +++ b/models/popular_monster/main.py @@ -0,0 +1,38 @@ +import wandb +import sys +import warnings + +from pathlib import Path +PATH = Path(__file__) +sys.path.insert(0, str(Path( + *[i for i in PATH.parts[:PATH.parts.index("views_pipeline") + 1]]) / "common_utils")) # PATH_COMMON_UTILS +from set_path import setup_project_paths +setup_project_paths(PATH) + +from utils_cli_parser import parse_args, validate_arguments +from utils_logger import setup_logging +from execute_model_runs import execute_sweep_run, execute_single_run + +warnings.filterwarnings("ignore") +try: + from common_utils.model_path import ModelPath + from common_utils.global_cache import GlobalCache + model_name = ModelPath.get_model_name_from_path(PATH) + GlobalCache["current_model"] = model_name +except ImportError as e: + warnings.warn(f"ImportError: {e}. Some functionalities (model seperated log files) may not work properly.", ImportWarning) +except Exception as e: + warnings.warn(f"An unexpected error occurred: {e}.", RuntimeWarning) +logger = setup_logging("run.log") + + +if __name__ == "__main__": + wandb.login() + + args = parse_args() + validate_arguments(args) + + if args.sweep: + execute_sweep_run(args) + else: + execute_single_run(args) diff --git a/models/popular_monster/notebooks/.gitkeep b/models/popular_monster/notebooks/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/models/popular_monster/reports/.gitkeep b/models/popular_monster/reports/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/models/popular_monster/src/architectures/.gitkeep b/models/popular_monster/src/architectures/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/models/popular_monster/src/dataloaders/get_data.py b/models/popular_monster/src/dataloaders/get_data.py new file mode 100644 index 00000000..4dcd20c8 --- /dev/null +++ b/models/popular_monster/src/dataloaders/get_data.py @@ -0,0 +1,14 @@ +import logging +from model_path import ModelPath +from utils_dataloaders import fetch_or_load_views_df + +logger = logging.getLogger(__name__) + +def get_data(args, model_name, self_test): + model_path = ModelPath(model_name, validate=False) + path_raw = model_path.data_raw + + data, alerts = fetch_or_load_views_df(model_name, args.run_type, path_raw, self_test, use_saved=args.saved) + logger.debug(f"DataFrame shape: {data.shape if data is not None else 'None'}") + + return data diff --git a/models/popular_monster/src/forecasting/generate_forecast.py b/models/popular_monster/src/forecasting/generate_forecast.py new file mode 100644 index 00000000..c011a203 --- /dev/null +++ b/models/popular_monster/src/forecasting/generate_forecast.py @@ -0,0 +1,47 @@ +import pandas as pd +from datetime import datetime +import logging +from model_path import ModelPath +from utils_log_files import create_log_file, read_log_file +from utils_run import get_standardized_df +from utils_save_outputs import save_predictions +from utils_artifacts import get_latest_model_artifact + +logger = logging.getLogger(__name__) + + +def forecast_model_artifact(config, artifact_name): + model_path = ModelPath(config["name"]) + path_raw = model_path.data_raw + path_generated = model_path.data_generated + path_artifacts = model_path.artifacts + run_type = config["run_type"] + + # if an artifact name is provided through the CLI, use it. + # Otherwise, get the latest model artifact based on the run type + if artifact_name: + logger.info(f"Using (non-default) artifact: {artifact_name}") + + if not artifact_name.endswith(".pkl"): + artifact_name += ".pkl" + path_artifact = path_artifacts / artifact_name + else: + # use the latest model artifact based on the run type + logger.info(f"Using latest (default) run type ({run_type}) specific artifact") + path_artifact = get_latest_model_artifact(path_artifacts, run_type) + + config["timestamp"] = path_artifact.stem[-15:] + df_viewser = pd.read_pickle(path_raw / f"{run_type}_viewser_df.pkl") + + try: + stepshift_model = pd.read_pickle(path_artifact) + except FileNotFoundError: + logger.exception(f"Model artifact not found at {path_artifact}") + + df_predictions = stepshift_model.predict(run_type, df_viewser) + df_predictions = get_standardized_df(df_predictions, config) + data_generation_timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + date_fetch_timestamp = read_log_file(path_raw / f"{run_type}_data_fetch_log.txt").get("Data Fetch Timestamp", None) + + save_predictions(df_predictions, path_generated, config) + create_log_file(path_generated, config, config["timestamp"], data_generation_timestamp, date_fetch_timestamp) diff --git a/models/popular_monster/src/management/execute_model_runs.py b/models/popular_monster/src/management/execute_model_runs.py new file mode 100644 index 00000000..d5b32a46 --- /dev/null +++ b/models/popular_monster/src/management/execute_model_runs.py @@ -0,0 +1,51 @@ +import wandb +from config_deployment import get_deployment_config +from config_hyperparameters import get_hp_config +from config_meta import get_meta_config +from config_sweep import get_sweep_config +from execute_model_tasks import execute_model_tasks +from get_data import get_data +from utils_run import update_config, update_sweep_config + + +def execute_sweep_run(args): + sweep_config = get_sweep_config() + meta_config = get_meta_config() + update_sweep_config(sweep_config, args, meta_config) + + project = f"{sweep_config['name']}_sweep" # we can name the sweep in the config file + + with wandb.init(project=f'{project}_fetch', entity="views_pipeline"): + + get_data(args, sweep_config["name"], args.drift_self_test) + + wandb.finish() + + sweep_id = wandb.sweep(sweep_config, project=project, entity="views_pipeline") + wandb.agent(sweep_id, execute_model_tasks, entity="views_pipeline") + + +def execute_single_run(args): + + hp_config = get_hp_config() + meta_config = get_meta_config() + dp_config = get_deployment_config() + config = update_config(hp_config, meta_config, dp_config, args) + + project = f"{config['name']}_{args.run_type}" + + with wandb.init(project=f'{project}_fetch', entity="views_pipeline"): + + get_data(args, config["name"], args.drift_self_test) + + wandb.finish() + + if args.run_type == 'calibration' or args.run_type == 'testing': + + execute_model_tasks(config=config, project=project, train=args.train, eval=args.evaluate, + forecast=False, artifact_name=args.artifact_name) + + elif args.run_type == "forecasting": + execute_model_tasks(config=config, project=project, train=args.train, eval=False, + forecast=args.forecast, artifact_name=args.artifact_name) + diff --git a/models/popular_monster/src/management/execute_model_tasks.py b/models/popular_monster/src/management/execute_model_tasks.py new file mode 100644 index 00000000..a913fa53 --- /dev/null +++ b/models/popular_monster/src/management/execute_model_tasks.py @@ -0,0 +1,70 @@ +import wandb +import logging +import time +from evaluate_model import evaluate_model_artifact +from evaluate_sweep import evaluate_sweep +from generate_forecast import forecast_model_artifact +from train_model import train_model_artifact +from utils_run import split_hurdle_parameters +from utils_wandb import add_wandb_monthly_metrics + +logger = logging.getLogger(__name__) + +def execute_model_tasks(config=None, project=None, train=None, eval=None, forecast=None, artifact_name=None): + """ + Executes various model-related tasks including training, evaluation, and forecasting. + + This function manages the execution of different tasks such as training the model, + evaluating an existing model, or performing forecasting. + It also initializes the WandB project. + + Args: + config: Configuration object containing parameters and settings. + project: The WandB project name. + train: Flag to indicate if the model should be trained. + eval: Flag to indicate if the model should be evaluated. + forecast: Flag to indicate if forecasting should be performed. + artifact_name (optional): Specific name of the model artifact to load for evaluation or forecasting. + """ + + start_t = time.time() + + # Initialize WandB + with wandb.init(project=project, entity="views_pipeline", + config=config): # project and config ignored when running a sweep + + # add the monthly metrics to WandB + add_wandb_monthly_metrics() + + # Update config from WandB initialization above + config = wandb.config + + # W&B does not directly support nested dictionaries for hyperparameters + # This will make the sweep config super ugly, but we don't have to distinguish between sweep and single runs + if config["sweep"] and config["algorithm"] == "HurdleRegression": + config["parameters"] = {} + config["parameters"]["clf"], config["parameters"]["reg"] = split_hurdle_parameters(config) + + if config["sweep"]: + logger.info(f"Sweeping model {config['name']}...") + stepshift_model = train_model_artifact(config) + logger.info(f"Evaluating model {config['name']}...") + evaluate_sweep(config, stepshift_model) + + # Handle the single model runs: train and save the model as an artifact + if train: + logger.info(f"Training model {config['name']}...") + train_model_artifact(config) + + # Handle the single model runs: evaluate a trained model (artifact) + if eval: + logger.info(f"Evaluating model {config['name']}...") + evaluate_model_artifact(config, artifact_name) + + if forecast: + logger.info(f"Forecasting model {config['name']}...") + forecast_model_artifact(config, artifact_name) + + end_t = time.time() + minutes = (end_t - start_t) / 60 + logger.info(f"Done. Runtime: {minutes:.3f} minutes.\n") diff --git a/models/popular_monster/src/offline_evaluation/evaluate_model.py b/models/popular_monster/src/offline_evaluation/evaluate_model.py new file mode 100644 index 00000000..0d86a87c --- /dev/null +++ b/models/popular_monster/src/offline_evaluation/evaluate_model.py @@ -0,0 +1,55 @@ +from datetime import datetime +import pandas as pd +import logging +from model_path import ModelPath +from utils_log_files import create_log_file, read_log_file +from utils_save_outputs import save_model_outputs, save_predictions +from utils_run import get_standardized_df +from utils_artifacts import get_latest_model_artifact +from utils_evaluation_metrics import generate_metric_dict +from utils_model_outputs import generate_output_dict +from utils_wandb import log_wandb_log_dict +from views_forecasts.extensions import * + +logger = logging.getLogger(__name__) + +def evaluate_model_artifact(config, artifact_name): + model_path = ModelPath(config["name"]) + path_raw = model_path.data_raw + path_generated = model_path.data_generated + path_artifacts = model_path.artifacts + run_type = config["run_type"] + + # if an artifact name is provided through the CLI, use it. + # Otherwise, get the latest model artifact based on the run type + if artifact_name: + logger.info(f"Using (non-default) artifact: {artifact_name}") + + if not artifact_name.endswith(".pkl"): + artifact_name += ".pkl" + PATH_ARTIFACT = path_artifacts / artifact_name + else: + # use the latest model artifact based on the run type + logger.info(f"Using latest (default) run type ({run_type}) specific artifact") + PATH_ARTIFACT = get_latest_model_artifact(path_artifacts, run_type) + + config["timestamp"] = PATH_ARTIFACT.stem[-15:] + df_viewser = pd.read_pickle(path_raw / f"{run_type}_viewser_df.pkl") + + try: + stepshift_model = pd.read_pickle(PATH_ARTIFACT) + except FileNotFoundError: + logger.exception(f"Model artifact not found at {PATH_ARTIFACT}") + + df = stepshift_model.predict(run_type, df_viewser) + df = get_standardized_df(df, config) + data_generation_timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + date_fetch_timestamp = read_log_file(path_raw / f"{run_type}_data_fetch_log.txt").get("Data Fetch Timestamp", None) + + _, df_output = generate_output_dict(df, config) + evaluation, df_evaluation = generate_metric_dict(df, config) + log_wandb_log_dict(config, evaluation) + + save_model_outputs(df_evaluation, df_output, path_generated, config) + save_predictions(df, path_generated, config) + create_log_file(path_generated, config, config["timestamp"], data_generation_timestamp, date_fetch_timestamp) diff --git a/models/popular_monster/src/offline_evaluation/evaluate_sweep.py b/models/popular_monster/src/offline_evaluation/evaluate_sweep.py new file mode 100644 index 00000000..d6726cf0 --- /dev/null +++ b/models/popular_monster/src/offline_evaluation/evaluate_sweep.py @@ -0,0 +1,28 @@ +import pandas as pd +import wandb +from sklearn.metrics import mean_squared_error +from model_path import ModelPath +from utils_run import get_standardized_df +from utils_wandb import log_wandb_log_dict +from utils_evaluation_metrics import generate_metric_dict + + +def evaluate_sweep(config, stepshift_model): + model_path = ModelPath(config["name"]) + path_raw = model_path.data_raw + run_type = config["run_type"] + steps = config["steps"] + + df_viewser = pd.read_pickle(path_raw / f"{run_type}_viewser_df.pkl") + df = stepshift_model.predict(run_type, df_viewser) + df = get_standardized_df(df, config) + + # Temporarily keep this because the metric to minimize is MSE + pred_cols = [f"step_pred_{str(i)}" for i in steps] + df["mse"] = df.apply(lambda row: mean_squared_error([row[config["depvar"]]] * 36, + [row[col] for col in pred_cols]), axis=1) + + wandb.log({"MSE": df["mse"].mean()}) + + evaluation, _ = generate_metric_dict(df, config) + log_wandb_log_dict(config, evaluation) diff --git a/models/popular_monster/src/online_evaluation/.gitkeep b/models/popular_monster/src/online_evaluation/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/models/popular_monster/src/training/train_model.py b/models/popular_monster/src/training/train_model.py new file mode 100644 index 00000000..f2342912 --- /dev/null +++ b/models/popular_monster/src/training/train_model.py @@ -0,0 +1,33 @@ +from datetime import datetime +import pandas as pd +from model_path import ModelPath +from utils_log_files import create_log_file, read_log_file +from utils_run import get_model +from set_partition import get_partitioner_dict +from views_forecasts.extensions import * + + +def train_model_artifact(config): + # print(config) + model_path = ModelPath(config["name"]) + path_raw = model_path.data_raw + path_generated = model_path.data_generated + path_artifacts = model_path.artifacts + run_type = config["run_type"] + df_viewser = pd.read_pickle(path_raw / f"{run_type}_viewser_df.pkl") + + stepshift_model = stepshift_training(config, run_type, df_viewser) + if not config["sweep"]: + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + model_filename = f"{run_type}_model_{timestamp}.pkl" + stepshift_model.save(path_artifacts / model_filename) + date_fetch_timestamp = read_log_file(path_raw / f"{run_type}_data_fetch_log.txt").get("Data Fetch Timestamp", None) + create_log_file(path_generated, config, timestamp, None, date_fetch_timestamp) + return stepshift_model + + +def stepshift_training(config, partition_name, dataset): + partitioner_dict = get_partitioner_dict(partition_name) + stepshift_model = get_model(config, partitioner_dict) + stepshift_model.fit(dataset) + return stepshift_model diff --git a/models/popular_monster/src/utils/utils_run.py b/models/popular_monster/src/utils/utils_run.py new file mode 100644 index 00000000..51901a01 --- /dev/null +++ b/models/popular_monster/src/utils/utils_run.py @@ -0,0 +1,83 @@ +import numpy as np +from views_stepshifter_darts.stepshifter import StepshifterModel +from views_stepshifter_darts.hurdle_model import HurdleModel +from views_forecasts.extensions import * + + +def get_model(config, partitioner_dict): + """ + Get the model based on the algorithm specified in the config + """ + + if config["algorithm"] == "HurdleModel": + model = HurdleModel(config, partitioner_dict) + else: + config["model_reg"] = config["algorithm"] + model = StepshifterModel(config, partitioner_dict) + + return model + + +def get_standardized_df(df, config): + """ + Standardize the DataFrame based on the run type + """ + + run_type = config["run_type"] + steps = config["steps"] + depvar = config["depvar"] + + # choose the columns to keep based on the run type and replace negative values with 0 + if run_type in ["calibration", "testing"]: + cols = [depvar] + df.forecasts.prediction_columns + elif run_type == "forecasting": + cols = [f"step_pred_{i}" for i in steps] + df = df.replace([np.inf, -np.inf], 0)[cols] + df = df.mask(df < 0, 0) + return df + + +def split_hurdle_parameters(parameters_dict): + """ + Split the parameters dictionary into two separate dictionaries, one for the + classification model and one for the regression model. + """ + + cls_dict = {} + reg_dict = {} + + for key, value in parameters_dict.items(): + if key.startswith("cls_"): + cls_key = key.replace("cls_", "") + cls_dict[cls_key] = value + elif key.startswith("reg_"): + reg_key = key.replace("reg_", "") + reg_dict[reg_key] = value + + return cls_dict, reg_dict + + +def update_config(hp_config, meta_config, dp_config, args): + config = hp_config.copy() + config["run_type"] = args.run_type + config["sweep"] = False + config["name"] = meta_config["name"] + config["depvar"] = meta_config["depvar"] + config["algorithm"] = meta_config["algorithm"] + if meta_config["algorithm"] == "HurdleModel": + config["model_clf"] = meta_config["model_clf"] + config["model_reg"] = meta_config["model_reg"] + config["deployment_status"] = dp_config["deployment_status"] + + return config + + +def update_sweep_config(sweep_config, args, meta_config): + sweep_config["parameters"]["run_type"] = {"value": args.run_type} + sweep_config["parameters"]["sweep"] = {"value": True} + sweep_config["parameters"]["name"] = {"value": meta_config["name"]} + sweep_config["parameters"]["depvar"] = {"value": meta_config["depvar"]} + sweep_config["parameters"]["algorithm"] = {"value": meta_config["algorithm"]} + if meta_config["algorithm"] == "HurdleModel": + sweep_config["parameters"]["model_clf"] = {"value": meta_config["model_clf"]} + sweep_config["parameters"]["model_reg"] = {"value": meta_config["model_reg"]} diff --git a/models/popular_monster/src/visualization/.gitkeep b/models/popular_monster/src/visualization/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/models/teen_spirit/README.md b/models/teen_spirit/README.md new file mode 100644 index 00000000..61ba219c --- /dev/null +++ b/models/teen_spirit/README.md @@ -0,0 +1,3 @@ +# Model README +## Model name: teen_spirit +## Created on: 2024-11-04 16:25:28.260650 \ No newline at end of file diff --git a/models/teen_spirit/artifacts/.gitkeep b/models/teen_spirit/artifacts/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/models/teen_spirit/configs/config_deployment.py b/models/teen_spirit/configs/config_deployment.py new file mode 100644 index 00000000..9e45b735 --- /dev/null +++ b/models/teen_spirit/configs/config_deployment.py @@ -0,0 +1,20 @@ +""" +Deployment Configuration Script + +This script defines the deployment configuration settings for the application. +It includes the deployment status and any additional settings specified. + +Deployment Status: +- shadow: The deployment is shadowed and not yet active. +- deployed: The deployment is active and in use. +- baseline: The deployment is in a baseline state, for reference or comparison. +- deprecated: The deployment is deprecated and no longer supported. + +Additional settings can be included in the configuration dictionary as needed. + +""" + +def get_deployment_config(): + # Deployment settings + deployment_config = {'deployment_status': 'shadow'} + return deployment_config diff --git a/models/teen_spirit/configs/config_hyperparameters.py b/models/teen_spirit/configs/config_hyperparameters.py new file mode 100644 index 00000000..73900504 --- /dev/null +++ b/models/teen_spirit/configs/config_hyperparameters.py @@ -0,0 +1,18 @@ + +def get_hp_config(): + """ + Contains the hyperparameter configurations for model training. + This configuration is "operational" so modifying these settings will impact the model's behavior during the training. + + Returns: + - hyperparameters (dict): A dictionary containing hyperparameters for training the model, which determine the model's behavior during the training phase. + """ + + hyperparameters = { + "steps": [*range(1, 36 + 1, 1)], + "parameters": { + "n_estimators": 300, + "n_jobs": 12, + } + } + return hyperparameters diff --git a/models/teen_spirit/configs/config_meta.py b/models/teen_spirit/configs/config_meta.py new file mode 100644 index 00000000..8e283608 --- /dev/null +++ b/models/teen_spirit/configs/config_meta.py @@ -0,0 +1,18 @@ +def get_meta_config(): + """ + Contains the meta data for the model (model algorithm, name, target variable, and level of analysis). + This config is for documentation purposes only, and modifying it will not affect the model, the training, or the evaluation. + + Returns: + - meta_config (dict): A dictionary containing model meta configuration. + """ + + meta_config = { + "name": "teen_spirit", + "algorithm": "RandomForestModel", + "depvar": "ln_ged_sb_dep", + "queryset": "fatalities003_faoprices", + "level": "cm", + "creator": "Marina" + } + return meta_config diff --git a/models/teen_spirit/configs/config_sweep.py b/models/teen_spirit/configs/config_sweep.py new file mode 100644 index 00000000..e70b4997 --- /dev/null +++ b/models/teen_spirit/configs/config_sweep.py @@ -0,0 +1,29 @@ + +def get_sweep_config(): + """ + Contains the configuration for hyperparameter sweeps using WandB. + This configuration is "operational" so modifying it will change the search strategy, parameter ranges, and other settings for hyperparameter tuning aimed at optimizing model performance. + + Returns: + - sweep_config (dict): A dictionary containing the configuration for hyperparameter sweeps, defining the methods and parameter ranges used to search for optimal hyperparameters. + """ + + sweep_config = { + 'method': 'grid', + 'name': 'teen_spirit' + } + + # Example metric setup: + metric = { + 'name': 'MSE', + 'goal': 'minimize' + } + sweep_config['metric'] = metric + + # Example parameters setup: + parameters_dict = { + 'steps': {'values': [[*range(1, 36 + 1, 1)]]}, + } + sweep_config['parameters'] = parameters_dict + + return sweep_config diff --git a/models/teen_spirit/data/generated/.gitkeep b/models/teen_spirit/data/generated/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/models/teen_spirit/data/processed/.gitkeep b/models/teen_spirit/data/processed/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/models/teen_spirit/data/raw/.gitkeep b/models/teen_spirit/data/raw/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/models/teen_spirit/main.py b/models/teen_spirit/main.py new file mode 100644 index 00000000..36429ecb --- /dev/null +++ b/models/teen_spirit/main.py @@ -0,0 +1,38 @@ +import wandb +import sys +import warnings + +from pathlib import Path +PATH = Path(__file__) +sys.path.insert(0, str(Path( + *[i for i in PATH.parts[:PATH.parts.index("views_pipeline") + 1]]) / "common_utils")) # PATH_COMMON_UTILS +from set_path import setup_project_paths +setup_project_paths(PATH) + +from utils_cli_parser import parse_args, validate_arguments +from utils_logger import setup_logging +from execute_model_runs import execute_sweep_run, execute_single_run + +warnings.filterwarnings("ignore") +try: + from common_utils.model_path import ModelPath + from common_utils.global_cache import GlobalCache + model_name = ModelPath.get_model_name_from_path(PATH) + GlobalCache["current_model"] = model_name +except ImportError as e: + warnings.warn(f"ImportError: {e}. Some functionalities (model seperated log files) may not work properly.", ImportWarning) +except Exception as e: + warnings.warn(f"An unexpected error occurred: {e}.", RuntimeWarning) +logger = setup_logging("run.log") + + +if __name__ == "__main__": + wandb.login() + + args = parse_args() + validate_arguments(args) + + if args.sweep: + execute_sweep_run(args) + else: + execute_single_run(args) diff --git a/models/teen_spirit/notebooks/.gitkeep b/models/teen_spirit/notebooks/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/models/teen_spirit/reports/.gitkeep b/models/teen_spirit/reports/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/models/teen_spirit/src/architectures/.gitkeep b/models/teen_spirit/src/architectures/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/models/teen_spirit/src/dataloaders/get_data.py b/models/teen_spirit/src/dataloaders/get_data.py new file mode 100644 index 00000000..4dcd20c8 --- /dev/null +++ b/models/teen_spirit/src/dataloaders/get_data.py @@ -0,0 +1,14 @@ +import logging +from model_path import ModelPath +from utils_dataloaders import fetch_or_load_views_df + +logger = logging.getLogger(__name__) + +def get_data(args, model_name, self_test): + model_path = ModelPath(model_name, validate=False) + path_raw = model_path.data_raw + + data, alerts = fetch_or_load_views_df(model_name, args.run_type, path_raw, self_test, use_saved=args.saved) + logger.debug(f"DataFrame shape: {data.shape if data is not None else 'None'}") + + return data diff --git a/models/teen_spirit/src/forecasting/generate_forecast.py b/models/teen_spirit/src/forecasting/generate_forecast.py new file mode 100644 index 00000000..c011a203 --- /dev/null +++ b/models/teen_spirit/src/forecasting/generate_forecast.py @@ -0,0 +1,47 @@ +import pandas as pd +from datetime import datetime +import logging +from model_path import ModelPath +from utils_log_files import create_log_file, read_log_file +from utils_run import get_standardized_df +from utils_save_outputs import save_predictions +from utils_artifacts import get_latest_model_artifact + +logger = logging.getLogger(__name__) + + +def forecast_model_artifact(config, artifact_name): + model_path = ModelPath(config["name"]) + path_raw = model_path.data_raw + path_generated = model_path.data_generated + path_artifacts = model_path.artifacts + run_type = config["run_type"] + + # if an artifact name is provided through the CLI, use it. + # Otherwise, get the latest model artifact based on the run type + if artifact_name: + logger.info(f"Using (non-default) artifact: {artifact_name}") + + if not artifact_name.endswith(".pkl"): + artifact_name += ".pkl" + path_artifact = path_artifacts / artifact_name + else: + # use the latest model artifact based on the run type + logger.info(f"Using latest (default) run type ({run_type}) specific artifact") + path_artifact = get_latest_model_artifact(path_artifacts, run_type) + + config["timestamp"] = path_artifact.stem[-15:] + df_viewser = pd.read_pickle(path_raw / f"{run_type}_viewser_df.pkl") + + try: + stepshift_model = pd.read_pickle(path_artifact) + except FileNotFoundError: + logger.exception(f"Model artifact not found at {path_artifact}") + + df_predictions = stepshift_model.predict(run_type, df_viewser) + df_predictions = get_standardized_df(df_predictions, config) + data_generation_timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + date_fetch_timestamp = read_log_file(path_raw / f"{run_type}_data_fetch_log.txt").get("Data Fetch Timestamp", None) + + save_predictions(df_predictions, path_generated, config) + create_log_file(path_generated, config, config["timestamp"], data_generation_timestamp, date_fetch_timestamp) diff --git a/models/teen_spirit/src/management/execute_model_runs.py b/models/teen_spirit/src/management/execute_model_runs.py new file mode 100644 index 00000000..d5b32a46 --- /dev/null +++ b/models/teen_spirit/src/management/execute_model_runs.py @@ -0,0 +1,51 @@ +import wandb +from config_deployment import get_deployment_config +from config_hyperparameters import get_hp_config +from config_meta import get_meta_config +from config_sweep import get_sweep_config +from execute_model_tasks import execute_model_tasks +from get_data import get_data +from utils_run import update_config, update_sweep_config + + +def execute_sweep_run(args): + sweep_config = get_sweep_config() + meta_config = get_meta_config() + update_sweep_config(sweep_config, args, meta_config) + + project = f"{sweep_config['name']}_sweep" # we can name the sweep in the config file + + with wandb.init(project=f'{project}_fetch', entity="views_pipeline"): + + get_data(args, sweep_config["name"], args.drift_self_test) + + wandb.finish() + + sweep_id = wandb.sweep(sweep_config, project=project, entity="views_pipeline") + wandb.agent(sweep_id, execute_model_tasks, entity="views_pipeline") + + +def execute_single_run(args): + + hp_config = get_hp_config() + meta_config = get_meta_config() + dp_config = get_deployment_config() + config = update_config(hp_config, meta_config, dp_config, args) + + project = f"{config['name']}_{args.run_type}" + + with wandb.init(project=f'{project}_fetch', entity="views_pipeline"): + + get_data(args, config["name"], args.drift_self_test) + + wandb.finish() + + if args.run_type == 'calibration' or args.run_type == 'testing': + + execute_model_tasks(config=config, project=project, train=args.train, eval=args.evaluate, + forecast=False, artifact_name=args.artifact_name) + + elif args.run_type == "forecasting": + execute_model_tasks(config=config, project=project, train=args.train, eval=False, + forecast=args.forecast, artifact_name=args.artifact_name) + diff --git a/models/teen_spirit/src/management/execute_model_tasks.py b/models/teen_spirit/src/management/execute_model_tasks.py new file mode 100644 index 00000000..a913fa53 --- /dev/null +++ b/models/teen_spirit/src/management/execute_model_tasks.py @@ -0,0 +1,70 @@ +import wandb +import logging +import time +from evaluate_model import evaluate_model_artifact +from evaluate_sweep import evaluate_sweep +from generate_forecast import forecast_model_artifact +from train_model import train_model_artifact +from utils_run import split_hurdle_parameters +from utils_wandb import add_wandb_monthly_metrics + +logger = logging.getLogger(__name__) + +def execute_model_tasks(config=None, project=None, train=None, eval=None, forecast=None, artifact_name=None): + """ + Executes various model-related tasks including training, evaluation, and forecasting. + + This function manages the execution of different tasks such as training the model, + evaluating an existing model, or performing forecasting. + It also initializes the WandB project. + + Args: + config: Configuration object containing parameters and settings. + project: The WandB project name. + train: Flag to indicate if the model should be trained. + eval: Flag to indicate if the model should be evaluated. + forecast: Flag to indicate if forecasting should be performed. + artifact_name (optional): Specific name of the model artifact to load for evaluation or forecasting. + """ + + start_t = time.time() + + # Initialize WandB + with wandb.init(project=project, entity="views_pipeline", + config=config): # project and config ignored when running a sweep + + # add the monthly metrics to WandB + add_wandb_monthly_metrics() + + # Update config from WandB initialization above + config = wandb.config + + # W&B does not directly support nested dictionaries for hyperparameters + # This will make the sweep config super ugly, but we don't have to distinguish between sweep and single runs + if config["sweep"] and config["algorithm"] == "HurdleRegression": + config["parameters"] = {} + config["parameters"]["clf"], config["parameters"]["reg"] = split_hurdle_parameters(config) + + if config["sweep"]: + logger.info(f"Sweeping model {config['name']}...") + stepshift_model = train_model_artifact(config) + logger.info(f"Evaluating model {config['name']}...") + evaluate_sweep(config, stepshift_model) + + # Handle the single model runs: train and save the model as an artifact + if train: + logger.info(f"Training model {config['name']}...") + train_model_artifact(config) + + # Handle the single model runs: evaluate a trained model (artifact) + if eval: + logger.info(f"Evaluating model {config['name']}...") + evaluate_model_artifact(config, artifact_name) + + if forecast: + logger.info(f"Forecasting model {config['name']}...") + forecast_model_artifact(config, artifact_name) + + end_t = time.time() + minutes = (end_t - start_t) / 60 + logger.info(f"Done. Runtime: {minutes:.3f} minutes.\n") diff --git a/models/teen_spirit/src/offline_evaluation/evaluate_model.py b/models/teen_spirit/src/offline_evaluation/evaluate_model.py new file mode 100644 index 00000000..0d86a87c --- /dev/null +++ b/models/teen_spirit/src/offline_evaluation/evaluate_model.py @@ -0,0 +1,55 @@ +from datetime import datetime +import pandas as pd +import logging +from model_path import ModelPath +from utils_log_files import create_log_file, read_log_file +from utils_save_outputs import save_model_outputs, save_predictions +from utils_run import get_standardized_df +from utils_artifacts import get_latest_model_artifact +from utils_evaluation_metrics import generate_metric_dict +from utils_model_outputs import generate_output_dict +from utils_wandb import log_wandb_log_dict +from views_forecasts.extensions import * + +logger = logging.getLogger(__name__) + +def evaluate_model_artifact(config, artifact_name): + model_path = ModelPath(config["name"]) + path_raw = model_path.data_raw + path_generated = model_path.data_generated + path_artifacts = model_path.artifacts + run_type = config["run_type"] + + # if an artifact name is provided through the CLI, use it. + # Otherwise, get the latest model artifact based on the run type + if artifact_name: + logger.info(f"Using (non-default) artifact: {artifact_name}") + + if not artifact_name.endswith(".pkl"): + artifact_name += ".pkl" + PATH_ARTIFACT = path_artifacts / artifact_name + else: + # use the latest model artifact based on the run type + logger.info(f"Using latest (default) run type ({run_type}) specific artifact") + PATH_ARTIFACT = get_latest_model_artifact(path_artifacts, run_type) + + config["timestamp"] = PATH_ARTIFACT.stem[-15:] + df_viewser = pd.read_pickle(path_raw / f"{run_type}_viewser_df.pkl") + + try: + stepshift_model = pd.read_pickle(PATH_ARTIFACT) + except FileNotFoundError: + logger.exception(f"Model artifact not found at {PATH_ARTIFACT}") + + df = stepshift_model.predict(run_type, df_viewser) + df = get_standardized_df(df, config) + data_generation_timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + date_fetch_timestamp = read_log_file(path_raw / f"{run_type}_data_fetch_log.txt").get("Data Fetch Timestamp", None) + + _, df_output = generate_output_dict(df, config) + evaluation, df_evaluation = generate_metric_dict(df, config) + log_wandb_log_dict(config, evaluation) + + save_model_outputs(df_evaluation, df_output, path_generated, config) + save_predictions(df, path_generated, config) + create_log_file(path_generated, config, config["timestamp"], data_generation_timestamp, date_fetch_timestamp) diff --git a/models/teen_spirit/src/offline_evaluation/evaluate_sweep.py b/models/teen_spirit/src/offline_evaluation/evaluate_sweep.py new file mode 100644 index 00000000..d6726cf0 --- /dev/null +++ b/models/teen_spirit/src/offline_evaluation/evaluate_sweep.py @@ -0,0 +1,28 @@ +import pandas as pd +import wandb +from sklearn.metrics import mean_squared_error +from model_path import ModelPath +from utils_run import get_standardized_df +from utils_wandb import log_wandb_log_dict +from utils_evaluation_metrics import generate_metric_dict + + +def evaluate_sweep(config, stepshift_model): + model_path = ModelPath(config["name"]) + path_raw = model_path.data_raw + run_type = config["run_type"] + steps = config["steps"] + + df_viewser = pd.read_pickle(path_raw / f"{run_type}_viewser_df.pkl") + df = stepshift_model.predict(run_type, df_viewser) + df = get_standardized_df(df, config) + + # Temporarily keep this because the metric to minimize is MSE + pred_cols = [f"step_pred_{str(i)}" for i in steps] + df["mse"] = df.apply(lambda row: mean_squared_error([row[config["depvar"]]] * 36, + [row[col] for col in pred_cols]), axis=1) + + wandb.log({"MSE": df["mse"].mean()}) + + evaluation, _ = generate_metric_dict(df, config) + log_wandb_log_dict(config, evaluation) diff --git a/models/teen_spirit/src/online_evaluation/.gitkeep b/models/teen_spirit/src/online_evaluation/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/models/teen_spirit/src/training/train_model.py b/models/teen_spirit/src/training/train_model.py new file mode 100644 index 00000000..f2342912 --- /dev/null +++ b/models/teen_spirit/src/training/train_model.py @@ -0,0 +1,33 @@ +from datetime import datetime +import pandas as pd +from model_path import ModelPath +from utils_log_files import create_log_file, read_log_file +from utils_run import get_model +from set_partition import get_partitioner_dict +from views_forecasts.extensions import * + + +def train_model_artifact(config): + # print(config) + model_path = ModelPath(config["name"]) + path_raw = model_path.data_raw + path_generated = model_path.data_generated + path_artifacts = model_path.artifacts + run_type = config["run_type"] + df_viewser = pd.read_pickle(path_raw / f"{run_type}_viewser_df.pkl") + + stepshift_model = stepshift_training(config, run_type, df_viewser) + if not config["sweep"]: + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + model_filename = f"{run_type}_model_{timestamp}.pkl" + stepshift_model.save(path_artifacts / model_filename) + date_fetch_timestamp = read_log_file(path_raw / f"{run_type}_data_fetch_log.txt").get("Data Fetch Timestamp", None) + create_log_file(path_generated, config, timestamp, None, date_fetch_timestamp) + return stepshift_model + + +def stepshift_training(config, partition_name, dataset): + partitioner_dict = get_partitioner_dict(partition_name) + stepshift_model = get_model(config, partitioner_dict) + stepshift_model.fit(dataset) + return stepshift_model diff --git a/models/teen_spirit/src/utils/utils_run.py b/models/teen_spirit/src/utils/utils_run.py new file mode 100644 index 00000000..51901a01 --- /dev/null +++ b/models/teen_spirit/src/utils/utils_run.py @@ -0,0 +1,83 @@ +import numpy as np +from views_stepshifter_darts.stepshifter import StepshifterModel +from views_stepshifter_darts.hurdle_model import HurdleModel +from views_forecasts.extensions import * + + +def get_model(config, partitioner_dict): + """ + Get the model based on the algorithm specified in the config + """ + + if config["algorithm"] == "HurdleModel": + model = HurdleModel(config, partitioner_dict) + else: + config["model_reg"] = config["algorithm"] + model = StepshifterModel(config, partitioner_dict) + + return model + + +def get_standardized_df(df, config): + """ + Standardize the DataFrame based on the run type + """ + + run_type = config["run_type"] + steps = config["steps"] + depvar = config["depvar"] + + # choose the columns to keep based on the run type and replace negative values with 0 + if run_type in ["calibration", "testing"]: + cols = [depvar] + df.forecasts.prediction_columns + elif run_type == "forecasting": + cols = [f"step_pred_{i}" for i in steps] + df = df.replace([np.inf, -np.inf], 0)[cols] + df = df.mask(df < 0, 0) + return df + + +def split_hurdle_parameters(parameters_dict): + """ + Split the parameters dictionary into two separate dictionaries, one for the + classification model and one for the regression model. + """ + + cls_dict = {} + reg_dict = {} + + for key, value in parameters_dict.items(): + if key.startswith("cls_"): + cls_key = key.replace("cls_", "") + cls_dict[cls_key] = value + elif key.startswith("reg_"): + reg_key = key.replace("reg_", "") + reg_dict[reg_key] = value + + return cls_dict, reg_dict + + +def update_config(hp_config, meta_config, dp_config, args): + config = hp_config.copy() + config["run_type"] = args.run_type + config["sweep"] = False + config["name"] = meta_config["name"] + config["depvar"] = meta_config["depvar"] + config["algorithm"] = meta_config["algorithm"] + if meta_config["algorithm"] == "HurdleModel": + config["model_clf"] = meta_config["model_clf"] + config["model_reg"] = meta_config["model_reg"] + config["deployment_status"] = dp_config["deployment_status"] + + return config + + +def update_sweep_config(sweep_config, args, meta_config): + sweep_config["parameters"]["run_type"] = {"value": args.run_type} + sweep_config["parameters"]["sweep"] = {"value": True} + sweep_config["parameters"]["name"] = {"value": meta_config["name"]} + sweep_config["parameters"]["depvar"] = {"value": meta_config["depvar"]} + sweep_config["parameters"]["algorithm"] = {"value": meta_config["algorithm"]} + if meta_config["algorithm"] == "HurdleModel": + sweep_config["parameters"]["model_clf"] = {"value": meta_config["model_clf"]} + sweep_config["parameters"]["model_reg"] = {"value": meta_config["model_reg"]} diff --git a/models/teen_spirit/src/visualization/.gitkeep b/models/teen_spirit/src/visualization/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/models/twin_flame/README.md b/models/twin_flame/README.md new file mode 100644 index 00000000..4394faa6 --- /dev/null +++ b/models/twin_flame/README.md @@ -0,0 +1,3 @@ +# Model README +## Model name: twin_flame +## Created on: 2024-11-05 11:17:17.580234 \ No newline at end of file diff --git a/models/twin_flame/artifacts/.gitkeep b/models/twin_flame/artifacts/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/models/twin_flame/configs/config_deployment.py b/models/twin_flame/configs/config_deployment.py new file mode 100644 index 00000000..9e45b735 --- /dev/null +++ b/models/twin_flame/configs/config_deployment.py @@ -0,0 +1,20 @@ +""" +Deployment Configuration Script + +This script defines the deployment configuration settings for the application. +It includes the deployment status and any additional settings specified. + +Deployment Status: +- shadow: The deployment is shadowed and not yet active. +- deployed: The deployment is active and in use. +- baseline: The deployment is in a baseline state, for reference or comparison. +- deprecated: The deployment is deprecated and no longer supported. + +Additional settings can be included in the configuration dictionary as needed. + +""" + +def get_deployment_config(): + # Deployment settings + deployment_config = {'deployment_status': 'shadow'} + return deployment_config diff --git a/models/twin_flame/configs/config_hyperparameters.py b/models/twin_flame/configs/config_hyperparameters.py new file mode 100644 index 00000000..ccc84e2a --- /dev/null +++ b/models/twin_flame/configs/config_hyperparameters.py @@ -0,0 +1,22 @@ + +def get_hp_config(): + """ + Contains the hyperparameter configurations for model training. + This configuration is "operational" so modifying these settings will impact the model's behavior during the training. + + Returns: + - hyperparameters (dict): A dictionary containing hyperparameters for training the model, which determine the model's behavior during the training phase. + """ + + hyperparameters = { + "steps": [*range(1, 36 + 1, 1)], + "parameters": { + "clf": { + "n_estimators": 250, + }, + "reg": { + "n_estimators": 250, + } + } + } + return hyperparameters diff --git a/models/twin_flame/configs/config_meta.py b/models/twin_flame/configs/config_meta.py new file mode 100644 index 00000000..756f3027 --- /dev/null +++ b/models/twin_flame/configs/config_meta.py @@ -0,0 +1,20 @@ +def get_meta_config(): + """ + Contains the meta data for the model (model algorithm, name, target variable, and level of analysis). + This config is for documentation purposes only, and modifying it will not affect the model, the training, or the evaluation. + + Returns: + - meta_config (dict): A dictionary containing model meta configuration. + """ + + meta_config = { + "name": "twin_flame", + "algorithm": "HurdleModel", + "model_clf": "LightGBMModel", + "model_reg": "LightGBMModel", + "depvar": "ln_ged_sb_dep", + "queryset": "fatalities003_topics", + "level": "cm", + "creator": "Borbála" + } + return meta_config diff --git a/models/twin_flame/configs/config_sweep.py b/models/twin_flame/configs/config_sweep.py new file mode 100644 index 00000000..be6db096 --- /dev/null +++ b/models/twin_flame/configs/config_sweep.py @@ -0,0 +1,29 @@ + +def get_sweep_config(): + """ + Contains the configuration for hyperparameter sweeps using WandB. + This configuration is "operational" so modifying it will change the search strategy, parameter ranges, and other settings for hyperparameter tuning aimed at optimizing model performance. + + Returns: + - sweep_config (dict): A dictionary containing the configuration for hyperparameter sweeps, defining the methods and parameter ranges used to search for optimal hyperparameters. + """ + + sweep_config = { + 'method': 'grid', + 'name': 'twin_flame' + } + + # Example metric setup: + metric = { + 'name': 'MSE', + 'goal': 'minimize' + } + sweep_config['metric'] = metric + + # Example parameters setup: + parameters_dict = { + 'steps': {'values': [[*range(1, 36 + 1, 1)]]}, + } + sweep_config['parameters'] = parameters_dict + + return sweep_config diff --git a/models/twin_flame/data/generated/.gitkeep b/models/twin_flame/data/generated/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/models/twin_flame/data/processed/.gitkeep b/models/twin_flame/data/processed/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/models/twin_flame/data/raw/.gitkeep b/models/twin_flame/data/raw/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/models/twin_flame/main.py b/models/twin_flame/main.py new file mode 100644 index 00000000..36429ecb --- /dev/null +++ b/models/twin_flame/main.py @@ -0,0 +1,38 @@ +import wandb +import sys +import warnings + +from pathlib import Path +PATH = Path(__file__) +sys.path.insert(0, str(Path( + *[i for i in PATH.parts[:PATH.parts.index("views_pipeline") + 1]]) / "common_utils")) # PATH_COMMON_UTILS +from set_path import setup_project_paths +setup_project_paths(PATH) + +from utils_cli_parser import parse_args, validate_arguments +from utils_logger import setup_logging +from execute_model_runs import execute_sweep_run, execute_single_run + +warnings.filterwarnings("ignore") +try: + from common_utils.model_path import ModelPath + from common_utils.global_cache import GlobalCache + model_name = ModelPath.get_model_name_from_path(PATH) + GlobalCache["current_model"] = model_name +except ImportError as e: + warnings.warn(f"ImportError: {e}. Some functionalities (model seperated log files) may not work properly.", ImportWarning) +except Exception as e: + warnings.warn(f"An unexpected error occurred: {e}.", RuntimeWarning) +logger = setup_logging("run.log") + + +if __name__ == "__main__": + wandb.login() + + args = parse_args() + validate_arguments(args) + + if args.sweep: + execute_sweep_run(args) + else: + execute_single_run(args) diff --git a/models/twin_flame/notebooks/.gitkeep b/models/twin_flame/notebooks/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/models/twin_flame/reports/.gitkeep b/models/twin_flame/reports/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/models/twin_flame/src/architectures/.gitkeep b/models/twin_flame/src/architectures/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/models/twin_flame/src/dataloaders/get_data.py b/models/twin_flame/src/dataloaders/get_data.py new file mode 100644 index 00000000..4dcd20c8 --- /dev/null +++ b/models/twin_flame/src/dataloaders/get_data.py @@ -0,0 +1,14 @@ +import logging +from model_path import ModelPath +from utils_dataloaders import fetch_or_load_views_df + +logger = logging.getLogger(__name__) + +def get_data(args, model_name, self_test): + model_path = ModelPath(model_name, validate=False) + path_raw = model_path.data_raw + + data, alerts = fetch_or_load_views_df(model_name, args.run_type, path_raw, self_test, use_saved=args.saved) + logger.debug(f"DataFrame shape: {data.shape if data is not None else 'None'}") + + return data diff --git a/models/twin_flame/src/forecasting/generate_forecast.py b/models/twin_flame/src/forecasting/generate_forecast.py new file mode 100644 index 00000000..c011a203 --- /dev/null +++ b/models/twin_flame/src/forecasting/generate_forecast.py @@ -0,0 +1,47 @@ +import pandas as pd +from datetime import datetime +import logging +from model_path import ModelPath +from utils_log_files import create_log_file, read_log_file +from utils_run import get_standardized_df +from utils_save_outputs import save_predictions +from utils_artifacts import get_latest_model_artifact + +logger = logging.getLogger(__name__) + + +def forecast_model_artifact(config, artifact_name): + model_path = ModelPath(config["name"]) + path_raw = model_path.data_raw + path_generated = model_path.data_generated + path_artifacts = model_path.artifacts + run_type = config["run_type"] + + # if an artifact name is provided through the CLI, use it. + # Otherwise, get the latest model artifact based on the run type + if artifact_name: + logger.info(f"Using (non-default) artifact: {artifact_name}") + + if not artifact_name.endswith(".pkl"): + artifact_name += ".pkl" + path_artifact = path_artifacts / artifact_name + else: + # use the latest model artifact based on the run type + logger.info(f"Using latest (default) run type ({run_type}) specific artifact") + path_artifact = get_latest_model_artifact(path_artifacts, run_type) + + config["timestamp"] = path_artifact.stem[-15:] + df_viewser = pd.read_pickle(path_raw / f"{run_type}_viewser_df.pkl") + + try: + stepshift_model = pd.read_pickle(path_artifact) + except FileNotFoundError: + logger.exception(f"Model artifact not found at {path_artifact}") + + df_predictions = stepshift_model.predict(run_type, df_viewser) + df_predictions = get_standardized_df(df_predictions, config) + data_generation_timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + date_fetch_timestamp = read_log_file(path_raw / f"{run_type}_data_fetch_log.txt").get("Data Fetch Timestamp", None) + + save_predictions(df_predictions, path_generated, config) + create_log_file(path_generated, config, config["timestamp"], data_generation_timestamp, date_fetch_timestamp) diff --git a/models/twin_flame/src/management/execute_model_runs.py b/models/twin_flame/src/management/execute_model_runs.py new file mode 100644 index 00000000..d5b32a46 --- /dev/null +++ b/models/twin_flame/src/management/execute_model_runs.py @@ -0,0 +1,51 @@ +import wandb +from config_deployment import get_deployment_config +from config_hyperparameters import get_hp_config +from config_meta import get_meta_config +from config_sweep import get_sweep_config +from execute_model_tasks import execute_model_tasks +from get_data import get_data +from utils_run import update_config, update_sweep_config + + +def execute_sweep_run(args): + sweep_config = get_sweep_config() + meta_config = get_meta_config() + update_sweep_config(sweep_config, args, meta_config) + + project = f"{sweep_config['name']}_sweep" # we can name the sweep in the config file + + with wandb.init(project=f'{project}_fetch', entity="views_pipeline"): + + get_data(args, sweep_config["name"], args.drift_self_test) + + wandb.finish() + + sweep_id = wandb.sweep(sweep_config, project=project, entity="views_pipeline") + wandb.agent(sweep_id, execute_model_tasks, entity="views_pipeline") + + +def execute_single_run(args): + + hp_config = get_hp_config() + meta_config = get_meta_config() + dp_config = get_deployment_config() + config = update_config(hp_config, meta_config, dp_config, args) + + project = f"{config['name']}_{args.run_type}" + + with wandb.init(project=f'{project}_fetch', entity="views_pipeline"): + + get_data(args, config["name"], args.drift_self_test) + + wandb.finish() + + if args.run_type == 'calibration' or args.run_type == 'testing': + + execute_model_tasks(config=config, project=project, train=args.train, eval=args.evaluate, + forecast=False, artifact_name=args.artifact_name) + + elif args.run_type == "forecasting": + execute_model_tasks(config=config, project=project, train=args.train, eval=False, + forecast=args.forecast, artifact_name=args.artifact_name) + diff --git a/models/twin_flame/src/management/execute_model_tasks.py b/models/twin_flame/src/management/execute_model_tasks.py new file mode 100644 index 00000000..a913fa53 --- /dev/null +++ b/models/twin_flame/src/management/execute_model_tasks.py @@ -0,0 +1,70 @@ +import wandb +import logging +import time +from evaluate_model import evaluate_model_artifact +from evaluate_sweep import evaluate_sweep +from generate_forecast import forecast_model_artifact +from train_model import train_model_artifact +from utils_run import split_hurdle_parameters +from utils_wandb import add_wandb_monthly_metrics + +logger = logging.getLogger(__name__) + +def execute_model_tasks(config=None, project=None, train=None, eval=None, forecast=None, artifact_name=None): + """ + Executes various model-related tasks including training, evaluation, and forecasting. + + This function manages the execution of different tasks such as training the model, + evaluating an existing model, or performing forecasting. + It also initializes the WandB project. + + Args: + config: Configuration object containing parameters and settings. + project: The WandB project name. + train: Flag to indicate if the model should be trained. + eval: Flag to indicate if the model should be evaluated. + forecast: Flag to indicate if forecasting should be performed. + artifact_name (optional): Specific name of the model artifact to load for evaluation or forecasting. + """ + + start_t = time.time() + + # Initialize WandB + with wandb.init(project=project, entity="views_pipeline", + config=config): # project and config ignored when running a sweep + + # add the monthly metrics to WandB + add_wandb_monthly_metrics() + + # Update config from WandB initialization above + config = wandb.config + + # W&B does not directly support nested dictionaries for hyperparameters + # This will make the sweep config super ugly, but we don't have to distinguish between sweep and single runs + if config["sweep"] and config["algorithm"] == "HurdleRegression": + config["parameters"] = {} + config["parameters"]["clf"], config["parameters"]["reg"] = split_hurdle_parameters(config) + + if config["sweep"]: + logger.info(f"Sweeping model {config['name']}...") + stepshift_model = train_model_artifact(config) + logger.info(f"Evaluating model {config['name']}...") + evaluate_sweep(config, stepshift_model) + + # Handle the single model runs: train and save the model as an artifact + if train: + logger.info(f"Training model {config['name']}...") + train_model_artifact(config) + + # Handle the single model runs: evaluate a trained model (artifact) + if eval: + logger.info(f"Evaluating model {config['name']}...") + evaluate_model_artifact(config, artifact_name) + + if forecast: + logger.info(f"Forecasting model {config['name']}...") + forecast_model_artifact(config, artifact_name) + + end_t = time.time() + minutes = (end_t - start_t) / 60 + logger.info(f"Done. Runtime: {minutes:.3f} minutes.\n") diff --git a/models/twin_flame/src/offline_evaluation/evaluate_model.py b/models/twin_flame/src/offline_evaluation/evaluate_model.py new file mode 100644 index 00000000..0d86a87c --- /dev/null +++ b/models/twin_flame/src/offline_evaluation/evaluate_model.py @@ -0,0 +1,55 @@ +from datetime import datetime +import pandas as pd +import logging +from model_path import ModelPath +from utils_log_files import create_log_file, read_log_file +from utils_save_outputs import save_model_outputs, save_predictions +from utils_run import get_standardized_df +from utils_artifacts import get_latest_model_artifact +from utils_evaluation_metrics import generate_metric_dict +from utils_model_outputs import generate_output_dict +from utils_wandb import log_wandb_log_dict +from views_forecasts.extensions import * + +logger = logging.getLogger(__name__) + +def evaluate_model_artifact(config, artifact_name): + model_path = ModelPath(config["name"]) + path_raw = model_path.data_raw + path_generated = model_path.data_generated + path_artifacts = model_path.artifacts + run_type = config["run_type"] + + # if an artifact name is provided through the CLI, use it. + # Otherwise, get the latest model artifact based on the run type + if artifact_name: + logger.info(f"Using (non-default) artifact: {artifact_name}") + + if not artifact_name.endswith(".pkl"): + artifact_name += ".pkl" + PATH_ARTIFACT = path_artifacts / artifact_name + else: + # use the latest model artifact based on the run type + logger.info(f"Using latest (default) run type ({run_type}) specific artifact") + PATH_ARTIFACT = get_latest_model_artifact(path_artifacts, run_type) + + config["timestamp"] = PATH_ARTIFACT.stem[-15:] + df_viewser = pd.read_pickle(path_raw / f"{run_type}_viewser_df.pkl") + + try: + stepshift_model = pd.read_pickle(PATH_ARTIFACT) + except FileNotFoundError: + logger.exception(f"Model artifact not found at {PATH_ARTIFACT}") + + df = stepshift_model.predict(run_type, df_viewser) + df = get_standardized_df(df, config) + data_generation_timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + date_fetch_timestamp = read_log_file(path_raw / f"{run_type}_data_fetch_log.txt").get("Data Fetch Timestamp", None) + + _, df_output = generate_output_dict(df, config) + evaluation, df_evaluation = generate_metric_dict(df, config) + log_wandb_log_dict(config, evaluation) + + save_model_outputs(df_evaluation, df_output, path_generated, config) + save_predictions(df, path_generated, config) + create_log_file(path_generated, config, config["timestamp"], data_generation_timestamp, date_fetch_timestamp) diff --git a/models/twin_flame/src/offline_evaluation/evaluate_sweep.py b/models/twin_flame/src/offline_evaluation/evaluate_sweep.py new file mode 100644 index 00000000..d6726cf0 --- /dev/null +++ b/models/twin_flame/src/offline_evaluation/evaluate_sweep.py @@ -0,0 +1,28 @@ +import pandas as pd +import wandb +from sklearn.metrics import mean_squared_error +from model_path import ModelPath +from utils_run import get_standardized_df +from utils_wandb import log_wandb_log_dict +from utils_evaluation_metrics import generate_metric_dict + + +def evaluate_sweep(config, stepshift_model): + model_path = ModelPath(config["name"]) + path_raw = model_path.data_raw + run_type = config["run_type"] + steps = config["steps"] + + df_viewser = pd.read_pickle(path_raw / f"{run_type}_viewser_df.pkl") + df = stepshift_model.predict(run_type, df_viewser) + df = get_standardized_df(df, config) + + # Temporarily keep this because the metric to minimize is MSE + pred_cols = [f"step_pred_{str(i)}" for i in steps] + df["mse"] = df.apply(lambda row: mean_squared_error([row[config["depvar"]]] * 36, + [row[col] for col in pred_cols]), axis=1) + + wandb.log({"MSE": df["mse"].mean()}) + + evaluation, _ = generate_metric_dict(df, config) + log_wandb_log_dict(config, evaluation) diff --git a/models/twin_flame/src/online_evaluation/.gitkeep b/models/twin_flame/src/online_evaluation/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/models/twin_flame/src/training/train_model.py b/models/twin_flame/src/training/train_model.py new file mode 100644 index 00000000..f2342912 --- /dev/null +++ b/models/twin_flame/src/training/train_model.py @@ -0,0 +1,33 @@ +from datetime import datetime +import pandas as pd +from model_path import ModelPath +from utils_log_files import create_log_file, read_log_file +from utils_run import get_model +from set_partition import get_partitioner_dict +from views_forecasts.extensions import * + + +def train_model_artifact(config): + # print(config) + model_path = ModelPath(config["name"]) + path_raw = model_path.data_raw + path_generated = model_path.data_generated + path_artifacts = model_path.artifacts + run_type = config["run_type"] + df_viewser = pd.read_pickle(path_raw / f"{run_type}_viewser_df.pkl") + + stepshift_model = stepshift_training(config, run_type, df_viewser) + if not config["sweep"]: + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + model_filename = f"{run_type}_model_{timestamp}.pkl" + stepshift_model.save(path_artifacts / model_filename) + date_fetch_timestamp = read_log_file(path_raw / f"{run_type}_data_fetch_log.txt").get("Data Fetch Timestamp", None) + create_log_file(path_generated, config, timestamp, None, date_fetch_timestamp) + return stepshift_model + + +def stepshift_training(config, partition_name, dataset): + partitioner_dict = get_partitioner_dict(partition_name) + stepshift_model = get_model(config, partitioner_dict) + stepshift_model.fit(dataset) + return stepshift_model diff --git a/models/twin_flame/src/utils/utils_run.py b/models/twin_flame/src/utils/utils_run.py new file mode 100644 index 00000000..51901a01 --- /dev/null +++ b/models/twin_flame/src/utils/utils_run.py @@ -0,0 +1,83 @@ +import numpy as np +from views_stepshifter_darts.stepshifter import StepshifterModel +from views_stepshifter_darts.hurdle_model import HurdleModel +from views_forecasts.extensions import * + + +def get_model(config, partitioner_dict): + """ + Get the model based on the algorithm specified in the config + """ + + if config["algorithm"] == "HurdleModel": + model = HurdleModel(config, partitioner_dict) + else: + config["model_reg"] = config["algorithm"] + model = StepshifterModel(config, partitioner_dict) + + return model + + +def get_standardized_df(df, config): + """ + Standardize the DataFrame based on the run type + """ + + run_type = config["run_type"] + steps = config["steps"] + depvar = config["depvar"] + + # choose the columns to keep based on the run type and replace negative values with 0 + if run_type in ["calibration", "testing"]: + cols = [depvar] + df.forecasts.prediction_columns + elif run_type == "forecasting": + cols = [f"step_pred_{i}" for i in steps] + df = df.replace([np.inf, -np.inf], 0)[cols] + df = df.mask(df < 0, 0) + return df + + +def split_hurdle_parameters(parameters_dict): + """ + Split the parameters dictionary into two separate dictionaries, one for the + classification model and one for the regression model. + """ + + cls_dict = {} + reg_dict = {} + + for key, value in parameters_dict.items(): + if key.startswith("cls_"): + cls_key = key.replace("cls_", "") + cls_dict[cls_key] = value + elif key.startswith("reg_"): + reg_key = key.replace("reg_", "") + reg_dict[reg_key] = value + + return cls_dict, reg_dict + + +def update_config(hp_config, meta_config, dp_config, args): + config = hp_config.copy() + config["run_type"] = args.run_type + config["sweep"] = False + config["name"] = meta_config["name"] + config["depvar"] = meta_config["depvar"] + config["algorithm"] = meta_config["algorithm"] + if meta_config["algorithm"] == "HurdleModel": + config["model_clf"] = meta_config["model_clf"] + config["model_reg"] = meta_config["model_reg"] + config["deployment_status"] = dp_config["deployment_status"] + + return config + + +def update_sweep_config(sweep_config, args, meta_config): + sweep_config["parameters"]["run_type"] = {"value": args.run_type} + sweep_config["parameters"]["sweep"] = {"value": True} + sweep_config["parameters"]["name"] = {"value": meta_config["name"]} + sweep_config["parameters"]["depvar"] = {"value": meta_config["depvar"]} + sweep_config["parameters"]["algorithm"] = {"value": meta_config["algorithm"]} + if meta_config["algorithm"] == "HurdleModel": + sweep_config["parameters"]["model_clf"] = {"value": meta_config["model_clf"]} + sweep_config["parameters"]["model_reg"] = {"value": meta_config["model_reg"]} diff --git a/models/twin_flame/src/visualization/.gitkeep b/models/twin_flame/src/visualization/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/models/wildest_dream/configs/config_meta.py b/models/wildest_dream/configs/config_meta.py index 4ad053e0..79809464 100644 --- a/models/wildest_dream/configs/config_meta.py +++ b/models/wildest_dream/configs/config_meta.py @@ -12,7 +12,7 @@ def get_meta_config(): "model_clf": "XGBClassifier", "model_reg": "XGBRegressor", "depvar": "ln_ged_sb_dep", # IMPORTANT! The current stepshift only takes one target variable! Not compatiable with Simon's code! - "queryset": "fatalities002_pgm_conflict_sptime_dist", + "queryset": "fatalities003_pgm_conflict_sptime_dist", "level": "pgm", "creator": "Xiaolong" } diff --git a/models/yellow_pikachu/configs/config_meta.py b/models/yellow_pikachu/configs/config_meta.py index 4cc54959..5877f3ea 100644 --- a/models/yellow_pikachu/configs/config_meta.py +++ b/models/yellow_pikachu/configs/config_meta.py @@ -12,7 +12,7 @@ def get_meta_config(): "model_clf": "XGBClassifier", "model_reg": "XGBRegressor", "depvar": "ln_ged_sb_dep", # IMPORTANT! The current stepshift only takes one target variable! Not compatiable with Simon's code! - "queryset": "fatalities002_pgm_conflict_treelag", + "queryset": "fatalities003_pgm_conflict_treelag", "level": "pgm", "creator": "Xiaolong" } diff --git a/models/yellow_submarine/README.md b/models/yellow_submarine/README.md new file mode 100644 index 00000000..248bf38b --- /dev/null +++ b/models/yellow_submarine/README.md @@ -0,0 +1,3 @@ +# Model README +## Model name: yellow_submarine +## Created on: 2024-11-04 15:01:22.637557 \ No newline at end of file diff --git a/models/yellow_submarine/artifacts/.gitkeep b/models/yellow_submarine/artifacts/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/models/yellow_submarine/configs/config_deployment.py b/models/yellow_submarine/configs/config_deployment.py new file mode 100644 index 00000000..9e45b735 --- /dev/null +++ b/models/yellow_submarine/configs/config_deployment.py @@ -0,0 +1,20 @@ +""" +Deployment Configuration Script + +This script defines the deployment configuration settings for the application. +It includes the deployment status and any additional settings specified. + +Deployment Status: +- shadow: The deployment is shadowed and not yet active. +- deployed: The deployment is active and in use. +- baseline: The deployment is in a baseline state, for reference or comparison. +- deprecated: The deployment is deprecated and no longer supported. + +Additional settings can be included in the configuration dictionary as needed. + +""" + +def get_deployment_config(): + # Deployment settings + deployment_config = {'deployment_status': 'shadow'} + return deployment_config diff --git a/models/yellow_submarine/configs/config_hyperparameters.py b/models/yellow_submarine/configs/config_hyperparameters.py new file mode 100644 index 00000000..b9336ae6 --- /dev/null +++ b/models/yellow_submarine/configs/config_hyperparameters.py @@ -0,0 +1,18 @@ + +def get_hp_config(): + """ + Contains the hyperparameter configurations for model training. + This configuration is "operational" so modifying these settings will impact the model's behavior during the training. + + Returns: + - hyperparameters (dict): A dictionary containing hyperparameters for training the model, which determine the model's behavior during the training phase. + """ + + hyperparameters = { + 'steps': [*range(1, 36 + 1, 1)], + "parameters": { + "n_estimators": 300, + "n_jobs": 12, + } + } + return hyperparameters diff --git a/models/yellow_submarine/configs/config_meta.py b/models/yellow_submarine/configs/config_meta.py new file mode 100644 index 00000000..fa3effa4 --- /dev/null +++ b/models/yellow_submarine/configs/config_meta.py @@ -0,0 +1,18 @@ +def get_meta_config(): + """ + Contains the meta data for the model (model algorithm, name, target variable, and level of analysis). + This config is for documentation purposes only, and modifying it will not affect the model, the training, or the evaluation. + + Returns: + - meta_config (dict): A dictionary containing model meta configuration. + """ + + meta_config = { + "name": "yellow_submarine", + "algorithm": "RandomForestModel", + "depvar": "ln_ged_sb_dep", + "queryset": "fatalities003_imfweo", + "level": "cm", + "creator": "Marina" + } + return meta_config diff --git a/models/yellow_submarine/configs/config_sweep.py b/models/yellow_submarine/configs/config_sweep.py new file mode 100644 index 00000000..7670c7d4 --- /dev/null +++ b/models/yellow_submarine/configs/config_sweep.py @@ -0,0 +1,29 @@ + +def get_sweep_config(): + """ + Contains the configuration for hyperparameter sweeps using WandB. + This configuration is "operational" so modifying it will change the search strategy, parameter ranges, and other settings for hyperparameter tuning aimed at optimizing model performance. + + Returns: + - sweep_config (dict): A dictionary containing the configuration for hyperparameter sweeps, defining the methods and parameter ranges used to search for optimal hyperparameters. + """ + + sweep_config = { + 'method': 'grid', + 'name': 'yellow_submarine' + } + + # Example metric setup: + metric = { + 'name': 'MSE', + 'goal': 'minimize' + } + sweep_config['metric'] = metric + + # Example parameters setup: + parameters_dict = { + 'steps': {'values': [[*range(1, 36 + 1, 1)]]}, + } + sweep_config['parameters'] = parameters_dict + + return sweep_config diff --git a/models/yellow_submarine/data/generated/.gitkeep b/models/yellow_submarine/data/generated/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/models/yellow_submarine/data/processed/.gitkeep b/models/yellow_submarine/data/processed/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/models/yellow_submarine/data/raw/.gitkeep b/models/yellow_submarine/data/raw/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/models/yellow_submarine/main.py b/models/yellow_submarine/main.py new file mode 100644 index 00000000..36429ecb --- /dev/null +++ b/models/yellow_submarine/main.py @@ -0,0 +1,38 @@ +import wandb +import sys +import warnings + +from pathlib import Path +PATH = Path(__file__) +sys.path.insert(0, str(Path( + *[i for i in PATH.parts[:PATH.parts.index("views_pipeline") + 1]]) / "common_utils")) # PATH_COMMON_UTILS +from set_path import setup_project_paths +setup_project_paths(PATH) + +from utils_cli_parser import parse_args, validate_arguments +from utils_logger import setup_logging +from execute_model_runs import execute_sweep_run, execute_single_run + +warnings.filterwarnings("ignore") +try: + from common_utils.model_path import ModelPath + from common_utils.global_cache import GlobalCache + model_name = ModelPath.get_model_name_from_path(PATH) + GlobalCache["current_model"] = model_name +except ImportError as e: + warnings.warn(f"ImportError: {e}. Some functionalities (model seperated log files) may not work properly.", ImportWarning) +except Exception as e: + warnings.warn(f"An unexpected error occurred: {e}.", RuntimeWarning) +logger = setup_logging("run.log") + + +if __name__ == "__main__": + wandb.login() + + args = parse_args() + validate_arguments(args) + + if args.sweep: + execute_sweep_run(args) + else: + execute_single_run(args) diff --git a/models/yellow_submarine/notebooks/.gitkeep b/models/yellow_submarine/notebooks/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/models/yellow_submarine/reports/.gitkeep b/models/yellow_submarine/reports/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/models/yellow_submarine/src/architectures/.gitkeep b/models/yellow_submarine/src/architectures/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/models/yellow_submarine/src/dataloaders/get_data.py b/models/yellow_submarine/src/dataloaders/get_data.py new file mode 100644 index 00000000..4dcd20c8 --- /dev/null +++ b/models/yellow_submarine/src/dataloaders/get_data.py @@ -0,0 +1,14 @@ +import logging +from model_path import ModelPath +from utils_dataloaders import fetch_or_load_views_df + +logger = logging.getLogger(__name__) + +def get_data(args, model_name, self_test): + model_path = ModelPath(model_name, validate=False) + path_raw = model_path.data_raw + + data, alerts = fetch_or_load_views_df(model_name, args.run_type, path_raw, self_test, use_saved=args.saved) + logger.debug(f"DataFrame shape: {data.shape if data is not None else 'None'}") + + return data diff --git a/models/yellow_submarine/src/forecasting/generate_forecast.py b/models/yellow_submarine/src/forecasting/generate_forecast.py new file mode 100644 index 00000000..c011a203 --- /dev/null +++ b/models/yellow_submarine/src/forecasting/generate_forecast.py @@ -0,0 +1,47 @@ +import pandas as pd +from datetime import datetime +import logging +from model_path import ModelPath +from utils_log_files import create_log_file, read_log_file +from utils_run import get_standardized_df +from utils_save_outputs import save_predictions +from utils_artifacts import get_latest_model_artifact + +logger = logging.getLogger(__name__) + + +def forecast_model_artifact(config, artifact_name): + model_path = ModelPath(config["name"]) + path_raw = model_path.data_raw + path_generated = model_path.data_generated + path_artifacts = model_path.artifacts + run_type = config["run_type"] + + # if an artifact name is provided through the CLI, use it. + # Otherwise, get the latest model artifact based on the run type + if artifact_name: + logger.info(f"Using (non-default) artifact: {artifact_name}") + + if not artifact_name.endswith(".pkl"): + artifact_name += ".pkl" + path_artifact = path_artifacts / artifact_name + else: + # use the latest model artifact based on the run type + logger.info(f"Using latest (default) run type ({run_type}) specific artifact") + path_artifact = get_latest_model_artifact(path_artifacts, run_type) + + config["timestamp"] = path_artifact.stem[-15:] + df_viewser = pd.read_pickle(path_raw / f"{run_type}_viewser_df.pkl") + + try: + stepshift_model = pd.read_pickle(path_artifact) + except FileNotFoundError: + logger.exception(f"Model artifact not found at {path_artifact}") + + df_predictions = stepshift_model.predict(run_type, df_viewser) + df_predictions = get_standardized_df(df_predictions, config) + data_generation_timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + date_fetch_timestamp = read_log_file(path_raw / f"{run_type}_data_fetch_log.txt").get("Data Fetch Timestamp", None) + + save_predictions(df_predictions, path_generated, config) + create_log_file(path_generated, config, config["timestamp"], data_generation_timestamp, date_fetch_timestamp) diff --git a/models/yellow_submarine/src/management/execute_model_runs.py b/models/yellow_submarine/src/management/execute_model_runs.py new file mode 100644 index 00000000..d5b32a46 --- /dev/null +++ b/models/yellow_submarine/src/management/execute_model_runs.py @@ -0,0 +1,51 @@ +import wandb +from config_deployment import get_deployment_config +from config_hyperparameters import get_hp_config +from config_meta import get_meta_config +from config_sweep import get_sweep_config +from execute_model_tasks import execute_model_tasks +from get_data import get_data +from utils_run import update_config, update_sweep_config + + +def execute_sweep_run(args): + sweep_config = get_sweep_config() + meta_config = get_meta_config() + update_sweep_config(sweep_config, args, meta_config) + + project = f"{sweep_config['name']}_sweep" # we can name the sweep in the config file + + with wandb.init(project=f'{project}_fetch', entity="views_pipeline"): + + get_data(args, sweep_config["name"], args.drift_self_test) + + wandb.finish() + + sweep_id = wandb.sweep(sweep_config, project=project, entity="views_pipeline") + wandb.agent(sweep_id, execute_model_tasks, entity="views_pipeline") + + +def execute_single_run(args): + + hp_config = get_hp_config() + meta_config = get_meta_config() + dp_config = get_deployment_config() + config = update_config(hp_config, meta_config, dp_config, args) + + project = f"{config['name']}_{args.run_type}" + + with wandb.init(project=f'{project}_fetch', entity="views_pipeline"): + + get_data(args, config["name"], args.drift_self_test) + + wandb.finish() + + if args.run_type == 'calibration' or args.run_type == 'testing': + + execute_model_tasks(config=config, project=project, train=args.train, eval=args.evaluate, + forecast=False, artifact_name=args.artifact_name) + + elif args.run_type == "forecasting": + execute_model_tasks(config=config, project=project, train=args.train, eval=False, + forecast=args.forecast, artifact_name=args.artifact_name) + diff --git a/models/yellow_submarine/src/management/execute_model_tasks.py b/models/yellow_submarine/src/management/execute_model_tasks.py new file mode 100644 index 00000000..a913fa53 --- /dev/null +++ b/models/yellow_submarine/src/management/execute_model_tasks.py @@ -0,0 +1,70 @@ +import wandb +import logging +import time +from evaluate_model import evaluate_model_artifact +from evaluate_sweep import evaluate_sweep +from generate_forecast import forecast_model_artifact +from train_model import train_model_artifact +from utils_run import split_hurdle_parameters +from utils_wandb import add_wandb_monthly_metrics + +logger = logging.getLogger(__name__) + +def execute_model_tasks(config=None, project=None, train=None, eval=None, forecast=None, artifact_name=None): + """ + Executes various model-related tasks including training, evaluation, and forecasting. + + This function manages the execution of different tasks such as training the model, + evaluating an existing model, or performing forecasting. + It also initializes the WandB project. + + Args: + config: Configuration object containing parameters and settings. + project: The WandB project name. + train: Flag to indicate if the model should be trained. + eval: Flag to indicate if the model should be evaluated. + forecast: Flag to indicate if forecasting should be performed. + artifact_name (optional): Specific name of the model artifact to load for evaluation or forecasting. + """ + + start_t = time.time() + + # Initialize WandB + with wandb.init(project=project, entity="views_pipeline", + config=config): # project and config ignored when running a sweep + + # add the monthly metrics to WandB + add_wandb_monthly_metrics() + + # Update config from WandB initialization above + config = wandb.config + + # W&B does not directly support nested dictionaries for hyperparameters + # This will make the sweep config super ugly, but we don't have to distinguish between sweep and single runs + if config["sweep"] and config["algorithm"] == "HurdleRegression": + config["parameters"] = {} + config["parameters"]["clf"], config["parameters"]["reg"] = split_hurdle_parameters(config) + + if config["sweep"]: + logger.info(f"Sweeping model {config['name']}...") + stepshift_model = train_model_artifact(config) + logger.info(f"Evaluating model {config['name']}...") + evaluate_sweep(config, stepshift_model) + + # Handle the single model runs: train and save the model as an artifact + if train: + logger.info(f"Training model {config['name']}...") + train_model_artifact(config) + + # Handle the single model runs: evaluate a trained model (artifact) + if eval: + logger.info(f"Evaluating model {config['name']}...") + evaluate_model_artifact(config, artifact_name) + + if forecast: + logger.info(f"Forecasting model {config['name']}...") + forecast_model_artifact(config, artifact_name) + + end_t = time.time() + minutes = (end_t - start_t) / 60 + logger.info(f"Done. Runtime: {minutes:.3f} minutes.\n") diff --git a/models/yellow_submarine/src/offline_evaluation/evaluate_model.py b/models/yellow_submarine/src/offline_evaluation/evaluate_model.py new file mode 100644 index 00000000..0d86a87c --- /dev/null +++ b/models/yellow_submarine/src/offline_evaluation/evaluate_model.py @@ -0,0 +1,55 @@ +from datetime import datetime +import pandas as pd +import logging +from model_path import ModelPath +from utils_log_files import create_log_file, read_log_file +from utils_save_outputs import save_model_outputs, save_predictions +from utils_run import get_standardized_df +from utils_artifacts import get_latest_model_artifact +from utils_evaluation_metrics import generate_metric_dict +from utils_model_outputs import generate_output_dict +from utils_wandb import log_wandb_log_dict +from views_forecasts.extensions import * + +logger = logging.getLogger(__name__) + +def evaluate_model_artifact(config, artifact_name): + model_path = ModelPath(config["name"]) + path_raw = model_path.data_raw + path_generated = model_path.data_generated + path_artifacts = model_path.artifacts + run_type = config["run_type"] + + # if an artifact name is provided through the CLI, use it. + # Otherwise, get the latest model artifact based on the run type + if artifact_name: + logger.info(f"Using (non-default) artifact: {artifact_name}") + + if not artifact_name.endswith(".pkl"): + artifact_name += ".pkl" + PATH_ARTIFACT = path_artifacts / artifact_name + else: + # use the latest model artifact based on the run type + logger.info(f"Using latest (default) run type ({run_type}) specific artifact") + PATH_ARTIFACT = get_latest_model_artifact(path_artifacts, run_type) + + config["timestamp"] = PATH_ARTIFACT.stem[-15:] + df_viewser = pd.read_pickle(path_raw / f"{run_type}_viewser_df.pkl") + + try: + stepshift_model = pd.read_pickle(PATH_ARTIFACT) + except FileNotFoundError: + logger.exception(f"Model artifact not found at {PATH_ARTIFACT}") + + df = stepshift_model.predict(run_type, df_viewser) + df = get_standardized_df(df, config) + data_generation_timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + date_fetch_timestamp = read_log_file(path_raw / f"{run_type}_data_fetch_log.txt").get("Data Fetch Timestamp", None) + + _, df_output = generate_output_dict(df, config) + evaluation, df_evaluation = generate_metric_dict(df, config) + log_wandb_log_dict(config, evaluation) + + save_model_outputs(df_evaluation, df_output, path_generated, config) + save_predictions(df, path_generated, config) + create_log_file(path_generated, config, config["timestamp"], data_generation_timestamp, date_fetch_timestamp) diff --git a/models/yellow_submarine/src/offline_evaluation/evaluate_sweep.py b/models/yellow_submarine/src/offline_evaluation/evaluate_sweep.py new file mode 100644 index 00000000..d6726cf0 --- /dev/null +++ b/models/yellow_submarine/src/offline_evaluation/evaluate_sweep.py @@ -0,0 +1,28 @@ +import pandas as pd +import wandb +from sklearn.metrics import mean_squared_error +from model_path import ModelPath +from utils_run import get_standardized_df +from utils_wandb import log_wandb_log_dict +from utils_evaluation_metrics import generate_metric_dict + + +def evaluate_sweep(config, stepshift_model): + model_path = ModelPath(config["name"]) + path_raw = model_path.data_raw + run_type = config["run_type"] + steps = config["steps"] + + df_viewser = pd.read_pickle(path_raw / f"{run_type}_viewser_df.pkl") + df = stepshift_model.predict(run_type, df_viewser) + df = get_standardized_df(df, config) + + # Temporarily keep this because the metric to minimize is MSE + pred_cols = [f"step_pred_{str(i)}" for i in steps] + df["mse"] = df.apply(lambda row: mean_squared_error([row[config["depvar"]]] * 36, + [row[col] for col in pred_cols]), axis=1) + + wandb.log({"MSE": df["mse"].mean()}) + + evaluation, _ = generate_metric_dict(df, config) + log_wandb_log_dict(config, evaluation) diff --git a/models/yellow_submarine/src/online_evaluation/.gitkeep b/models/yellow_submarine/src/online_evaluation/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/models/yellow_submarine/src/training/train_model.py b/models/yellow_submarine/src/training/train_model.py new file mode 100644 index 00000000..f2342912 --- /dev/null +++ b/models/yellow_submarine/src/training/train_model.py @@ -0,0 +1,33 @@ +from datetime import datetime +import pandas as pd +from model_path import ModelPath +from utils_log_files import create_log_file, read_log_file +from utils_run import get_model +from set_partition import get_partitioner_dict +from views_forecasts.extensions import * + + +def train_model_artifact(config): + # print(config) + model_path = ModelPath(config["name"]) + path_raw = model_path.data_raw + path_generated = model_path.data_generated + path_artifacts = model_path.artifacts + run_type = config["run_type"] + df_viewser = pd.read_pickle(path_raw / f"{run_type}_viewser_df.pkl") + + stepshift_model = stepshift_training(config, run_type, df_viewser) + if not config["sweep"]: + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + model_filename = f"{run_type}_model_{timestamp}.pkl" + stepshift_model.save(path_artifacts / model_filename) + date_fetch_timestamp = read_log_file(path_raw / f"{run_type}_data_fetch_log.txt").get("Data Fetch Timestamp", None) + create_log_file(path_generated, config, timestamp, None, date_fetch_timestamp) + return stepshift_model + + +def stepshift_training(config, partition_name, dataset): + partitioner_dict = get_partitioner_dict(partition_name) + stepshift_model = get_model(config, partitioner_dict) + stepshift_model.fit(dataset) + return stepshift_model diff --git a/models/yellow_submarine/src/utils/utils_run.py b/models/yellow_submarine/src/utils/utils_run.py new file mode 100644 index 00000000..51901a01 --- /dev/null +++ b/models/yellow_submarine/src/utils/utils_run.py @@ -0,0 +1,83 @@ +import numpy as np +from views_stepshifter_darts.stepshifter import StepshifterModel +from views_stepshifter_darts.hurdle_model import HurdleModel +from views_forecasts.extensions import * + + +def get_model(config, partitioner_dict): + """ + Get the model based on the algorithm specified in the config + """ + + if config["algorithm"] == "HurdleModel": + model = HurdleModel(config, partitioner_dict) + else: + config["model_reg"] = config["algorithm"] + model = StepshifterModel(config, partitioner_dict) + + return model + + +def get_standardized_df(df, config): + """ + Standardize the DataFrame based on the run type + """ + + run_type = config["run_type"] + steps = config["steps"] + depvar = config["depvar"] + + # choose the columns to keep based on the run type and replace negative values with 0 + if run_type in ["calibration", "testing"]: + cols = [depvar] + df.forecasts.prediction_columns + elif run_type == "forecasting": + cols = [f"step_pred_{i}" for i in steps] + df = df.replace([np.inf, -np.inf], 0)[cols] + df = df.mask(df < 0, 0) + return df + + +def split_hurdle_parameters(parameters_dict): + """ + Split the parameters dictionary into two separate dictionaries, one for the + classification model and one for the regression model. + """ + + cls_dict = {} + reg_dict = {} + + for key, value in parameters_dict.items(): + if key.startswith("cls_"): + cls_key = key.replace("cls_", "") + cls_dict[cls_key] = value + elif key.startswith("reg_"): + reg_key = key.replace("reg_", "") + reg_dict[reg_key] = value + + return cls_dict, reg_dict + + +def update_config(hp_config, meta_config, dp_config, args): + config = hp_config.copy() + config["run_type"] = args.run_type + config["sweep"] = False + config["name"] = meta_config["name"] + config["depvar"] = meta_config["depvar"] + config["algorithm"] = meta_config["algorithm"] + if meta_config["algorithm"] == "HurdleModel": + config["model_clf"] = meta_config["model_clf"] + config["model_reg"] = meta_config["model_reg"] + config["deployment_status"] = dp_config["deployment_status"] + + return config + + +def update_sweep_config(sweep_config, args, meta_config): + sweep_config["parameters"]["run_type"] = {"value": args.run_type} + sweep_config["parameters"]["sweep"] = {"value": True} + sweep_config["parameters"]["name"] = {"value": meta_config["name"]} + sweep_config["parameters"]["depvar"] = {"value": meta_config["depvar"]} + sweep_config["parameters"]["algorithm"] = {"value": meta_config["algorithm"]} + if meta_config["algorithm"] == "HurdleModel": + sweep_config["parameters"]["model_clf"] = {"value": meta_config["model_clf"]} + sweep_config["parameters"]["model_reg"] = {"value": meta_config["model_reg"]} diff --git a/models/yellow_submarine/src/visualization/.gitkeep b/models/yellow_submarine/src/visualization/.gitkeep new file mode 100644 index 00000000..e69de29b