-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdoc_retention_utils.py
131 lines (94 loc) · 3.81 KB
/
doc_retention_utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
import numpy as np
from lmfit import Model
from scipy.stats import chi2
from lmfit import Model
from sklearn.metrics import r2_score, mean_absolute_error
def transmission_k_constant(tau, k=0.5):
"""Model assuming exponential decay with constant k"""
return np.exp(-k * tau)
def transmission_k_as_func_of_tau(tau, p=0.5, m=-0.5):
"""Model assuming exponential decay with k=f(t)"""
return np.exp(-p * (tau ** (m + 1)))
def transmission_sigma_constant(tau, sigma=0.5):
"""Vollenweider model, constant sigma"""
return 1 / (1 + (sigma * tau))
def transmission_sigma_as_func_of_tau(tau, k=0.5, m=-0.5):
"""Vollenweider model, sigma=f(tau)"""
return 1 / (1 + (k * (tau ** (1 + m))))
def fit_model(func, df, tau_col, trans_col):
"""Convenience function for fitting models and priting summary statistics.
Args
func: Function to be fit
df: Dataframe. Data to fit
tau_col: Str. Name of residence time column in 'df'
trans_col: Str. Name of transmission column in 'df'
Returns
Model result object from lmfit. Summary stats. are printed.
"""
model = Model(func, independent_vars=["tau"])
fit = model.fit(df[trans_col], tau=df[tau_col])
print(fit.fit_report())
r2 = 1 - fit.residual.var() / np.var(df[trans_col])
print(f"R2: {r2:.2f}")
return fit
def likelihood_ratio_test(model1, model2):
"""
Perform a likelihood ratio test between two models.
This function calculates the likelihood ratio test statistic and p-value to compare
the goodness of fit between two nested models. It prints the likelihood ratio, degrees
of freedom, and p-value.
Parameters:
model1 (lmfit.model.ModelResult): The first (simpler) model.
model2 (lmfit.model.ModelResult): The second (more complex) model.
Returns:
None
"""
chi_square1 = model1.result.chisqr
chi_square2 = model2.result.chisqr
n = len(model1.data)
# Calculate the log-likelihood
log_likelihood1 = -0.5 * chi_square1
log_likelihood2 = -0.5 * chi_square2
# Calculate the test statistic
LR = -2 * (log_likelihood1 - log_likelihood2)
# Degrees of freedom
deg_freedom = model2.nvarys - model1.nvarys
# Calculate the p-value
p_value = chi2.sf(LR, deg_freedom)
print(f"Likelihood Ratio: {LR}")
print(f"Degrees of Freedom: {deg_freedom}")
print(f"P-value: {round(p_value, 3)}")
def calculate_mae_and_mbd(model, data, x_col, y_col):
# Extract the observed and predicted values
observed = data[y_col].values
predicted = model.eval(params=model.params, **{x_col: data[x_col].values})
# Ensure the lengths of observed and predicted are the same
if len(observed) != len(predicted):
raise ValueError(f"Length mismatch: observed has {len(observed)} elements, predicted has {len(predicted)} elements")
# Calculate the errors
errors = observed - predicted
# Calculate the mean absolute error (MAE)
mae = np.mean(np.abs(errors))
# Calculate the mean bias deviation (MBD)
mbd = np.mean(errors)
return mae, mbd
def rs_mae(s1_obs, s2_pred, model_name):
r2 = r2_score(s1_obs, s2_pred)
mae = mean_absolute_error(s1_obs, s2_pred)
print(f"{model_name}:")
print("R2:", np.round(r2, 2), "MAE:", np.round(mae, 2))
def calculate_bic(obs_s, pred_s, num_params):
"""
Calculate Bayesian Information Criterion (BIC).
Parameters:
df (pd.DataFrame): DataFrame containing the data.
obs_col (str): Column name for observations.
pred_col (str): Column name for predictions.
num_params (int): Number of parameters in the model.
Returns:
float: BIC value.
"""
n = len(obs_s)
residual_sum_of_squares = np.sum((obs_s - pred_s) ** 2)
bic = n * np.log(residual_sum_of_squares / n) + num_params * np.log(n)
print(bic)