Skip to content

Commit 0469964

Browse files
authored
Merge pull request #22 from mitre/fix-bugs
Fix various bugs throughout
2 parents 4860eb2 + c2c69a5 commit 0469964

File tree

3 files changed

+50
-40
lines changed

3 files changed

+50
-40
lines changed

R/adult_clean.R

+32-24
Original file line numberDiff line numberDiff line change
@@ -377,7 +377,8 @@ cleanadult <- function(df, weight_cap = Inf){
377377
# if it's a repeated value, we want to get rid of it as well
378378
rv_impl_ids <- as.character(
379379
w_subj_df$id[w_subj_df$meas_m %in% inc_df$meas_m[criteria &
380-
inc_df$is_first_rv]]
380+
inc_df$is_first_rv] &
381+
w_subj_df$is_rv]
381382
)
382383

383384
# update and remove
@@ -414,7 +415,8 @@ cleanadult <- function(df, weight_cap = Inf){
414415
# if it's a repeated value, we want to get rid of it as well
415416
rv_impl_ids <- as.character(
416417
w_subj_df$id[w_subj_df$meas_m %in% inc_df$meas_m[criteria &
417-
inc_df$is_first_rv]]
418+
inc_df$is_first_rv] &
419+
w_subj_df$is_rv]
418420
)
419421

420422
# update and remove
@@ -451,7 +453,8 @@ cleanadult <- function(df, weight_cap = Inf){
451453
# if it's a repeated value, we want to get rid of it as well
452454
rv_impl_ids <- as.character(
453455
w_subj_df$id[w_subj_df$meas_m %in% inc_df$meas_m[criteria &
454-
inc_df$is_first_rv]]
456+
inc_df$is_first_rv] &
457+
w_subj_df$is_rv]
455458
)
456459

457460
# update and remove
@@ -576,25 +579,22 @@ cleanadult <- function(df, weight_cap = Inf){
576579

577580
# implausible ids from the step
578581
impl_ids <- as.character(comb_df$id.w)[criteria]
579-
# if it's a repeated value, we want to get rid of it as well
580-
rv_impl_ids <- as.character(
581-
w_subj_df$id[w_subj_df$meas_m %in% comb_df$meas_m.w[criteria &
582-
comb_df$is_first_rv]]
583-
)
582+
# do not remove repeated values
584583

585584
# update and remove -- weight
586585
w_subj_keep[impl_ids] <- step
587-
w_subj_keep[rv_impl_ids] <- paste0(step, "-RV")
588586

589587
# don't get rid of extraneous just yet -- shouldn't be in
590-
w_subj_df <- w_subj_df[!w_subj_df$id %in% c(impl_ids, rv_impl_ids),]
588+
w_subj_df <- w_subj_df[!w_subj_df$id %in% c(impl_ids) |
589+
!w_subj_df$id %in% comb_df$id.w,]
591590

592591
# update and remove -- height
593592
h_subj_keep[as.character(comb_df$id.h)][criteria] <- step
594593

595594
# don't get rid of extraneous just yet
596595
h_subj_df <- h_subj_df[h_subj_df$id %in% comb_df$id.h[!criteria] |
597-
h_subj_df$extraneous,]
596+
h_subj_df$extraneous |
597+
!h_subj_df$id %in% comb_df$id.h,]
598598

599599
# reevaluate temp same day -- don't need to reevaluate if nothing has
600600
# changed
@@ -707,15 +707,16 @@ cleanadult <- function(df, weight_cap = Inf){
707707
}
708708

709709
# if dup ratio is too high, or any adjacent same days, we exclude all
710+
# same day extraneous
710711
criteria <-
711712
if ((dup_ratio > .25) | adjacent){
712-
rep(T, nrow(h_subj_df))
713+
!is.na(h_subj_df$diff)
713714
} else {
714715
rep(F, nrow(h_subj_df))
715716
}
716717

717718
# if criteria didn't catch it, we now compare with medians
718-
if (!all(criteria) & any(h_subj_df$extraneous)){
719+
if (!any(criteria) & any(h_subj_df$extraneous)){
719720
med <- median(h_subj_df$meas_m[
720721
!h_subj_df$age_days %in% dup_days
721722
])
@@ -800,7 +801,7 @@ cleanadult <- function(df, weight_cap = Inf){
800801

801802
# check if pairs outside two inch range
802803
# imperial will also be unique
803-
exc_2d <- abs(ht_1_imp - ht_2_imp) > 2
804+
exc_2d <- round(abs(ht_1_imp - ht_2_imp), 2) > 2
804805

805806
# only if outside the range
806807
if (exc_2d){
@@ -947,14 +948,13 @@ cleanadult <- function(df, weight_cap = Inf){
947948
})
948949

949950
# check g2 v g1 -- true indicates use the original exclusions
950-
# TODO: CHECK
951951
g2_g1_check <-
952952
if (!is.na(mean_ht[2])){
953953
(mean_ht[2] - mean_ht[1]) < 0 &
954954
((min_age[2] < 50 &
955-
(mean_ht[2] - mean_ht[1]) > ((-5 * 2.54) +.001)) |
955+
(mean_ht[2] - mean_ht[1]) < ((-5 * 2.54) +.001)) |
956956
(min_age[2] >= 50 &
957-
(mean_ht[2] - mean_ht[1]) > ((-7 * 2.54) +.001)))
957+
(mean_ht[2] - mean_ht[1]) < ((-7 * 2.54) +.001)))
958958
} else {
959959
F
960960
}
@@ -1140,10 +1140,12 @@ cleanadult <- function(df, weight_cap = Inf){
11401140
rv_impl_ids <- as.character(
11411141
w_subj_df$id[w_subj_df$meas_m %in%
11421142
inc_df_first$meas_m[criteria_first &
1143-
inc_df_first$is_first_rv]],
1143+
inc_df_first$is_first_rv] &
1144+
w_subj_df$is_rv],
11441145
w_subj_df$id[w_subj_df$meas_m %in%
11451146
inc_df_rv$meas_m[criteria_rv &
1146-
inc_df_rv$is_first_rv]]
1147+
inc_df_rv$is_first_rv] &
1148+
w_subj_df$is_rv]
11471149
)
11481150

11491151
# update and remove
@@ -1262,15 +1264,16 @@ cleanadult <- function(df, weight_cap = Inf){
12621264
}
12631265

12641266
# if dup ratio is too high, or any adjacent same days, we exclude all
1267+
# same day extraneous
12651268
criteria <-
12661269
if ((dup_ratio > .25) | adjacent){
1267-
rep(T, nrow(w_subj_df))
1270+
!is.na(w_subj_df$diff)
12681271
} else {
12691272
rep(F, nrow(w_subj_df))
12701273
}
12711274

12721275
# if criteria didn't catch it, we now compare with medians
1273-
if (!all(criteria) & any(w_subj_df$extraneous)){
1276+
if (!any(criteria) & any(w_subj_df$extraneous)){
12741277
# calculate ewma
12751278
# calculate ewma (using metric)
12761279
ewma_res <- ewma_dn(w_subj_df$age_days, w_subj_df$meas_m,
@@ -1644,14 +1647,18 @@ cleanadult <- function(df, weight_cap = Inf){
16441647
# within the limits AND it's 1D, we exclude
16451648
h_bmi_out <-
16461649
all(!check_between(comb_df$bmi, 16, 60)) &
1650+
!check_between(comb_df$meas_m.h, 139, 206) &
16471651
length(unique(h_subj_df$meas_m)) == 1
16481652
w_bmi_out <-
16491653
all(!check_between(comb_df$bmi, 16, 60)) &
1654+
!check_between(comb_df$meas_m.w, 40, 225) &
16501655
length(unique(w_subj_df$meas_m)) == 1
16511656

1657+
# if any are true for the above or below, remove all for that parameter
1658+
16521659
# remove based on above criteria
1653-
rem_ids_ht <- comb_df$id.h[h_exc_btw | rep(h_bmi_out, nrow(comb_df))]
1654-
rem_ids_wt <- comb_df$id.w[w_exc_btw | rep(w_bmi_out, nrow(comb_df))]
1660+
rem_ids_ht <- h_subj_df$id[any(h_exc_btw | h_bmi_out)]
1661+
rem_ids_wt <- w_subj_df$id[any(w_exc_btw | w_bmi_out)]
16551662

16561663
# update and remove
16571664
h_subj_keep[rem_ids_ht] <- step
@@ -1673,7 +1680,8 @@ cleanadult <- function(df, weight_cap = Inf){
16731680
comb_df <- data.table()
16741681
}
16751682
# no bmis available -- no matches
1676-
if (nrow(comb_df) == 0){
1683+
if (nrow(comb_df) == 0) {
1684+
# no bmis available
16771685
if (nrow(h_subj_df) > 0){
16781686
exc_ht <-
16791687
!check_between(h_subj_df$meas_m, 139, 206) &

R/adult_support.R

+16-15
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@ as.matrix.delta_dn <- function(agedays) {
5050
#'
5151
#' @param agedays Vector of age in days for each z score (potentially transformed to adjust weighting).
5252
#'
53-
#' @param z Input vector of numeric MEASUREMENT data.
53+
#' @param meas Input vector of numeric MEASUREMENT data.
5454
#'
5555
#' @param ewma.exp Exponent to use for weighting.
5656
#'
@@ -65,21 +65,21 @@ as.matrix.delta_dn <- function(agedays) {
6565
#' and the subsequent observation.
6666
#' @keywords internal
6767
#' @noRd
68-
ewma_dn <- function(agedays, z, ewma.exp = 5, ewma.adjacent = T) {
68+
ewma_dn <- function(agedays, meas, ewma.exp = -5, ewma.adjacent = T) {
6969
# 6. EWMA calculation description: Most of the next steps will involve calculating the exponentially weighted moving average for each subject and parameter. I will
7070
# describe how to calculate EWMASDs, and will describe how it needs to be varied in subsequent steps.
7171
# a. The overall goal of the EWMASD calculation is to identify the difference between the SD-score and what we might predict that DS-score should be, in order to
7272
# determine whether it should be excluded.
7373
# b. Only nonmissing SD-scores for a parameter that are not designated for exclusion are included in the following calculations.
74-
# c. For each SD-score SDi and associated agedaysi calculate the following for every other z-score (SDj...SDn) and associated agedays (agedaysj...agedaysn) for the
74+
# c. For each SD-score SDi and associated agedaysi calculate the following for every other measurement (SDj...SDn) and associated agedays (agedaysj...agedaysn) for the
7575
# same subject and parameter
7676
# i. (delta)Agej=agedaysj-agedaysi
7777
# ii. EWMAZ=SDi=[(sigma)j->n(SDj*((5+(delta)Agej)^-1.5))]/[ (sigma)j->n((5+(delta)Agej)^-1.5)]
7878
# iii. For most EWMASD calculations, there are 3 EWMASDs that need to be calculated. I will note if not all of these need to be done for a given step.
7979
# 1. EWMASDall calculated as above
8080
# 2. EWMAZbef calculated excluding the SD-score just before the SD-score of interest (sorted by agedays). For the first observation for a parameter for a
8181
# subject, this should be identical to EWMASDall rather than missing.
82-
# 3. EWMAZaft calculated excluding the z-score just after the SD-score of interest (sorted by agedays). For the lastobservation for a parameter for a subject,
82+
# 3. EWMAZaft calculated excluding the measurement just after the SD-score of interest (sorted by agedays). For the lastobservation for a parameter for a subject,
8383
# this should be identical to EWMASDall rather than missing.
8484
# iv. For each of the three EWMASDs, calculate the dewma_*=SD-EWMASD
8585
# d. EWMASDs and (delta)EWMASDs will change if a value is excluded or manipulated using one of the methods below, therefore EWMASDs and (delta)EWMASDs be recalculated for each
@@ -102,16 +102,16 @@ ewma_dn <- function(agedays, z, ewma.exp = 5, ewma.adjacent = T) {
102102
delta <- ifelse(delta == 0, 0, (delta) ^ ewma.exp)
103103

104104
# calculate EWMAs, and return in order of original data
105-
ewma.all[index] <- delta %*% z / apply(delta, 1, sum)
105+
ewma.all[index] <- delta %*% meas / apply(delta, 1, sum)
106106

107107
if (ewma.adjacent) {
108108
if (n > 2) {
109109
delta2 = delta
110110
delta2[col(delta2) == row(delta2) - 1] = 0
111-
ewma.before[index] = delta2 %*% z / apply(delta2, 1, sum)
111+
ewma.before[index] = delta2 %*% meas / apply(delta2, 1, sum)
112112
delta3 = delta
113113
delta3[col(delta3) == row(delta3) + 1] = 0
114-
ewma.after[index] = delta3 %*% z / apply(delta3, 1, sum)
114+
ewma.after[index] = delta3 %*% meas / apply(delta3, 1, sum)
115115
} else {
116116
ewma.before <- ewma.after <- ewma.all
117117
}
@@ -237,7 +237,6 @@ temp_sde <- function(subj_df, ptype = "height"){
237237
if (sum(!as.character(subj_df$age_days) %in% dup_days) > 0){
238238
# get the median without duplicate days
239239
median(subj_df$measurement[
240-
!as.character(subj_df$age_days) %in% dup_days &
241240
if (ptype == "weight"){
242241
!subj_df$is_rv
243242
} else {
@@ -307,8 +306,8 @@ redo_identify_rv <- function(w_subj_df){
307306
#' @noRd
308307
rem_hundreds <- function(inc_df, dewma, meas_col, hundreds, ptype = "weight"){
309308
# calculate difference between values -- ENDS ARE PROTECTED ON EITHER SIDE
310-
inc_df$diff_prev <- c(NA, diff(inc_df[,..meas_col]))
311-
inc_df$diff_next <- c(diff(inc_df[,..meas_col]), NA)
309+
inc_df$diff_prev <- c(NA, diff(unlist(inc_df[,..meas_col])))
310+
inc_df$diff_next <- c(diff(unlist(inc_df[,..meas_col])), NA)
312311

313312
# state upper and lower limits (hundreds +/- 2)
314313
# modifier for height vs weight
@@ -384,13 +383,13 @@ rem_hundreds <- function(inc_df, dewma, meas_col, hundreds, ptype = "weight"){
384383
#' @noRd
385384
rem_unit_errors <- function(inc_df, ptype = "height"){
386385
# add "unit error": metric encoded as imperial
387-
inc_df$ue <- inc_df$meas_m * (if (ptype == "height"){ 2.54 } else {1/2.2046226})
386+
inc_df$ue <- inc_df$meas_m * (if (ptype == "height"){ 2.54 } else {2.2046226})
388387

389388
# calculate ewma (using metric)
390389
ewma_res <- ewma_dn(inc_df$age_days, inc_df$meas_m)
391390
dewma <- (inc_df$meas_m- ewma_res)
392391
# delta ewma with unit error
393-
absdewma_ue <- abs(ewma_res-inc_df$ue)
392+
absdewma_ue <- abs(inc_df$ue - ewma_res)
394393
colnames(dewma) <- colnames(absdewma_ue) <-
395394
paste0("d",colnames(ewma_res))
396395

@@ -506,10 +505,11 @@ rem_transpositions <- function(inc_df, ptype = "height"){
506505
inc_df$transpo <- switch_tens_ones(
507506
unlist(inc_df[, paste0("meas_", mtype), with = F])
508507
)
508+
509509
# if imperial, we want to convert to metric
510510
if (mtype == "im"){
511-
inc_df$transpo <- inc_df$transpo *
512-
(if (ptype == "height"){ 2.54 } else {1/2.2046226})
511+
inc_df$transpo <- inc_df$transpo /
512+
(if (ptype == "height"){ 2.54 } else {2.2046226})
513513
}
514514

515515
inc_df$ones <- get_num_places(
@@ -518,6 +518,7 @@ rem_transpositions <- function(inc_df, ptype = "height"){
518518
inc_df$tens <- get_num_places(
519519
unlist(inc_df[, paste0("meas_", mtype), with = F]), "tens"
520520
)
521+
521522
absdewma_transpo <- abs(inc_df$transpo - ewma_res)
522523
colnames(absdewma_transpo) <- paste0("d",colnames(ewma_res))
523524

@@ -654,7 +655,7 @@ ht_3d_growth_compare <- function(mean_ht, min_age, glist,
654655
#' Function to remove data based on exponentially-weighted moving average
655656
#' (Daymont, et al.) for WEIGHT. Cutoff defaults adjusted for adults.
656657
#' inputs:
657-
#' subj_df: subject data frame, which has age in days and z-score
658+
#' subj_df: subject data frame, which has age in days and measurement
658659
#' ewma_cutoff: EWMA past which considered invalid (center value). left and right
659660
#' are .5 less.
660661
#' outputs:

R/growth.R

+2-1
Original file line numberDiff line numberDiff line change
@@ -161,6 +161,7 @@ cleangrowth <- function(subjid,
161161
param,
162162
agedays = as.integer(agedays),
163163
v = ifelse(measurement == 0, NaN, measurement),
164+
v_adult = measurement,
164165
sex = as.integer(ifelse(
165166
sex %in% c(0, 'm', 'M'), 0, ifelse(sex %in% c(1, 'f', 'F'), 1, NA)
166167
))
@@ -609,7 +610,7 @@ cleangrowth <- function(subjid,
609610
# add age in years
610611
data.adult[, age_years := agedays/365.25]
611612
# rename for ease of use
612-
data.adult[, measurement := v]
613+
data.adult[, measurement := v_adult]
613614
data.adult[, id := line]
614615

615616
if (!quietly)

0 commit comments

Comments
 (0)