Merge pull request #22 from mitre/fix-bugs

dchud · web-flow · commit 0469964c293f · 2021-05-28T09:50:38.000-04:00
Fix various bugs throughout
diff --git a/R/adult_clean.R b/R/adult_clean.R
@@ -377,7 +377,8 @@ cleanadult <- function(df, weight_cap = Inf){
       # if it's a repeated value, we want to get rid of it as well
       rv_impl_ids <- as.character(
         w_subj_df$id[w_subj_df$meas_m %in% inc_df$meas_m[criteria &
-                                                           inc_df$is_first_rv]]
+                                                           inc_df$is_first_rv] &
+                       w_subj_df$is_rv]
       )
 
       # update and remove
@@ -414,7 +415,8 @@ cleanadult <- function(df, weight_cap = Inf){
       # if it's a repeated value, we want to get rid of it as well
       rv_impl_ids <- as.character(
         w_subj_df$id[w_subj_df$meas_m %in% inc_df$meas_m[criteria &
-                                                           inc_df$is_first_rv]]
+                                                           inc_df$is_first_rv] &
+                       w_subj_df$is_rv]
       )
 
       # update and remove
@@ -451,7 +453,8 @@ cleanadult <- function(df, weight_cap = Inf){
       # if it's a repeated value, we want to get rid of it as well
       rv_impl_ids <- as.character(
         w_subj_df$id[w_subj_df$meas_m %in% inc_df$meas_m[criteria &
-                                                           inc_df$is_first_rv]]
+                                                           inc_df$is_first_rv] &
+                       w_subj_df$is_rv]
       )
 
       # update and remove
@@ -576,25 +579,22 @@ cleanadult <- function(df, weight_cap = Inf){
 
         # implausible ids from the step
         impl_ids <- as.character(comb_df$id.w)[criteria]
-        # if it's a repeated value, we want to get rid of it as well
-        rv_impl_ids <- as.character(
-          w_subj_df$id[w_subj_df$meas_m %in% comb_df$meas_m.w[criteria &
-                                                                comb_df$is_first_rv]]
-        )
+        # do not remove repeated values
 
         # update and remove -- weight
         w_subj_keep[impl_ids] <- step
-        w_subj_keep[rv_impl_ids] <- paste0(step, "-RV")
 
         # don't get rid of extraneous just yet -- shouldn't be in
-        w_subj_df <- w_subj_df[!w_subj_df$id %in% c(impl_ids, rv_impl_ids),]
+        w_subj_df <- w_subj_df[!w_subj_df$id %in% c(impl_ids) |
+                                 !w_subj_df$id %in% comb_df$id.w,]
 
         # update and remove -- height
         h_subj_keep[as.character(comb_df$id.h)][criteria] <- step
 
         # don't get rid of extraneous just yet
         h_subj_df <- h_subj_df[h_subj_df$id %in% comb_df$id.h[!criteria] |
-                                 h_subj_df$extraneous,]
+                                 h_subj_df$extraneous |
+                                 !h_subj_df$id %in% comb_df$id.h,]
 
         # reevaluate temp same day -- don't need to reevaluate if nothing has
         # changed
@@ -707,15 +707,16 @@ cleanadult <- function(df, weight_cap = Inf){
       }
 
       # if dup ratio is too high, or any adjacent same days, we exclude all
+      # same day extraneous
       criteria <-
         if ((dup_ratio > .25) | adjacent){
-          rep(T, nrow(h_subj_df))
+          !is.na(h_subj_df$diff)
         } else {
           rep(F, nrow(h_subj_df))
         }
 
       # if criteria didn't catch it, we now compare with medians
-      if (!all(criteria) & any(h_subj_df$extraneous)){
+      if (!any(criteria) & any(h_subj_df$extraneous)){
         med <- median(h_subj_df$meas_m[
           !h_subj_df$age_days %in% dup_days
         ])
@@ -800,7 +801,7 @@ cleanadult <- function(df, weight_cap = Inf){
 
       # check if pairs outside two inch range
       # imperial will also be unique
-      exc_2d <- abs(ht_1_imp - ht_2_imp) > 2
+      exc_2d <- round(abs(ht_1_imp - ht_2_imp), 2) > 2
 
       # only if outside the range
       if (exc_2d){
@@ -947,14 +948,13 @@ cleanadult <- function(df, weight_cap = Inf){
           })
 
           # check g2 v g1 -- true indicates use the original exclusions
-          # TODO: CHECK
           g2_g1_check <-
             if (!is.na(mean_ht[2])){
               (mean_ht[2] - mean_ht[1]) < 0 &
                 ((min_age[2] < 50 &
-                    (mean_ht[2] - mean_ht[1]) > ((-5 * 2.54) +.001)) |
+                    (mean_ht[2] - mean_ht[1]) < ((-5 * 2.54) +.001)) |
                    (min_age[2] >= 50 &
-                      (mean_ht[2] - mean_ht[1]) > ((-7 * 2.54) +.001)))
+                      (mean_ht[2] - mean_ht[1]) < ((-7 * 2.54) +.001)))
             } else {
               F
             }
@@ -1140,10 +1140,12 @@ cleanadult <- function(df, weight_cap = Inf){
       rv_impl_ids <- as.character(
         w_subj_df$id[w_subj_df$meas_m %in%
                        inc_df_first$meas_m[criteria_first &
-                                             inc_df_first$is_first_rv]],
+                                             inc_df_first$is_first_rv] &
+                       w_subj_df$is_rv],
         w_subj_df$id[w_subj_df$meas_m %in%
                        inc_df_rv$meas_m[criteria_rv &
-                                          inc_df_rv$is_first_rv]]
+                                          inc_df_rv$is_first_rv] &
+                       w_subj_df$is_rv]
       )
 
       # update and remove
@@ -1262,15 +1264,16 @@ cleanadult <- function(df, weight_cap = Inf){
       }
 
       # if dup ratio is too high, or any adjacent same days, we exclude all
+      # same day extraneous
       criteria <-
         if ((dup_ratio > .25) | adjacent){
-          rep(T, nrow(w_subj_df))
+          !is.na(w_subj_df$diff)
         } else {
           rep(F, nrow(w_subj_df))
         }
 
       # if criteria didn't catch it, we now compare with medians
-      if (!all(criteria) & any(w_subj_df$extraneous)){
+      if (!any(criteria) & any(w_subj_df$extraneous)){
         # calculate ewma
         # calculate ewma (using metric)
         ewma_res <- ewma_dn(w_subj_df$age_days, w_subj_df$meas_m,
@@ -1644,14 +1647,18 @@ cleanadult <- function(df, weight_cap = Inf){
         # within the limits AND it's 1D, we exclude
         h_bmi_out <-
           all(!check_between(comb_df$bmi, 16, 60)) &
+          !check_between(comb_df$meas_m.h, 139, 206) &
           length(unique(h_subj_df$meas_m)) == 1
         w_bmi_out <-
           all(!check_between(comb_df$bmi, 16, 60)) &
+          !check_between(comb_df$meas_m.w, 40, 225) &
           length(unique(w_subj_df$meas_m)) == 1
 
+        # if any are true for the above or below, remove all for that parameter
+
         # remove based on above criteria
-        rem_ids_ht <- comb_df$id.h[h_exc_btw | rep(h_bmi_out, nrow(comb_df))]
-        rem_ids_wt <- comb_df$id.w[w_exc_btw | rep(w_bmi_out, nrow(comb_df))]
+        rem_ids_ht <- h_subj_df$id[any(h_exc_btw | h_bmi_out)]
+        rem_ids_wt <- w_subj_df$id[any(w_exc_btw | w_bmi_out)]
 
         # update and remove
         h_subj_keep[rem_ids_ht] <- step
@@ -1673,7 +1680,8 @@ cleanadult <- function(df, weight_cap = Inf){
         comb_df <- data.table()
       }
       # no bmis available -- no matches
-      if (nrow(comb_df) == 0){
+      if (nrow(comb_df) == 0) {
+        # no bmis available
         if (nrow(h_subj_df) > 0){
           exc_ht <-
             !check_between(h_subj_df$meas_m, 139, 206) &
diff --git a/R/adult_support.R b/R/adult_support.R
@@ -50,7 +50,7 @@ as.matrix.delta_dn <- function(agedays) {
 #'
 #' @param agedays Vector of age in days for each z score (potentially transformed to adjust weighting).
 #'
-#' @param z Input vector of numeric MEASUREMENT data.
+#' @param meas Input vector of numeric MEASUREMENT data.
 #'
 #' @param ewma.exp Exponent to use for weighting.
 #'
@@ -65,21 +65,21 @@ as.matrix.delta_dn <- function(agedays) {
 #'   and the subsequent observation.
 #' @keywords internal
 #' @noRd
-ewma_dn <- function(agedays, z, ewma.exp = 5, ewma.adjacent = T) {
+ewma_dn <- function(agedays, meas, ewma.exp = -5, ewma.adjacent = T) {
   # 6.  EWMA calculation description: Most of the next steps will involve calculating the exponentially weighted moving average for each subject and parameter. I will
   #     describe how to calculate EWMASDs, and will describe how it needs to be varied in subsequent steps.
   # a.	The overall goal of the EWMASD calculation is to identify the difference between the SD-score and what we might predict that DS-score should be, in order to
   #     determine whether it should be excluded.
   # b.	Only nonmissing SD-scores for a parameter that are not designated for exclusion are included in the following calculations.
-  # c.	For each SD-score SDi and associated agedaysi calculate the following for every other z-score (SDj...SDn) and associated agedays (agedaysj...agedaysn)  for the
+  # c.	For each SD-score SDi and associated agedaysi calculate the following for every other measurement (SDj...SDn) and associated agedays (agedaysj...agedaysn)  for the
   #     same subject and parameter
   #   i.	(delta)Agej=agedaysj-agedaysi
   #   ii.	EWMAZ=SDi=[(sigma)j->n(SDj*((5+(delta)Agej)^-1.5))]/[ (sigma)j->n((5+(delta)Agej)^-1.5)]
   #   iii.	For most EWMASD calculations, there are 3 EWMASDs that need to be calculated. I will note if not all of these need to be done for a given step.
   #     1.	EWMASDall calculated as above
   #     2.	EWMAZbef calculated excluding the SD-score just before the SD-score of interest (sorted by agedays). For the first observation for a parameter for a
   #         subject, this should be identical to EWMASDall rather than missing.
-  #     3.	EWMAZaft calculated excluding the z-score just after the SD-score of interest (sorted by agedays). For the lastobservation for a parameter for a subject,
+  #     3.	EWMAZaft calculated excluding the measurement just after the SD-score of interest (sorted by agedays). For the lastobservation for a parameter for a subject,
   #         this should be identical to EWMASDall rather than missing.
   #   iv.	For each of the three EWMASDs, calculate the dewma_*=SD-EWMASD
   # d.	EWMASDs and (delta)EWMASDs will change if a value is excluded or manipulated using one of the methods below, therefore EWMASDs and (delta)EWMASDs be recalculated for each
@@ -102,16 +102,16 @@ ewma_dn <- function(agedays, z, ewma.exp = 5, ewma.adjacent = T) {
     delta <- ifelse(delta == 0, 0, (delta) ^ ewma.exp)
 
     # calculate EWMAs, and return in order of original data
-    ewma.all[index] <- delta %*% z / apply(delta, 1, sum)
+    ewma.all[index] <- delta %*% meas / apply(delta, 1, sum)
 
     if (ewma.adjacent) {
       if (n > 2) {
         delta2 = delta
         delta2[col(delta2) == row(delta2) - 1] = 0
-        ewma.before[index] = delta2 %*% z / apply(delta2, 1, sum)
+        ewma.before[index] = delta2 %*% meas / apply(delta2, 1, sum)
         delta3 = delta
         delta3[col(delta3) == row(delta3) + 1] = 0
-        ewma.after[index] = delta3 %*% z / apply(delta3, 1, sum)
+        ewma.after[index] = delta3 %*% meas / apply(delta3, 1, sum)
       } else {
         ewma.before <- ewma.after <- ewma.all
       }
@@ -237,7 +237,6 @@ temp_sde <- function(subj_df, ptype = "height"){
       if (sum(!as.character(subj_df$age_days) %in% dup_days) > 0){
         # get the median without duplicate days
         median(subj_df$measurement[
-          !as.character(subj_df$age_days) %in% dup_days &
             if (ptype == "weight"){
               !subj_df$is_rv
             } else {
@@ -307,8 +306,8 @@ redo_identify_rv <- function(w_subj_df){
 #' @noRd
 rem_hundreds <- function(inc_df, dewma, meas_col, hundreds, ptype = "weight"){
   # calculate difference between values -- ENDS ARE PROTECTED ON EITHER SIDE
-  inc_df$diff_prev <- c(NA, diff(inc_df[,..meas_col]))
-  inc_df$diff_next <- c(diff(inc_df[,..meas_col]), NA)
+  inc_df$diff_prev <- c(NA, diff(unlist(inc_df[,..meas_col])))
+  inc_df$diff_next <- c(diff(unlist(inc_df[,..meas_col])), NA)
 
   # state upper and lower limits (hundreds +/- 2)
   # modifier for height vs weight
@@ -384,13 +383,13 @@ rem_hundreds <- function(inc_df, dewma, meas_col, hundreds, ptype = "weight"){
 #' @noRd
 rem_unit_errors <- function(inc_df, ptype = "height"){
   # add "unit error": metric encoded as imperial
-  inc_df$ue <- inc_df$meas_m * (if (ptype == "height"){ 2.54 } else {1/2.2046226})
+  inc_df$ue <- inc_df$meas_m * (if (ptype == "height"){ 2.54 } else {2.2046226})
 
   # calculate ewma (using metric)
   ewma_res <- ewma_dn(inc_df$age_days, inc_df$meas_m)
   dewma <- (inc_df$meas_m- ewma_res)
   # delta ewma with unit error
-  absdewma_ue <- abs(ewma_res-inc_df$ue)
+  absdewma_ue <- abs(inc_df$ue - ewma_res)
   colnames(dewma) <- colnames(absdewma_ue) <-
     paste0("d",colnames(ewma_res))
 
@@ -506,10 +505,11 @@ rem_transpositions <- function(inc_df, ptype = "height"){
     inc_df$transpo <- switch_tens_ones(
       unlist(inc_df[, paste0("meas_", mtype), with = F])
       )
+
     # if imperial, we want to convert to metric
     if (mtype == "im"){
-      inc_df$transpo <- inc_df$transpo *
-        (if (ptype == "height"){ 2.54 } else {1/2.2046226})
+      inc_df$transpo <- inc_df$transpo /
+        (if (ptype == "height"){ 2.54 } else {2.2046226})
     }
 
     inc_df$ones <- get_num_places(
@@ -518,6 +518,7 @@ rem_transpositions <- function(inc_df, ptype = "height"){
     inc_df$tens <- get_num_places(
       unlist(inc_df[, paste0("meas_", mtype), with = F]), "tens"
     )
+
     absdewma_transpo <- abs(inc_df$transpo - ewma_res)
     colnames(absdewma_transpo) <- paste0("d",colnames(ewma_res))
 
@@ -654,7 +655,7 @@ ht_3d_growth_compare <- function(mean_ht, min_age, glist,
 #' Function to remove data based on exponentially-weighted moving average
 #' (Daymont, et al.) for WEIGHT. Cutoff defaults adjusted for adults.
 #' inputs:
-#' subj_df: subject data frame, which has age in days and z-score
+#' subj_df: subject data frame, which has age in days and measurement
 #' ewma_cutoff: EWMA past which considered invalid (center value). left and right
 #'   are .5 less.
 #' outputs:
diff --git a/R/growth.R b/R/growth.R
@@ -161,6 +161,7 @@ cleangrowth <- function(subjid,
     param,
     agedays = as.integer(agedays),
     v = ifelse(measurement == 0, NaN, measurement),
+    v_adult = measurement,
     sex = as.integer(ifelse(
       sex %in% c(0, 'm', 'M'), 0, ifelse(sex %in% c(1, 'f', 'F'), 1, NA)
     ))
@@ -609,7 +610,7 @@ cleangrowth <- function(subjid,
     # add age in years
     data.adult[, age_years := agedays/365.25]
     # rename for ease of use
-    data.adult[, measurement := v]
+    data.adult[, measurement := v_adult]
     data.adult[, id := line]
 
     if (!quietly)