From e9e56c4c9ce3c2ad3e2175ff27d53b6d7b19fff0 Mon Sep 17 00:00:00 2001 From: enryh Date: Fri, 27 Dec 2024 14:29:55 +0100 Subject: [PATCH] :sparkles: test if adding packages one by one works - should also fail faster as installations are done first - add msImpute again --- project/01_0_install_R_packages.R | 176 +++++++++++++++++ project/01_0_install_R_packages.ipynb | 219 +++++++++++++++++++++ project/config/alzheimer_study/config.yaml | 4 +- project/workflow/Snakefile_v2.smk | 17 +- 4 files changed, 413 insertions(+), 3 deletions(-) create mode 100644 project/01_0_install_R_packages.R create mode 100644 project/01_0_install_R_packages.ipynb diff --git a/project/01_0_install_R_packages.R b/project/01_0_install_R_packages.R new file mode 100644 index 000000000..e2517e894 --- /dev/null +++ b/project/01_0_install_R_packages.R @@ -0,0 +1,176 @@ +# --- +# jupyter: +# jupytext: +# formats: ipynb,R:light +# text_representation: +# extension: .R +# format_name: light +# format_version: '1.5' +# jupytext_version: 1.14.5 +# kernelspec: +# display_name: R +# language: R +# name: ir +# --- + +# + tags=["parameters"] vscode={"languageId": "r"} +methods = 'KNN_IMPUTE,msImpute' + +# + tags=["hide-input"] vscode={"languageId": "r"} +# options("install.lock"=FALSE) + +packages_base_R <- + c("BiocManager", "reshape2", "data.table", "readr", "tibble") + +install_rpackage <- function(pkg) { + # If not installed, install the package + if (!require(pkg, character.only = TRUE)) { + install.packages(pkg) + library(pkg, character.only = TRUE) + } + +} + +# used in the large imputation function for two packages +install_bioconductor <- function(pkg) { + # If not installed, install the package + if (!require(pkg, character.only = TRUE)) { + BiocManager::install(pkg) + library(pkg, character.only = TRUE) + } + +} + + +nafunctions <- function(method = "zero") { + method <- tolower(method) + if (method == "zero") { + } + else if (method == "minimum") { + } + else if (method == "colmedian") { + install_rpackage('e1071') + } + else if (method == "rowmedian") { + install_rpackage('e1071') + } + else if (method == "knn_impute") { + install_bioconductor('impute') + } + else if (method == "seqknn") { + if (!require(SeqKnn)) { + install.packages("src/R_NAGuideR/SeqKnn_1.0.1.tar.gz", + repos = NULL, + type = "source") + } + } + else if (method == "bpca") { + install_bioconductor('pcaMethods') + } + else if (method == "svdmethod") { + install_bioconductor('pcaMethods') + } + else if (method == "lls") { + install_bioconductor('pcaMethods') + } + else if (method == "mle") { + install_rpackage('norm') + } + else if (method == "qrilc") { + install_bioconductor("impute") + install_bioconductor("pcaMethods") + install_rpackage('gmm') + install_rpackage('imputeLCMD') + } + else if (method == "mindet") { + install_bioconductor("impute") + install_bioconductor("pcaMethods") + install_rpackage('gmm') + install_rpackage('imputeLCMD') + } + else if (method == "minprob") { + install_bioconductor("impute") + install_bioconductor("pcaMethods") + install_rpackage('gmm') + install_rpackage('imputeLCMD') + } + else if (method == "irm") { + install_rpackage('VIM') + } + else if (method == "impseq") { + install_rpackage('rrcovNA') + } + else if (method == "impseqrob") { + install_rpackage('rrcovNA') + } + else if (method == "mice-norm") { + install_rpackage('mice') + } + else if (method == "mice-cart") { + install_rpackage('mice') + } + else if (method == "trknn") { + source('src/R_NAGuideR/Imput_funcs.r') + } + else if (method == "rf") { + install_rpackage("missForest") + } + else if (method == "pi") { + } + # else if(method=="grr"){ + # library(DreamAI) + # df<-impute.RegImpute(data=as.matrix(df1), fillmethod = "row_mean", maxiter_RegImpute = 10,conv_nrmse = 1e-03) + # } + else if (method == "gms") { + # install.packages('GMSimpute') + if (!require(GMSimpute)) { + install.packages( + "src/R_NAGuideR/GMSimpute_0.0.1.1.tar.gz", + repos = NULL, + type = "source" + ) + } + } + else if (method == "msimpute") { + install_bioconductor("msImpute") + } + else if (method == "msimpute_mnar") { + install_bioconductor("msImpute") + } + else if (method == "gsimp") { + options(stringsAsFactors = F) + # dependencies parly for sourced file + + install_bioconductor("impute") + install_bioconductor("pcaMethods") + install_rpackage('gmm') + install_rpackage('imputeLCMD') + install_rpackage("magrittr") + install_rpackage("glmnet") + install_rpackage("abind") + install_rpackage("foreach") + install_rpackage("doParallel") + source('src/R_NAGuideR/GSimp.R') + + } + else{ + stop(paste("Unspported methods so far: ", method)) + } + df <- as.data.frame(df) + df +} + + +for (package in packages_base_R) { + # Check if the package is already installed + install_rpackage(pkg = package) +} + + +# + vscode={"languageId": "r"} +methods = unlist(strsplit(methods, split = ",")) +for (package in methods) { + # Check if the package is already installed, otherwise install + nafunctions(method = package) +} + diff --git a/project/01_0_install_R_packages.ipynb b/project/01_0_install_R_packages.ipynb new file mode 100644 index 000000000..7fd518f90 --- /dev/null +++ b/project/01_0_install_R_packages.ipynb @@ -0,0 +1,219 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "e470a0a5", + "metadata": { + "tags": [ + "parameters" + ], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "methods = 'KNN_IMPUTE,msImpute'" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7ffb0a1b", + "metadata": { + "lines_to_next_cell": 2, + "tags": [ + "hide-input" + ], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# options(\"install.lock\"=FALSE)\n", + "\n", + "packages_base_R <-\n", + " c(\"BiocManager\", \"reshape2\", \"data.table\", \"readr\", \"tibble\")\n", + "\n", + "install_rpackage <- function(pkg) {\n", + " # If not installed, install the package\n", + " if (!require(pkg, character.only = TRUE)) {\n", + " install.packages(pkg)\n", + " library(pkg, character.only = TRUE)\n", + " }\n", + " \n", + "}\n", + "\n", + "# used in the large imputation function for two packages\n", + "install_bioconductor <- function(pkg) {\n", + " # If not installed, install the package\n", + " if (!require(pkg, character.only = TRUE)) {\n", + " BiocManager::install(pkg)\n", + " library(pkg, character.only = TRUE)\n", + " }\n", + " \n", + "}\n", + "\n", + "\n", + "nafunctions <- function(method = \"zero\") {\n", + " method <- tolower(method)\n", + " if (method == \"zero\") {\n", + " }\n", + " else if (method == \"minimum\") {\n", + " }\n", + " else if (method == \"colmedian\") {\n", + " install_rpackage('e1071')\n", + " }\n", + " else if (method == \"rowmedian\") {\n", + " install_rpackage('e1071')\n", + " }\n", + " else if (method == \"knn_impute\") {\n", + " install_bioconductor('impute')\n", + " }\n", + " else if (method == \"seqknn\") {\n", + " if (!require(SeqKnn)) {\n", + " install.packages(\"src/R_NAGuideR/SeqKnn_1.0.1.tar.gz\",\n", + " repos = NULL,\n", + " type = \"source\")\n", + " }\n", + " }\n", + " else if (method == \"bpca\") {\n", + " install_bioconductor('pcaMethods')\n", + " }\n", + " else if (method == \"svdmethod\") {\n", + " install_bioconductor('pcaMethods')\n", + " }\n", + " else if (method == \"lls\") {\n", + " install_bioconductor('pcaMethods')\n", + " }\n", + " else if (method == \"mle\") {\n", + " install_rpackage('norm')\n", + " }\n", + " else if (method == \"qrilc\") {\n", + " install_bioconductor(\"impute\")\n", + " install_bioconductor(\"pcaMethods\")\n", + " install_rpackage('gmm')\n", + " install_rpackage('imputeLCMD')\n", + " }\n", + " else if (method == \"mindet\") {\n", + " install_bioconductor(\"impute\")\n", + " install_bioconductor(\"pcaMethods\")\n", + " install_rpackage('gmm')\n", + " install_rpackage('imputeLCMD')\n", + " }\n", + " else if (method == \"minprob\") {\n", + " install_bioconductor(\"impute\")\n", + " install_bioconductor(\"pcaMethods\")\n", + " install_rpackage('gmm')\n", + " install_rpackage('imputeLCMD')\n", + " }\n", + " else if (method == \"irm\") {\n", + " install_rpackage('VIM')\n", + " }\n", + " else if (method == \"impseq\") {\n", + " install_rpackage('rrcovNA')\n", + " }\n", + " else if (method == \"impseqrob\") {\n", + " install_rpackage('rrcovNA')\n", + " }\n", + " else if (method == \"mice-norm\") {\n", + " install_rpackage('mice')\n", + " }\n", + " else if (method == \"mice-cart\") {\n", + " install_rpackage('mice')\n", + " }\n", + " else if (method == \"trknn\") {\n", + " source('src/R_NAGuideR/Imput_funcs.r')\n", + " }\n", + " else if (method == \"rf\") {\n", + " install_rpackage(\"missForest\")\n", + " }\n", + " else if (method == \"pi\") {\n", + " }\n", + " # else if(method==\"grr\"){\n", + " # library(DreamAI)\n", + " # df<-impute.RegImpute(data=as.matrix(df1), fillmethod = \"row_mean\", maxiter_RegImpute = 10,conv_nrmse = 1e-03)\n", + " # }\n", + " else if (method == \"gms\") {\n", + " # install.packages('GMSimpute')\n", + " if (!require(GMSimpute)) {\n", + " install.packages(\n", + " \"src/R_NAGuideR/GMSimpute_0.0.1.1.tar.gz\",\n", + " repos = NULL,\n", + " type = \"source\"\n", + " )\n", + " }\n", + " }\n", + " else if (method == \"msimpute\") {\n", + " install_bioconductor(\"msImpute\")\n", + " }\n", + " else if (method == \"msimpute_mnar\") {\n", + " install_bioconductor(\"msImpute\")\n", + " }\n", + " else if (method == \"gsimp\") {\n", + " options(stringsAsFactors = F)\n", + " # dependencies parly for sourced file\n", + " \n", + " install_bioconductor(\"impute\")\n", + " install_bioconductor(\"pcaMethods\")\n", + " install_rpackage('gmm')\n", + " install_rpackage('imputeLCMD')\n", + " install_rpackage(\"magrittr\")\n", + " install_rpackage(\"glmnet\")\n", + " install_rpackage(\"abind\")\n", + " install_rpackage(\"foreach\")\n", + " install_rpackage(\"doParallel\")\n", + " source('src/R_NAGuideR/GSimp.R')\n", + " \n", + " } \n", + " else{\n", + " stop(paste(\"Unspported methods so far: \", method))\n", + " }\n", + " df <- as.data.frame(df)\n", + " df\n", + "}\n", + "\n", + "\n", + "for (package in packages_base_R) {\n", + " # Check if the package is already installed\n", + " install_rpackage(pkg = package)\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1d65b5d9", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "methods = unlist(strsplit(methods, split = \",\"))\n", + "for (package in methods) {\n", + " # Check if the package is already installed, otherwise install\n", + " nafunctions(method = package)\n", + "}\n" + ] + } + ], + "metadata": { + "jupytext": { + "formats": "ipynb,R:light" + }, + "kernelspec": { + "display_name": "R", + "language": "R", + "name": "ir" + }, + "language_info": { + "name": "R" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/project/config/alzheimer_study/config.yaml b/project/config/alzheimer_study/config.yaml index 41eacff96..66e00ee0c 100644 --- a/project/config/alzheimer_study/config.yaml +++ b/project/config/alzheimer_study/config.yaml @@ -67,8 +67,8 @@ NAGuideR_methods: - MINIMUM - MINPROB - MLE -# - MSIMPUTE -# - MSIMPUTE_MNAR + - MSIMPUTE + - MSIMPUTE_MNAR - PI - QRILC - RF diff --git a/project/workflow/Snakefile_v2.smk b/project/workflow/Snakefile_v2.smk index d007df946..2cecf94eb 100644 --- a/project/workflow/Snakefile_v2.smk +++ b/project/workflow/Snakefile_v2.smk @@ -121,11 +121,26 @@ rule transform_NAGuideR_predictions: " -p dumps {params.dumps_as_str}" " && jupyter nbconvert --to html {output.nb}" - +rule install_R_package: + input: + nb="01_0_install_R_packages.ipynb", + # methods=','.join(config["NAGuideR_methods"]), + output: + nb="{folder_experiment}/01_0_install_R_packages.ipynb", + conda: + "envs/trainRmodels.yaml" + params: + methods=','.join(config["NAGuideR_methods"]), + shell: + "papermill {input.nb} {output.nb}" + " -r methods {params.methods}" + + rule train_NAGuideR_model: input: nb="01_1_train_NAGuideR_methods.ipynb", train_split="{folder_experiment}/data/data_wide_sample_cols.csv", + nb_setup="{folder_experiment}/01_0_install_R_packages.ipynb" output: nb="{folder_experiment}/01_1_train_NAGuideR_{method}.ipynb", dump="{folder_experiment}/preds/pred_all_{method}.csv",