From 886ce0edb8cab2807f630e6d85a9736b645d751b Mon Sep 17 00:00:00 2001 From: Adriaan van der Graaf Date: Mon, 7 Oct 2024 10:48:54 +0200 Subject: [PATCH 1/6] Building an R version of MR link 2 and testing for equivalence of likelihood function --- R/mr_link_2_functions.R | 72 +++++ tests/test_for_r_and_python_concordance.py | 314 +++++++++++++++++++++ 2 files changed, 386 insertions(+) create mode 100644 R/mr_link_2_functions.R create mode 100644 tests/test_for_r_and_python_concordance.py diff --git a/R/mr_link_2_functions.R b/R/mr_link_2_functions.R new file mode 100644 index 0000000..b0dfb43 --- /dev/null +++ b/R/mr_link_2_functions.R @@ -0,0 +1,72 @@ +mr_link2_loglik_reference_v2 <- function(th, lam, c_x, c_y, n_x, n_y) { + # Convert n_x and n_y to float + n_x <- as.numeric(n_x) + n_y <- as.numeric(n_y) + a <- th[1] + tX <- abs(th[2]) + tY <- abs(th[3]) + + Dyy <- 1 / (n_y * lam + tY) + + if (a != 0) { + Dxx <- 1 / (exp(log(a^2 * n_y + n_x) + log(lam)) + tX - + exp(log(a^2 * n_y^2 * lam^2) - log(n_y * lam + tY))) + Dxy <- -Dxx * a * exp(log(n_y * lam) - log(n_y * lam + tY)) + Dyy <- Dyy + exp(log(Dxx * (a^2 * n_y^2 * lam^2)) - (2 * log(n_y * lam + tY))) + asq_ny_sq_lam_sq_div_ny_lam_ty <- exp(log(a^2 * n_y^2 * lam^2) - log(n_y * lam + tY)) + } else { + Dxx <- 1 / (exp(log(n_x) + log(lam)) + tX) + Dxy <- -Dxx * a * exp(log(n_y * lam) - log(n_y * lam + tY)) + asq_ny_sq_lam_sq_div_ny_lam_ty <- 0 * lam + } + + dX <- n_x * c_x + a * n_y * c_y + dY <- n_y * c_y + m <- length(c_x) + + loglik <- -m * log(2 * pi) - + (1 / 2) * sum(log((a^2 * n_y + n_x) * lam + tX - asq_ny_sq_lam_sq_div_ny_lam_ty)) - + (1 / 2) * sum(log(n_y * lam + tY)) + + (1 / 2) * (sum(dX^2 * Dxx) + 2 * sum(dX * dY * Dxy) + sum(dY^2 * Dyy)) - + (n_x / 2) * sum((c_x^2) / lam) - + (n_y / 2) * sum((c_y^2) / lam) + + (m / 2) * (log(n_x) + log(n_y)) - sum(log(lam)) + (m / 2) * (log(tX) + log(tY)) + + return(-loglik) +} + +mr_link2_loglik_alpha_h0 <- function(th, lam, cX, cY, nX, nY) { + return(mr_link2_loglik_reference_v2(c(0, th[1], th[2]), lam, cX, cY, nX, nY)) +} + +mr_link2_loglik_sigma_y_h0 <- function(th, lam, c_x, c_y, n_x, n_y) { + n_x <- as.numeric(n_x) + n_y <- as.numeric(n_y) + a <- th[1] + tX <- abs(th[2]) + + Dyy <- rep(0, length(lam)) + + if (a != 0) { + Dxx <- 1 / (exp(log(a^2 * n_y + n_x) + log(lam)) + tX) + Dxy <- rep(0, length(lam)) + asq_ny_sq_lam_sq_div_ny_lam_ty <- rep(0, length(lam)) + } else { + Dxx <- 1 / (exp(log(n_x) + log(lam)) + tX) + Dxy <- rep(0, length(lam)) + asq_ny_sq_lam_sq_div_ny_lam_ty <- rep(0, length(lam)) + } + + dX <- n_x * c_x + a * n_y * c_y + dY <- n_y * c_y + m <- length(c_x) + + loglik <- -m * log(2 * pi) - + (1 / 2) * sum(log((a^2 * n_y + n_x) * lam + tX - asq_ny_sq_lam_sq_div_ny_lam_ty)) + + (1 / 2) * (sum(dX^2 * Dxx) + 2 * sum(dX * dY * Dxy) + sum(dY^2 * Dyy)) - + (n_x / 2) * sum((c_x^2) / lam) - + (n_y / 2) * sum((c_y^2) / lam) + + (m / 2) * (log(n_x) + log(n_y)) - sum(log(lam)) + (m / 2) * log(tX) + + return(-loglik) +} \ No newline at end of file diff --git a/tests/test_for_r_and_python_concordance.py b/tests/test_for_r_and_python_concordance.py new file mode 100644 index 0000000..083f1fb --- /dev/null +++ b/tests/test_for_r_and_python_concordance.py @@ -0,0 +1,314 @@ +import pytest +import sys +import os + +## For the python part + +sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) +from mr_link_2_standalone import * + + +## for the R part + +from rpy2.robjects import r, numpy2ri +from rpy2.robjects.packages import importr + +# Activate automatic conversion between numpy and R objects +numpy2ri.activate() + +# Load the base R package +base = importr('base') +# Source the R file with the functions +r['source']('../R/mr_link_2_functions.R') + +""" +Now, we perform unit tests on the MR-link-2 functions. to ensure that there is no creep of the +First function v0 +""" +def test_mr_link2_loglik_reference_v0_basic_r_concordance(): + th = np.array([0.1, 0.01, 0.01]) + lam = np.array([1, 2, 3]) + c_x = np.array([1, 1, 1]) + c_y = np.array([2, 2, 2]) + n_x = 100 + n_y = 100 + + expected_result = 22.944216071347796 + python_result = mr_link2_loglik_reference_v0(th, lam, c_x, c_y, n_x, n_y) + + assert np.isclose(python_result, expected_result) + assert np.isclose(python_result, mr_link2_loglik_reference_v2(th, lam, c_x, c_y, n_x, n_y)) + + r_result = r['mr_link2_loglik_reference_v2'](th, lam, c_x, c_y, n_x, n_y) + assert np.isclose(r_result[0], expected_result) + + +def test_mr_link2_loglik_reference_v0_zero_lam_r_concordance(): + th = np.array([0.1, 0.01, 0.01]) + lam = np.array([0, 0, 0]) + c_x = np.array([1, 1, 1]) + c_y = np.array([2, 2, 2]) + n_x = 100 + n_y = 100 + + python_result = mr_link2_loglik_reference_v0(th, lam, c_x, c_y, n_x, n_y) + assert np.isnan(python_result) + + r_result = r['mr_link2_loglik_reference_v2'](th, lam, c_x, c_y, n_x, n_y) + assert np.isnan(r_result[0]) + +def test_mr_link2_loglik_reference_v0_large_values_r_concordance(): + th = np.array([0.5, 1e10, 1e10]) + lam = np.array([1e5, 1e5, 1e5]) + c_x = np.array([1e2, 1e2, 1e2]) + c_y = np.array([1e2, 1e2, 1e2]) + n_x = 1e6 + n_y = 1e6 + + expected_result = 17617.166270043643 + python_result = mr_link2_loglik_reference_v0(th, lam, c_x, c_y, n_x, n_y) + assert np.isclose(python_result, expected_result) + + r_result = r['mr_link2_loglik_reference_v2'](th, lam, c_x, c_y, n_x, n_y) + assert np.isclose(r_result[0], expected_result) + +def test_mr_link2_loglik_reference_v0_negative_values_r_concordance(): + th = np.array([0.1, -0.01, 0.01]) + lam = np.array([1, 2, 3]) + c_x = np.array([1, 1, 1]) + c_y = np.array([2, 2, 2]) + n_x = 100 + n_y = 100 + + expected_result = 22.944216071347796 + python_result = mr_link2_loglik_reference_v0(th, lam, c_x, c_y, n_x, n_y) + assert np.isclose(python_result, expected_result) + + r_result = r['mr_link2_loglik_reference_v2'](th, lam, c_x, c_y, n_x, n_y) + assert np.isclose(r_result[0], expected_result) + + +def test_mr_link2_loglik_reference_v2_basic_r_concordance(): + th = np.array([0.1, 0.01, 0.01]) + lam = np.array([1, 2, 3]) + c_x = np.array([1, 1, 1]) + c_y = np.array([2, 2, 2]) + n_x = 100 + n_y = 100 + + expected_result = 22.944216071347796 + python_result = mr_link2_loglik_reference_v2(th, lam, c_x, c_y, n_x, n_y) + assert np.isclose(python_result, expected_result) + + r_result = r['mr_link2_loglik_reference_v2'](th, lam, c_x, c_y, n_x, n_y) + assert np.isclose(r_result[0], expected_result) + + + +def test_mr_link2_loglik_reference_v2_zero_lam_r_concordance(): + th = np.array([0.1, 0.01, 0.01]) + lam = np.array([0, 0, 0]) + c_x = np.array([1, 1, 1]) + c_y = np.array([2, 2, 2]) + n_x = 100 + n_y = 100 + + python_result = mr_link2_loglik_reference_v2(th, lam, c_x, c_y, n_x, n_y) + assert np.isnan(python_result) + + r_result = r['mr_link2_loglik_reference_v2'](th, lam, c_x, c_y, n_x, n_y) + assert np.isnan(r_result[0]) + + +def test_mr_link2_loglik_reference_v2_large_values_r_concordance(): + th = np.array([0.5, 1e10, 1e10]) + lam = np.array([1e5, 1e5, 1e5]) + c_x = np.array([1e2, 1e2, 1e2]) + c_y = np.array([1e2, 1e2, 1e2]) + n_x = 1e6 + n_y = 1e6 + + expected_result = 17617.166270043643 + python_result = mr_link2_loglik_reference_v2(th, lam, c_x, c_y, n_x, n_y) + assert np.isclose(python_result, expected_result) + + r_result = r['mr_link2_loglik_reference_v2'](th, lam, c_x, c_y, n_x, n_y) + assert np.isclose(r_result[0], expected_result) + + +def test_mr_link2_loglik_reference_v2_negative_values_r_concordance(): + th = np.array([0.1, -0.01, 0.01]) + lam = np.array([1, 2, 3]) + c_x = np.array([1, 1, 1]) + c_y = np.array([2, 2, 2]) + n_x = 100 + n_y = 100 + + expected_result = 22.944216071347796 + python_result = mr_link2_loglik_reference_v2(th, lam, c_x, c_y, n_x, n_y) + + # Check the Python result against the expected value + assert np.isclose(python_result, expected_result) + + # Call the R function and check if it matches the expected value + r_result = r['mr_link2_loglik_reference_v2'](th, lam, c_x, c_y, n_x, n_y) + assert np.isclose(r_result[0], expected_result) + + # Also check against the mr_link2_loglik_reference_v0 function + reference_result_v0 = mr_link2_loglik_reference_v0(th, lam, c_x, c_y, n_x, n_y) + assert np.isclose(python_result, reference_result_v0) + + +""" +As we do use them as functions, we want to make sure that the h0 functions also work correctly. +""" +def test_mr_link2_loglik_alpha_h0_basic_r_concordance(): + th = np.array([0.01, 0.01]) + lam = np.array([1, 2, 3]) + cX = np.array([1, 1, 1]) + cY = np.array([2, 2, 2]) + nX = 100 + nY = 100 + + python_result = mr_link2_loglik_alpha_h0(th, lam, cX, cY, nX, nY) + reference_result = mr_link2_loglik_reference_v2([0.0] + list(th), lam, cX, cY, nX, nY) + + assert np.isclose(python_result, reference_result) + + r_result = r['mr_link2_loglik_alpha_h0'](th, lam, cX, cY, nX, nY) + assert np.isclose(r_result[0], reference_result) + + +def test_mr_link2_loglik_alpha_h0_zero_lam_r_concordance(): + th = np.array([0.01, 0.01]) + lam = np.array([0, 0, 0]) + cX = np.array([1, 1, 1]) + cY = np.array([2, 2, 2]) + nX = 100 + nY = 100 + result = mr_link2_loglik_alpha_h0(th, lam, cX, cY, nX, nY) + assert np.isnan(result) + assert np.isnan(mr_link2_loglik_reference_v2([0.0] + list(th), lam, cX, cY, nX, nY)) +def test_mr_link2_loglik_alpha_h0_large_value_r_concordances(): + th = np.array([1e10, 1e10]) + lam = np.array([1e5, 1e5, 1e5]) + cX = np.array([1e2, 1e2, 1e2]) + cY = np.array([1e2, 1e2, 1e2]) + nX = 1e6 + nY = 1e6 + + # Call the Python function for alpha_h0 + python_result = mr_link2_loglik_alpha_h0(th, lam, cX, cY, nX, nY) + + # Call the Python reference function for comparison + reference_result = mr_link2_loglik_reference_v2([0.0] + list(th), lam, cX, cY, nX, nY) + + # Validate Python result + assert isinstance(python_result, float) + assert np.isclose(python_result, reference_result) + + # Call the R function for alpha_h0 and compare with the Python result + r_result = r['mr_link2_loglik_alpha_h0'](th, lam, cX, cY, nX, nY) + assert np.isclose(r_result[0], reference_result) + + +def test_mr_link2_loglik_alpha_h0_negative_values_r_concordance(): + th = np.array([0.01, -0.01]) + lam = np.array([1, 2, 3]) + cX = np.array([1, 1, 1]) + cY = np.array([2, 2, 2]) + nX = 100 + nY = 100 + + # Call the Python function for alpha_h0 + python_result = mr_link2_loglik_alpha_h0(th, lam, cX, cY, nX, nY) + + # Call the Python reference function for comparison + reference_result = mr_link2_loglik_reference_v2([0.0] + list(th), lam, cX, cY, nX, nY) + + # Validate Python result + assert isinstance(python_result, float) + assert np.isclose(python_result, reference_result) + + # Call the R function for alpha_h0 and compare with the Python result + r_result = r['mr_link2_loglik_alpha_h0'](th, lam, cX, cY, nX, nY) + assert np.isclose(r_result[0], reference_result) + + +def test_mr_link2_loglik_alpha_h0_large_population_r_concordance(): + th = np.array([0.01, 0.01]) + lam = np.array([1, 2, 3]) + cX = np.array([1, 1, 1]) + cY = np.array([2, 2, 2]) + nX = 1e12 + nY = 1e12 + + # Call the Python function for alpha_h0 + python_result = mr_link2_loglik_alpha_h0(th, lam, cX, cY, nX, nY) + + # Call the Python reference function for comparison + reference_result = mr_link2_loglik_reference_v2([0.0] + list(th), lam, cX, cY, nX, nY) + + # Validate Python result + assert isinstance(python_result, float) + assert np.isclose(python_result, reference_result) + + # Call the R function for alpha_h0 and compare with the Python result + r_result = r['mr_link2_loglik_alpha_h0'](th, lam, cX, cY, nX, nY) + assert np.isclose(r_result[0], reference_result) + + +""" +FUZZ TESTING many many many times. +""" +FUZZ_ITERATIONS = 10000 + + +def random_input_generator(): + """Generate random inputs for the test functions.""" + for _ in range(FUZZ_ITERATIONS): + th = np.random.uniform(-1e3, 1e3, size=3) + lam = np.random.uniform(1e-5, 1e3, size=500) + cX = np.random.uniform(-1e3, 1e3, size=500) + cY = np.random.uniform(-1e3, 1e3, size=500) + nX = np.random.uniform(50, 1e7) + nY = np.random.uniform(50, 1e7) + + yield th, lam, cX, cY, nX, nY + + +@pytest.mark.parametrize("th, lam, cX, cY, nX, nY", random_input_generator()) +def test_fuzz_mr_link2_loglik_reference_v2(th, lam, cX, cY, nX, nY): + """Fuzz testing for mr_link2_loglik_reference_v2 between Python and R implementations.""" + # Python function result + python_result = mr_link2_loglik_reference_v2(th, lam, cX, cY, nX, nY) + + # Call R function + r_result = r['mr_link2_loglik_reference_v2'](th, lam, cX, cY, nX, nY) + + # Convert R result to Python float for comparison + r_result_py = float(r_result[0]) + + # Compare both results using np.isclose() + assert np.isclose(python_result, r_result_py, rtol=1e-5, + atol=1e-8), f"Mismatch for th={th}, lam={lam}, cX={cX}, cY={cY}, nX={nX}, nY={nY}" + + +@pytest.mark.parametrize("th, lam, cX, cY, nX, nY", random_input_generator()) +def test_fuzz_mr_link2_loglik_alpha_h0(th, lam, cX, cY, nX, nY): + """Fuzz testing for mr_link2_loglik_alpha_h0 between Python and R implementations.""" + # Use only the first two elements of 'th' for alpha_h0 + th_h0 = th[:2] + + # Python function result + python_result = mr_link2_loglik_alpha_h0(th_h0, lam, cX, cY, nX, nY) + + # Call R function + r_result = r['mr_link2_loglik_alpha_h0'](th_h0, lam, cX, cY, nX, nY) + + # Convert R result to Python float for comparison + r_result_py = float(r_result[0]) + + # Compare both results using np.isclose() + assert np.isclose(python_result, r_result_py, rtol=1e-5, + atol=1e-8), f"Mismatch for th={th}, lam={lam}, cX={cX}, cY={cY}, nX={nX}, nY={nY}" \ No newline at end of file From 4d0421fcff82078879f498ed98ee4f810db03329 Mon Sep 17 00:00:00 2001 From: Adriaan van der Graaf Date: Mon, 7 Oct 2024 10:54:18 +0200 Subject: [PATCH 2/6] Building an R version of MR link 2 and testing for equivalence of likelihood function --- tests/test_for_r_and_python_concordance.py | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/tests/test_for_r_and_python_concordance.py b/tests/test_for_r_and_python_concordance.py index 083f1fb..67e32f0 100644 --- a/tests/test_for_r_and_python_concordance.py +++ b/tests/test_for_r_and_python_concordance.py @@ -311,4 +311,20 @@ def test_fuzz_mr_link2_loglik_alpha_h0(th, lam, cX, cY, nX, nY): # Compare both results using np.isclose() assert np.isclose(python_result, r_result_py, rtol=1e-5, - atol=1e-8), f"Mismatch for th={th}, lam={lam}, cX={cX}, cY={cY}, nX={nX}, nY={nY}" \ No newline at end of file + atol=1e-8), f"Mismatch for th={th}, lam={lam}, cX={cX}, cY={cY}, nX={nX}, nY={nY}" + + + +@pytest.mark.parametrize("th, lam, cX, cY, nX, nY", [ + (np.array([1e-10, 1e-10, 1e-10]), np.array([1e-10, 1e-10, 1e-10]), np.array([1e-10, 1e-10, 1e-10]), np.array([1e-10, 1e-10, 1e-10]), 1e10, 1e10), + (np.array([1e10, 1e10, 1e10]), np.array([1e5, 1e5, 1e5]), np.array([1e2, 1e2, 1e2]), np.array([1e2, 1e2, 1e2]), 1e10, 1e5), + (np.array([0, 0, 0]), np.array([1, 2, 3]), np.array([0, 0, 0]), np.array([0, 0, 0]), 100, 100), + # Add other extreme cases here... +]) +def test_edge_cases_mr_link2_loglik_reference_v2(th, lam, cX, cY, nX, nY): + """Test edge cases for equivalence between Python and R implementations.""" + python_result = mr_link2_loglik_reference_v2(th, lam, cX, cY, nX, nY) + r_result = r['mr_link2_loglik_reference_v2'](th, lam, cX, cY, nX, nY) + r_result_py = float(r_result[0]) + + assert np.isclose(python_result, r_result_py, rtol=1e-5, atol=1e-8) \ No newline at end of file From 73fb48ded69ceb4b898978a100de46ca04ec7e8c Mon Sep 17 00:00:00 2001 From: Adriaan van der Graaf Date: Mon, 7 Oct 2024 13:50:16 +0200 Subject: [PATCH 3/6] Added integration tests of MR-link-2 in R up to 2.5% tolerance --- .github/workflows/test.yml | 22 +++- R/mr_link_2_functions.R | 138 +++++++++++++++++++++ tests/test_for_r_and_python_concordance.py | 107 +++++++++++++++- 3 files changed, 258 insertions(+), 9 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 51781e3..4eb9177 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -1,18 +1,34 @@ name: Test on: [push, pull_request] + jobs: test: runs-on: ubuntu-latest steps: - uses: actions/checkout@v2 + - name: Set up Python uses: actions/setup-python@v2 with: python-version: '3.x' - - name: Install dependencies + + - name: Install R + run: | + sudo apt-get update + sudo apt-get install -y r-base r-base-dev + + - name: Cache pip dependencies + uses: actions/cache@v2 + with: + path: ~/.cache/pip + key: ${{ runner.os }}-pip-${{ hashFiles('**/requirements.txt') }} + restore-keys: | + ${{ runner.os }}-pip- + + - name: Install Python dependencies run: | python -m pip install --upgrade pip - pip install numpy scipy pandas pytest bitarray duckdb pyarrow + pip install numpy scipy pandas pytest bitarray duckdb pyarrow rpy2 - name: Download and install plink run: | @@ -26,4 +42,4 @@ jobs: run: pytest tests/unit_tests_mr_link_2.py - name: Run integration tests - run: pytest tests/integration_tests_mr_link_2.py + run: pytest tests/integration_tests_mr_link_2.py \ No newline at end of file diff --git a/R/mr_link_2_functions.R b/R/mr_link_2_functions.R index b0dfb43..09a6b12 100644 --- a/R/mr_link_2_functions.R +++ b/R/mr_link_2_functions.R @@ -69,4 +69,142 @@ mr_link2_loglik_sigma_y_h0 <- function(th, lam, c_x, c_y, n_x, n_y) { (m / 2) * (log(n_x) + log(n_y)) - sum(log(lam)) + (m / 2) * log(tX) return(-loglik) +} + + + +mr_link2 <- function(selected_eigenvalues, selected_eigenvectors, + exposure_betas, outcome_betas, + n_exp, n_out, sigma_exp_guess, sigma_out_guess) { + + start_time <- Sys.time() + + # Define optimization options + method <- "Nelder-Mead" + control <- list(maxit = 100000) + + # Calculate c_x and c_y + c_x <- t(selected_eigenvectors) %*% exposure_betas + c_y <- t(selected_eigenvectors) %*% outcome_betas + + max_sigma <- sqrt(.Machine$double.xmax) + + # Alpha h0 estimation + alpha_h0_guesses <- list(c(sigma_exp_guess, sigma_out_guess), + c(max_sigma, max_sigma), + c(1, max_sigma), + c(1e3, 1e3)) + + alpha_h0_results <- optim(par = alpha_h0_guesses[[1]], + fn = mr_link2_loglik_alpha_h0, + gr = NULL, + selected_eigenvalues, c_x, c_y, n_exp, n_out, + method = method, control = control) + + # Loop over other guesses + for (alpha_h0_guess in alpha_h0_guesses[-1]) { + if (alpha_h0_results$convergence == 0) break + + new_alpha_h0_results <- optim(par = alpha_h0_guess, + fn = mr_link2_loglik_alpha_h0, + gr = NULL, + selected_eigenvalues, c_x, c_y, n_exp, n_out, + method = method, control = control) + + if (alpha_h0_results$value >= new_alpha_h0_results$value) { + alpha_h0_results <- new_alpha_h0_results + } + } + + # Sigma_y estimation + sigma_y_guesses <- list(c(0.0, sigma_exp_guess), + c(1.0, sigma_exp_guess), + c(0.0, alpha_h0_results$par[1]), + c(1.0, alpha_h0_results$par[1]), + c(0.0, max_sigma), + c(1e-10, max_sigma)) + + sigma_y_h0_results <- optim(par = sigma_y_guesses[[1]], + fn = mr_link2_loglik_sigma_y_h0, + gr = NULL, + selected_eigenvalues, c_x, c_y, n_exp, n_out, + method = method, control = control) + + for (sigma_y_guess in sigma_y_guesses[-1]) { + if (sigma_y_h0_results$convergence == 0) break + + new_sigma_y_h0_results <- optim(par = sigma_y_guess, + fn = mr_link2_loglik_sigma_y_h0, + gr = NULL, + selected_eigenvalues, c_x, c_y, n_exp, n_out, + method = method, control = control) + + if (new_sigma_y_h0_results$value < sigma_y_h0_results$value) { + sigma_y_h0_results <- new_sigma_y_h0_results + } + } + + # Ha estimation + ha_guesses <- list(c(0.0, alpha_h0_results$par[1], alpha_h0_results$par[2]), + c(sigma_y_h0_results$par[1], sigma_y_h0_results$par[2], sqrt(.Machine$double.xmax)), + c(1.0, alpha_h0_results$par[1], alpha_h0_results$par[2]), + c(1e-10, max_sigma, max_sigma)) + + ha_results <- optim(par = ha_guesses[[1]], + fn = mr_link2_loglik_reference_v2, + gr = NULL, + selected_eigenvalues, c_x, c_y, n_exp, n_out, + method = method, control = control) + + for (ha_guess in ha_guesses[-1]) { + if (ha_results$convergence == 0) break + + new_ha_result <- optim(par = ha_guess, + fn = mr_link2_loglik_reference_v2, + gr = NULL, + selected_eigenvalues, c_x, c_y, n_exp, n_out, + method = method, control = control) + + if (new_ha_result$value < ha_results$value) { + ha_results <- new_ha_result + } + } + + # Likelihood Ratio Test and Estimation + alpha <- ha_results$par[1] + alpha_chi_sq <- 2 * (alpha_h0_results$value - ha_results$value) + alpha_p_val <- pchisq(alpha_chi_sq, df = 1, lower.tail = FALSE) + z_alpha <- ifelse(alpha_chi_sq <= 0, 0.0, sign(alpha) * sqrt(alpha_chi_sq)) + se_alpha <- ifelse(z_alpha != 0, alpha / z_alpha, NA) + + sigma_y <- 1 / abs(ha_results$par[3]) + sigma_y_chi_sq <- 2 * (sigma_y_h0_results$value - ha_results$value) + sigma_y_p_val <- pchisq(sigma_y_chi_sq, df = 1, lower.tail = FALSE) + z_sigma_y <- ifelse(sigma_y_chi_sq <= 0, 0.0, sqrt(sigma_y_chi_sq)) + se_sigma_y <- ifelse(z_sigma_y != 0, sigma_y / z_sigma_y, NA) + + # Return results as a list (equivalent to Python's dictionary) + list( + alpha = alpha, + `se(alpha)` = se_alpha, + `p(alpha)` = alpha_p_val, + sigma_y = sigma_y, + `se(sigma_y)` = se_sigma_y, + `p(sigma_y)` = sigma_y_p_val, + sigma_x = 1 / abs(ha_results$par[2]), + alpha_h0_sigma_x = 1 / abs(alpha_h0_results$par[1]), + alpha_h0_sigma_y = 1 / abs(alpha_h0_results$par[2]), + alpha_h0_loglik = alpha_h0_results$value, + sigma_y_h0_alpha = sigma_y_h0_results$par[1], + sigma_y_h0_sigma_x = 1 / abs(sigma_y_h0_results$par[2]), + sigma_y_h0_loglik = sigma_y_h0_results$value, + ha_loglik = ha_results$value, + optim_alpha_h0_success = alpha_h0_results$convergence == 0, + optim_alpha_h0_nit = alpha_h0_results$counts, + optim_sigma_y_h0_success = sigma_y_h0_results$convergence == 0, + optim_sigma_y_h0_nit = sigma_y_h0_results$counts, + optim_ha_success = ha_results$convergence == 0, + optim_ha_nit = ha_results$counts, + function_time = as.numeric(difftime(Sys.time(), start_time, units = "secs")) + ) } \ No newline at end of file diff --git a/tests/test_for_r_and_python_concordance.py b/tests/test_for_r_and_python_concordance.py index 67e32f0..be640b9 100644 --- a/tests/test_for_r_and_python_concordance.py +++ b/tests/test_for_r_and_python_concordance.py @@ -10,8 +10,11 @@ ## for the R part -from rpy2.robjects import r, numpy2ri +from rpy2.robjects import r, numpy2ri, default_converter +from rpy2.robjects.vectors import FloatVector from rpy2.robjects.packages import importr +from rpy2.robjects.conversion import localconverter + # Activate automatic conversion between numpy and R objects numpy2ri.activate() @@ -261,10 +264,10 @@ def test_mr_link2_loglik_alpha_h0_large_population_r_concordance(): """ FUZZ TESTING many many many times. """ -FUZZ_ITERATIONS = 10000 +FUZZ_ITERATIONS = 1_000 -def random_input_generator(): +def random_input_generator_likelihood_functions(): """Generate random inputs for the test functions.""" for _ in range(FUZZ_ITERATIONS): th = np.random.uniform(-1e3, 1e3, size=3) @@ -277,7 +280,7 @@ def random_input_generator(): yield th, lam, cX, cY, nX, nY -@pytest.mark.parametrize("th, lam, cX, cY, nX, nY", random_input_generator()) +@pytest.mark.parametrize("th, lam, cX, cY, nX, nY", random_input_generator_likelihood_functions()) def test_fuzz_mr_link2_loglik_reference_v2(th, lam, cX, cY, nX, nY): """Fuzz testing for mr_link2_loglik_reference_v2 between Python and R implementations.""" # Python function result @@ -294,7 +297,7 @@ def test_fuzz_mr_link2_loglik_reference_v2(th, lam, cX, cY, nX, nY): atol=1e-8), f"Mismatch for th={th}, lam={lam}, cX={cX}, cY={cY}, nX={nX}, nY={nY}" -@pytest.mark.parametrize("th, lam, cX, cY, nX, nY", random_input_generator()) +@pytest.mark.parametrize("th, lam, cX, cY, nX, nY", random_input_generator_likelihood_functions()) def test_fuzz_mr_link2_loglik_alpha_h0(th, lam, cX, cY, nX, nY): """Fuzz testing for mr_link2_loglik_alpha_h0 between Python and R implementations.""" # Use only the first two elements of 'th' for alpha_h0 @@ -327,4 +330,96 @@ def test_edge_cases_mr_link2_loglik_reference_v2(th, lam, cX, cY, nX, nY): r_result = r['mr_link2_loglik_reference_v2'](th, lam, cX, cY, nX, nY) r_result_py = float(r_result[0]) - assert np.isclose(python_result, r_result_py, rtol=1e-5, atol=1e-8) \ No newline at end of file + assert np.isclose(python_result, r_result_py, rtol=1e-5, atol=1e-8) + + +""" +Integration tests of the R functions +""" + +FUZZ_TEST_ITERATIONS = 100 +def random_input_generator_mr_link_2_function(): + np.random.seed(42) + """Generate random inputs for fuzz testing.""" + for _ in range(FUZZ_TEST_ITERATIONS): + # Generate random 'selected_eigenvalues', 'selected_eigenvectors', 'exposure_betas', and 'outcome_betas' + selected_eigenvalues = np.random.uniform(0.1, 2.0, size=100) + selected_eigenvectors = np.random.normal(size=(100, 100)) + exposure_betas = np.random.normal(size=100) + outcome_betas = np.random.normal(size=100) + n_exp = np.random.randint(500, 5000) # Randomize number of individuals + n_out = np.random.randint(500, 5000) # Randomize number of individuals + sigma_exp_guess = np.random.uniform(0.1, 5.0) # Randomize sigma guesses + sigma_out_guess = np.random.uniform(0.1, 5.0) + + yield selected_eigenvalues, selected_eigenvectors, exposure_betas, outcome_betas, n_exp, n_out, sigma_exp_guess, sigma_out_guess + + +@pytest.mark.parametrize( + "selected_eigenvalues, selected_eigenvectors, exposure_betas, outcome_betas, n_exp, n_out, sigma_exp_guess, sigma_out_guess", + random_input_generator_mr_link_2_function()) +def test_mr_link2_equivalence_fuzz(selected_eigenvalues, selected_eigenvectors, exposure_betas, outcome_betas, n_exp, + n_out, sigma_exp_guess, sigma_out_guess): + """ + + Fuzz testing for MR-Link2 equivalence between Python and R implementations. + + Currently only testing until a threshold of 2.5% difference. If testing more samples, this may be exceeded. + + It seems that there are differences between the optimizers which I cannot resolve easily for now. + That being said, the p values will be withing 5% of each other, so a python p value of 0.1 will be found within + the (0.095 - 0.105) interval. + + """ + + # Run Python version of mr_link2 + python_result = mr_link2( + selected_eigenvalues=selected_eigenvalues, + selected_eigenvectors=selected_eigenvectors, + exposure_betas=exposure_betas, + outcome_betas=outcome_betas, + n_exp=n_exp, + n_out=n_out, + sigma_exp_guess=sigma_exp_guess, + sigma_out_guess=sigma_out_guess + ) + + # Convert inputs to R-compatible types and run R version of mr_link2 + with localconverter(default_converter + numpy2ri.converter): + r_selected_eigenvalues = r.matrix(selected_eigenvalues, nrow=100, ncol=1) + r_selected_eigenvectors = r.matrix(selected_eigenvectors, nrow=100, ncol=100) + r_exposure_betas = r.matrix(exposure_betas, nrow=100, ncol=1) + r_outcome_betas = r.matrix(outcome_betas, nrow=100, ncol=1) + r_n_exp = n_exp + r_n_out = n_out + r_sigma_exp_guess = sigma_exp_guess + r_sigma_out_guess = sigma_out_guess + + # Call the R function + r_result = r['mr_link2']( + r_selected_eigenvalues, r_selected_eigenvectors, + r_exposure_betas, r_outcome_betas, + r_n_exp, r_n_out, r_sigma_exp_guess, r_sigma_out_guess + ) + + # Convert R result back to Python dictionary format + r_result_dict = {str(key): value[0] for key, value in r_result.items()} + + optimization_indicators = ['optim_alpha_h0_success', 'optim_sigma_y_h0_success', 'optim_ha_success'] + for result in [python_result, r_result_dict]: + for indicator in optimization_indicators: + if not result[indicator]: + print('No correct optimization.') + return + + # Define which results to compare with high precision + high_precision_keys = ['alpha_h0_loglik', 'ha_loglik', ] + for key in high_precision_keys: + assert np.isclose(python_result[key], r_result_dict[key], rtol=1e-4, + atol=1e-8), f"Mismatch for {key}: Python={python_result[key]}, R={r_result_dict[key]}" + + # Define which results to compare with low precision + low_precision_keys = ['alpha', 'sigma_y', 'sigma_x','p(alpha)', 'p(sigma_y)'] + for key in low_precision_keys: + assert np.isclose(python_result[key], r_result_dict[key], rtol=2.5e-2, ## this is pretty suboptimal, but as this also includes P values, there will not be a crazy amount of difference here. + atol=1e-8), f"Mismatch for {key}: Python={python_result[key]}, R={r_result_dict[key]}" From ae3eb0b9811bb3de3708eeb96d58230640335767 Mon Sep 17 00:00:00 2001 From: Adriaan van der Graaf Date: Mon, 7 Oct 2024 15:26:31 +0200 Subject: [PATCH 4/6] MR-link-2 for R in working condition. Not as featureful as the python version, but will produce the same results --- R/mr_link_2_functions.R | 81 +++++++++++++++++++- tests/integration_tests_mr_link_2.py | 5 -- tests/test_for_r_and_python_concordance.py | 86 +++++++++++++++++++++- 3 files changed, 164 insertions(+), 8 deletions(-) diff --git a/R/mr_link_2_functions.R b/R/mr_link_2_functions.R index 09a6b12..a150837 100644 --- a/R/mr_link_2_functions.R +++ b/R/mr_link_2_functions.R @@ -81,7 +81,7 @@ mr_link2 <- function(selected_eigenvalues, selected_eigenvectors, # Define optimization options method <- "Nelder-Mead" - control <- list(maxit = 100000) + control <- list(maxit = 300) # Calculate c_x and c_y c_x <- t(selected_eigenvectors) %*% exposure_betas @@ -207,4 +207,83 @@ mr_link2 <- function(selected_eigenvalues, selected_eigenvectors, optim_ha_nit = ha_results$counts, function_time = as.numeric(difftime(Sys.time(), start_time, units = "secs")) ) +} + +remove_highly_correlated <- function(ld_matrix, snp_ordering, max_correlation) { + # Remove NAs from the correlation matrix + ld_matrix[is.na(ld_matrix)] <- 0 + + # Initialize a logical vector to mark rows/columns to remove + idxs_to_remove <- rep(FALSE, nrow(ld_matrix)) + + # Find pairs of SNPs that are highly correlated + correlated_indices <- which(abs(ld_matrix) >= max_correlation, arr.ind = TRUE) + + to_remove <- vector("logical", length = nrow(ld_matrix)) + + # Loop through correlated pairs and mark SNPs to remove + for (i in seq_len(nrow(correlated_indices))) { + a <- correlated_indices[i, 1] + b <- correlated_indices[i, 2] + + if ((!to_remove[a] && !to_remove[b] ) && (a != b)) { + to_remove[b] <- TRUE + } + } + + # Remove highly correlated SNPs from both the matrix and the SNP list + ld_matrix <- ld_matrix[!to_remove, !to_remove] + pruned_snps <- snp_ordering[!to_remove] + + return(list(ld_matrix = ld_matrix, pruned_snps = pruned_snps)) +} + +mr_link2_analysis <- function(exposure_betas, outcome_betas, ld_matrix, n_exp, n_out, max_correlation = 0.99) { + # Prune highly correlated SNPs + snp_ordering <- seq_len(nrow(ld_matrix)) # Simple numbering for SNPs + pruned_data <- remove_highly_correlated(ld_matrix, snp_ordering, max_correlation) + pruned_ld_matrix <- pruned_data$ld_matrix + pruned_snps <- pruned_data$pruned_snps + + # Eigenvalue decomposition + eigen_decomp <- eigen(pruned_ld_matrix) + eigenvalues <- eigen_decomp$values + eigenvectors <- eigen_decomp$vectors + + # Calculate cumulative variance explained + variance_explained <- cumsum(eigenvalues) / sum(eigenvalues) + + # Select the eigenvectors that explain at least 99% of the variance + threshold_index <- min(which(variance_explained >= 0.99)) + selected_eigenvectors <- eigenvectors[, 1:threshold_index] + selected_eigenvalues <- eigenvalues[1:threshold_index] + + # Call the existing MR-link-2 function + # Assuming `mr_link2` is already defined in R and takes the following arguments: + # - selected_eigenvalues + # - selected_eigenvectors + # - exposure_betas + # - outcome_betas + # - n_exp (number of individuals in exposure dataset) + # - n_out (number of individuals in outcome dataset) + sigma_exp_guess <- 0.01 # You can adjust or estimate this as needed + sigma_out_guess <- 0.001 # You can adjust or estimate this as needed + + # Subset the betas based on pruned SNPs + exposure_betas_pruned <- exposure_betas[pruned_snps] + outcome_betas_pruned <- outcome_betas[pruned_snps] + + # Perform the MR-link-2 estimation + mr_result <- mr_link2( + selected_eigenvalues = selected_eigenvalues, + selected_eigenvectors = selected_eigenvectors, + exposure_betas = exposure_betas_pruned, + outcome_betas = outcome_betas_pruned, + n_exp = n_exp, + n_out = n_out, + sigma_exp_guess = sigma_exp_guess, + sigma_out_guess = sigma_out_guess + ) + + return(mr_result) } \ No newline at end of file diff --git a/tests/integration_tests_mr_link_2.py b/tests/integration_tests_mr_link_2.py index d8f3157..0f0945b 100644 --- a/tests/integration_tests_mr_link_2.py +++ b/tests/integration_tests_mr_link_2.py @@ -13,11 +13,6 @@ from mr_link_2_standalone import * -"""" -NB. These tests require us to have plink in your path. -So unfortunately I will not test them in the github workflow. -""" - def test_plink_version(): result = subprocess.run(['plink', '--version'], capture_output=True, text=True) assert result.returncode == 0, "PLINK is not installed or not in the PATH" diff --git a/tests/test_for_r_and_python_concordance.py b/tests/test_for_r_and_python_concordance.py index be640b9..00e2154 100644 --- a/tests/test_for_r_and_python_concordance.py +++ b/tests/test_for_r_and_python_concordance.py @@ -11,7 +11,8 @@ ## for the R part from rpy2.robjects import r, numpy2ri, default_converter -from rpy2.robjects.vectors import FloatVector +from rpy2.robjects.vectors import FloatVector, ListVector, StrVector + from rpy2.robjects.packages import importr from rpy2.robjects.conversion import localconverter @@ -22,7 +23,7 @@ # Load the base R package base = importr('base') # Source the R file with the functions -r['source']('../R/mr_link_2_functions.R') +r['source'](os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'R', 'mr_link_2_functions.R'))) """ Now, we perform unit tests on the MR-link-2 functions. to ensure that there is no creep of the @@ -423,3 +424,84 @@ def test_mr_link2_equivalence_fuzz(selected_eigenvalues, selected_eigenvectors, for key in low_precision_keys: assert np.isclose(python_result[key], r_result_dict[key], rtol=2.5e-2, ## this is pretty suboptimal, but as this also includes P values, there will not be a crazy amount of difference here. atol=1e-8), f"Mismatch for {key}: Python={python_result[key]}, R={r_result_dict[key]}" + + +def test_remove_highly_correlated(): + """Test that the highly correlated SNPs are correctly removed.""" + + # Example LD matrix with correlation values + ld_matrix = np.array([[1.0, 0.9, 0.1], + [0.9, 1.0, 0.05], + [0.1, 0.05, 1.0]]) + + snp_ordering = ["SNP1", "SNP2", "SNP3"] + max_correlation = 0.8 # Set the maximum correlation threshold + + with localconverter(numpy2ri.converter): + r_ld_matrix = r.matrix(ld_matrix, nrow=3) + # Wrap the snp_ordering list into a dictionary format for ListVector + r_snp_ordering = StrVector(snp_ordering) + + # Call the R function `remove_highly_correlated` + r_pruned_result = r['remove_highly_correlated'](r_ld_matrix, r_snp_ordering, max_correlation) + + # Extract results from R output + pruned_ld_matrix = np.array(r_pruned_result['ld_matrix']) + pruned_snps = list(r_pruned_result['pruned_snps']) + + # Expected pruned results (SNP2 is removed) + expected_pruned_ld_matrix = np.array([[1.0, 0.05], [0.05, 1.0]]) + expected_pruned_snps = ["SNP2", "SNP3"] + + # Assertions + assert np.allclose(pruned_ld_matrix, expected_pruned_ld_matrix), "LD matrix was not pruned correctly." + assert pruned_snps == expected_pruned_snps, f"SNP pruning failed. Expected {expected_pruned_snps}, got {pruned_snps}" + +# Test function for eigenvalue decomposition and MR-link-2 analysis +def test_mr_link2_analysis(): + """Test the MR-link-2 analysis on synthetic data.""" + + # Synthetic data + np.random.seed(42) + exposure_betas = np.random.normal(size=100) + outcome_betas = np.random.normal(size=100) + ld_matrix = np.random.normal(0, 1, (100, 100)) + ld_matrix = (ld_matrix + ld_matrix.T) / 2 # Make it symmetric + np.fill_diagonal(ld_matrix, 1) # Ensure diagonals are 1s (perfect correlation with self) + + n_exp = 1000 + n_out = 1000 + max_correlation = 0.9 + + # Convert inputs to R-compatible types + with localconverter(numpy2ri.converter): + r_exposure_betas = FloatVector(exposure_betas) + r_outcome_betas = FloatVector(outcome_betas) + r_ld_matrix = r.matrix(ld_matrix, nrow=100) + r_n_exp = n_exp + r_n_out = n_out + r_max_correlation = max_correlation + + # Call the R function `mr_link2_analysis` + r_mr_result = r['mr_link2_analysis'](r_exposure_betas, r_outcome_betas, r_ld_matrix, r_n_exp, r_n_out, + r_max_correlation) + + # Convert the result back to a Python-friendly format + mr_result_dict = dict(r_mr_result) + + # Check the output has expected keys and ranges + assert 'alpha' in mr_result_dict, "Key 'alpha' missing in MR result." + assert 'p(alpha)' in mr_result_dict, "Key 'p(alpha)' missing in MR result." + assert 'sigma_x' in mr_result_dict, "Key 'sigma_x' missing in MR result." + + # Example value checks (can be replaced with actual expected ranges based on real data) + assert np.isfinite(mr_result_dict['alpha']), "Alpha value is not finite." + assert 0 <= mr_result_dict['p(alpha)'] <= 1, "P-value for alpha is out of bounds." + assert mr_result_dict['sigma_x'] > 0, "Sigma_x should be positive." + + print("MR-link-2 result passed all checks.") + + +# Run the tests +if __name__ == '__main__': + pytest.main() From 22efce8ed3e14bdc804962e89cff982b65109dfe Mon Sep 17 00:00:00 2001 From: Adriaan van der Graaf Date: Mon, 7 Oct 2024 15:29:28 +0200 Subject: [PATCH 5/6] Add R tests to the workflow --- .github/workflows/test.yml | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 4eb9177..f231d93 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -42,4 +42,7 @@ jobs: run: pytest tests/unit_tests_mr_link_2.py - name: Run integration tests - run: pytest tests/integration_tests_mr_link_2.py \ No newline at end of file + run: pytest tests/integration_tests_mr_link_2.py + + - name: Run comparison_tests_with_r_imlementation + run: pytest tests/test_for_r_and_python_concordance.py \ No newline at end of file From 46abc501758f74d4c2111a35b872993a3e4bb7d4 Mon Sep 17 00:00:00 2001 From: Adriaan van der Graaf Date: Tue, 29 Oct 2024 14:02:16 +0100 Subject: [PATCH 6/6] Explicitly casting something to a list to solve a bug caused by a version difference. the 2.9 version (list-like) of bitarray has a different API as the 3.0 version (iterator) --- mr_link_2_standalone.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mr_link_2_standalone.py b/mr_link_2_standalone.py index 70b78f9..4b10a3a 100644 --- a/mr_link_2_standalone.py +++ b/mr_link_2_standalone.py @@ -98,7 +98,7 @@ def read_list_of_snps_into_geno_array(self, snps_to_load: set, missing_encoding= for i in range(n_variants): bits = bitarray.bitarray(endian="little") bits.frombytes(byte_array[offset:(offset+bytes_per_variant)]) - array = bits.decode(self._decoder)[:self.n_individuals] + array = list(bits.decode(self._decoder))[:self.n_individuals] genotypes[:,i] = array offset+=bytes_per_variant