From b880194243959e521f0e21b0313fb710501aa895 Mon Sep 17 00:00:00 2001 From: Takanori Hayashi Date: Fri, 6 Dec 2024 11:42:03 +0900 Subject: [PATCH] Remove ddof parameter for pl.corr in Rust --- crates/polars-compute/src/var_cov.rs | 3 +-- crates/polars-lazy/src/tests/arity.rs | 4 ++-- crates/polars-ops/src/chunked_array/cov.rs | 4 ++-- .../src/dsl/function_expr/correlation.rs | 24 +++++++++---------- .../src/dsl/functions/correlation.rs | 14 ++++------- crates/polars-python/src/functions/lazy.rs | 8 +++---- 6 files changed, 24 insertions(+), 33 deletions(-) diff --git a/crates/polars-compute/src/var_cov.rs b/crates/polars-compute/src/var_cov.rs index 5bace1688bc6..6468580da38b 100644 --- a/crates/polars-compute/src/var_cov.rs +++ b/crates/polars-compute/src/var_cov.rs @@ -192,8 +192,7 @@ impl PearsonState { self.mean_y = new_mean_y; } - pub fn finalize(&self, _ddof: u8) -> f64 { - // The division by sample_weight - ddof on both sides cancels out. + pub fn finalize(&self) -> f64 { let denom = (self.dp_xx * self.dp_yy).sqrt(); if denom == 0.0 { f64::NAN diff --git a/crates/polars-lazy/src/tests/arity.rs b/crates/polars-lazy/src/tests/arity.rs index 740678af0af4..73450259c667 100644 --- a/crates/polars-lazy/src/tests/arity.rs +++ b/crates/polars-lazy/src/tests/arity.rs @@ -15,7 +15,7 @@ fn test_pearson_corr() -> PolarsResult<()> { .lazy() .group_by_stable([col("uid")]) // a double aggregation expression. - .agg([pearson_corr(col("day"), col("cumcases"), 1).alias("pearson_corr")]) + .agg([pearson_corr(col("day"), col("cumcases")).alias("pearson_corr")]) .collect()?; let s = out.column("pearson_corr")?.f64()?; assert!((s.get(0).unwrap() - 0.997176).abs() < 0.000001); @@ -25,7 +25,7 @@ fn test_pearson_corr() -> PolarsResult<()> { .lazy() .group_by_stable([col("uid")]) // a double aggregation expression. - .agg([pearson_corr(col("day"), col("cumcases"), 1) + .agg([pearson_corr(col("day"), col("cumcases")) .pow(2.0) .alias("pearson_corr")]) .collect() diff --git a/crates/polars-ops/src/chunked_array/cov.rs b/crates/polars-ops/src/chunked_array/cov.rs index dbfa6b48f4fb..5eac4fcdb4de 100644 --- a/crates/polars-ops/src/chunked_array/cov.rs +++ b/crates/polars-ops/src/chunked_array/cov.rs @@ -19,7 +19,7 @@ where } /// Compute the pearson correlation between two columns. -pub fn pearson_corr(a: &ChunkedArray, b: &ChunkedArray, ddof: u8) -> Option +pub fn pearson_corr(a: &ChunkedArray, b: &ChunkedArray) -> Option where T: PolarsNumericType, T::Native: AsPrimitive, @@ -30,5 +30,5 @@ where for (a, b) in a.downcast_iter().zip(b.downcast_iter()) { out.combine(&polars_compute::var_cov::pearson_corr(a, b)) } - Some(out.finalize(ddof)) + Some(out.finalize()) } diff --git a/crates/polars-plan/src/dsl/function_expr/correlation.rs b/crates/polars-plan/src/dsl/function_expr/correlation.rs index 0413bac9dc01..116375921b5a 100644 --- a/crates/polars-plan/src/dsl/function_expr/correlation.rs +++ b/crates/polars-plan/src/dsl/function_expr/correlation.rs @@ -27,11 +27,9 @@ impl Display for CorrelationMethod { pub(super) fn corr(s: &[Column], ddof: u8, method: CorrelationMethod) -> PolarsResult { match method { - CorrelationMethod::Pearson => pearson_corr(s, ddof), + CorrelationMethod::Pearson => pearson_corr(s), #[cfg(all(feature = "rank", feature = "propagate_nans"))] - CorrelationMethod::SpearmanRank(propagate_nans) => { - spearman_rank_corr(s, ddof, propagate_nans) - }, + CorrelationMethod::SpearmanRank(propagate_nans) => spearman_rank_corr(s, propagate_nans), CorrelationMethod::Covariance => covariance(s, ddof), } } @@ -61,7 +59,7 @@ fn covariance(s: &[Column], ddof: u8) -> PolarsResult { Ok(Column::new(name, &[ret])) } -fn pearson_corr(s: &[Column], ddof: u8) -> PolarsResult { +fn pearson_corr(s: &[Column]) -> PolarsResult { let a = &s[0]; let b = &s[1]; let name = PlSmallStr::from_static("pearson_corr"); @@ -69,24 +67,24 @@ fn pearson_corr(s: &[Column], ddof: u8) -> PolarsResult { use polars_ops::chunked_array::cov::pearson_corr; let ret = match a.dtype() { DataType::Float32 => { - let ret = pearson_corr(a.f32().unwrap(), b.f32().unwrap(), ddof).map(|v| v as f32); + let ret = pearson_corr(a.f32().unwrap(), b.f32().unwrap()).map(|v| v as f32); return Ok(Column::new(name.clone(), &[ret])); }, - DataType::Float64 => pearson_corr(a.f64().unwrap(), b.f64().unwrap(), ddof), - DataType::Int32 => pearson_corr(a.i32().unwrap(), b.i32().unwrap(), ddof), - DataType::Int64 => pearson_corr(a.i64().unwrap(), b.i64().unwrap(), ddof), - DataType::UInt32 => pearson_corr(a.u32().unwrap(), b.u32().unwrap(), ddof), + DataType::Float64 => pearson_corr(a.f64().unwrap(), b.f64().unwrap()), + DataType::Int32 => pearson_corr(a.i32().unwrap(), b.i32().unwrap()), + DataType::Int64 => pearson_corr(a.i64().unwrap(), b.i64().unwrap()), + DataType::UInt32 => pearson_corr(a.u32().unwrap(), b.u32().unwrap()), _ => { let a = a.cast(&DataType::Float64)?; let b = b.cast(&DataType::Float64)?; - pearson_corr(a.f64().unwrap(), b.f64().unwrap(), ddof) + pearson_corr(a.f64().unwrap(), b.f64().unwrap()) }, }; Ok(Column::new(name, &[ret])) } #[cfg(all(feature = "rank", feature = "propagate_nans"))] -fn spearman_rank_corr(s: &[Column], ddof: u8, propagate_nans: bool) -> PolarsResult { +fn spearman_rank_corr(s: &[Column], propagate_nans: bool) -> PolarsResult { use polars_core::utils::coalesce_nulls_columns; use polars_ops::chunked_array::nan_propagating_aggregate::nan_max_s; let a = &s[0]; @@ -134,5 +132,5 @@ fn spearman_rank_corr(s: &[Column], ddof: u8, propagate_nans: bool) -> PolarsRes ) .into(); - pearson_corr(&[a_rank, b_rank], ddof) + pearson_corr(&[a_rank, b_rank]) } diff --git a/crates/polars-plan/src/dsl/functions/correlation.rs b/crates/polars-plan/src/dsl/functions/correlation.rs index 97e14f5df2f8..c3481783961f 100644 --- a/crates/polars-plan/src/dsl/functions/correlation.rs +++ b/crates/polars-plan/src/dsl/functions/correlation.rs @@ -20,15 +20,11 @@ pub fn cov(a: Expr, b: Expr, ddof: u8) -> Expr { } /// Compute the pearson correlation between two columns. -/// -/// # Arguments -/// * ddof -/// Delta degrees of freedom -pub fn pearson_corr(a: Expr, b: Expr, ddof: u8) -> Expr { +pub fn pearson_corr(a: Expr, b: Expr) -> Expr { let input = vec![a, b]; let function = FunctionExpr::Correlation { method: CorrelationMethod::Pearson, - ddof, + ddof: 0u8, }; Expr::Function { input, @@ -45,18 +41,16 @@ pub fn pearson_corr(a: Expr, b: Expr, ddof: u8) -> Expr { /// Compute the spearman rank correlation between two columns. /// Missing data will be excluded from the computation. /// # Arguments -/// * ddof -/// Delta degrees of freedom /// * propagate_nans /// If `true` any `NaN` encountered will lead to `NaN` in the output. /// If to `false` then `NaN` are regarded as larger than any finite number /// and thus lead to the highest rank. #[cfg(all(feature = "rank", feature = "propagate_nans"))] -pub fn spearman_rank_corr(a: Expr, b: Expr, ddof: u8, propagate_nans: bool) -> Expr { +pub fn spearman_rank_corr(a: Expr, b: Expr, propagate_nans: bool) -> Expr { let input = vec![a, b]; let function = FunctionExpr::Correlation { method: CorrelationMethod::SpearmanRank(propagate_nans), - ddof, + ddof: 0u8, }; Expr::Function { input, diff --git a/crates/polars-python/src/functions/lazy.rs b/crates/polars-python/src/functions/lazy.rs index 3a437b3281d3..5bc8819f3f85 100644 --- a/crates/polars-python/src/functions/lazy.rs +++ b/crates/polars-python/src/functions/lazy.rs @@ -504,8 +504,8 @@ pub fn map_mul( } #[pyfunction] -pub fn pearson_corr(a: PyExpr, b: PyExpr, ddof: u8) -> PyExpr { - dsl::pearson_corr(a.inner, b.inner, ddof).into() +pub fn pearson_corr(a: PyExpr, b: PyExpr) -> PyExpr { + dsl::pearson_corr(a.inner, b.inner).into() } #[pyfunction] @@ -537,10 +537,10 @@ pub fn repeat(value: PyExpr, n: PyExpr, dtype: Option>) -> PyResu } #[pyfunction] -pub fn spearman_rank_corr(a: PyExpr, b: PyExpr, ddof: u8, propagate_nans: bool) -> PyExpr { +pub fn spearman_rank_corr(a: PyExpr, b: PyExpr, propagate_nans: bool) -> PyExpr { #[cfg(feature = "propagate_nans")] { - dsl::spearman_rank_corr(a.inner, b.inner, ddof, propagate_nans).into() + dsl::spearman_rank_corr(a.inner, b.inner, propagate_nans).into() } #[cfg(not(feature = "propagate_nans"))] {