Skip to content

Commit 2a731bb

Browse files
authored
updating the rust clustering to be on par as Java (aws#373)
1 parent 3765c20 commit 2a731bb

12 files changed

+2264
-300
lines changed

Rust/Cargo.toml

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[package]
22
name = "rcf"
3-
version = "3.0.1"
3+
version = "3.3.0"
44
edition = "2021"
55
license = "Apache-2.0"
66

Rust/src/common/cluster.rs

+1,042-221
Large diffs are not rendered by default.

Rust/src/common/conditionalfieldsummarizer.rs

+1-1
Original file line numberDiff line numberDiff line change
@@ -135,7 +135,7 @@ impl FieldSummarizer {
135135
upper[j] = y[third.0].0;
136136
}
137137

138-
let summary = summarize(&vec, self.distance, self.max_number, false);
138+
let summary = summarize(&vec, self.distance, self.max_number, false).unwrap();
139139
SampleSummary {
140140
summary_points: summary.summary_points.clone(),
141141
relative_weight: summary.relative_weight.clone(),

Rust/src/common/multidimdatawithkey.rs

+1-1
Original file line numberDiff line numberDiff line change
@@ -148,7 +148,7 @@ fn next_element(mean: f32, scale: f32, rng: &mut ChaCha20Rng) -> f32 {
148148
}
149149
}
150150

151-
fn new_vec(mean: &[f32], scale: &[f32], rng: &mut ChaCha20Rng) -> Vec<f32> {
151+
pub fn new_vec(mean: &[f32], scale: &[f32], rng: &mut ChaCha20Rng) -> Vec<f32> {
152152
let dimensions = mean.len();
153153
let mut answer = Vec::new();
154154
for i in 0..dimensions {

Rust/src/common/samplesummary.rs

+126-52
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,14 @@
11
use std::cmp::min;
2-
2+
use std::ops::Index;
3+
use std::slice::SliceIndex;
4+
use crate::types::Result;
5+
use crate::errors;
6+
use crate::util::check_argument;
37
use rand::{Rng, SeedableRng};
48
use rand_chacha::ChaCha20Rng;
9+
use rayon::range;
510

6-
use crate::common::cluster::{iterative_clustering, Center, Cluster};
11+
use crate::common::cluster::{Center, multi_cluster_as_weighted_obj, multi_cluster_as_weighted_ref, single_centroid_cluster_weighted_vec_with_distance_over_slices};
712

813
///
914
/// The goal of the summarization below is as follows: on being provided a collection of sampled weighted points
@@ -98,27 +103,27 @@ impl SampleSummary {
98103
}
99104

100105

101-
pub fn from_points(points: &[(Vec<f32>, f32)], lower_fraction: f64, upper_fraction:f64) -> Self {
102-
assert!(points.len() > 0, "cannot be empty list");
103-
assert!(lower_fraction < 0.5, " has to be less than half");
104-
assert!(upper_fraction > 0.5, "has to be larger than half");
105-
let dimensions = points[0].0.len();
106-
assert!(dimensions > 0, " cannot have 0 dimensions");
106+
pub fn from_points<Q>(dimensions: usize,points: &[(Q, f32)], lower_fraction: f64, upper_fraction:f64) -> Result<Self>
107+
where Q: Index<usize, Output = f32>
108+
{
109+
check_argument(points.len() > 0, "cannot be empty list")?;
110+
check_argument(lower_fraction < 0.5, " has to be less than half")?;
111+
check_argument(upper_fraction > 0.5, "has to be larger than half")?;
112+
check_argument(dimensions > 0, " cannot have 0 dimensions")?;
107113
let total_weight: f64 = points.iter().map(|x| x.1 as f64).sum();
108-
assert!(total_weight > 0.0, "weights cannot be all zero");
109-
assert!(total_weight.is_finite(), " cannot have infinite weights");
114+
check_argument(total_weight > 0.0, "weights cannot be all zero")?;
115+
check_argument(total_weight.is_finite(), " cannot have infinite weights")?;
110116
let mut mean = vec![0.0f32; dimensions];
111117
let mut deviation = vec![0.0f32; dimensions];
112118
let mut sum_values_sq = vec![0.0f64; dimensions];
113119
let mut sum_values = vec![0.0f64; dimensions];
114120
for i in 0..points.len() {
115-
assert!(points[i].0.len() == dimensions, "incorrect dimensions");
116-
assert!(points[i].1 >= 0.0, "point weights have to be non-negative");
121+
check_argument(points[i].1 >= 0.0, "point weights have to be non-negative")?;
117122
for j in 0..dimensions {
118-
assert!(
123+
check_argument(
119124
points[i].0[j].is_finite() && !points[i].0[j].is_nan(),
120125
" cannot have NaN or infinite values"
121-
);
126+
)?;
122127
sum_values[j] += points[i].1 as f64 * points[i].0[j] as f64;
123128
sum_values_sq[j] +=
124129
points[i].1 as f64 * points[i].0[j] as f64 * points[i].0[j] as f64;
@@ -147,7 +152,7 @@ impl SampleSummary {
147152
upper_vec[j] = y[third.0].0;
148153
}
149154

150-
SampleSummary {
155+
Ok(SampleSummary {
151156
summary_points: Vec::new(),
152157
relative_weight: Vec::new(),
153158
total_weight: total_weight as f32,
@@ -156,7 +161,68 @@ impl SampleSummary {
156161
lower: lower_vec,
157162
median,
158163
deviation,
164+
})
165+
}
166+
167+
pub fn from_references<Q>(dimensions: usize, points: &[(&Q, f32)], lower_fraction: f64, upper_fraction:f64) -> Result<Self>
168+
where Q:?Sized + Index<usize, Output = f32>
169+
{
170+
check_argument(points.len() > 0, "cannot be empty list")?;
171+
check_argument(lower_fraction < 0.5, " has to be less than half")?;
172+
check_argument(upper_fraction > 0.5, "has to be larger than half")?;
173+
check_argument(dimensions > 0, " cannot have 0 dimensions")?;
174+
let total_weight: f64 = points.iter().map(|x| x.1 as f64).sum();
175+
check_argument(total_weight > 0.0, "weights cannot be all zero")?;
176+
check_argument(total_weight.is_finite(), " cannot have infinite weights")?;
177+
let mut mean = vec![0.0f32; dimensions];
178+
let mut deviation = vec![0.0f32; dimensions];
179+
let mut sum_values_sq = vec![0.0f64; dimensions];
180+
let mut sum_values = vec![0.0f64; dimensions];
181+
for i in 0..points.len() {
182+
check_argument(points[i].1 >= 0.0, "point weights have to be non-negative")?;
183+
for j in 0..dimensions {
184+
check_argument(
185+
points[i].0[j].is_finite() && !points[i].0[j].is_nan(),
186+
" cannot have NaN or infinite values"
187+
)?;
188+
sum_values[j] += points[i].1 as f64 * points[i].0[j] as f64;
189+
sum_values_sq[j] +=
190+
points[i].1 as f64 * points[i].0[j] as f64 * points[i].0[j] as f64;
191+
}
192+
}
193+
for j in 0..dimensions {
194+
mean[j] = (sum_values[j] / total_weight) as f32;
195+
let t: f64 = sum_values_sq[j] / total_weight
196+
- sum_values[j] * sum_values[j] / (total_weight * total_weight);
197+
deviation[j] = f64::sqrt(if t > 0.0 { t } else { 0.0 }) as f32;
159198
}
199+
let mut median = vec![0.0f32; dimensions];
200+
let mut upper_vec = vec![0.0f32;dimensions];
201+
let mut lower_vec = vec![0.0f32;dimensions];
202+
let num = total_weight/2.0;
203+
let lower = total_weight * lower_fraction;
204+
let upper = total_weight * upper_fraction;
205+
for j in 0..dimensions {
206+
let mut y: Vec<(f32,f32)> = points.iter().map(|x| (x.0[j],x.1)).collect();
207+
y.sort_by(|a, b| a.0.partial_cmp(&b.0).unwrap());
208+
let first = Self::pick(&y,lower,0,0.0);
209+
lower_vec[j] = y[first.0].0;
210+
let second = Self::pick(&y,num,first.0,first.1);
211+
median[j] = y[second.0].0;
212+
let third = Self::pick(&y,upper,second.0,second.1);
213+
upper_vec[j] = y[third.0].0;
214+
}
215+
216+
Ok(SampleSummary {
217+
summary_points: Vec::new(),
218+
relative_weight: Vec::new(),
219+
total_weight: total_weight as f32,
220+
mean,
221+
upper: upper_vec,
222+
lower: lower_vec,
223+
median,
224+
deviation,
225+
})
160226
}
161227
}
162228

@@ -165,58 +231,66 @@ pub fn summarize(
165231
distance: fn(&[f32], &[f32]) -> f64,
166232
max_number: usize,
167233
parallel_enabled: bool,
168-
) -> SampleSummary {
169-
assert!(max_number < 51, " for large number of clusters, other methods may be better, consider recursively removing clusters");
170-
let mut summary = SampleSummary::from_points(&points,LOWER_FRACTION,UPPER_FRACTION);
234+
) -> Result<SampleSummary> {
235+
let dimensions = points[0].0.len();
236+
let mut summary = SampleSummary::from_points(dimensions,&points,LOWER_FRACTION,UPPER_FRACTION)?;
171237

172238
if max_number > 0 {
173-
let dimensions = points[0].0.len();
174239
let max_allowed = min(dimensions * MAX_NUMBER_PER_DIMENSION, max_number);
175-
let total_weight: f64 = points.iter().map(|x| x.1 as f64).sum();
176-
let mut rng = ChaCha20Rng::seed_from_u64(max_allowed as u64);
177240

178-
let mut sampled_points: Vec<(&[f32], f32)> = Vec::new();
179-
if points.len() < 5 * LENGTH_BOUND {
180-
for j in 0..points.len() {
181-
sampled_points.push((&points[j].0, points[j].1));
182-
}
183-
} else {
184-
let mut remainder = 0.0f64;
185-
for j in 0..points.len() {
186-
if points[j].1 > (total_weight / LENGTH_BOUND as f64) as f32 {
187-
sampled_points.push((&points[j].0, points[j].1));
188-
} else {
189-
remainder += points[j].1 as f64;
190-
}
191-
}
192-
for j in 0..points.len() {
193-
if points[j].1 <= (total_weight / LENGTH_BOUND as f64) as f32
194-
&& rng.gen::<f64>() < 5.0 * (LENGTH_BOUND as f64) / (points.len() as f64)
195-
{
196-
let t = points[j].1 as f64
197-
* (points.len() as f64 / (5.0 * LENGTH_BOUND as f64))
198-
* (remainder / total_weight);
199-
sampled_points.push((&points[j].0, t as f32));
200-
}
201-
}
241+
let mut list: Vec<Center> = single_centroid_cluster_weighted_vec_with_distance_over_slices(
242+
&points,
243+
distance,
244+
max_allowed,
245+
parallel_enabled,
246+
)?;
247+
list.sort_by(|o1, o2| o2.weight().partial_cmp(&o1.weight()).unwrap()); // decreasing order
248+
let mut summary_points: Vec<Vec<f32>> = Vec::new();
249+
let mut relative_weight: Vec<f32> = Vec::new();
250+
let center_sum: f64 = list.iter().map(|x| x.weight()).sum();
251+
for i in 0..list.len() {
252+
summary_points.push(list[i].representative().clone());
253+
relative_weight.push((list[i].weight() / center_sum) as f32);
202254
}
255+
summary.add_typical(summary_points, relative_weight);
256+
}
257+
return Ok(summary);
258+
}
203259

204-
let mut list: Vec<Center> = iterative_clustering(
205-
max_allowed,
206-
&sampled_points,
207-
Center::new,
260+
261+
pub fn multi_summarize_ref(
262+
points: &[(&[f32], f32)],
263+
distance: fn(&[f32], &[f32]) -> f64,
264+
number_of_representatives: usize,
265+
shrinkage : f32,
266+
max_number: usize,
267+
parallel_enabled: bool,
268+
) -> Result<SampleSummary> {
269+
let dimensions = points[0].0.len();
270+
let mut summary = SampleSummary::from_references(dimensions,points,LOWER_FRACTION,UPPER_FRACTION)?;
271+
272+
if max_number > 0 {
273+
let max_allowed = min(dimensions * MAX_NUMBER_PER_DIMENSION, max_number);
274+
275+
let mut list= multi_cluster_as_weighted_ref(
276+
&points,
208277
distance,
278+
number_of_representatives,
279+
shrinkage,
280+
false,
281+
max_allowed,
209282
parallel_enabled,
210-
);
283+
)?;
211284
list.sort_by(|o1, o2| o2.weight().partial_cmp(&o1.weight()).unwrap()); // decreasing order
212285
let mut summary_points: Vec<Vec<f32>> = Vec::new();
213286
let mut relative_weight: Vec<f32> = Vec::new();
214287
let center_sum: f64 = list.iter().map(|x| x.weight()).sum();
215288
for i in 0..list.len() {
216-
summary_points.push(list[i].primary_representative(distance).clone());
289+
summary_points.push(Vec::from(list[i].representatives()[0].0));
217290
relative_weight.push((list[i].weight() / center_sum) as f32);
218291
}
219292
summary.add_typical(summary_points, relative_weight);
220293
}
221-
return summary;
294+
295+
return Ok(summary);
222296
}

0 commit comments

Comments
 (0)