1
1
use std:: cmp:: min;
2
-
2
+ use std:: ops:: Index ;
3
+ use std:: slice:: SliceIndex ;
4
+ use crate :: types:: Result ;
5
+ use crate :: errors;
6
+ use crate :: util:: check_argument;
3
7
use rand:: { Rng , SeedableRng } ;
4
8
use rand_chacha:: ChaCha20Rng ;
9
+ use rayon:: range;
5
10
6
- use crate :: common:: cluster:: { iterative_clustering , Center , Cluster } ;
11
+ use crate :: common:: cluster:: { Center , multi_cluster_as_weighted_obj , multi_cluster_as_weighted_ref , single_centroid_cluster_weighted_vec_with_distance_over_slices } ;
7
12
8
13
///
9
14
/// The goal of the summarization below is as follows: on being provided a collection of sampled weighted points
@@ -98,27 +103,27 @@ impl SampleSummary {
98
103
}
99
104
100
105
101
- pub fn from_points ( points : & [ ( Vec < f32 > , f32 ) ] , lower_fraction : f64 , upper_fraction : f64 ) -> Self {
102
- assert ! ( points. len( ) > 0 , "cannot be empty list" ) ;
103
- assert ! ( lower_fraction < 0.5 , " has to be less than half" ) ;
104
- assert ! ( upper_fraction > 0.5 , "has to be larger than half" ) ;
105
- let dimensions = points[ 0 ] . 0 . len ( ) ;
106
- assert ! ( dimensions > 0 , " cannot have 0 dimensions" ) ;
106
+ pub fn from_points < Q > ( dimensions : usize , points : & [ ( Q , f32 ) ] , lower_fraction : f64 , upper_fraction : f64 ) -> Result < Self >
107
+ where Q : Index < usize , Output = f32 >
108
+ {
109
+ check_argument ( points. len ( ) > 0 , "cannot be empty list" ) ?;
110
+ check_argument ( lower_fraction < 0.5 , " has to be less than half" ) ?;
111
+ check_argument ( upper_fraction > 0.5 , "has to be larger than half" ) ?;
112
+ check_argument ( dimensions > 0 , " cannot have 0 dimensions" ) ?;
107
113
let total_weight: f64 = points. iter ( ) . map ( |x| x. 1 as f64 ) . sum ( ) ;
108
- assert ! ( total_weight > 0.0 , "weights cannot be all zero" ) ;
109
- assert ! ( total_weight. is_finite( ) , " cannot have infinite weights" ) ;
114
+ check_argument ( total_weight > 0.0 , "weights cannot be all zero" ) ? ;
115
+ check_argument ( total_weight. is_finite ( ) , " cannot have infinite weights" ) ? ;
110
116
let mut mean = vec ! [ 0.0f32 ; dimensions] ;
111
117
let mut deviation = vec ! [ 0.0f32 ; dimensions] ;
112
118
let mut sum_values_sq = vec ! [ 0.0f64 ; dimensions] ;
113
119
let mut sum_values = vec ! [ 0.0f64 ; dimensions] ;
114
120
for i in 0 ..points. len ( ) {
115
- assert ! ( points[ i] . 0 . len( ) == dimensions, "incorrect dimensions" ) ;
116
- assert ! ( points[ i] . 1 >= 0.0 , "point weights have to be non-negative" ) ;
121
+ check_argument ( points[ i] . 1 >= 0.0 , "point weights have to be non-negative" ) ?;
117
122
for j in 0 ..dimensions {
118
- assert ! (
123
+ check_argument (
119
124
points[ i] . 0 [ j] . is_finite ( ) && !points[ i] . 0 [ j] . is_nan ( ) ,
120
125
" cannot have NaN or infinite values"
121
- ) ;
126
+ ) ? ;
122
127
sum_values[ j] += points[ i] . 1 as f64 * points[ i] . 0 [ j] as f64 ;
123
128
sum_values_sq[ j] +=
124
129
points[ i] . 1 as f64 * points[ i] . 0 [ j] as f64 * points[ i] . 0 [ j] as f64 ;
@@ -147,7 +152,7 @@ impl SampleSummary {
147
152
upper_vec[ j] = y[ third. 0 ] . 0 ;
148
153
}
149
154
150
- SampleSummary {
155
+ Ok ( SampleSummary {
151
156
summary_points : Vec :: new ( ) ,
152
157
relative_weight : Vec :: new ( ) ,
153
158
total_weight : total_weight as f32 ,
@@ -156,7 +161,68 @@ impl SampleSummary {
156
161
lower : lower_vec,
157
162
median,
158
163
deviation,
164
+ } )
165
+ }
166
+
167
+ pub fn from_references < Q > ( dimensions : usize , points : & [ ( & Q , f32 ) ] , lower_fraction : f64 , upper_fraction : f64 ) -> Result < Self >
168
+ where Q : ?Sized + Index < usize , Output = f32 >
169
+ {
170
+ check_argument ( points. len ( ) > 0 , "cannot be empty list" ) ?;
171
+ check_argument ( lower_fraction < 0.5 , " has to be less than half" ) ?;
172
+ check_argument ( upper_fraction > 0.5 , "has to be larger than half" ) ?;
173
+ check_argument ( dimensions > 0 , " cannot have 0 dimensions" ) ?;
174
+ let total_weight: f64 = points. iter ( ) . map ( |x| x. 1 as f64 ) . sum ( ) ;
175
+ check_argument ( total_weight > 0.0 , "weights cannot be all zero" ) ?;
176
+ check_argument ( total_weight. is_finite ( ) , " cannot have infinite weights" ) ?;
177
+ let mut mean = vec ! [ 0.0f32 ; dimensions] ;
178
+ let mut deviation = vec ! [ 0.0f32 ; dimensions] ;
179
+ let mut sum_values_sq = vec ! [ 0.0f64 ; dimensions] ;
180
+ let mut sum_values = vec ! [ 0.0f64 ; dimensions] ;
181
+ for i in 0 ..points. len ( ) {
182
+ check_argument ( points[ i] . 1 >= 0.0 , "point weights have to be non-negative" ) ?;
183
+ for j in 0 ..dimensions {
184
+ check_argument (
185
+ points[ i] . 0 [ j] . is_finite ( ) && !points[ i] . 0 [ j] . is_nan ( ) ,
186
+ " cannot have NaN or infinite values"
187
+ ) ?;
188
+ sum_values[ j] += points[ i] . 1 as f64 * points[ i] . 0 [ j] as f64 ;
189
+ sum_values_sq[ j] +=
190
+ points[ i] . 1 as f64 * points[ i] . 0 [ j] as f64 * points[ i] . 0 [ j] as f64 ;
191
+ }
192
+ }
193
+ for j in 0 ..dimensions {
194
+ mean[ j] = ( sum_values[ j] / total_weight) as f32 ;
195
+ let t: f64 = sum_values_sq[ j] / total_weight
196
+ - sum_values[ j] * sum_values[ j] / ( total_weight * total_weight) ;
197
+ deviation[ j] = f64:: sqrt ( if t > 0.0 { t } else { 0.0 } ) as f32 ;
159
198
}
199
+ let mut median = vec ! [ 0.0f32 ; dimensions] ;
200
+ let mut upper_vec = vec ! [ 0.0f32 ; dimensions] ;
201
+ let mut lower_vec = vec ! [ 0.0f32 ; dimensions] ;
202
+ let num = total_weight/2.0 ;
203
+ let lower = total_weight * lower_fraction;
204
+ let upper = total_weight * upper_fraction;
205
+ for j in 0 ..dimensions {
206
+ let mut y: Vec < ( f32 , f32 ) > = points. iter ( ) . map ( |x| ( x. 0 [ j] , x. 1 ) ) . collect ( ) ;
207
+ y. sort_by ( |a, b| a. 0 . partial_cmp ( & b. 0 ) . unwrap ( ) ) ;
208
+ let first = Self :: pick ( & y, lower, 0 , 0.0 ) ;
209
+ lower_vec[ j] = y[ first. 0 ] . 0 ;
210
+ let second = Self :: pick ( & y, num, first. 0 , first. 1 ) ;
211
+ median[ j] = y[ second. 0 ] . 0 ;
212
+ let third = Self :: pick ( & y, upper, second. 0 , second. 1 ) ;
213
+ upper_vec[ j] = y[ third. 0 ] . 0 ;
214
+ }
215
+
216
+ Ok ( SampleSummary {
217
+ summary_points : Vec :: new ( ) ,
218
+ relative_weight : Vec :: new ( ) ,
219
+ total_weight : total_weight as f32 ,
220
+ mean,
221
+ upper : upper_vec,
222
+ lower : lower_vec,
223
+ median,
224
+ deviation,
225
+ } )
160
226
}
161
227
}
162
228
@@ -165,58 +231,66 @@ pub fn summarize(
165
231
distance : fn ( & [ f32 ] , & [ f32 ] ) -> f64 ,
166
232
max_number : usize ,
167
233
parallel_enabled : bool ,
168
- ) -> SampleSummary {
169
- assert ! ( max_number < 51 , " for large number of clusters, other methods may be better, consider recursively removing clusters" ) ;
170
- let mut summary = SampleSummary :: from_points ( & points, LOWER_FRACTION , UPPER_FRACTION ) ;
234
+ ) -> Result < SampleSummary > {
235
+ let dimensions = points [ 0 ] . 0 . len ( ) ;
236
+ let mut summary = SampleSummary :: from_points ( dimensions , & points, LOWER_FRACTION , UPPER_FRACTION ) ? ;
171
237
172
238
if max_number > 0 {
173
- let dimensions = points[ 0 ] . 0 . len ( ) ;
174
239
let max_allowed = min ( dimensions * MAX_NUMBER_PER_DIMENSION , max_number) ;
175
- let total_weight: f64 = points. iter ( ) . map ( |x| x. 1 as f64 ) . sum ( ) ;
176
- let mut rng = ChaCha20Rng :: seed_from_u64 ( max_allowed as u64 ) ;
177
240
178
- let mut sampled_points: Vec < ( & [ f32 ] , f32 ) > = Vec :: new ( ) ;
179
- if points. len ( ) < 5 * LENGTH_BOUND {
180
- for j in 0 ..points. len ( ) {
181
- sampled_points. push ( ( & points[ j] . 0 , points[ j] . 1 ) ) ;
182
- }
183
- } else {
184
- let mut remainder = 0.0f64 ;
185
- for j in 0 ..points. len ( ) {
186
- if points[ j] . 1 > ( total_weight / LENGTH_BOUND as f64 ) as f32 {
187
- sampled_points. push ( ( & points[ j] . 0 , points[ j] . 1 ) ) ;
188
- } else {
189
- remainder += points[ j] . 1 as f64 ;
190
- }
191
- }
192
- for j in 0 ..points. len ( ) {
193
- if points[ j] . 1 <= ( total_weight / LENGTH_BOUND as f64 ) as f32
194
- && rng. gen :: < f64 > ( ) < 5.0 * ( LENGTH_BOUND as f64 ) / ( points. len ( ) as f64 )
195
- {
196
- let t = points[ j] . 1 as f64
197
- * ( points. len ( ) as f64 / ( 5.0 * LENGTH_BOUND as f64 ) )
198
- * ( remainder / total_weight) ;
199
- sampled_points. push ( ( & points[ j] . 0 , t as f32 ) ) ;
200
- }
201
- }
241
+ let mut list: Vec < Center > = single_centroid_cluster_weighted_vec_with_distance_over_slices (
242
+ & points,
243
+ distance,
244
+ max_allowed,
245
+ parallel_enabled,
246
+ ) ?;
247
+ list. sort_by ( |o1, o2| o2. weight ( ) . partial_cmp ( & o1. weight ( ) ) . unwrap ( ) ) ; // decreasing order
248
+ let mut summary_points: Vec < Vec < f32 > > = Vec :: new ( ) ;
249
+ let mut relative_weight: Vec < f32 > = Vec :: new ( ) ;
250
+ let center_sum: f64 = list. iter ( ) . map ( |x| x. weight ( ) ) . sum ( ) ;
251
+ for i in 0 ..list. len ( ) {
252
+ summary_points. push ( list[ i] . representative ( ) . clone ( ) ) ;
253
+ relative_weight. push ( ( list[ i] . weight ( ) / center_sum) as f32 ) ;
202
254
}
255
+ summary. add_typical ( summary_points, relative_weight) ;
256
+ }
257
+ return Ok ( summary) ;
258
+ }
203
259
204
- let mut list: Vec < Center > = iterative_clustering (
205
- max_allowed,
206
- & sampled_points,
207
- Center :: new,
260
+
261
+ pub fn multi_summarize_ref (
262
+ points : & [ ( & [ f32 ] , f32 ) ] ,
263
+ distance : fn ( & [ f32 ] , & [ f32 ] ) -> f64 ,
264
+ number_of_representatives : usize ,
265
+ shrinkage : f32 ,
266
+ max_number : usize ,
267
+ parallel_enabled : bool ,
268
+ ) -> Result < SampleSummary > {
269
+ let dimensions = points[ 0 ] . 0 . len ( ) ;
270
+ let mut summary = SampleSummary :: from_references ( dimensions, points, LOWER_FRACTION , UPPER_FRACTION ) ?;
271
+
272
+ if max_number > 0 {
273
+ let max_allowed = min ( dimensions * MAX_NUMBER_PER_DIMENSION , max_number) ;
274
+
275
+ let mut list= multi_cluster_as_weighted_ref (
276
+ & points,
208
277
distance,
278
+ number_of_representatives,
279
+ shrinkage,
280
+ false ,
281
+ max_allowed,
209
282
parallel_enabled,
210
- ) ;
283
+ ) ? ;
211
284
list. sort_by ( |o1, o2| o2. weight ( ) . partial_cmp ( & o1. weight ( ) ) . unwrap ( ) ) ; // decreasing order
212
285
let mut summary_points: Vec < Vec < f32 > > = Vec :: new ( ) ;
213
286
let mut relative_weight: Vec < f32 > = Vec :: new ( ) ;
214
287
let center_sum: f64 = list. iter ( ) . map ( |x| x. weight ( ) ) . sum ( ) ;
215
288
for i in 0 ..list. len ( ) {
216
- summary_points. push ( list[ i] . primary_representative ( distance ) . clone ( ) ) ;
289
+ summary_points. push ( Vec :: from ( list[ i] . representatives ( ) [ 0 ] . 0 ) ) ;
217
290
relative_weight. push ( ( list[ i] . weight ( ) / center_sum) as f32 ) ;
218
291
}
219
292
summary. add_typical ( summary_points, relative_weight) ;
220
293
}
221
- return summary;
294
+
295
+ return Ok ( summary) ;
222
296
}
0 commit comments