@@ -105,8 +105,8 @@ public static <R> void assignAndRecompute(List<Weighted<Integer>> sampledPoints,
105
105
double minDist = Double .MAX_VALUE ;
106
106
int minDistNbr = -1 ;
107
107
for (int i = 0 ; i < clusters .size (); i ++) {
108
+ // will check for negative distances
108
109
dist [i ] = clusters .get (i ).distance (getPoint .apply (point .index ), distance );
109
- checkArgument (dist [i ] >= 0 , "distance cannot be negative" );
110
110
if (minDist > dist [i ]) {
111
111
minDist = dist [i ];
112
112
minDistNbr = i ;
@@ -201,10 +201,18 @@ public static <R> List<ICluster<R>> iterativeClustering(int maxAllowed, int init
201
201
boolean phase2GlobalReassign , double overlapParameter , List <ICluster <R >> previousClustering ) {
202
202
203
203
checkArgument (refs .size () > 0 , "empty list, nothing to do" );
204
- checkArgument (maxAllowed >= stopAt && stopAt > 0 , "incorrect bounds on number of clusters" );
204
+ checkArgument (stopAt > 0 , "has to stop at 1 cluster" );
205
+ checkArgument (stopAt <= maxAllowed , "cannot stop before achieving the limit" );
205
206
206
207
Random rng = new Random (seed );
207
- double sampledSum = refs .stream ().map (e -> (double ) e .weight ).reduce (Double ::sum ).get ();
208
+ double sampledSum = refs .stream ().map (e -> {
209
+ checkArgument (!Double .isNaN (e .weight ), " weights have to be non-NaN" );
210
+ checkArgument (Double .isFinite (e .weight ), " weights have to be finite" );
211
+ checkArgument (e .weight >= 0.0 , () -> "negative weights are not meaningful" + e .weight );
212
+ return (double ) e .weight ;
213
+ }).reduce (0.0 , Double ::sum );
214
+ checkArgument (sampledSum > 0 , " total weight has to be positive" );
215
+
208
216
ArrayList <ICluster <R >> centers = new ArrayList <>();
209
217
if (refs .size () < 10 * (initial + 5 )) {
210
218
for (Weighted <Integer > point : refs ) {
@@ -294,6 +302,8 @@ public static <R> List<ICluster<R>> iterativeClustering(int maxAllowed, int init
294
302
}
295
303
centers .sort (Comparator .comparingDouble (ICluster ::getWeight ));
296
304
while (centers .get (0 ).getWeight () == 0.0 ) {
305
+ // this line is reachable via zeroTest() in
306
+ // SampleSummaryTest
297
307
centers .remove (0 );
298
308
}
299
309
if (inital < 1.2 * maxAllowed + 1 ) {
@@ -345,14 +355,14 @@ public static <R> List<ICluster<R>> summarize(List<Weighted<R>> points, int maxA
345
355
List <ICluster <R >> previousClustering ) {
346
356
checkArgument (maxAllowed < 100 , "are you sure you want more elements in the summary?" );
347
357
checkArgument (maxAllowed <= initial , "initial parameter should be at least maximum allowed in final result" );
348
- checkArgument (stopAt > 0 && stopAt <= maxAllowed , "lower bound set incorrectly" );
349
358
350
359
double totalWeight = points .stream ().map (e -> {
351
- checkArgument (e .weight >= 0.0 , "negative weights are not meaningful" );
360
+ checkArgument (!Double .isNaN (e .weight ), " weights have to be non-NaN" );
361
+ checkArgument (Double .isFinite (e .weight ), " weights have to be finite" );
362
+ checkArgument (e .weight >= 0.0 , () -> "negative weights are not meaningful" + e .weight );
352
363
return (double ) e .weight ;
353
364
}).reduce (0.0 , Double ::sum );
354
- checkArgument (!Double .isNaN (totalWeight ) && Double .isFinite (totalWeight ),
355
- " weights have to finite and non-NaN" );
365
+ checkArgument (totalWeight > 0 , " total weight has to be positive" );
356
366
Random rng = new Random (seed );
357
367
// the following list is explicity copied and sorted for potential efficiency
358
368
List <Weighted <R >> sampledPoints = createSample (points , rng .nextLong (), 5 * LENGTH_BOUND , 0.005 , 1.0 );
@@ -363,8 +373,6 @@ public static <R> List<ICluster<R>> summarize(List<Weighted<R>> points, int maxA
363
373
}
364
374
365
375
Function <Integer , R > getPoint = (i ) -> sampledPoints .get (i ).index ;
366
- checkArgument (sampledPoints .size () > 0 , "empty list, nothing to do" );
367
- double sampledSum = sampledPoints .stream ().map (e -> (double ) e .weight ).reduce (Double ::sum ).get ();
368
376
369
377
return iterativeClustering (maxAllowed , initial , stopAt , refs , getPoint , distance , clusterInitializer ,
370
378
rng .nextLong (), parallelEnabled , phase2GlobalReassign , overlapParameter , previousClustering );
@@ -403,11 +411,13 @@ public static SampleSummary summarize(List<Weighted<float[]>> points, int maxAll
403
411
checkArgument (maxAllowed <= initial , "initial parameter should be at least maximum allowed in final result" );
404
412
405
413
double totalWeight = points .stream ().map (e -> {
406
- checkArgument (e .weight >= 0.0 , "negative weights are not meaningful" );
414
+ checkArgument (!Double .isNaN (e .weight ), " weights have to be non-NaN" );
415
+ checkArgument (Double .isFinite (e .weight ), " weights have to be finite" );
416
+ checkArgument (e .weight >= 0.0 , () -> "negative weights are not meaningful" + e .weight );
407
417
return (double ) e .weight ;
408
418
}).reduce (0.0 , Double ::sum );
409
- checkArgument (! Double . isNaN ( totalWeight ) && Double . isFinite ( totalWeight ),
410
- " weights have to finite and non-NaN" );
419
+ checkArgument (totalWeight > 0 , " total weight has to be positive" );
420
+
411
421
Random rng = new Random (seed );
412
422
// the following list is explicity copied and sorted for potential efficiency
413
423
List <Weighted <float []>> sampledPoints = createSample (points , rng .nextLong (), 5 * LENGTH_BOUND , 0.005 , 1.0 );
@@ -458,24 +468,24 @@ public static SampleSummary summarize(float[][] points, int maxAllowed, int init
458
468
* @param maxAllowed maximum number of groups/clusters
459
469
* @param initial a parameter controlling the initialization
460
470
* @param reassignPerStep if reassignment is to be performed each step
471
+ * @param seed random seed
461
472
* @return a summarization
462
473
*/
463
- public static SampleSummary summarize (List <Weighted <float []>> points , int maxAllowed , int initial ,
464
- boolean reassignPerStep ) {
465
- return summarize (points , maxAllowed , initial , reassignPerStep , Summarizer ::L2distance , new Random ().nextLong (),
466
- false );
474
+ public static SampleSummary l2summarize (List <Weighted <float []>> points , int maxAllowed , int initial ,
475
+ boolean reassignPerStep , long seed ) {
476
+ return summarize (points , maxAllowed , initial , reassignPerStep , Summarizer ::L2distance , seed , false );
467
477
}
468
478
469
479
/**
470
480
* Same as above, with the most common use cases filled in
471
481
*
472
482
* @param points points in float[][], each of weight 1.0
473
483
* @param maxAllowed maximum number of clusters one is interested in
484
+ * @param seed random seed
474
485
* @return a summarization
475
486
*/
476
- public static SampleSummary summarize (float [][] points , int maxAllowed ) {
477
- return summarize (points , maxAllowed , 4 * maxAllowed , false , Summarizer ::L2distance , new Random ().nextLong (),
478
- false );
487
+ public static SampleSummary l2summarize (float [][] points , int maxAllowed , long seed ) {
488
+ return summarize (points , maxAllowed , 4 * maxAllowed , false , Summarizer ::L2distance , seed , false );
479
489
}
480
490
481
491
/**
@@ -529,9 +539,9 @@ public static <R> List<ICluster<R>> multiSummarize(R[] points, int maxAllowed, i
529
539
clusterInitializer , seed , parallelEnabled , null );
530
540
}
531
541
532
- // same as above, with defaults
542
+ // same as above, with multicenter instead of generic
533
543
public static List <ICluster <float []>> multiSummarize (float [][] points , int maxAllowed , double shrinkage ,
534
- int numberOfRepresentatives ) {
544
+ int numberOfRepresentatives , long seed ) {
535
545
536
546
ArrayList <Weighted <float []>> weighted = new ArrayList <>();
537
547
for (float [] point : points ) {
@@ -540,7 +550,7 @@ public static List<ICluster<float[]>> multiSummarize(float[][] points, int maxAl
540
550
BiFunction <float [], Float , ICluster <float []>> clusterInitializer = (a , b ) -> MultiCenter .initialize (a , b ,
541
551
shrinkage , numberOfRepresentatives );
542
552
return summarize (weighted , maxAllowed , 4 * maxAllowed , 1 , true , DEFAULT_SEPARATION_RATIO_FOR_MERGE ,
543
- Summarizer ::L2distance , clusterInitializer , new Random (). nextLong () , true , null );
553
+ Summarizer ::L2distance , clusterInitializer , seed , true , null );
544
554
}
545
555
546
556
}
0 commit comments