aws
diff --git a/‎Java/README.md
+49-28 b/‎Java/README.md
+49-28
diff --git a/‎Java/core/src/main/java/com/amazon/randomcutforest/RandomCutForest.java
+6 b/‎Java/core/src/main/java/com/amazon/randomcutforest/RandomCutForest.java
+6
diff --git a/‎Java/core/src/main/java/com/amazon/randomcutforest/anomalydetection/AbstractAttributionVisitor.java
+6-5 b/‎Java/core/src/main/java/com/amazon/randomcutforest/anomalydetection/AbstractAttributionVisitor.java
+6-5
diff --git a/‎Java/core/src/main/java/com/amazon/randomcutforest/anomalydetection/DynamicAttributionVisitor.java
-17 b/‎Java/core/src/main/java/com/amazon/randomcutforest/anomalydetection/DynamicAttributionVisitor.java
-17
diff --git a/‎Java/core/src/main/java/com/amazon/randomcutforest/config/CorrectionMode.java
+64 b/‎Java/core/src/main/java/com/amazon/randomcutforest/config/CorrectionMode.java
+64
diff --git a/‎Java/core/src/main/java/com/amazon/randomcutforest/imputation/ConditionalSampleSummarizer.java
+5-7 b/‎Java/core/src/main/java/com/amazon/randomcutforest/imputation/ConditionalSampleSummarizer.java
+5-7
@@ -2,8 +2,10 @@
 
 This directory contains a Java implementation of the Random Cut Forest data structure and algorithms
 for anomaly detection, density estimation, imputation, and forecast. The goal of this library 
-is to be easy to use and to strike a balance between efficiency and extensibility. Please see randomcutforest-examples 
-for a few detailed examples and extensions.
+is to be easy to use and to strike a balance between efficiency and extensibility. Please do not forget 
+to look into the ParkServices package that provide many augmented functionalities such as explicit determination 
+of anomaly grade based on the first hand understanding of the core algorithm. Please also see randomcutforest-examples 
+for a few detailed examples and extensions. Please do not hesitate to creat an issue for any discussion item.
 
 ## Basic operations
 
@@ -13,20 +15,26 @@ To create a RandomCutForest instance with all parameters set to defaults:
 int dimensions = 5; // The number of dimensions in the input data, required
 RandomCutForest forest = RandomCutForest.defaultForest(dimensions);
 ```
-
+We recommend using shingle size which correspond to contextual analysis of data, 
+and RCF uses ideas not dissimilar from higher order Markov Chains to improve its 
+accuracy. An option is provided to have the shingles be constructed internally. 
 To explicitly set optional parameters like number of trees in the forest or 
-sample size, RandomCutForest provides a builder:
+sample size, RandomCutForest provides a builder (for example with 4 input dimensions for 
+a 4-way multivariate analysis):
 
 ```java
 RandomCutForest forest = RandomCutForest.builder()
-    .numberOfTrees(90)
-    .sampleSize(200)
-    .dimensions(2) // still required!
-    .lambda(0.2)
-    .randomSeed(123)
-    .storeSequenceIndexesEnabled(true)
-    .centerOfMassEnabled(true)
-    .build();
+        .numberOfTrees(90)
+        .sampleSize(200) // use this cover the phenomenon of interest
+                         // for analysis of 5 minute aggregations, a week has
+                         // about 12 * 24 * 7 starting points of interest
+                         // larger sample sizes will be larger models 
+        .dimensions(inputDimension*4) // still required!
+        .timeDecay(0.2) // determines half life of data
+        .randomSeed(123)
+        .internalShingleEnabled(true)
+        .shingleSize(7)
+        .build();
 ```
 
 Typical usage of a forest is to compute a statistic on an input data point and then update the forest with that point 
@@ -53,27 +61,40 @@ while (true) {
 
 The following parameters can be configured in the RandomCutForest builder. 
 
-| Parameter Name | Type | Description | Default Value|
-| --- | --- | --- | --- |
-| centerOfMassEnabled | boolean | If true, then tree nodes in the forest will compute their center of mass as part of tree update operations. | false |
-| dimensions | int | The number of dimensions in the input data. | Required, no default value |
-| lambda | double | The decay factor used by stream samplers in this forest. See the next section for guidance. | 1 / (10 * sampleSize) |
-| numberOfTrees | int | The number of trees in this forest. | 50 |
-| outputAfter | int | The number of points required by stream samplers before results are returned. | 0.25 * sampleSize |
-| parallelExecutionEnabled | boolean | If true, then the forest will create an internal threadpool. Forest updates and traversals will be submitted to this threadpool, and individual trees will be updated or traversed in parallel. For larger shingle sizes, dimensions, and number of trees, parallelization may improve throughput. We recommend users benchmark against their target use case. | false |
-| randomSeed | long | A seed value used to initialize the random number generators in this forest. | |
-| sampleSize | int | The sample size used by stream samplers in this forest | 256 |
-| storeSequenceIndexesEnabled | boolean | If true, then sequence indexes (ordinals indicating when a point was added to a tree) will be stored in the forest along with poitn values. | false |
-| threadPoolSize | int | The number of threads to use in the internal threadpool. | Number of available processors - 1 |
-
-## Choosing a `lambda` value for your application
+| Parameter Name              | Type    | Description                                                                                                                                                                                                                                                                                                                                                    | Default Value                                                                         |
+|-----------------------------|---------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|---------------------------------------------------------------------------------------|
+| dimensions                  | int     | The number of dimensions in the input data.                                                                                                                                                                                                                                                                                                                    | Required, no default value. Should be the product of input dimensions and shingleSize |
+| shingleSize                 | int     | The number of contiguous observations across all the input variables that would be used for analysis                                                                                                                                                                                                                                                           | Strongly recommended for contextual anomalies. Required for Forecast/Extrapolate      |
+| lambda                      | double  | The decay factor used by stream samplers in this forest. See the next section for guidance.                                                                                                                                                                                                                                                                    | 1 / (10 * sampleSize)                                                                 |
+| numberOfTrees               | int     | The number of trees in this forest.                                                                                                                                                                                                                                                                                                                            | 50                                                                                    |
+| outputAfter                 | int     | The number of points required by stream samplers before results are returned.                                                                                                                                                                                                                                                                                  | 0.25 * sampleSize                                                                     |
+| internalShinglingEnabled    | boolean | Whether the shingling is performed by RCF itself since it has already seen previous values.                                                                                                                                                                                                                                                                    | false (for historical reasons). Recommended : true, will result in smaller models.    |
+| parallelExecutionEnabled    | boolean | If true, then the forest will create an internal threadpool. Forest updates and traversals will be submitted to this threadpool, and individual trees will be updated or traversed in parallel. For larger shingle sizes, dimensions, and number of trees, parallelization may improve throughput. We recommend users benchmark against their target use case. | false                                                                                 |
+| randomSeed                  | long    | A seed value used to initialize the random number generators in this forest.                                                                                                                                                                                                                                                                                   |                                                                                       |
+| sampleSize                  | int     | The sample size used by stream samplers in this forest                                                                                                                                                                                                                                                                                                         | 256                                                                                   |
+| centerOfMassEnabled         | boolean | If true, then tree nodes in the forest will compute their center of mass as part of tree update operations.                                                                                                                                                                                                                                                    | false                                                                                 |
+| storeSequenceIndexesEnabled | boolean | If true, then sequence indexes (ordinals indicating when a point was added to a tree) will be stored in the forest along with poitn values.                                                                                                                                                                                                                    | false                                                                                 |
+| threadPoolSize              | int     | The number of threads to use in the internal threadpool.                                                                                                                                                                                                                                                                                                       | Number of available processors - 1                                                    |
+
+The above parameters are the most common and historical. Please use the issues to request additions/discussions of other parameters of interest.
+
+RandomCutForest primarily provides an estimation (say anomaly score, or extrapolation over a forecast horizon) and using that raw estimation can be challenging. The ParkServices package provides 
+several capabilities (ThresholdedRandomCutForest, RCFCaster, respectively) for distilling the scores to a determination of 
+anomaly/otherwise (an assesment of grade) or calibrated conformal forecasts. These have natural parameter choices that are different 
+from the core RandomCutForest -- for example internalShinglingEnabled defaults to true since that is more natural in those contexts.
+The package examples provides a collection of examples and uses of parameters, we draw the attention to ThresholdedMultiDimensionalExample 
+and RCFCasterExample. If one is interested in sequential analysis of a series of consecutive inputs, check out SequentialAnomalyExample. 
+ParkServices also exposes many other functionalities of RCF which were covert, such as clustering (including multi-centroid representations) 
+-- see NumericGLADExample for instance. 
+
+## Choosing a `timeDecay` value for your application
 
 When we submit a point to the sampler, it is included into the sample with some probability, and 
 it will remain in the for some number of steps before being replaced. Call the number of steps that
 a point is included in the sample the "lifetime" of the point (which may be 0). Over a finite time
 window, the distribution of the lifetime of a point is approximately exponential with parameter
-`lambda`. Thus, `1 / lambda` is approximately the average number of steps that a point will be included
-in the sample. By default, we set `lambda` equal to `1 / (10 * sampleSize)`.
+`lambda`. Thus, `1 / timmeDecay` is approximately the average number of steps that a point will be included
+in the sample. By default, we set `timeDecay` equal to `1 / (10 * sampleSize)`.
 
 Alternatively, if you want the probability that a point survives longer than n steps to be 0.05,
 you can solve for `lambda` in the equation `exp(-lambda * n) = 0.05`.
 
@@ -1228,6 +1228,7 @@ public float[] extrapolateFromCurrentTime(int horizon) {
      *                          considered a neighbor.
      * @return a list of Neighbors, ordered from closest to furthest.
      */
+    @Deprecated
     public List<Neighbor> getNearNeighborsInSample(double[] point, double distanceThreshold) {
         return getNearNeighborsInSample(toFloatArray(point), distanceThreshold);
     }
@@ -1258,7 +1259,12 @@ public List<Neighbor> getNearNeighborsInSample(float[] point, double distanceThr
      * @param point A point whose neighbors we want to find.
      * @return a list of Neighbors, ordered from closest to furthest.
      */
+    @Deprecated
     public List<Neighbor> getNearNeighborsInSample(double[] point) {
+        return getNearNeighborsInSample(toFloatArray(point));
+    }
+
+    public List<Neighbor> getNearNeighborsInSample(float[] point) {
         return getNearNeighborsInSample(point, Double.POSITIVE_INFINITY);
     }
 
 
@@ -156,7 +156,8 @@ public void accept(INodeView node, int depthOfNode) {
             }
         }
 
-        if ((hitDuplicates || ignoreLeaf) && (pointInsideBox || depthOfNode == 0)) {
+        boolean capture = (pointInsideBox || depthOfNode == 0);
+        if ((hitDuplicates || ignoreLeaf) && capture) {
             // final rescaling; this ensures agreement with the ScalarScoreVector
             // the scoreUnseen/scoreSeen should be the same as scoring; other uses need
             // caution.
@@ -170,7 +171,9 @@ public void acceptLeaf(INodeView leafNode, int depthOfNode) {
 
         updateRangesForScoring(leafNode.getBoundingBox(), leafNode.getBoundingBox().getMergedBox(pointToScore));
 
-        if (Arrays.equals(leafNode.getLeafPoint(), pointToScore)) {
+        // newrange == 0 corresponds to equality of points and is fater than
+        // Array.equals
+        if (sumOfNewRange <= 0) {
             hitDuplicates = true;
         }
 
@@ -180,9 +183,7 @@ public void acceptLeaf(INodeView leafNode, int depthOfNode) {
             savedScore = scoreUnseen(depthOfNode, leafNode.getMass());
         }
 
-        if ((hitDuplicates) || ((ignoreLeaf) && (leafNode.getMass() <= ignoreLeafMassThreshold))
-                || sumOfNewRange <= 0) {
-
+        if ((hitDuplicates) || ((ignoreLeaf) && (leafNode.getMass() <= ignoreLeafMassThreshold))) {
             Arrays.fill(directionalAttribution.high, savedScore / (2 * pointToScore.length));
             Arrays.fill(directionalAttribution.low, savedScore / (2 * pointToScore.length));
             /* in this case do not have a better option than an equal attribution */
 
@@ -66,23 +66,6 @@ public DynamicAttributionVisitor(float[] point, int treeMass, int ignoreLeafMass
         this.damp = damp;
     }
 
-    /**
-     * Same as above with a default non-dampening
-     * 
-     * @param point                   to be scored
-     * @param treeMass                mass of the tree
-     * @param ignoreLeafMassThreshold mass of the leaves to be ignored
-     * @param scoreSeen               score when point has been seen
-     * @param scoreUnseen             score when point has not been seen
-     */
-    public DynamicAttributionVisitor(float[] point, int treeMass, int ignoreLeafMassThreshold,
-            BiFunction<Double, Double, Double> scoreSeen, BiFunction<Double, Double, Double> scoreUnseen) {
-        super(point, treeMass, ignoreLeafMassThreshold);
-        this.scoreSeen = scoreSeen;
-        this.scoreUnseen = scoreUnseen;
-        this.damp = (x, y) -> 1.0;
-    }
-
     @Override
     protected double scoreSeen(int depth, int leafMass) {
         return scoreSeen.apply((double) depth, (double) leafMass);
 
@@ -0,0 +1,64 @@
+/*
+ * Copyright 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License").
+ * You may not use this file except in compliance with the License.
+ * A copy of the License is located at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * or in the "license" file accompanying this file. This file is distributed
+ * on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
+ * express or implied. See the License for the specific language governing
+ * permissions and limitations under the License.
+ */
+
+package com.amazon.randomcutforest.config;
+
+/**
+ * Options for using RCF, specially with thresholds
+ */
+public enum CorrectionMode {
+
+    /**
+     * default behavior, no correction
+     */
+    NONE,
+
+    /**
+     * due to transforms, or due to input noise
+     */
+    NOISE,
+
+    /**
+     * elimination due to multi mode operation
+     */
+
+    MULTI_MODE,
+
+    /**
+     * effect of an anomaly in shingle
+     */
+
+    ANOMALY_IN_SHINGLE,
+
+    /**
+     * conditional forecast, using conditional fields
+     */
+
+    CONDITIONAL_FORECAST,
+
+    /**
+     * forecasted value was not very different
+     */
+
+    FORECAST,
+
+    /**
+     * data drifts and level shifts, will not be corrected unless level shifts are
+     * turned on
+     */
+
+    DATA_DRIFT
+
+}
@@ -137,9 +137,12 @@ public SampleSummary summarize(List<ConditionalTreeSample> alist, boolean addTyp
         double currentWeight = 0;
         int alwaysInclude = 0;
         double remainderWeight = totalWeight;
-        while (alwaysInclude < newList.size() && newList.get(alwaysInclude).distance == 0) {
+        while (newList.get(alwaysInclude).distance == 0) {
             remainderWeight -= newList.get(alwaysInclude).weight;
             ++alwaysInclude;
+            if (alwaysInclude == newList.size()) {
+                break;
+            }
         }
         for (int j = 1; j < newList.size(); j++) {
             if ((currentWeight < remainderWeight / 3 && currentWeight + newList.get(j).weight >= remainderWeight / 3)
@@ -161,7 +164,6 @@ public SampleSummary summarize(List<ConditionalTreeSample> alist, boolean addTyp
         ArrayList<Weighted<float[]>> typicalPoints = new ArrayList<>();
         for (int j = 0; j < num; j++) {
             ConditionalTreeSample e = newList.get(j);
-
             float[] values;
             if (project) {
                 values = new float[missingDimensions.length];
@@ -171,11 +173,7 @@ public SampleSummary summarize(List<ConditionalTreeSample> alist, boolean addTyp
             } else {
                 values = Arrays.copyOf(e.leafPoint, dimensions);
             }
-            // weight is changed for clustering,
-            // based on the distance of the sample from the query point
-            double weight = (e.distance <= threshold) ? e.weight : e.weight * threshold / e.distance;
-            typicalPoints.add(new Weighted<>(values, (float) weight));
-
+            typicalPoints.add(new Weighted<>(values, (float) e.weight));
         }
         int maxAllowed = min(queryPoint.length * MAX_NUMBER_OF_TYPICAL_PER_DIMENSION, MAX_NUMBER_OF_TYPICAL_ELEMENTS);
         maxAllowed = min(maxAllowed, num);