Merge branch 'utpilla/Update-OTLP-Exporter-Trace-Pipeline' of https://github.com/utpilla/opentelemetry-rust into utpilla/Update-OTLP-Exporter-Trace-Pipeline

utpilla · utpilla · commit cc4a9c065d1c · 2024-05-24T11:56:51.000-07:00
diff --git a/opentelemetry-sdk/benches/metric_counter.rs b/opentelemetry-sdk/benches/metric_counter.rs
@@ -19,7 +19,7 @@ use opentelemetry::{
 };
 use opentelemetry_sdk::metrics::{ManualReader, SdkMeterProvider};
 use rand::{
-    rngs::{self, SmallRng},
+    rngs::{self},
     Rng, SeedableRng,
 };
 use std::cell::RefCell;
@@ -107,14 +107,35 @@ fn counter_add(c: &mut Criterion) {
         });
     });
 
-    c.bench_function("Random_Generator_5", |b| {
+    // Cause overflow.
+    for v in 0..2001 {
+        counter.add(100, &[KeyValue::new("A", v.to_string())]);
+    }
+    c.bench_function("Counter_Overflow", |b| {
         b.iter(|| {
-            let mut rng = SmallRng::from_entropy();
-            let _i1 = rng.gen_range(0..4);
-            let _i2 = rng.gen_range(0..4);
-            let _i3 = rng.gen_range(0..10);
-            let _i4 = rng.gen_range(0..10);
-            let _i5 = rng.gen_range(0..10);
+            // 4*4*10*10 = 1600 time series.
+            let rands = CURRENT_RNG.with(|rng| {
+                let mut rng = rng.borrow_mut();
+                [
+                    rng.gen_range(0..4),
+                    rng.gen_range(0..4),
+                    rng.gen_range(0..10),
+                    rng.gen_range(0..10),
+                ]
+            });
+            let index_first_attribute = rands[0];
+            let index_second_attribute = rands[1];
+            let index_third_attribute = rands[2];
+            let index_forth_attribute = rands[3];
+            counter.add(
+                1,
+                &[
+                    KeyValue::new("attribute1", attribute_values[index_first_attribute]),
+                    KeyValue::new("attribute2", attribute_values[index_second_attribute]),
+                    KeyValue::new("attribute3", attribute_values[index_third_attribute]),
+                    KeyValue::new("attribute4", attribute_values[index_forth_attribute]),
+                ],
+            );
         });
     });
 
diff --git a/opentelemetry-sdk/src/metrics/internal/aggregate.rs b/opentelemetry-sdk/src/metrics/internal/aggregate.rs
@@ -24,7 +24,7 @@ pub(crate) static STREAM_OVERFLOW_ATTRIBUTE_SET: Lazy<AttributeSet> = Lazy::new(
 
 /// Checks whether aggregator has hit cardinality limit for metric streams
 pub(crate) fn is_under_cardinality_limit(size: usize) -> bool {
-    size < STREAM_CARDINALITY_LIMIT as usize - 1
+    size < STREAM_CARDINALITY_LIMIT as usize
 }
 
 /// Receives measurements to be aggregated.
diff --git a/opentelemetry-sdk/src/metrics/internal/sum.rs b/opentelemetry-sdk/src/metrics/internal/sum.rs
@@ -55,12 +55,12 @@ impl<T: Number<T>> ValueMap<T> {
                 Entry::Vacant(vacant_entry) => {
                     if is_under_cardinality_limit(size) {
                         vacant_entry.insert(measurement);
+                    } else if let Some(val) = values.get_mut(&STREAM_OVERFLOW_ATTRIBUTE_SET) {
+                        *val += measurement;
+                        return;
                     } else {
-                        values
-                            .entry(STREAM_OVERFLOW_ATTRIBUTE_SET.clone())
-                            .and_modify(|val| *val += measurement)
-                            .or_insert(measurement);
-                        global::handle_error(MetricsError::Other("Warning: Maximum data points for metric stream exceeded. Entry added to overflow.".into()));
+                        values.insert(STREAM_OVERFLOW_ATTRIBUTE_SET.clone(), measurement);
+                        global::handle_error(MetricsError::Other("Warning: Maximum data points for metric stream exceeded. Entry added to overflow. Subsequent overflows to same metric until next collect will not be logged.".into()));
                     }
                 }
             }
diff --git a/opentelemetry-sdk/src/metrics/mod.rs b/opentelemetry-sdk/src/metrics/mod.rs
@@ -161,6 +161,35 @@ mod tests {
     // "multi_thread" tokio flavor must be used else flush won't
     // be able to make progress!
 
+    #[tokio::test(flavor = "multi_thread", worker_threads = 1)]
+    async fn counter_overflow_delta() {
+        // Arrange
+        let mut test_context = TestContext::new(Temporality::Delta);
+        let counter = test_context.u64_counter("test", "my_counter", None);
+
+        // Act
+        // Record measurements with A:0, A:1,.......A:1999, which just fits in the 2000 limit
+        for v in 0..2000 {
+            counter.add(100, &[KeyValue::new("A", v.to_string())]);
+        }
+
+        // All of the below will now go into overflow.
+        counter.add(100, &[KeyValue::new("A", "foo")]);
+        counter.add(100, &[KeyValue::new("A", "another")]);
+        counter.add(100, &[KeyValue::new("A", "yet_another")]);
+        test_context.flush_metrics();
+
+        let sum = test_context.get_aggregation::<data::Sum<u64>>("my_counter", None);
+
+        // Expecting 2001 metric points. (2000 + 1 overflow)
+        assert_eq!(sum.data_points.len(), 2001);
+
+        let data_point =
+            find_datapoint_with_key_value(&sum.data_points, "otel.metric.overflow", "true")
+                .expect("overflow point expected");
+        assert_eq!(data_point.value, 300);
+    }
+
     #[tokio::test(flavor = "multi_thread", worker_threads = 1)]
     async fn counter_aggregation_cumulative() {
         // Run this test with stdout enabled to see output.
diff --git a/stress/Cargo.toml b/stress/Cargo.toml
@@ -9,6 +9,11 @@ name = "metrics"
 path = "src/metrics.rs"
 doc = false
 
+[[bin]] # Bin to run the metrics overflow stress tests
+name = "metrics_overflow"
+path = "src/metrics_overflow.rs"
+doc = false
+
 [[bin]] # Bin to run the logs stress tests
 name = "logs"
 path = "src/logs.rs"
diff --git a/stress/src/metrics_overflow.rs b/stress/src/metrics_overflow.rs
@@ -0,0 +1,49 @@
+/*
+    Stress test results:
+    OS: Ubuntu 22.04.3 LTS (5.15.146.1-microsoft-standard-WSL2)
+    Hardware: AMD EPYC 7763 64-Core Processor - 2.44 GHz, 16vCPUs,
+    RAM: 64.0 GB
+    4.5M /sec
+*/
+
+use lazy_static::lazy_static;
+use opentelemetry::{
+    metrics::{Counter, MeterProvider as _},
+    KeyValue,
+};
+use opentelemetry_sdk::metrics::{ManualReader, SdkMeterProvider};
+use rand::{
+    rngs::{self},
+    Rng, SeedableRng,
+};
+use std::{borrow::Cow, cell::RefCell};
+
+mod throughput;
+
+lazy_static! {
+    static ref PROVIDER: SdkMeterProvider = SdkMeterProvider::builder()
+        .with_reader(ManualReader::builder().build())
+        .build();
+    static ref COUNTER: Counter<u64> = PROVIDER
+        .meter(<&str as Into<Cow<'static, str>>>::into("test"))
+        .u64_counter("hello")
+        .init();
+}
+
+thread_local! {
+    /// Store random number generator for each thread
+    static CURRENT_RNG: RefCell<rngs::SmallRng> = RefCell::new(rngs::SmallRng::from_entropy());
+}
+
+fn main() {
+    throughput::test_throughput(test_counter);
+}
+
+fn test_counter() {
+    // The main goal of this test is to ensure that OTel SDK is not growing its
+    // memory usage indefinitely even when user code misbehaves by producing
+    // unbounded metric points (unique time series).
+    // It also checks that SDK's internal logging is also done in a bounded way.
+    let rand = CURRENT_RNG.with(|rng| rng.borrow_mut().gen_range(0..100000000));
+    COUNTER.add(1, &[KeyValue::new("A", rand)]);
+}

Original file line number	Diff line number	Diff line change
`@@ -24,7 +24,7 @@ pub(crate) static STREAM_OVERFLOW_ATTRIBUTE_SET: Lazy<AttributeSet> = Lazy::new(`
`24`	`24`
`25`	`25`	`/// Checks whether aggregator has hit cardinality limit for metric streams`
`26`	`26`	`pub(crate) fn is_under_cardinality_limit(size: usize) -> bool {`
`27`		`- size < STREAM_CARDINALITY_LIMIT as usize - 1`
	`27`	`+ size < STREAM_CARDINALITY_LIMIT as usize`
`28`	`28`	`}`
`29`	`29`
`30`	`30`	`/// Receives measurements to be aggregated.`
Original file line number	Diff line number	Diff line change
`@@ -55,12 +55,12 @@ impl<T: Number<T>> ValueMap<T> {`
`55`	`55`	`Entry::Vacant(vacant_entry) => {`
`56`	`56`	`if is_under_cardinality_limit(size) {`
`57`	`57`	`vacant_entry.insert(measurement);`
	`58`	`+ } else if let Some(val) = values.get_mut(&STREAM_OVERFLOW_ATTRIBUTE_SET) {`
	`59`	`+ *val += measurement;`
	`60`	`+ return;`
`58`	`61`	`} else {`
`59`		`- values`
`60`		`- .entry(STREAM_OVERFLOW_ATTRIBUTE_SET.clone())`
`61`		`- .and_modify(\|val\| *val += measurement)`
`62`		`- .or_insert(measurement);`
`63`		`- global::handle_error(MetricsError::Other("Warning: Maximum data points for metric stream exceeded. Entry added to overflow.".into()));`
	`62`	`+ values.insert(STREAM_OVERFLOW_ATTRIBUTE_SET.clone(), measurement);`
	`63`	`+ global::handle_error(MetricsError::Other("Warning: Maximum data points for metric stream exceeded. Entry added to overflow. Subsequent overflows to same metric until next collect will not be logged.".into()));`
`64`	`64`	`}`
`65`	`65`	`}`
`66`	`66`	`}`