Skip to content

Commit e0ca417

Browse files
authored
[oximeter] Write to both single node and cluster (#7565)
As part of phase one of rolling out the replicated ClickHouse cluster, we'll be writing to both the single node and replicated cluster when it exists. In our dogfood rack we'll enable the replicated cluster to perform long running tests. Closes: #7419
1 parent f8aacda commit e0ca417

File tree

9 files changed

+168
-24
lines changed

9 files changed

+168
-24
lines changed

internal-dns/types/src/config.rs

+5
Original file line numberDiff line numberDiff line change
@@ -478,6 +478,11 @@ impl DnsConfigBuilder {
478478
);
479479
let zone = self.host_zone(zone_id, *http_address.ip())?;
480480
self.service_backend_zone(http_service, &zone, http_address.port())?;
481+
self.service_backend_zone(
482+
ServiceName::ClickhouseClusterNative,
483+
&zone,
484+
CLICKHOUSE_TCP_PORT,
485+
)?;
481486
self.service_backend_zone(
482487
ServiceName::ClickhouseAdminServer,
483488
&zone,

internal-dns/types/src/names.rs

+7-1
Original file line numberDiff line numberDiff line change
@@ -33,8 +33,12 @@ pub enum ServiceName {
3333
ClickhouseAdminSingleServer,
3434
/// The native TCP interface to a ClickHouse server.
3535
///
36-
/// NOTE: This is used for either single-node or a replicated cluster.
36+
/// NOTE: This is used for a single-node ClickHouse installation.
3737
ClickhouseNative,
38+
/// The native TCP interface to a ClickHouse server.
39+
///
40+
/// NOTE: This is used for a replicated cluster ClickHouse installation.
41+
ClickhouseClusterNative,
3842
/// The TCP interface to a ClickHouse Keeper server.
3943
ClickhouseKeeper,
4044
/// The HTTP interface to a replicated ClickHouse server.
@@ -67,6 +71,7 @@ impl ServiceName {
6771
"clickhouse-admin-single-server"
6872
}
6973
ServiceName::ClickhouseNative => "clickhouse-native",
74+
ServiceName::ClickhouseClusterNative => "clickhouse-cluster-native",
7075
ServiceName::ClickhouseKeeper => "clickhouse-keeper",
7176
ServiceName::ClickhouseServer => "clickhouse-server",
7277
ServiceName::Cockroach => "cockroach",
@@ -97,6 +102,7 @@ impl ServiceName {
97102
| ServiceName::ClickhouseAdminServer
98103
| ServiceName::ClickhouseAdminSingleServer
99104
| ServiceName::ClickhouseNative
105+
| ServiceName::ClickhouseClusterNative
100106
| ServiceName::ClickhouseKeeper
101107
| ServiceName::ClickhouseServer
102108
| ServiceName::Cockroach

internal-dns/types/tests/output/internal-dns-zone.txt

+11
Original file line numberDiff line numberDiff line change
@@ -113,6 +113,17 @@ builder: "non_trivial"
113113
}
114114
}
115115
],
116+
"_clickhouse-cluster-native._tcp": [
117+
{
118+
"type": "SRV",
119+
"data": {
120+
"prio": 0,
121+
"weight": 0,
122+
"port": 9000,
123+
"target": "001de000-c04e-4000-8000-000000000006.host.control-plane.oxide.internal"
124+
}
125+
}
126+
],
116127
"_clickhouse-native._tcp": [
117128
{
118129
"type": "SRV",

oximeter/collector/src/agent.rs

+121-12
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ use oximeter_api::ProducerDetails;
2626
use oximeter_db::Client;
2727
use oximeter_db::DbWrite;
2828
use qorb::claim::Handle;
29+
use qorb::policy::Policy;
2930
use qorb::pool::Pool;
3031
use qorb::resolver::BoxedResolver;
3132
use slog::Logger;
@@ -56,11 +57,12 @@ pub struct OximeterAgent {
5657
log: Logger,
5758
// Oximeter target used by this agent to produce metrics about itself.
5859
collection_target: self_stats::OximeterCollector,
59-
// Handle to the TX-side of a channel for collecting results from the collection tasks
60-
result_sender: mpsc::Sender<CollectionTaskOutput>,
60+
// Wrapper of the two handles to the TX-side of the single-node and cluster
61+
// channels for collecting results from the collection tasks.
62+
result_sender: CollectionTaskSenderWrapper,
6163
// Handle to each Tokio task collection from a single producer.
6264
collection_tasks: Arc<Mutex<BTreeMap<Uuid, CollectionTaskHandle>>>,
63-
// The interval on which we refresh our list of producers from Nexus
65+
// The interval on which we refresh our list of producers from Nexus.
6466
refresh_interval: Duration,
6567
// Handle to the task used to periodically refresh the list of producers.
6668
refresh_task: Arc<StdMutex<Option<tokio::task::JoinHandle<()>>>>,
@@ -70,22 +72,31 @@ pub struct OximeterAgent {
7072

7173
impl OximeterAgent {
7274
/// Construct a new agent with the given ID and logger.
75+
// TODO: Remove this linter exception once we only write to a
76+
// single database
77+
#[allow(clippy::too_many_arguments)]
7378
pub async fn with_id(
7479
id: Uuid,
7580
address: SocketAddrV6,
7681
refresh_interval: Duration,
7782
db_config: DbConfig,
7883
native_resolver: BoxedResolver,
84+
// Temporary resolver to write to a replicated ClickHouse
85+
// cluster as well as a single-node installation.
86+
cluster_resolver: BoxedResolver,
7987
log: &Logger,
8088
replicated: bool,
8189
) -> Result<Self, Error> {
82-
let (result_sender, result_receiver) = mpsc::channel(8);
90+
let collection_task_wrapper = CollectionTaskWrapper::new();
91+
8392
let log = log.new(o!(
8493
"component" => "oximeter-agent",
8594
"collector_id" => id.to_string(),
8695
"collector_ip" => address.ip().to_string(),
8796
));
8897
let insertion_log = log.new(o!("component" => "results-sink"));
98+
let instertion_log_cluster =
99+
log.new(o!("component" => "results-sink-cluster"));
89100

90101
// Determine the version of the database.
91102
//
@@ -126,14 +137,54 @@ impl OximeterAgent {
126137
collector_port: address.port(),
127138
};
128139

129-
// Spawn the task for aggregating and inserting all metrics
140+
// Spawn the task for aggregating and inserting all metrics to a
141+
// single node ClickHouse installation.
130142
tokio::spawn(async move {
131143
crate::results_sink::database_inserter(
132144
insertion_log,
133145
client,
134146
db_config.batch_size,
135147
Duration::from_secs(db_config.batch_interval),
136-
result_receiver,
148+
collection_task_wrapper.single_rx,
149+
)
150+
.await
151+
});
152+
153+
// Our internal testing rack will be running a ClickHouse cluster
154+
// alongside a single-node installation for a while. We want to handle
155+
// the case of these two installations running alongside each other, and
156+
// oximeter writing to both of them. On our production racks ClickHouse
157+
// will only be run on single-node modality, so we'll ignore all cases where
158+
// the `ClickhouseClusterNative` service is not available.
159+
// This will be done by spawning a second task for DB inserts to a replicated
160+
// ClickHouse cluster. If oximeter cannot connect to the database, it will
161+
// simply log a warning and move on.
162+
163+
// Temporary additional client that writes to a replicated cluster
164+
// This will be removed once we phase out the single node installation.
165+
//
166+
// We don't need to check whether the DB is at the expected version since
167+
// this is already handled by reconfigurator via clickhouse-admin.
168+
//
169+
// We have a short claim timeout so oximeter can move on quickly if the cluster
170+
// does not exist.
171+
let claim_policy = Policy {
172+
claim_timeout: Duration::from_millis(100),
173+
..Default::default()
174+
};
175+
176+
let cluster_client =
177+
Client::new_with_pool_policy(cluster_resolver, claim_policy, &log);
178+
179+
// Spawn the task for aggregating and inserting all metrics to a
180+
// replicated cluster ClickHouse installation
181+
tokio::spawn(async move {
182+
results_sink::database_inserter(
183+
instertion_log_cluster,
184+
cluster_client,
185+
db_config.batch_size,
186+
Duration::from_secs(db_config.batch_interval),
187+
collection_task_wrapper.cluster_rx,
137188
)
138189
.await
139190
});
@@ -142,7 +193,7 @@ impl OximeterAgent {
142193
id,
143194
log,
144195
collection_target,
145-
result_sender,
196+
result_sender: collection_task_wrapper.wrapper_tx,
146197
collection_tasks: Arc::new(Mutex::new(BTreeMap::new())),
147198
refresh_interval,
148199
refresh_task: Arc::new(StdMutex::new(None)),
@@ -183,13 +234,14 @@ impl OximeterAgent {
183234
db_config: Option<DbConfig>,
184235
log: &Logger,
185236
) -> Result<Self, Error> {
186-
let (result_sender, result_receiver) = mpsc::channel(8);
187237
let log = log.new(o!(
188238
"component" => "oximeter-standalone",
189239
"collector_id" => id.to_string(),
190240
"collector_ip" => address.ip().to_string(),
191241
));
192242

243+
let collection_task_wrapper = CollectionTaskWrapper::new();
244+
193245
// If we have configuration for ClickHouse, we'll spawn the results
194246
// sink task as usual. If not, we'll spawn a dummy task that simply
195247
// prints the results as they're received.
@@ -218,12 +270,15 @@ impl OximeterAgent {
218270
client,
219271
db_config.batch_size,
220272
Duration::from_secs(db_config.batch_interval),
221-
result_receiver,
273+
collection_task_wrapper.single_rx,
222274
)
223275
.await
224276
});
225277
} else {
226-
tokio::spawn(results_sink::logger(insertion_log, result_receiver));
278+
tokio::spawn(results_sink::logger(
279+
insertion_log,
280+
collection_task_wrapper.single_rx,
281+
));
227282
}
228283

229284
// Set up tracking of statistics about ourselves.
@@ -242,7 +297,7 @@ impl OximeterAgent {
242297
id,
243298
log,
244299
collection_target,
245-
result_sender,
300+
result_sender: collection_task_wrapper.wrapper_tx,
246301
collection_tasks: Arc::new(Mutex::new(BTreeMap::new())),
247302
refresh_interval,
248303
refresh_task: Arc::new(StdMutex::new(None)),
@@ -434,6 +489,60 @@ impl OximeterAgent {
434489
}
435490
}
436491

492+
#[derive(Debug, Clone)]
493+
pub struct CollectionTaskSenderWrapper {
494+
single_tx: mpsc::Sender<CollectionTaskOutput>,
495+
cluster_tx: mpsc::Sender<CollectionTaskOutput>,
496+
}
497+
498+
impl CollectionTaskSenderWrapper {
499+
pub async fn send(
500+
&self,
501+
msg: CollectionTaskOutput,
502+
log: &Logger,
503+
) -> anyhow::Result<()> {
504+
let (result_single, result_cluster) = futures::future::join(
505+
self.single_tx.send(msg.clone()),
506+
self.cluster_tx.send(msg),
507+
)
508+
.await;
509+
510+
if let Err(e) = result_single {
511+
error!(
512+
log,
513+
"failed to send value from the collection task to channel for single node: {e:?}"
514+
);
515+
};
516+
if let Err(e) = result_cluster {
517+
error!(
518+
log,
519+
"failed to send value from the collection task to channel for cluster: {e:?}"
520+
);
521+
};
522+
Ok(())
523+
}
524+
}
525+
526+
#[derive(Debug)]
527+
pub struct CollectionTaskWrapper {
528+
wrapper_tx: CollectionTaskSenderWrapper,
529+
single_rx: mpsc::Receiver<CollectionTaskOutput>,
530+
cluster_rx: mpsc::Receiver<CollectionTaskOutput>,
531+
}
532+
533+
impl CollectionTaskWrapper {
534+
pub fn new() -> Self {
535+
let (single_tx, single_rx) = mpsc::channel(8);
536+
let (cluster_tx, cluster_rx) = mpsc::channel(8);
537+
538+
Self {
539+
wrapper_tx: CollectionTaskSenderWrapper { single_tx, cluster_tx },
540+
single_rx,
541+
cluster_rx,
542+
}
543+
}
544+
}
545+
437546
// A task which periodically updates our list of producers from Nexus.
438547
async fn refresh_producer_list_task(
439548
agent: OximeterAgent,
@@ -543,7 +652,7 @@ async fn claim_nexus_with_backoff(
543652
"failed to lookup Nexus IP, will retry";
544653
"delay" => ?delay,
545654
// No `InlineErrorChain` here: `error` is a string
546-
"error" => error,
655+
"error" => %error,
547656
);
548657
};
549658
let do_lookup = || async {

oximeter/collector/src/collection_task.rs

+10-8
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
// Copyright 2024 Oxide Computer Company
88

99
use crate::Error;
10+
use crate::agent::CollectionTaskSenderWrapper;
1011
use crate::self_stats;
1112
use chrono::DateTime;
1213
use chrono::Utc;
@@ -307,6 +308,7 @@ async fn collection_loop(
307308
}
308309

309310
/// Type of each output sent from a collection task to the results sink.
311+
#[derive(Debug, Clone)]
310312
pub(crate) struct CollectionTaskOutput {
311313
pub(crate) was_forced_collection: bool,
312314
pub(crate) results: ProducerResults,
@@ -334,7 +336,7 @@ impl CollectionTaskHandle {
334336
log: &Logger,
335337
collector: self_stats::OximeterCollector,
336338
producer: ProducerEndpoint,
337-
outbox: mpsc::Sender<CollectionTaskOutput>,
339+
outbox: CollectionTaskSenderWrapper,
338340
) -> Self {
339341
let (task, task_tx) =
340342
CollectionTask::new(log, collector, producer, outbox).await;
@@ -480,7 +482,7 @@ struct CollectionTask {
480482
result_rx: mpsc::Receiver<CollectionResponse>,
481483

482484
// Outbox for forwarding the results to the sink.
483-
outbox: mpsc::Sender<CollectionTaskOutput>,
485+
outbox: CollectionTaskSenderWrapper,
484486

485487
// Timer for making collections periodically.
486488
collection_timer: Interval,
@@ -499,7 +501,7 @@ impl CollectionTask {
499501
log: &Logger,
500502
collector: self_stats::OximeterCollector,
501503
producer: ProducerEndpoint,
502-
outbox: mpsc::Sender<CollectionTaskOutput>,
504+
outbox: CollectionTaskSenderWrapper,
503505
) -> (Self, mpsc::Sender<CollectionMessage>) {
504506
// Create our own logger.
505507
let log = log.new(o!(
@@ -593,7 +595,7 @@ impl CollectionTask {
593595
self.outbox.send(CollectionTaskOutput {
594596
was_forced_collection: false,
595597
results: self.stats.sample(),
596-
}).await.unwrap();
598+
}, &self.log).await.unwrap();
597599
}
598600
_ = self.collection_timer.tick() => {
599601
self.handle_collection_timer_tick().await?;
@@ -786,10 +788,10 @@ impl CollectionTask {
786788
self.details.on_success(success);
787789
if self
788790
.outbox
789-
.send(CollectionTaskOutput {
790-
was_forced_collection,
791-
results,
792-
})
791+
.send(
792+
CollectionTaskOutput { was_forced_collection, results },
793+
&self.log,
794+
)
793795
.await
794796
.is_err()
795797
{

oximeter/collector/src/lib.rs

+9
Original file line numberDiff line numberDiff line change
@@ -263,13 +263,22 @@ impl Oximeter {
263263
debug!(log, "creating ClickHouse client");
264264
let resolver =
265265
make_resolver(config.db.address, ServiceName::ClickhouseNative);
266+
let cluster_resolver = Box::new(DnsResolver::new(
267+
service::Name(ServiceName::ClickhouseClusterNative.srv_name()),
268+
bootstrap_dns.clone(),
269+
DnsResolverConfig {
270+
hardcoded_ttl: Some(tokio::time::Duration::MAX),
271+
..Default::default()
272+
},
273+
));
266274
Ok(Arc::new(
267275
OximeterAgent::with_id(
268276
args.id,
269277
args.address,
270278
config.refresh_interval,
271279
config.db,
272280
resolver,
281+
cluster_resolver,
273282
&log,
274283
config.db.replicated,
275284
)

0 commit comments

Comments
 (0)