Skip to content

Commit b25a021

Browse files
More logging for drivers and coordinator
1 parent 64a6d8f commit b25a021

3 files changed

Lines changed: 176 additions & 43 deletions

File tree

tools/tpcc-runner/src/coordinator.rs

Lines changed: 86 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ use std::collections::BTreeMap;
77
use std::fs;
88
use std::path::Path;
99
use std::sync::Arc;
10+
use std::time::Duration;
1011

1112
use crate::config::CoordinatorConfig;
1213
use crate::metrics_module_bindings::reset_reducer::reset;
@@ -15,7 +16,9 @@ use crate::protocol::{
1516
DriverAssignment, RegisterDriverRequest, RegisterDriverResponse, RunSchedule, ScheduleResponse,
1617
SubmitSummaryRequest,
1718
};
18-
use crate::summary::{aggregate_summaries, now_millis, write_json, AggregateSummary, DriverSummary};
19+
use crate::summary::{
20+
aggregate_summaries, log_aggregate_summary, now_millis, write_json, AggregateSummary, DriverSummary,
21+
};
1922

2023
#[derive(Clone)]
2124
struct AppState {
@@ -66,8 +69,8 @@ async fn register_driver(
6669
Json(request): Json<RegisterDriverRequest>,
6770
) -> Json<RegisterDriverResponse> {
6871
let mut inner = state.inner.lock();
69-
let assignment = match inner.registrations.get(&request.driver_id) {
70-
Some(existing) => existing.assignment.clone(),
72+
let (assignment, is_new_registration) = match inner.registrations.get(&request.driver_id) {
73+
Some(existing) => (existing.assignment.clone(), false),
7174
None => {
7275
if inner.registration_order.len() >= inner.config.expected_drivers {
7376
return Json(RegisterDriverResponse {
@@ -84,10 +87,30 @@ async fn register_driver(
8487
assignment: assignment.clone(),
8588
},
8689
);
87-
assignment
90+
(assignment, true)
8891
}
8992
};
9093
maybe_create_schedule(&mut inner);
94+
let registered = inner.registrations.len();
95+
let warehouse_end = assignment_end(&assignment);
96+
if is_new_registration {
97+
log::info!(
98+
"driver {} registered and ready ({}/{}): warehouses {}..={} ({} warehouse(s))",
99+
request.driver_id,
100+
registered,
101+
inner.config.expected_drivers,
102+
assignment.warehouse_start,
103+
warehouse_end,
104+
assignment.driver_warehouse_count
105+
);
106+
} else {
107+
log::info!(
108+
"driver {} re-registered and remains ready: warehouses {}..={}",
109+
request.driver_id,
110+
assignment.warehouse_start,
111+
warehouse_end
112+
);
113+
}
91114
Json(RegisterDriverResponse {
92115
accepted: true,
93116
assignment: Some(assignment),
@@ -108,16 +131,32 @@ async fn submit_summary(
108131
) -> Result<Json<AggregateSummary>, axum::http::StatusCode> {
109132
let aggregate = {
110133
let mut inner = state.inner.lock();
111-
inner
134+
let replaced = inner
112135
.summaries
113-
.insert(request.summary.driver_id.clone(), request.summary.clone());
114-
if inner.summaries.len() == inner.config.expected_drivers {
136+
.insert(request.summary.driver_id.clone(), request.summary.clone())
137+
.is_some();
138+
let received = inner.summaries.len();
139+
log::info!(
140+
"received summary from driver {} ({}/{}{})",
141+
request.summary.driver_id,
142+
received,
143+
inner.config.expected_drivers,
144+
if replaced { ", replaced existing summary" } else { "" }
145+
);
146+
if received == inner.config.expected_drivers {
115147
let summaries: Vec<_> = inner.summaries.values().cloned().collect();
116148
let aggregate = aggregate_summaries(inner.config.run_id.clone(), &summaries);
149+
let summary_path = aggregate_summary_path(&inner.config.output_dir, &aggregate);
117150
if let Err(err) = write_aggregate(&inner.config.output_dir, &aggregate) {
118151
log::error!("failed to write aggregate summary: {err:#}");
119152
return Err(axum::http::StatusCode::INTERNAL_SERVER_ERROR);
120153
}
154+
log::info!(
155+
"received all {} summary(s) for run {}",
156+
inner.config.expected_drivers,
157+
inner.config.run_id
158+
);
159+
log_aggregate_summary(&aggregate, &summary_path);
121160
aggregate
122161
} else {
123162
aggregate_summaries(
@@ -142,18 +181,23 @@ fn maybe_create_schedule(inner: &mut CoordinatorState) {
142181
.reducers
143182
.reset(inner.config.warmup_secs * 1000, measure_start_ms, measure_end_ms);
144183

145-
inner.schedule = Some(RunSchedule {
184+
let schedule = RunSchedule {
146185
run_id: inner.config.run_id.clone(),
147186
warmup_start_ms,
148187
measure_start_ms,
149188
measure_end_ms,
150189
stop_ms: measure_end_ms,
151-
});
190+
};
191+
inner.schedule = Some(schedule.clone());
152192
log::info!(
153-
"all {} driver(s) registered; schedule ready for run {}",
193+
"all {} driver(s) registered; schedule ready for run {} (warmup_start_ms={} measure_start_ms={} measure_end_ms={})",
154194
inner.config.expected_drivers,
155-
inner.config.run_id
195+
inner.config.run_id,
196+
warmup_start_ms,
197+
measure_start_ms,
198+
measure_end_ms
156199
);
200+
tokio::spawn(log_schedule_events(schedule));
157201
}
158202

159203
fn assignment_for_index(config: &CoordinatorConfig, index: usize) -> DriverAssignment {
@@ -173,7 +217,35 @@ fn assignment_for_index(config: &CoordinatorConfig, index: usize) -> DriverAssig
173217
}
174218

175219
fn write_aggregate(output_dir: &Path, aggregate: &AggregateSummary) -> Result<()> {
176-
let run_dir = output_dir.join(&aggregate.run_id);
177-
fs::create_dir_all(&run_dir).with_context(|| format!("failed to create {}", run_dir.display()))?;
178-
write_json(&run_dir.join("summary.json"), aggregate)
220+
let summary_path = aggregate_summary_path(output_dir, aggregate);
221+
if let Some(run_dir) = summary_path.parent() {
222+
fs::create_dir_all(run_dir).with_context(|| format!("failed to create {}", run_dir.display()))?;
223+
}
224+
write_json(&summary_path, aggregate)
225+
}
226+
227+
fn aggregate_summary_path(output_dir: &Path, aggregate: &AggregateSummary) -> std::path::PathBuf {
228+
output_dir.join(&aggregate.run_id).join("summary.json")
229+
}
230+
231+
fn assignment_end(assignment: &DriverAssignment) -> u32 {
232+
assignment
233+
.warehouse_start
234+
.saturating_add(assignment.driver_warehouse_count.saturating_sub(1))
235+
}
236+
237+
async fn log_schedule_events(schedule: RunSchedule) {
238+
sleep_until_ms(schedule.warmup_start_ms).await;
239+
log::info!("run {} warmup started", schedule.run_id);
240+
sleep_until_ms(schedule.measure_start_ms).await;
241+
log::info!("run {} measurement started", schedule.run_id);
242+
sleep_until_ms(schedule.measure_end_ms).await;
243+
log::info!("run {} measurement ended", schedule.run_id);
244+
}
245+
246+
async fn sleep_until_ms(target_ms: u64) {
247+
let now_ms = now_millis();
248+
if target_ms > now_ms {
249+
tokio::time::sleep(Duration::from_millis(target_ms - now_ms)).await;
250+
}
179251
}

tools/tpcc-runner/src/driver.rs

Lines changed: 39 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
use anyhow::{anyhow, bail, Context, Result};
22
use rand::{rngs::StdRng, Rng, SeedableRng};
33
use std::fs;
4-
use std::path::{Path, PathBuf};
4+
use std::path::PathBuf;
55
use std::sync::atomic::{AtomicBool, AtomicU64, Ordering};
66
use std::sync::Arc;
77
use std::time::Duration;
@@ -15,14 +15,18 @@ use crate::module_bindings::*;
1515
use crate::protocol::{
1616
RegisterDriverRequest, RegisterDriverResponse, RunSchedule, ScheduleResponse, SubmitSummaryRequest,
1717
};
18-
use crate::summary::{write_json, DriverSummary, DriverSummaryMeta, SharedMetrics, TransactionKind, TransactionRecord};
18+
use crate::summary::{
19+
log_driver_summary, write_json, DriverSummary, DriverSummaryMeta, SharedMetrics, TransactionKind,
20+
TransactionRecord,
21+
};
1922
use crate::topology::DatabaseTopology;
2023
use crate::tpcc::*;
2124

2225
struct TerminalRuntime {
2326
config: DriverConfig,
2427
metrics: SharedMetrics,
2528
abort: Arc<AtomicBool>,
29+
start_logged: Arc<AtomicBool>,
2630
request_ids: Arc<AtomicU64>,
2731
schedule: RunSchedule,
2832
run_constants: RunConstants,
@@ -60,9 +64,22 @@ pub async fn run(config: DriverConfig) -> Result<()> {
6064
};
6165

6266
let abort = Arc::new(AtomicBool::new(false));
67+
let start_logged = Arc::new(AtomicBool::new(false));
6368
let request_ids = Arc::new(AtomicU64::new(1));
6469
let mut tasks = JoinSet::new();
6570

71+
log::info!(
72+
"driver {} ready for run {}: warehouses {}..={} terminals={} warmup_start_ms={} measure_start_ms={} measure_end_ms={}",
73+
config.driver_id,
74+
run_id,
75+
config.warehouse_start,
76+
config.warehouse_end(),
77+
config.terminals(),
78+
schedule.warmup_start_ms,
79+
schedule.measure_start_ms,
80+
schedule.measure_end_ms
81+
);
82+
6683
for warehouse_id in config.warehouse_start..=config.warehouse_end() {
6784
let database_identity = topology.identity_for_warehouse(warehouse_id)?;
6885
for district_id in 1..=DISTRICTS_PER_WAREHOUSE {
@@ -75,13 +92,15 @@ pub async fn run(config: DriverConfig) -> Result<()> {
7592
let terminal_config = config.clone();
7693
let terminal_metrics = metrics.clone();
7794
let terminal_abort = abort.clone();
95+
let terminal_start_logged = start_logged.clone();
7896
let terminal_constants = run_constants.clone();
7997
let terminal_schedule = schedule.clone();
8098
let terminal_request_ids = request_ids.clone();
8199
let runtime = TerminalRuntime {
82100
config: terminal_config,
83101
metrics: terminal_metrics,
84102
abort: terminal_abort,
103+
start_logged: terminal_start_logged,
85104
request_ids: terminal_request_ids,
86105
schedule: terminal_schedule,
87106
run_constants: terminal_constants,
@@ -131,7 +150,7 @@ pub async fn run(config: DriverConfig) -> Result<()> {
131150
measure_end_ms: schedule.measure_end_ms,
132151
})?;
133152
write_json(&summary_path, &summary)?;
134-
print_summary(&summary, &summary_path, &events_path);
153+
log_driver_summary(&summary, &summary_path, &events_path);
135154

136155
if let Some(coordinator_url) = &config.coordinator_url {
137156
submit_summary(coordinator_url, summary).await?;
@@ -145,6 +164,7 @@ async fn run_terminal(runtime: TerminalRuntime) -> Result<()> {
145164
config,
146165
metrics,
147166
abort,
167+
start_logged,
148168
request_ids,
149169
schedule,
150170
run_constants,
@@ -154,7 +174,23 @@ async fn run_terminal(runtime: TerminalRuntime) -> Result<()> {
154174
} = runtime;
155175
let client = ModuleClient::connect_async(&config.connection, database_identity).await?;
156176
let metrics_client = connect_metrics_module_async(&config.connection).await?;
177+
log::info!(
178+
"driver {} terminal {} connected to {} for warehouse {} district {}",
179+
config.driver_id,
180+
assignment.terminal_id,
181+
database_identity,
182+
assignment.warehouse_id,
183+
assignment.district_id
184+
);
157185
sleep_until_ms_async(schedule.warmup_start_ms).await;
186+
if !start_logged.swap(true, Ordering::Relaxed) {
187+
log::info!(
188+
"driver {} starting run {} with {} terminal(s)",
189+
config.driver_id,
190+
schedule.run_id,
191+
config.terminals()
192+
);
193+
}
158194

159195
let mut rng = StdRng::seed_from_u64(seed);
160196
while !abort.load(Ordering::Relaxed) {
@@ -683,32 +719,6 @@ fn resolve_output_dir(config: &DriverConfig, run_id: &str) -> PathBuf {
683719
}
684720
}
685721

686-
fn print_summary(summary: &DriverSummary, summary_path: &Path, events_path: &Path) {
687-
log::info!("run_id={}", summary.run_id);
688-
log::info!("driver_id={}", summary.driver_id);
689-
log::info!("tpmc_like={:.2}", summary.tpmc_like);
690-
log::info!("total_transactions={}", summary.total_transactions);
691-
for (name, txn) in &summary.transactions {
692-
log::info!(
693-
"{} count={} success={} failure={} p95_ms={} p99_ms={}",
694-
name,
695-
txn.count,
696-
txn.success,
697-
txn.failure,
698-
txn.p95_latency_ms,
699-
txn.p99_latency_ms
700-
);
701-
}
702-
log::info!(
703-
"delivery queued={} completed={} pending={}",
704-
summary.delivery.queued,
705-
summary.delivery.completed,
706-
summary.delivery.pending
707-
);
708-
log::info!("summary={}", summary_path.display());
709-
log::info!("events={}", events_path.display());
710-
}
711-
712722
async fn sleep_until_ms_async(target_ms: u64) {
713723
let now_ms = crate::summary::now_millis();
714724
if target_ms > now_ms {

tools/tpcc-runner/src/summary.rs

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -572,6 +572,57 @@ pub fn aggregate_summaries(run_id: String, summaries: &[DriverSummary]) -> Aggre
572572
}
573573
}
574574

575+
pub fn log_driver_summary(summary: &DriverSummary, summary_path: &Path, events_path: &Path) {
576+
log::info!("run_id={}", summary.run_id);
577+
log::info!("driver_id={}", summary.driver_id);
578+
log::info!("tpmc_like={:.2}", summary.tpmc_like);
579+
log::info!("total_transactions={}", summary.total_transactions);
580+
for (name, txn) in &summary.transactions {
581+
log::info!(
582+
"{} count={} success={} failure={} p95_ms={} p99_ms={}",
583+
name,
584+
txn.count,
585+
txn.success,
586+
txn.failure,
587+
txn.p95_latency_ms,
588+
txn.p99_latency_ms
589+
);
590+
}
591+
log::info!(
592+
"delivery queued={} completed={} pending={}",
593+
summary.delivery.queued,
594+
summary.delivery.completed,
595+
summary.delivery.pending
596+
);
597+
log::info!("summary={}", summary_path.display());
598+
log::info!("events={}", events_path.display());
599+
}
600+
601+
pub fn log_aggregate_summary(summary: &AggregateSummary, summary_path: &Path) {
602+
log::info!("run_id={}", summary.run_id);
603+
log::info!("driver_count={}", summary.driver_count);
604+
log::info!("tpmc_like={:.2}", summary.tpmc_like);
605+
log::info!("total_transactions={}", summary.total_transactions);
606+
for (name, txn) in &summary.transactions {
607+
log::info!(
608+
"{} count={} success={} failure={} p95_ms={} p99_ms={}",
609+
name,
610+
txn.count,
611+
txn.success,
612+
txn.failure,
613+
txn.p95_latency_ms,
614+
txn.p99_latency_ms
615+
);
616+
}
617+
log::info!(
618+
"delivery queued={} completed={} pending={}",
619+
summary.delivery.queued,
620+
summary.delivery.completed,
621+
summary.delivery.pending
622+
);
623+
log::info!("summary={}", summary_path.display());
624+
}
625+
575626
pub fn write_json<T: Serialize>(path: &Path, value: &T) -> Result<()> {
576627
let file = File::create(path).with_context(|| format!("failed to create {}", path.display()))?;
577628
serde_json::to_writer_pretty(file, value).with_context(|| format!("failed to write {}", path.display()))

0 commit comments

Comments
 (0)