Skip to content

Commit d690333

Browse files
authored
fix(gax): record fractional latency metrics (#12979)
## Summary Record GAX operation and attempt latency metrics with fractional millisecond precision instead of truncating to whole milliseconds. Previously `MetricsTracer` used: ```java stopwatch.elapsed(TimeUnit.MILLISECONDS) ``` This floors sub-millisecond precision before the value is recorded into the histogram. For low-latency RPCs, this can make Cloud Monitoring percentiles look significantly lower than application-observed latency. For example, a Spanner customer investigation showed: - application/client observed P50: ~3.7ms - trace-measured GAX operation latency: ~3.9ms - exported operation_latencies P50: ~2.9ms The apparent ~800us P50 gap looked like extra client-side latency in Spanner, but raw metric prints showed values being recorded as whole numbers only (3.0, 4.0, 7.0). The sub-ms precision was lost before histogram bucketization, so this was a rounding/truncation artifact rather than actual Spanner client overhead. ## Fix Use nanosecond elapsed time and convert to fractional milliseconds: stopwatch.elapsed(TimeUnit.NANOSECONDS) / 1_000_000.0 Applied to: - operation latency - attempt latency
1 parent 0dec99e commit d690333

2 files changed

Lines changed: 56 additions & 10 deletions

File tree

sdk-platform-java/gax-java/gax/src/main/java/com/google/api/gax/tracing/MetricsTracer.java

Lines changed: 25 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@
3838
import com.google.api.gax.rpc.StatusCode;
3939
import com.google.common.annotations.VisibleForTesting;
4040
import com.google.common.base.Stopwatch;
41+
import com.google.common.base.Ticker;
4142
import java.util.HashMap;
4243
import java.util.Map;
4344
import java.util.concurrent.TimeUnit;
@@ -58,18 +59,27 @@ public class MetricsTracer implements ApiTracer {
5859
public static final String DEFAULT_LANGUAGE = "Java";
5960
private static final String OPERATION_FINISHED_STATUS_MESSAGE =
6061
"Operation has already been completed";
62+
private static final double NANOS_PER_MILLISECOND = 1_000_000.0;
6163
private Stopwatch attemptTimer;
62-
private final Stopwatch operationTimer = Stopwatch.createStarted();
64+
private final Ticker ticker;
65+
private final Stopwatch operationTimer;
6366
// These are RPC specific attributes and pertain to a specific API Trace
6467
private final Map<String, String> attributes = new HashMap<>();
6568
private final MetricsRecorder metricsRecorder;
6669
private final AtomicBoolean operationFinished;
6770

6871
public MetricsTracer(MethodName methodName, MetricsRecorder metricsRecorder) {
72+
this(methodName, metricsRecorder, Ticker.systemTicker());
73+
}
74+
75+
@VisibleForTesting
76+
MetricsTracer(MethodName methodName, MetricsRecorder metricsRecorder, Ticker ticker) {
6977
this.attributes.put(METHOD_ATTRIBUTE, methodName.toString());
7078
this.attributes.put(LANGUAGE_ATTRIBUTE, DEFAULT_LANGUAGE);
7179
this.metricsRecorder = metricsRecorder;
7280
this.operationFinished = new AtomicBoolean();
81+
this.ticker = ticker;
82+
this.operationTimer = Stopwatch.createStarted(ticker);
7383
}
7484

7585
/**
@@ -86,7 +96,7 @@ public void operationSucceeded() {
8696
}
8797
attributes.put(STATUS_ATTRIBUTE, StatusCode.Code.OK.toString());
8898
metricsRecorder.recordOperationLatency(
89-
operationTimer.elapsed(TimeUnit.MILLISECONDS), attributes);
99+
operationTimer.elapsed(TimeUnit.NANOSECONDS) / NANOS_PER_MILLISECOND, attributes);
90100
metricsRecorder.recordOperationCount(1, attributes);
91101
}
92102

@@ -104,7 +114,7 @@ public void operationCancelled() {
104114
}
105115
attributes.put(STATUS_ATTRIBUTE, StatusCode.Code.CANCELLED.toString());
106116
metricsRecorder.recordOperationLatency(
107-
operationTimer.elapsed(TimeUnit.MILLISECONDS), attributes);
117+
operationTimer.elapsed(TimeUnit.NANOSECONDS) / NANOS_PER_MILLISECOND, attributes);
108118
metricsRecorder.recordOperationCount(1, attributes);
109119
}
110120

@@ -123,7 +133,7 @@ public void operationFailed(Throwable error) {
123133
// Uses the GRPC status code representation.
124134
attributes.put(STATUS_ATTRIBUTE, ObservabilityUtils.extractStatus(error).toString());
125135
metricsRecorder.recordOperationLatency(
126-
operationTimer.elapsed(TimeUnit.MILLISECONDS), attributes);
136+
operationTimer.elapsed(TimeUnit.NANOSECONDS) / NANOS_PER_MILLISECOND, attributes);
127137
metricsRecorder.recordOperationCount(1, attributes);
128138
}
129139

@@ -138,7 +148,7 @@ public void operationFailed(Throwable error) {
138148
*/
139149
@Override
140150
public void attemptStarted(Object request, int attemptNumber) {
141-
attemptTimer = Stopwatch.createStarted();
151+
attemptTimer = Stopwatch.createStarted(ticker);
142152
}
143153

144154
/**
@@ -148,7 +158,8 @@ public void attemptStarted(Object request, int attemptNumber) {
148158
@Override
149159
public void attemptSucceeded() {
150160
attributes.put(STATUS_ATTRIBUTE, StatusCode.Code.OK.toString());
151-
metricsRecorder.recordAttemptLatency(attemptTimer.elapsed(TimeUnit.MILLISECONDS), attributes);
161+
metricsRecorder.recordAttemptLatency(
162+
attemptTimer.elapsed(TimeUnit.NANOSECONDS) / NANOS_PER_MILLISECOND, attributes);
152163
metricsRecorder.recordAttemptCount(1, attributes);
153164
}
154165

@@ -159,7 +170,8 @@ public void attemptSucceeded() {
159170
@Override
160171
public void attemptCancelled() {
161172
attributes.put(STATUS_ATTRIBUTE, StatusCode.Code.CANCELLED.toString());
162-
metricsRecorder.recordAttemptLatency(attemptTimer.elapsed(TimeUnit.MILLISECONDS), attributes);
173+
metricsRecorder.recordAttemptLatency(
174+
attemptTimer.elapsed(TimeUnit.NANOSECONDS) / NANOS_PER_MILLISECOND, attributes);
163175
metricsRecorder.recordAttemptCount(1, attributes);
164176
}
165177

@@ -174,7 +186,8 @@ public void attemptCancelled() {
174186
@Override
175187
public void attemptFailedDuration(Throwable error, java.time.Duration delay) {
176188
attributes.put(STATUS_ATTRIBUTE, ObservabilityUtils.extractStatus(error).toString());
177-
metricsRecorder.recordAttemptLatency(attemptTimer.elapsed(TimeUnit.MILLISECONDS), attributes);
189+
metricsRecorder.recordAttemptLatency(
190+
attemptTimer.elapsed(TimeUnit.NANOSECONDS) / NANOS_PER_MILLISECOND, attributes);
178191
metricsRecorder.recordAttemptCount(1, attributes);
179192
}
180193

@@ -198,7 +211,8 @@ public void attemptFailed(Throwable error, org.threeten.bp.Duration delay) {
198211
@Override
199212
public void attemptFailedRetriesExhausted(Throwable error) {
200213
attributes.put(STATUS_ATTRIBUTE, ObservabilityUtils.extractStatus(error).toString());
201-
metricsRecorder.recordAttemptLatency(attemptTimer.elapsed(TimeUnit.MILLISECONDS), attributes);
214+
metricsRecorder.recordAttemptLatency(
215+
attemptTimer.elapsed(TimeUnit.NANOSECONDS) / NANOS_PER_MILLISECOND, attributes);
202216
metricsRecorder.recordAttemptCount(1, attributes);
203217
}
204218

@@ -212,7 +226,8 @@ public void attemptFailedRetriesExhausted(Throwable error) {
212226
@Override
213227
public void attemptPermanentFailure(Throwable error) {
214228
attributes.put(STATUS_ATTRIBUTE, ObservabilityUtils.extractStatus(error).toString());
215-
metricsRecorder.recordAttemptLatency(attemptTimer.elapsed(TimeUnit.MILLISECONDS), attributes);
229+
metricsRecorder.recordAttemptLatency(
230+
attemptTimer.elapsed(TimeUnit.NANOSECONDS) / NANOS_PER_MILLISECOND, attributes);
216231
metricsRecorder.recordAttemptCount(1, attributes);
217232
}
218233

sdk-platform-java/gax-java/gax/src/test/java/com/google/api/gax/tracing/MetricsTracerTest.java

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,8 +43,10 @@
4343
import com.google.api.gax.rpc.StatusCode.Code;
4444
import com.google.api.gax.rpc.testing.FakeStatusCode;
4545
import com.google.common.collect.ImmutableMap;
46+
import com.google.common.testing.FakeTicker;
4647
import java.util.HashMap;
4748
import java.util.Map;
49+
import java.util.concurrent.TimeUnit;
4850
import org.junit.jupiter.api.BeforeEach;
4951
import org.junit.jupiter.api.Test;
5052
import org.junit.jupiter.api.extension.ExtendWith;
@@ -86,6 +88,35 @@ void testOperationSucceeded_recordsAttributes() {
8688
verifyNoMoreInteractions(metricsRecorder);
8789
}
8890

91+
@Test
92+
void testOperationSucceeded_recordsFractionalMilliseconds() {
93+
FakeTicker ticker = new FakeTicker();
94+
metricsTracer =
95+
new MetricsTracer(MethodName.of("fake_service", "fake_method"), metricsRecorder, ticker);
96+
ticker.advance(3_900_000, TimeUnit.NANOSECONDS);
97+
98+
metricsTracer.operationSucceeded();
99+
100+
verify(metricsRecorder).recordOperationLatency(3.9, getAttributes(Code.OK));
101+
verify(metricsRecorder).recordOperationCount(1, getAttributes(Code.OK));
102+
verifyNoMoreInteractions(metricsRecorder);
103+
}
104+
105+
@Test
106+
void testAttemptSucceeded_recordsFractionalMilliseconds() {
107+
FakeTicker ticker = new FakeTicker();
108+
metricsTracer =
109+
new MetricsTracer(MethodName.of("fake_service", "fake_method"), metricsRecorder, ticker);
110+
metricsTracer.attemptStarted(new Object(), 0);
111+
ticker.advance(4_200_000, TimeUnit.NANOSECONDS);
112+
113+
metricsTracer.attemptSucceeded();
114+
115+
verify(metricsRecorder).recordAttemptLatency(4.2, getAttributes(Code.OK));
116+
verify(metricsRecorder).recordAttemptCount(1, getAttributes(Code.OK));
117+
verifyNoMoreInteractions(metricsRecorder);
118+
}
119+
89120
@Test
90121
void testOperationFailed_recordsAttributes() {
91122
ApiException error0 =

0 commit comments

Comments
 (0)