Skip to content

Commit 6299c2e

Browse files
committed
wip: improve logging and metrics
1 parent 148adab commit 6299c2e

3 files changed

Lines changed: 60 additions & 15 deletions

File tree

internal/controller/controller.go

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -471,6 +471,12 @@ func (c *Controller) recordContainer(ctx context.Context, pod *corev1.Pod, conta
471471
)
472472

473473
if err := c.apiClient.PostOne(ctx, record); err != nil {
474+
// Return if no artifact is found
475+
var noArtifactErr *deploymentrecord.NoArtifactError
476+
if errors.As(err, &noArtifactErr) {
477+
return nil
478+
}
479+
474480
// Make sure to not retry on client error messages
475481
var clientErr *deploymentrecord.ClientError
476482
if errors.As(err, &clientErr) {

pkg/deploymentrecord/client.go

Lines changed: 46 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -160,6 +160,19 @@ func (c *ClientError) Unwrap() error {
160160
return c.err
161161
}
162162

163+
// NoArtifactError represents a 404 client response where no artifact was found.
164+
type NoArtifactError struct {
165+
err error
166+
}
167+
168+
func (n *NoArtifactError) Error() string {
169+
return fmt.Sprintf("client_error: %s", n.err.Error())
170+
}
171+
172+
func (n *NoArtifactError) Unwrap() error {
173+
return n.err
174+
}
175+
163176
// PostOne posts a single deployment record to the GitHub deployment
164177
// records API.
165178
func (c *Client) PostOne(ctx context.Context, record *DeploymentRecord) error {
@@ -249,29 +262,47 @@ func (c *Client) PostOne(ctx context.Context, record *DeploymentRecord) error {
249262
}
250263

251264
// Drain and close response body to enable connection reuse by reading body for error logging
252-
body, _ := io.ReadAll(resp.Body)
265+
body, _ := io.ReadAll(io.LimitReader(resp.Body, 4096))
266+
_, _ = io.Copy(io.Discard, resp.Body)
253267
_ = resp.Body.Close()
254268

255-
lastErr = fmt.Errorf("unexpected status code: %d", resp.StatusCode)
256-
257-
// Don't retry on client errors (4xx) except for 429
258-
// (rate limit)
259-
if resp.StatusCode >= 400 && resp.StatusCode < 500 && resp.StatusCode != 429 {
269+
switch {
270+
case resp.StatusCode == 429:
271+
// Retry with backoff
272+
dtmetrics.PostDeploymentRecordSoftFail.Inc()
273+
slog.Debug("rate limited, retrying",
274+
"attempt", attempt,
275+
"status_code", resp.StatusCode,
276+
)
277+
lastErr = fmt.Errorf("rate limited, attempt %d", attempt)
278+
continue
279+
case resp.StatusCode == 404:
280+
// No artifact found
281+
dtmetrics.PostDeploymentRecordNotSaved.Inc()
282+
slog.Debug("no artifact attestation found, no record created",
283+
"attempt", attempt,
284+
"status_code", resp.StatusCode,
285+
)
286+
return &NoArtifactError{err: fmt.Errorf("no artifact found")}
287+
case resp.StatusCode >= 400 && resp.StatusCode < 500:
288+
// Don't retry non rate limiting client errors
260289
dtmetrics.PostDeploymentRecordClientError.Inc()
261290
slog.Warn("client error, aborting",
262291
"attempt", attempt,
263-
"error", lastErr,
264292
"status_code", resp.StatusCode,
265-
"msg", string(body),
293+
"resp_msg", string(body),
294+
)
295+
return &ClientError{err: fmt.Errorf("unexpected client err with status code %d", resp.StatusCode)}
296+
default:
297+
// Retry with backoff
298+
dtmetrics.PostDeploymentRecordSoftFail.Inc()
299+
slog.Debug("retriable error",
300+
"attempt", attempt,
301+
"status_code", resp.StatusCode,
302+
"resp_msg", string(body),
266303
)
267-
return &ClientError{err: lastErr}
304+
lastErr = fmt.Errorf("server error, attempt %d", attempt)
268305
}
269-
dtmetrics.PostDeploymentRecordSoftFail.Inc()
270-
slog.Debug("retriable server error",
271-
"attempt", attempt,
272-
"status_code", resp.StatusCode,
273-
"msg", string(body),
274-
)
275306
}
276307

277308
dtmetrics.PostDeploymentRecordHardFail.Inc()

pkg/dtmetrics/prom.go

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,14 @@ var (
4949
},
5050
)
5151

52+
//nolint: revive
53+
PostDeploymentRecordNotSaved = promauto.NewCounter(
54+
prometheus.CounterOpts{
55+
Name: "deptracker_post_record_not_saved",
56+
Help: "The total number of successful posts with no attestation that result in no record creation",
57+
},
58+
)
59+
5260
//nolint: revive
5361
PostDeploymentRecordSoftFail = promauto.NewCounter(
5462
prometheus.CounterOpts{

0 commit comments

Comments
 (0)