Skip to content

Commit 6756ba5

Browse files
authored
Merge pull request #88 from Miyamura80/feat/optimize-log-scrubbing
🔨 ✅ optimize log scrubbing with scrubadub and single-pass regex
2 parents 97eb790 + d36a02c commit 6756ba5

7 files changed

Lines changed: 362 additions & 35 deletions

File tree

.gitignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,9 @@
33

44
old_scripts/
55

6+
# Agent session files
7+
session-*.md
8+
69
# Overwrite global config
710
.global_config.yaml
811

common/config_models.py

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
type validation and structure for the configuration data.
77
"""
88

9-
from pydantic import BaseModel
9+
from pydantic import BaseModel, Field
1010

1111

1212
class ExampleParent(BaseModel):
@@ -70,12 +70,29 @@ class LoggingLevelsConfig(BaseModel):
7070
critical: bool
7171

7272

73+
class RedactionPattern(BaseModel):
74+
"""Configuration for a specific redaction pattern."""
75+
76+
name: str
77+
regex: str
78+
placeholder: str
79+
80+
81+
class RedactionConfig(BaseModel):
82+
"""Configuration for log redaction/scrubbing."""
83+
84+
enabled: bool = True
85+
use_default_pii: bool = True
86+
patterns: list[RedactionPattern] = []
87+
88+
7389
class LoggingConfig(BaseModel):
7490
"""Complete logging configuration."""
7591

7692
verbose: bool
7793
format: LoggingFormatConfig
7894
levels: LoggingLevelsConfig
95+
redaction: RedactionConfig = Field(default_factory=lambda: RedactionConfig())
7996

8097

8198
class FeaturesConfig(BaseModel):

common/global_config.yaml

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,3 +50,24 @@ logging:
5050
warning: true # Show warning logs
5151
error: true # Show error logs
5252
critical: true # Show critical logs
53+
redaction:
54+
enabled: true
55+
use_default_pii: true
56+
patterns:
57+
- name: "ANTHROPIC_API_KEY"
58+
regex: "sk-ant-[a-zA-Z0-9-]{20,}"
59+
placeholder: "[REDACTED_API_KEY]"
60+
- name: "OPENAI_API_KEY"
61+
regex: "sk-[a-zA-Z0-9]{20,}"
62+
placeholder: "[REDACTED_API_KEY]"
63+
- name: "STRIPE_API_KEY"
64+
regex: "[spr]k_(live|test)_[a-zA-Z0-9]{20,}"
65+
placeholder: "[REDACTED_API_KEY]"
66+
- name: "BEARER_TOKEN"
67+
regex: "Bearer\\s+[a-zA-Z0-9._\\-]{20,}"
68+
placeholder: "[REDACTED_BEARER_TOKEN]"
69+
- name: "GENERIC_KEY"
70+
regex: "(?i:(?:api[_-]?key|project[_-]?key|secret[_-]?key)[=:\\s]+['\"]?[a-zA-Z0-9_\\-]{16,}['\"]?)"
71+
placeholder: "[REDACTED_KEY]"
72+
73+

pyproject.toml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,9 @@ dependencies = [
2929
"pylint>=3.3.0",
3030
"deptry>=0.24.0",
3131
"openfeature-sdk>=0.8.4",
32+
"scrubadub>=2.0.1",
33+
"pydantic>=2.0.0",
34+
"numpy>=1.26.0",
3235
]
3336
readme = "README.md"
3437
requires-python = ">= 3.12"
@@ -65,6 +68,9 @@ error-on-warning = true
6568
[tool.ty.environment]
6669
python-version = "3.12"
6770

71+
[tool.deptry]
72+
ignore = ["DEP002"]
73+
6874
[tool.vulture]
6975
exclude = [
7076
".venv/",

src/utils/logging_config.py

Lines changed: 72 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
import sys
55
import threading
66

7+
import scrubadub
78
from human_id import generate_id
89
from loguru import logger
910

@@ -13,57 +14,90 @@
1314
_logging_initialized = False
1415
_logging_lock = threading.Lock()
1516

16-
# PII Patterns for redaction (pre-compiled for performance)
17-
# Note: More specific patterns must come before general ones (e.g., sk-ant- before sk-)
18-
_COMPILED_PII_PATTERNS = [
19-
# Email addresses
20-
(
21-
re.compile(r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b"),
22-
"[REDACTED_EMAIL]",
23-
),
24-
# Anthropic API keys (sk-ant-...) - must be before OpenAI pattern
25-
(re.compile(r"sk-ant-[a-zA-Z0-9-]{20,}"), "[REDACTED_API_KEY]"),
26-
# OpenAI API keys (sk-...)
27-
(re.compile(r"sk-[a-zA-Z0-9]{20,}"), "[REDACTED_API_KEY]"),
28-
# Stripe API keys (sk_live_*, sk_test_*, pk_live_*, pk_test_*, rk_live_*, rk_test_*)
29-
(re.compile(r"[spr]k_(live|test)_[a-zA-Z0-9]{20,}"), "[REDACTED_API_KEY]"),
30-
# Authorization Bearer tokens
31-
(re.compile(r"Bearer\s+[a-zA-Z0-9._\-]{20,}"), "[REDACTED_BEARER_TOKEN]"),
32-
# Generic project/API keys (common formats: xxx_key_*, api_key=*, apikey=*)
33-
(re.compile(r"(?i)(api[_-]?key|project[_-]?key|secret[_-]?key)[=:\s]+['\"]?[a-zA-Z0-9_\-]{16,}['\"]?"), "[REDACTED_KEY]"),
34-
]
17+
18+
class _LogScrubber:
19+
"""
20+
Optimized single-pass log scrubber.
21+
Uses scrubadub for general PII and a compiled multi-pattern regex for secrets.
22+
"""
23+
24+
def __init__(self):
25+
config = global_config.logging.redaction
26+
self.enabled = config.enabled
27+
self.use_default_pii = config.use_default_pii
28+
self.patterns = config.patterns
29+
30+
# Initialize scrubadub
31+
self.scrubber = None
32+
if self.enabled and self.use_default_pii:
33+
self.scrubber = scrubadub.Scrubber()
34+
# Remove default FilenameDetector if it's too aggressive, but usually it's fine
35+
# We can customize detectors here if needed
36+
37+
# Compile custom patterns into a single-pass regex
38+
self.combined_regex = None
39+
self.placeholder_map = {}
40+
41+
if self.enabled and self.patterns:
42+
regex_parts = []
43+
for i, p in enumerate(self.patterns):
44+
group_name = f"p{i}"
45+
regex_parts.append(f"(?P<{group_name}>{p.regex})")
46+
self.placeholder_map[group_name] = p.placeholder
47+
48+
self.combined_regex = re.compile("|".join(regex_parts))
49+
50+
def _redact_callback(self, match):
51+
"""Callback for re.sub to return the correct placeholder for the matched group."""
52+
group_name = match.lastgroup
53+
return self.placeholder_map.get(group_name, "[REDACTED]")
54+
55+
def scrub(self, text: str) -> str:
56+
"""Scrub sensitive data from text in a single pass."""
57+
if not self.enabled or not text:
58+
return text
59+
60+
# 1. Scrub general PII using scrubadub
61+
if self.scrubber:
62+
text = self.scrubber.clean(text)
63+
64+
# 2. Scrub custom secrets (single pass)
65+
if self.combined_regex:
66+
text = self.combined_regex.sub(self._redact_callback, text)
67+
68+
return text
69+
70+
71+
# Initialize the singleton scrubber
72+
_SCRUBBER = _LogScrubber()
3573

3674

3775
def scrub_sensitive_data(record):
3876
"""
3977
Patch function to scrub sensitive data from the log record.
4078
Modifies record["message"] and record["exception"] in place.
4179
"""
80+
if not _SCRUBBER.enabled:
81+
return
82+
4283
# Scrub main message
43-
message = record["message"]
44-
for pattern, placeholder in _COMPILED_PII_PATTERNS:
45-
message = pattern.sub(placeholder, message)
46-
record["message"] = message
84+
record["message"] = _SCRUBBER.scrub(record["message"])
4785

4886
# Scrub exception if present
4987
exception = record.get("exception")
5088
if exception:
5189
type_, value, tb = exception
5290
value_str = str(value)
53-
redacted = False
54-
for pattern, placeholder in _COMPILED_PII_PATTERNS:
55-
if pattern.search(value_str):
56-
value_str = pattern.sub(placeholder, value_str)
57-
redacted = True
91+
scrubbed_value_str = _SCRUBBER.scrub(value_str)
5892

59-
if redacted:
93+
if scrubbed_value_str != value_str:
6094
# Re-instantiate the exception with the redacted message to preserve loguru formatting
6195
try:
6296
# Most standard exceptions accept a single string argument
63-
new_value = type_(value_str)
97+
new_value = type_(scrubbed_value_str)
6498
except Exception:
6599
# Fallback to a generic Exception if type instantiation fails
66-
new_value = Exception(value_str)
100+
new_value = Exception(scrubbed_value_str)
67101

68102
# Preserve traceback and context metadata
69103
new_value.__traceback__ = tb
@@ -72,6 +106,13 @@ def scrub_sensitive_data(record):
72106

73107
record["exception"] = (type_, new_value, tb)
74108

109+
# Scrub extra context if present
110+
extra = record.get("extra")
111+
if extra:
112+
for key, val in extra.items():
113+
if isinstance(val, str):
114+
extra[key] = _SCRUBBER.scrub(val)
115+
75116

76117
def _should_show_location(level: str) -> bool:
77118
"""Determine if location should be shown for given log level"""

tests/test_logging_security.py

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,14 @@ def test_email_redaction(self):
1515
record = {"message": "User email is test@example.com", "exception": None}
1616
scrub_sensitive_data(record)
1717
assert "test@example.com" not in record["message"]
18-
assert "[REDACTED_EMAIL]" in record["message"]
18+
assert "{{EMAIL}}" in record["message"]
19+
20+
def test_phone_redaction(self):
21+
"""Test that phone numbers are redacted (new capability via scrubadub)."""
22+
record = {"message": "Call me at 1-800-555-0199", "exception": None}
23+
scrub_sensitive_data(record)
24+
assert "1-800-555-0199" not in record["message"]
25+
assert "{{PHONE}}" in record["message"]
1926

2027
def test_api_key_redaction(self):
2128
"""Test that OpenAI API keys are redacted from log messages."""
@@ -32,7 +39,7 @@ def test_multiple_redactions(self):
3239
"exception": None,
3340
}
3441
scrub_sensitive_data(record)
35-
assert "[REDACTED_EMAIL]" in record["message"]
42+
assert "{{EMAIL}}" in record["message"]
3643
assert "[REDACTED_API_KEY]" in record["message"]
3744
assert "test@example.com" not in record["message"]
3845
assert "sk-123456789012345678901234" not in record["message"]
@@ -54,7 +61,7 @@ def test_exception_message_redaction(self):
5461
# Verify exception redaction
5562
_, value, _ = record["exception"]
5663
assert "test@example.com" not in str(value)
57-
assert "[REDACTED_EMAIL]" in str(value)
64+
assert "{{EMAIL}}" in str(value)
5865

5966
def test_exception_api_key_redaction(self):
6067
"""Test redacting API keys from exception values."""

0 commit comments

Comments
 (0)