99# This script distinguishes between:
1010# - Real secrets (high-entropy strings, known token patterns)
1111# - False positives (variable names, example values, API call patterns)
12+ #
13+ # Performance optimizations:
14+ # - Combined regex patterns to reduce grep calls (O(n) instead of O(n*m))
15+ # - Single-pass file scanning with grep
16+ # - Optimized entropy calculation (pure bash, no subprocesses)
17+ # - Early exit for files with no matches
1218# ##############################################################################
1319
1420set -euo pipefail
@@ -19,122 +25,27 @@ YELLOW='\033[1;33m'
1925GREEN=' \033[0;32m'
2026NC=' \033[0m' # No Color
2127
22- # Patterns that indicate secrets (high confidence)
23- declare -a SECRET_PATTERNS=(
24- # API Keys (various formats)
25- ' sk_live_[a-zA-Z0-9]{24,}'
26- ' sk_test_[a-zA-Z0-9]{24,}'
27- ' pk_live_[a-zA-Z0-9]{24,}'
28- ' pk_test_[a-zA-Z0-9]{24,}'
29- ' AIza[0-9A-Za-z_-]{35}'
30- ' AKIA[0-9A-Z]{16}'
31- ' sk-[a-zA-Z0-9]{32,}'
32- ' xox[baprs]-[0-9]{10,13}-[0-9]{10,13}-[a-zA-Z0-9]{24,}'
33-
34- # GitHub tokens
35- ' ghp_[a-zA-Z0-9]{36}'
36- ' gho_[a-zA-Z0-9]{36}'
37- ' ghu_[a-zA-Z0-9]{36}'
38- ' ghs_[a-zA-Z0-9]{36}'
39- ' ghr_[a-zA-Z0-9]{36}'
40-
41- # AWS tokens
42- ' AKIA[0-9A-Z]{16}'
43- ' ASIA[0-9A-Z]{16}'
44-
45- # Generic high-entropy strings (32+ chars, mixed case, numbers)
46- ' [a-zA-Z0-9+/=]{40,}'
47-
48- # JWT tokens
49- ' eyJ[a-zA-Z0-9_-]{10,}\.[a-zA-Z0-9_-]{10,}\.[a-zA-Z0-9_-]{10,}'
50-
51- # Private keys (PEM format)
52- ' -----BEGIN (RSA |EC |DSA |OPENSSH )?PRIVATE KEY-----'
53-
54- # OAuth tokens
55- ' ya29\.[a-zA-Z0-9_-]+'
56- ' 1//[a-zA-Z0-9_-]+'
57- )
58-
59- # Patterns that are likely false positives (allowlist)
60- declare -a ALLOWLIST_PATTERNS=(
61- # Example/placeholder values
62- ' YOUR_API_KEY_HERE'
63- ' your-api-key-here'
64- ' example\.com'
65- ' test_key'
66- ' demo_key'
67- ' placeholder'
68- ' CHANGE_ME'
69- ' REPLACE_ME'
70-
71- # Variable names (not values)
72- ' api_key\s*='
73- ' API_KEY\s*='
74- ' access_token\s*='
75- ' secret\s*='
76-
77- # Common API call patterns (URLs, endpoints)
78- ' https?://[a-zA-Z0-9.-]+'
79- ' api/v[0-9]+'
80- ' /api/'
81-
82- # Documentation/comments
83- ' ^\s*#.*(api|key|token|secret)'
84- ' ^\s*//.*(api|key|token|secret)'
85- ' ^\s*\*.*(api|key|token|secret)'
86-
87- # Test files
88- ' test.*\.(py|js|sh)$'
89- ' .*test\.(py|js|sh)$'
90- ' mock.*\.(py|js|sh)$'
91-
92- # Example files
93- ' \.example$'
94- ' \.sample$'
95- ' example\.'
96- )
97-
98- # Files to exclude from scanning
99- declare -a EXCLUDE_PATTERNS=(
100- ' \.git/'
101- ' \.env\.example$'
102- ' \.gitignore$'
103- ' artifacts/'
104- ' \.pre-commit-cache/'
105- ' node_modules/'
106- ' \.venv/'
107- ' venv/'
108- ' __pycache__/'
109- ' \.pytest_cache/'
110- ' \.mypy_cache/'
111- ' dist/'
112- ' build/'
113- )
114-
115- # Function to check if file should be excluded
28+ # Combined secret pattern (all patterns OR'd together for single grep pass)
29+ # This reduces O(files × lines × patterns) to O(files × lines)
30+ SECRET_PATTERN=' (sk_live_[a-zA-Z0-9]{24,}|sk_test_[a-zA-Z0-9]{24,}|pk_live_[a-zA-Z0-9]{24,}|pk_test_[a-zA-Z0-9]{24,}|AIza[0-9A-Za-z_-]{35}|AKIA[0-9A-Z]{16}|sk-[a-zA-Z0-9]{32,}|xox[baprs]-[0-9]{10,13}-[0-9]{10,13}-[a-zA-Z0-9]{24,}|ghp_[a-zA-Z0-9]{36}|gho_[a-zA-Z0-9]{36}|ghu_[a-zA-Z0-9]{36}|ghs_[a-zA-Z0-9]{36}|ghr_[a-zA-Z0-9]{36}|ASIA[0-9A-Z]{16}|[a-zA-Z0-9+/=]{40,}|eyJ[a-zA-Z0-9_-]{10,}\.[a-zA-Z0-9_-]{10,}\.[a-zA-Z0-9_-]{10,}|-----BEGIN (RSA |EC |DSA |OPENSSH )?PRIVATE KEY-----|ya29\.[a-zA-Z0-9_-]+|1//[a-zA-Z0-9_-]+)'
31+
32+ # Patterns that always indicate secrets (no entropy check needed)
33+ HIGH_CONFIDENCE_PATTERN=' (BEGIN|PRIVATE|KEY|ghp_|sk_|AIza|AKIA)'
34+
35+ # Combined allowlist pattern (for fast filtering)
36+ ALLOWLIST_PATTERN=' (YOUR_API_KEY_HERE|your-api-key-here|example\.com|test_key|demo_key|placeholder|CHANGE_ME|REPLACE_ME|api_key\s*=|API_KEY\s*=|access_token\s*=|secret\s*=|https?://[a-zA-Z0-9.-]+|api/v[0-9]+|/api/|^\s*#.*(api|key|token|secret)|^\s*//.*(api|key|token|secret)|^\s*\*.*(api|key|token|secret))'
37+
38+ # Files to exclude from scanning (compiled into single pattern for efficiency)
39+ EXCLUDE_PATTERN=' (\.git/|\.env\.example$|\.gitignore$|artifacts/|\.pre-commit-cache/|node_modules/|\.venv/|venv/|__pycache__/|\.pytest_cache/|\.mypy_cache/|dist/|build/)'
40+
41+ # Function to check if file should be excluded (optimized with single regex)
11642should_exclude_file () {
11743 local file=" $1 "
118- for pattern in " ${EXCLUDE_PATTERNS[@]} " ; do
119- if [[ " ${file} " =~ ${pattern} ]]; then
120- return 0
121- fi
122- done
123- return 1
44+ [[ " ${file} " =~ ${EXCLUDE_PATTERN} ]]
12445}
12546
126- # Function to check if pattern matches allowlist
127- is_allowlisted () {
128- local line=" $1 "
129- for pattern in " ${ALLOWLIST_PATTERNS[@]} " ; do
130- if echo " ${line} " | grep -qiE " ${pattern} " ; then
131- return 0
132- fi
133- done
134- return 1
135- }
136-
137- # Function to calculate entropy (simple approximation)
47+ # Optimized entropy calculation (pure bash, no subprocesses)
48+ # Uses associative array to count unique characters
13849calculate_entropy () {
13950 local str=" $1 "
14051 local len=${# str}
@@ -143,14 +54,17 @@ calculate_entropy() {
14354 return
14455 fi
14556
146- # Count unique characters
147- local unique_chars
148- unique_chars=$( echo " ${str} " | fold -w1 | sort -u | wc -l)
149- # Simple entropy approximation
150- echo " ${unique_chars} "
57+ # Count unique characters using associative array (bash 4+)
58+ # This is much faster than fold | sort | wc
59+ declare -A chars
60+ local i
61+ for (( i = 0 ; i < len; i++ )) ; do
62+ chars[${str: i: 1} ]=1
63+ done
64+ echo " ${# chars[@]} "
15165}
15266
153- # Main detection function
67+ # Main detection function (optimized)
15468detect_secrets () {
15569 local found_secrets=0
15670 local files_checked=0
@@ -164,50 +78,74 @@ detect_secrets() {
16478 return 0
16579 fi
16680
81+ # Process files efficiently
16782 while IFS= read -r file; do
16883 [[ -z " ${file} " ]] && continue
16984
170- # Skip excluded files
85+ # Skip excluded files (fast check)
17186 if should_exclude_file " ${file} " ; then
17287 continue
17388 fi
17489
17590 # Skip if file doesn't exist (might be deleted)
17691 [[ ! -f " ${file} " ]] && continue
17792
93+ # Skip binary files (fast check - grep -I fails on binary files)
94+ if ! grep -qI . " ${file} " 2> /dev/null; then
95+ continue
96+ fi
97+
17898 files_checked=$(( files_checked + 1 ))
17999
180- # Check each line for secret patterns
181- local line_num=0
182- while IFS= read -r line || [[ -n " ${line} " ]]; do
183- line_num=$(( line_num + 1 ))
100+ # Use grep to find all lines with potential secrets in one pass
101+ # This is much faster than reading the entire file line by line
102+ local matches
103+ matches=$( grep -nE " ${SECRET_PATTERN} " " ${file} " 2> /dev/null || true)
104+
105+ # Early exit: if no matches, skip this file entirely
106+ if [[ -z " ${matches} " ]]; then
107+ continue
108+ fi
109+
110+ # Process only the matching lines (much smaller set than entire file)
111+ while IFS= read -r match_line; do
112+ [[ -z " ${match_line} " ]] && continue
184113
185- # Skip allowlisted patterns
186- if is_allowlisted " ${line} " ; then
114+ # Extract line number and content
115+ local line_num=" ${match_line%%:* } "
116+ local line=" ${match_line#*: } "
117+
118+ # Fast allowlist check (single grep call)
119+ if echo " ${line} " | grep -qiE " ${ALLOWLIST_PATTERN} " ; then
120+ continue
121+ fi
122+
123+ # Extract matched secret part
124+ local matched_part
125+ matched_part=$( echo " ${line} " | grep -oE " ${SECRET_PATTERN} " | head -1)
126+
127+ # Check if it's a high-confidence pattern (skip entropy check for speed)
128+ if echo " ${matched_part} " | grep -qE " ${HIGH_CONFIDENCE_PATTERN} " ; then
129+ echo -e " ${RED} ✗ Potential secret found in ${file} :${line_num}${NC} "
130+ echo -e " ${YELLOW} Pattern:${NC} ${matched_part: 0: 50} ..."
131+ echo -e " ${YELLOW} Context:${NC} ${line: 0: 100} ..."
132+ echo " "
133+ found_secrets=$(( found_secrets + 1 ))
187134 continue
188135 fi
189136
190- # Check against secret patterns
191- for pattern in " ${SECRET_PATTERNS[@]} " ; do
192- if echo " ${line} " | grep -qE " ${pattern} " ; then
193- # Additional check: high entropy
194- local matched_part
195- matched_part=$( echo " ${line} " | grep -oE " ${pattern} " | head -1)
196- local entropy
197- entropy=$( calculate_entropy " ${matched_part} " )
198-
199- # If it's a high-entropy match, flag it
200- if [[ ${entropy} -gt 8 ]] || echo " ${pattern} " | grep -qE " (BEGIN|PRIVATE|KEY|ghp_|sk_|AIza|AKIA)" ; then
201- echo -e " ${RED} ✗ Potential secret found in ${file} :${line_num}${NC} "
202- echo -e " ${YELLOW} Pattern:${NC} ${pattern} "
203- echo -e " ${YELLOW} Context:${NC} ${line: 0: 100} ..."
204- echo " "
205- found_secrets=$(( found_secrets + 1 ))
206- break
207- fi
208- fi
209- done
210- done < " ${file} "
137+ # For generic patterns, check entropy (only when needed)
138+ local entropy
139+ entropy=$( calculate_entropy " ${matched_part} " )
140+
141+ if [[ ${entropy} -gt 8 ]]; then
142+ echo -e " ${RED} ✗ Potential secret found in ${file} :${line_num}${NC} "
143+ echo -e " ${YELLOW} Pattern:${NC} ${matched_part: 0: 50} ..."
144+ echo -e " ${YELLOW} Context:${NC} ${line: 0: 100} ..."
145+ echo " "
146+ found_secrets=$(( found_secrets + 1 ))
147+ fi
148+ done <<< " ${matches}"
211149 done <<< " ${staged_files}"
212150
213151 if [[ ${found_secrets} -gt 0 ]]; then
0 commit comments