-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdedup.py
More file actions
279 lines (227 loc) · 10.2 KB
/
dedup.py
File metadata and controls
279 lines (227 loc) · 10.2 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
"""
RoadSense AI — Duplicate Detection (dedup.py)
Prevents the same signal from being processed twice.
Strategy:
- Content hash: SHA-256 of (source + translated_content + location_geohash + 7-day time bucket)
- Exact match: same Reddit post ID or YouTube video+comment ID
- Near-duplicate: same content hash regardless of minor whitespace/punctuation differences
Critical for RSS feeds which re-serve old articles on every fetch.
Also prevents Reddit/YouTube scrapers from double-counting the same post.
"""
import hashlib
import logging
import re
from datetime import datetime, timezone, timedelta
from typing import Optional
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
# ── Config ────────────────────────────────────────────────────────────────────
# In-memory seen set for within-Lambda dedup
# In production, Srikar stores these hashes in DynamoDB
_seen_hashes: set[str] = set()
# ── Geohash (simplified) ──────────────────────────────────────────────────────
def location_bucket(signal: dict) -> str:
"""
Create a coarse location bucket for hashing.
Rounds coordinates to ~1km grid. Falls back to city name.
"""
coords = signal.get("location", {}).get("coordinates")
if coords and coords.get("lat") and coords.get("lon"):
# Round to 2 decimal places ≈ ~1km precision
lat = round(float(coords["lat"]), 2)
lon = round(float(coords["lon"]), 2)
return f"{lat},{lon}"
city = signal.get("city", "")
return city.lower().strip() if city else "unknown"
def time_bucket(signal: dict) -> str:
"""
Create a 7-day time bucket from the signal timestamp.
All signals within the same 7-day window share the same bucket.
"""
try:
ts = datetime.fromisoformat(signal["timestamp"])
# Floor to 7-day epoch buckets from Unix epoch
epoch = datetime(1970, 1, 1, tzinfo=timezone.utc)
days_since = (ts - epoch).days
bucket = days_since // 7
return str(bucket)
except (KeyError, ValueError):
return "unknown"
def normalise_content(text: str) -> str:
"""
Normalise text for dedup — removes punctuation variation and extra whitespace
so "Pothole on MG Road!!" and "pothole on MG Road" hash to the same value.
"""
if not text:
return ""
text = text.lower().strip()
text = re.sub(r"[^\w\s]", "", text) # remove punctuation
text = re.sub(r"\s+", " ", text) # collapse whitespace
return text
# ── Hash Functions ────────────────────────────────────────────────────────────
def compute_content_hash(signal: dict) -> str:
"""
Compute dedup hash from:
- source name
- normalised translated content (or original if no translation)
- location bucket (~1km grid or city)
- 7-day time bucket
Same signal re-ingested in a different run = same hash = duplicate.
"""
source = signal.get("source_name") or signal.get("source", "unknown")
content = signal.get("translated_content") or signal.get("original_content", "")
content = normalise_content(content)
loc = location_bucket(signal)
time = time_bucket(signal)
raw = f"{source}:{content}:{loc}:{time}"
return hashlib.sha256(raw.encode("utf-8")).hexdigest()
def compute_source_id_hash(signal: dict) -> Optional[str]:
"""
For Reddit/YouTube signals, hash the native source ID.
Reddit: reddit_post_id
YouTube: video_id + first 100 chars of content (to distinguish comments)
Returns None for sources without native IDs.
"""
source = signal.get("source_name", "")
if source == "reddit":
post_id = signal.get("reddit_post_id")
if post_id:
return hashlib.sha256(f"reddit:{post_id}".encode()).hexdigest()
if source == "youtube":
video_id = signal.get("video_id")
content = normalise_content(
signal.get("translated_content") or signal.get("original_content", "")
)[:100]
if video_id:
return hashlib.sha256(f"youtube:{video_id}:{content}".encode()).hexdigest()
return None
# ── Core Dedup ────────────────────────────────────────────────────────────────
def is_duplicate(signal: dict, seen_hashes: Optional[set] = None) -> bool:
"""
Check if a signal is a duplicate.
Checks both content hash and source ID hash.
Args:
signal: Signal dict to check
seen_hashes: External hash set (e.g. loaded from DynamoDB).
Falls back to in-memory _seen_hashes if None.
Returns:
True if duplicate, False if new
"""
store = seen_hashes if seen_hashes is not None else _seen_hashes
# Check source ID hash first (exact match)
source_id_hash = compute_source_id_hash(signal)
if source_id_hash and source_id_hash in store:
logger.debug(f"[Dedup] Duplicate by source ID: {signal.get('signal_id', '?')[:8]}")
return True
# Check content hash
content_hash = compute_content_hash(signal)
if content_hash in store:
logger.debug(f"[Dedup] Duplicate by content hash: {signal.get('signal_id', '?')[:8]}")
return True
return False
def mark_seen(signal: dict, seen_hashes: Optional[set] = None) -> None:
"""Add a signal's hashes to the seen set."""
store = seen_hashes if seen_hashes is not None else _seen_hashes
source_id_hash = compute_source_id_hash(signal)
if source_id_hash:
store.add(source_id_hash)
content_hash = compute_content_hash(signal)
store.add(content_hash)
def deduplicate_signals(signals: list[dict],
seen_hashes: Optional[set] = None) -> list[dict]:
"""
Remove duplicate signals from a list.
Also deduplicates within the batch itself.
Args:
signals: List of signal dicts
seen_hashes: Optional external seen set from DynamoDB.
Pass an empty set() to use a fresh in-memory store.
Returns:
Deduplicated list of signals
"""
store = seen_hashes if seen_hashes is not None else _seen_hashes
unique = []
duplicate_count = 0
for signal in signals:
if is_duplicate(signal, store):
duplicate_count += 1
continue
mark_seen(signal, store)
unique.append(signal)
logger.info(
f"[Dedup] {len(signals)} signals → {len(unique)} unique "
f"({duplicate_count} duplicates removed)"
)
return unique
# ── Local Test ────────────────────────────────────────────────────────────────
if __name__ == "__main__":
from datetime import timezone
now = datetime.now(timezone.utc).isoformat()
test_signals = [
# Original signal
{
"signal_id": "s001",
"source": "social_media",
"source_name": "reddit",
"reddit_post_id": "abc123",
"original_content": "Big pothole on MG Road!!",
"translated_content": "Big pothole on MG Road!!",
"city": "Bangalore",
"timestamp": now,
"location": {"coordinates": {"lat": 12.9716, "lon": 77.5946}},
},
# Exact duplicate — same reddit_post_id
{
"signal_id": "s002",
"source": "social_media",
"source_name": "reddit",
"reddit_post_id": "abc123",
"original_content": "Big pothole on MG Road!!",
"translated_content": "Big pothole on MG Road!!",
"city": "Bangalore",
"timestamp": now,
"location": {"coordinates": {"lat": 12.9716, "lon": 77.5946}},
},
# Near-duplicate — same content, minor punctuation difference
{
"signal_id": "s003",
"source": "social_media",
"source_name": "reddit",
"reddit_post_id": "xyz999",
"original_content": "big pothole on mg road",
"translated_content": "big pothole on mg road",
"city": "Bangalore",
"timestamp": now,
"location": {"coordinates": {"lat": 12.9716, "lon": 77.5946}},
},
# Different signal — new content
{
"signal_id": "s004",
"source": "news",
"source_name": "times_of_india",
"original_content": "Waterlogging reported on NH-65 in Hyderabad",
"translated_content": "Waterlogging reported on NH-65 in Hyderabad",
"city": "Hyderabad",
"timestamp": now,
"location": {"coordinates": {"lat": 17.3850, "lon": 78.4867}},
},
# RSS re-serve — same news article fetched again
{
"signal_id": "s005",
"source": "news",
"source_name": "times_of_india",
"original_content": "Waterlogging reported on NH-65 in Hyderabad",
"translated_content": "Waterlogging reported on NH-65 in Hyderabad",
"city": "Hyderabad",
"timestamp": now,
"location": {"coordinates": {"lat": 17.3850, "lon": 78.4867}},
},
]
print("Testing Duplicate Detection...\n")
seen = set()
unique = deduplicate_signals(test_signals, seen_hashes=seen)
print(f"\nInput: {len(test_signals)} signals")
print(f"Output: {len(unique)} unique signals")
print("\nUnique signals:")
for s in unique:
print(f" {s['signal_id']}: {s['translated_content']}")