Skip to content

Commit 9a385ac

Browse files
jeremymanningclaude
andcommitted
Add reconciliation tool for three-way people sync (Phase 3)
- Created scripts/reconcile_people.py: compares people.xlsx (source of truth), JRM_CV.tex, and lab_manual.tex using exact, nickname, and fuzzy matching (0.85 threshold) - Categorizes discrepancies as auto-resolved, flagged-for-review, or conflicts - Excludes non-person sheets (collaborators, director) from comparison - Created tests/test_reconcile_people.py with 22 tests including a 20+ name variation corpus for fuzzy matching validation - 192 total tests passing Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent 840e2ed commit 9a385ac

3 files changed

Lines changed: 646 additions & 9 deletions

File tree

scripts/reconcile_people.py

Lines changed: 399 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,399 @@
1+
#!/usr/bin/env python3
2+
"""Reconcile member/alumni data across people.xlsx, JRM_CV.tex, and lab_manual.tex.
3+
4+
people.xlsx is the source of truth. Discrepancies are categorized as:
5+
- Auto-resolved: people in people.xlsx missing from other sources (auto-added)
6+
- Flagged for review: people in other sources missing from people.xlsx
7+
- Conflicts: data mismatches requiring manual resolution
8+
"""
9+
import argparse
10+
import sys
11+
from difflib import SequenceMatcher
12+
from pathlib import Path
13+
from typing import List, Dict, Set, Optional, Tuple
14+
15+
from utils import load_spreadsheet_all_sheets
16+
from parse_cv_trainees import parse_cv_trainees, get_active_trainees, get_alumni_trainees
17+
from parse_lab_manual import parse_members_chapter
18+
from sync_cv_people import normalize_name, NICKNAME_MAP, expand_nicknames, names_match
19+
20+
PROJECT_ROOT = Path(__file__).parent.parent
21+
PEOPLE_XLSX = PROJECT_ROOT / 'data' / 'people.xlsx'
22+
CV_TEX = PROJECT_ROOT / 'documents' / 'JRM_CV.tex'
23+
LAB_MANUAL_TEX = PROJECT_ROOT / 'lab-manual' / 'lab_manual.tex'
24+
25+
FUZZY_THRESHOLD = 0.85
26+
27+
28+
def load_people_xlsx() -> Dict[str, List[Dict]]:
29+
"""Load all sheets from people.xlsx and return normalized data."""
30+
sheets = load_spreadsheet_all_sheets(PEOPLE_XLSX)
31+
return sheets
32+
33+
34+
# Sheets in people.xlsx that contain actual lab members/alumni
35+
PERSON_SHEETS = {
36+
'members', 'alumni_postdocs', 'alumni_grads',
37+
'alumni_managers', 'alumni_undergrads',
38+
}
39+
40+
41+
def get_all_people_names(sheets: Dict[str, List[Dict]]) -> Dict[str, Dict]:
42+
"""Extract all people from people.xlsx with their sheet and data.
43+
44+
Excludes non-person sheets like 'collaborators' and 'director'.
45+
46+
Returns:
47+
Dict mapping normalized name -> {sheet, name_original, data}
48+
"""
49+
people = {}
50+
for sheet_name, rows in sheets.items():
51+
if sheet_name not in PERSON_SHEETS:
52+
continue
53+
for row in rows:
54+
name = row.get('name', '').strip()
55+
if not name:
56+
continue
57+
norm = normalize_name(name)
58+
people[norm] = {
59+
'sheet': sheet_name,
60+
'name_original': name,
61+
'data': row,
62+
}
63+
return people
64+
65+
66+
def get_cv_names() -> Dict[str, Dict]:
67+
"""Extract all trainees from JRM_CV.tex.
68+
69+
Returns:
70+
Dict mapping normalized name -> {category, is_active, trainee}
71+
"""
72+
if not CV_TEX.exists():
73+
return {}
74+
trainees_by_cat = parse_cv_trainees(CV_TEX)
75+
result = {}
76+
for cat, trainees in trainees_by_cat.items():
77+
for t in trainees:
78+
norm = normalize_name(t.name)
79+
result[norm] = {
80+
'category': t.category,
81+
'is_active': t.is_active,
82+
'name_original': t.name,
83+
'trainee': t,
84+
}
85+
return result
86+
87+
88+
def get_lab_manual_names() -> Dict[str, Dict]:
89+
"""Extract all members from lab_manual.tex.
90+
91+
Returns:
92+
Dict mapping normalized name -> {role_category, is_active, record}
93+
"""
94+
if not LAB_MANUAL_TEX.exists():
95+
return {}
96+
records = parse_members_chapter(LAB_MANUAL_TEX)
97+
result = {}
98+
for r in records:
99+
norm = normalize_name(r['name'])
100+
# Same person may appear multiple times (multi-role); keep the most recent
101+
if norm in result:
102+
existing = result[norm]
103+
if r['is_active'] and not existing['is_active']:
104+
result[norm] = {
105+
'role_category': r['role_category'],
106+
'is_active': r['is_active'],
107+
'name_original': r['name'],
108+
'record': r,
109+
}
110+
else:
111+
result[norm] = {
112+
'role_category': r['role_category'],
113+
'is_active': r['is_active'],
114+
'name_original': r['name'],
115+
'record': r,
116+
}
117+
return result
118+
119+
120+
def fuzzy_find(name: str, name_set: Set[str]) -> Optional[Tuple[str, float]]:
121+
"""Find the best fuzzy match for a name in a set.
122+
123+
Args:
124+
name: Normalized name to search for.
125+
name_set: Set of normalized names to search in.
126+
127+
Returns:
128+
Tuple of (matched_name, score) if score >= FUZZY_THRESHOLD, else None.
129+
"""
130+
best_match = None
131+
best_score = 0.0
132+
for candidate in name_set:
133+
score = SequenceMatcher(None, name, candidate).ratio()
134+
if score > best_score:
135+
best_score = score
136+
best_match = candidate
137+
if best_score >= FUZZY_THRESHOLD and best_match:
138+
return (best_match, best_score)
139+
return None
140+
141+
142+
def find_match(name: str, target_names: Set[str]) -> Optional[Tuple[str, str]]:
143+
"""Try to find a name in a set using exact, nickname, and fuzzy matching.
144+
145+
Returns:
146+
Tuple of (matched_name, match_type) or None.
147+
match_type is 'exact', 'nickname', or 'fuzzy'.
148+
"""
149+
# Exact match
150+
if name in target_names:
151+
return (name, 'exact')
152+
153+
# Nickname match
154+
if names_match(name, name) is False:
155+
pass # names_match compares two names
156+
for target in target_names:
157+
if names_match(name, target):
158+
return (target, 'nickname')
159+
160+
# Fuzzy match
161+
result = fuzzy_find(name, target_names)
162+
if result:
163+
return (result[0], 'fuzzy')
164+
165+
return None
166+
167+
168+
class Discrepancy:
169+
"""A discrepancy found during reconciliation."""
170+
171+
def __init__(self, name, disc_type, present_in, missing_from,
172+
details, resolution, confidence=1.0):
173+
self.name = name
174+
self.type = disc_type # 'missing', 'conflict', 'near_match'
175+
self.present_in = present_in # list of source names
176+
self.missing_from = missing_from # list of source names
177+
self.details = details
178+
self.resolution = resolution # 'auto_add', 'flag_for_review', 'conflict'
179+
self.confidence = confidence
180+
181+
182+
def reconcile(dry_run=False) -> List[Discrepancy]:
183+
"""Run three-way reconciliation.
184+
185+
Args:
186+
dry_run: If True, report only; don't modify files.
187+
188+
Returns:
189+
List of Discrepancy objects.
190+
"""
191+
xlsx_people = get_all_people_names(load_people_xlsx())
192+
cv_people = get_cv_names()
193+
lm_people = get_lab_manual_names()
194+
195+
xlsx_names = set(xlsx_people.keys())
196+
cv_names = set(cv_people.keys())
197+
lm_names = set(lm_people.keys())
198+
199+
# Exclude PI from comparisons (PI is not in people.xlsx)
200+
pi_names = {normalize_name(r['name_original']) for r in lm_people.values()
201+
if r['role_category'] == 'PI'}
202+
lm_names_no_pi = lm_names - pi_names
203+
204+
discrepancies = []
205+
206+
# 1. People in people.xlsx but not in CV
207+
for name in xlsx_names:
208+
if name not in cv_names:
209+
match = find_match(name, cv_names)
210+
if match:
211+
matched, match_type = match
212+
if match_type == 'fuzzy':
213+
discrepancies.append(Discrepancy(
214+
name=xlsx_people[name]['name_original'],
215+
disc_type='near_match',
216+
present_in=['people.xlsx', 'CV (as ' + cv_people[matched]['name_original'] + ')'],
217+
missing_from=[],
218+
details=f"Fuzzy match: '{xlsx_people[name]['name_original']}' ≈ '{cv_people[matched]['name_original']}'",
219+
resolution='flag_for_review',
220+
confidence=SequenceMatcher(None, name, matched).ratio(),
221+
))
222+
else:
223+
discrepancies.append(Discrepancy(
224+
name=xlsx_people[name]['name_original'],
225+
disc_type='missing',
226+
present_in=['people.xlsx'],
227+
missing_from=['CV'],
228+
details=f"'{xlsx_people[name]['name_original']}' is in people.xlsx ({xlsx_people[name]['sheet']}) but not in CV",
229+
resolution='auto_add',
230+
))
231+
232+
# 2. People in people.xlsx but not in lab-manual
233+
for name in xlsx_names:
234+
if name not in lm_names_no_pi:
235+
match = find_match(name, lm_names_no_pi)
236+
if match:
237+
matched, match_type = match
238+
if match_type == 'fuzzy':
239+
discrepancies.append(Discrepancy(
240+
name=xlsx_people[name]['name_original'],
241+
disc_type='near_match',
242+
present_in=['people.xlsx', 'lab-manual (as ' + lm_people[matched]['name_original'] + ')'],
243+
missing_from=[],
244+
details=f"Fuzzy match: '{xlsx_people[name]['name_original']}' ≈ '{lm_people[matched]['name_original']}'",
245+
resolution='flag_for_review',
246+
confidence=SequenceMatcher(None, name, matched).ratio(),
247+
))
248+
else:
249+
discrepancies.append(Discrepancy(
250+
name=xlsx_people[name]['name_original'],
251+
disc_type='missing',
252+
present_in=['people.xlsx'],
253+
missing_from=['lab-manual'],
254+
details=f"'{xlsx_people[name]['name_original']}' is in people.xlsx ({xlsx_people[name]['sheet']}) but not in lab-manual",
255+
resolution='auto_add',
256+
))
257+
258+
# 3. People in lab-manual but not in people.xlsx (FLAG)
259+
for name in lm_names_no_pi:
260+
if name not in xlsx_names:
261+
match = find_match(name, xlsx_names)
262+
if match:
263+
matched, match_type = match
264+
if match_type in ('exact', 'nickname'):
265+
continue # Already matched
266+
discrepancies.append(Discrepancy(
267+
name=lm_people[name]['name_original'],
268+
disc_type='near_match',
269+
present_in=['lab-manual'],
270+
missing_from=['people.xlsx'],
271+
details=f"Fuzzy match: '{lm_people[name]['name_original']}' ≈ '{xlsx_people[matched]['name_original']}'",
272+
resolution='flag_for_review',
273+
confidence=SequenceMatcher(None, name, matched).ratio(),
274+
))
275+
else:
276+
discrepancies.append(Discrepancy(
277+
name=lm_people[name]['name_original'],
278+
disc_type='missing',
279+
present_in=['lab-manual'],
280+
missing_from=['people.xlsx'],
281+
details=f"'{lm_people[name]['name_original']}' is in lab-manual ({lm_people[name]['role_category']}) but not in people.xlsx",
282+
resolution='flag_for_review',
283+
))
284+
285+
# 4. People in CV but not in people.xlsx (FLAG)
286+
for name in cv_names:
287+
if name not in xlsx_names:
288+
match = find_match(name, xlsx_names)
289+
if match:
290+
matched, match_type = match
291+
if match_type in ('exact', 'nickname'):
292+
continue
293+
discrepancies.append(Discrepancy(
294+
name=cv_people[name]['name_original'],
295+
disc_type='near_match',
296+
present_in=['CV'],
297+
missing_from=['people.xlsx'],
298+
details=f"Fuzzy match: '{cv_people[name]['name_original']}' ≈ '{xlsx_people[matched]['name_original']}'",
299+
resolution='flag_for_review',
300+
confidence=SequenceMatcher(None, name, matched).ratio(),
301+
))
302+
else:
303+
discrepancies.append(Discrepancy(
304+
name=cv_people[name]['name_original'],
305+
disc_type='missing',
306+
present_in=['CV'],
307+
missing_from=['people.xlsx'],
308+
details=f"'{cv_people[name]['name_original']}' is in CV ({cv_people[name]['category']}) but not in people.xlsx",
309+
resolution='flag_for_review',
310+
))
311+
312+
return discrepancies
313+
314+
315+
def print_report(discrepancies: List[Discrepancy]) -> None:
316+
"""Print a categorized reconciliation report."""
317+
auto_resolved = [d for d in discrepancies if d.resolution == 'auto_add']
318+
flagged = [d for d in discrepancies if d.resolution == 'flag_for_review']
319+
conflicts = [d for d in discrepancies if d.resolution == 'conflict']
320+
321+
print("=" * 60)
322+
print("RECONCILIATION REPORT")
323+
print("=" * 60)
324+
print(f"\nTotal discrepancies: {len(discrepancies)}")
325+
print(f" Auto-resolved: {len(auto_resolved)}")
326+
print(f" Flagged for review: {len(flagged)}")
327+
print(f" Conflicts: {len(conflicts)}")
328+
329+
if auto_resolved:
330+
print("\n" + "-" * 60)
331+
print("AUTO-RESOLVED (people.xlsx → other sources)")
332+
print("-" * 60)
333+
for d in auto_resolved:
334+
print(f" + {d.name}")
335+
print(f" Present in: {', '.join(d.present_in)}")
336+
print(f" Missing from: {', '.join(d.missing_from)}")
337+
print(f" Action: Auto-add to {', '.join(d.missing_from)}")
338+
339+
if flagged:
340+
print("\n" + "-" * 60)
341+
print("FLAGGED FOR REVIEW")
342+
print("-" * 60)
343+
for d in flagged:
344+
flag = "~" if d.type == 'near_match' else "?"
345+
print(f" {flag} {d.name}")
346+
print(f" {d.details}")
347+
if d.type == 'near_match':
348+
print(f" Confidence: {d.confidence:.0%}")
349+
350+
if conflicts:
351+
print("\n" + "-" * 60)
352+
print("CONFLICTS REQUIRING MANUAL RESOLUTION")
353+
print("-" * 60)
354+
for d in conflicts:
355+
print(f" ! {d.name}")
356+
print(f" {d.details}")
357+
358+
if not discrepancies:
359+
print("\nAll sources are in sync!")
360+
361+
print("\n" + "=" * 60)
362+
363+
364+
def main():
365+
parser = argparse.ArgumentParser(
366+
description='Reconcile member/alumni data across people.xlsx, CV, and lab-manual.'
367+
)
368+
parser.add_argument(
369+
'--dry-run', action='store_true',
370+
help='Report discrepancies without making changes.'
371+
)
372+
args = parser.parse_args()
373+
374+
# Verify sources exist
375+
if not PEOPLE_XLSX.exists():
376+
print(f"ERROR: {PEOPLE_XLSX} not found", file=sys.stderr)
377+
sys.exit(1)
378+
379+
if not LAB_MANUAL_TEX.exists():
380+
print(f"WARNING: {LAB_MANUAL_TEX} not found (submodule not initialized?)", file=sys.stderr)
381+
print("Run: git submodule update --init", file=sys.stderr)
382+
383+
discrepancies = reconcile(dry_run=args.dry_run)
384+
print_report(discrepancies)
385+
386+
if args.dry_run:
387+
print("\n(Dry run — no changes made)")
388+
else:
389+
# TODO: Apply auto-fixes in Phase 3 implementation
390+
print("\n(Report only — auto-fix not yet implemented)")
391+
392+
# Exit with non-zero if there are flagged items
393+
flagged = [d for d in discrepancies if d.resolution == 'flag_for_review']
394+
if flagged:
395+
sys.exit(1)
396+
397+
398+
if __name__ == '__main__':
399+
main()

0 commit comments

Comments
 (0)