Skip to content

Commit b52e1fd

Browse files
committed
feat: add filenameTokenizer file
1 parent 3f2b10f commit b52e1fd

5 files changed

Lines changed: 788 additions & 10 deletions

File tree

src/components/User/Dashboard/DatasetOrganizer/LLMPanel.tsx

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -670,7 +670,8 @@ const LLMPanel: React.FC<LLMPanelProps> = ({
670670
userContext,
671671
subjectInfo,
672672
evidenceBundle?.counts_by_ext || {},
673-
sampleFiles
673+
sampleFiles,
674+
evidenceBundle
674675
);
675676

676677
try {
@@ -809,6 +810,10 @@ const LLMPanel: React.FC<LLMPanelProps> = ({
809810
"_staging/evidence_bundle.json",
810811
JSON.stringify(evidenceBundle, null, 2)
811812
);
813+
zip.file(
814+
"_staging/subject_analysis.json",
815+
JSON.stringify(evidenceBundle.subject_analysis, null, 2)
816+
);
812817
// trio files (get content from the AI-generated FileItems)
813818
const dd = files.find(
814819
(f) => f.source === "ai" && f.name === "dataset_description.json"

src/components/User/Dashboard/DatasetOrganizer/utils/fileAnalyzers.ts

Lines changed: 195 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -149,3 +149,198 @@ export const analyzeFilenamePatterns = (
149149
hasTaskNames,
150150
};
151151
};
152+
153+
// add to fileAnalyzers.ts
154+
155+
// export interface SubjectRecord {
156+
// original_id: string;
157+
// numeric_id: string;
158+
// site: string | null;
159+
// pattern_name: string;
160+
// file_count: number;
161+
// }
162+
163+
// export interface SubjectAnalysis {
164+
// success: boolean;
165+
// method: string;
166+
// subject_records: SubjectRecord[];
167+
// subject_count: number;
168+
// has_site_info: boolean;
169+
// variants_by_subject: Record<string, any>;
170+
// python_generated_filename_rules: any[];
171+
// id_mapping: {
172+
// id_mapping: Record<string, string>;
173+
// reverse_mapping: Record<string, string>;
174+
// strategy_used: string;
175+
// metadata_columns: string[];
176+
// };
177+
// }
178+
179+
// // mirrors _extract_subjects_from_directory_structure
180+
// const extractFromDirectoryStructure = (
181+
// allFiles: string[]
182+
// ): Omit<SubjectAnalysis, "id_mapping"> | null => {
183+
// const patterns: Array<[RegExp, boolean, number, number | null, string]> = [
184+
// [/^([A-Za-z]+)_sub(\d+)$/i, true, 2, 1, "site_prefixed"],
185+
// [/^sub-(\d+)$/i, false, 1, null, "standard_bids"],
186+
// [/^subject[_-]?(\d+)$/i, false, 1, null, "simple"],
187+
// [/^(\d{3,})$/, false, 1, null, "numeric_only"],
188+
// ];
189+
190+
// const subjectRecords: SubjectRecord[] = [];
191+
// const seenIds = new Set<string>();
192+
193+
// for (const filepath of allFiles) {
194+
// const parts = filepath.split("/");
195+
// for (const part of parts.slice(0, 2)) {
196+
// for (const [
197+
// regex,
198+
// hasSite,
199+
// idGroup,
200+
// siteGroup,
201+
// patternName,
202+
// ] of patterns) {
203+
// const match = part.match(regex);
204+
// if (match) {
205+
// const originalId = match[0];
206+
// if (seenIds.has(originalId)) break;
207+
// seenIds.add(originalId);
208+
// subjectRecords.push({
209+
// original_id: originalId,
210+
// numeric_id: match[idGroup],
211+
// site: hasSite && siteGroup ? match[siteGroup] : null,
212+
// pattern_name: patternName,
213+
// file_count: 0,
214+
// });
215+
// break;
216+
// }
217+
// }
218+
// }
219+
// }
220+
221+
// if (subjectRecords.length === 0) return null;
222+
223+
// subjectRecords.sort((a, b) => {
224+
// const na = parseInt(a.numeric_id) || 0;
225+
// const nb = parseInt(b.numeric_id) || 0;
226+
// return na - nb;
227+
// });
228+
229+
// return {
230+
// success: true,
231+
// method: "directory_structure",
232+
// subject_records: subjectRecords,
233+
// subject_count: subjectRecords.length,
234+
// has_site_info: subjectRecords.some((r) => r.site !== null),
235+
// variants_by_subject: {},
236+
// python_generated_filename_rules: [],
237+
// };
238+
// };
239+
240+
// // mirrors _extract_subjects_from_flat_filenames
241+
// const extractFromFlatFilenames = (
242+
// allFiles: string[]
243+
// ): Omit<SubjectAnalysis, "id_mapping"> | null => {
244+
// const identifierToFiles: Record<string, string[]> = {};
245+
246+
// for (const filepath of allFiles) {
247+
// const filename = filepath.split("/").pop() || "";
248+
// const nameNoExt = filename
249+
// .replace(/\.[^/.]+$/, "")
250+
// .replace(/\.nii\.gz$/, "");
251+
// const match = nameNoExt.match(/^([A-Za-z0-9\-]+)/);
252+
// if (match) {
253+
// const identifier = match[1];
254+
// if (!identifierToFiles[identifier]) identifierToFiles[identifier] = [];
255+
// identifierToFiles[identifier].push(filepath);
256+
// }
257+
// }
258+
259+
// if (Object.keys(identifierToFiles).length === 0) return null;
260+
261+
// const extractNumeric = (id: string): number => {
262+
// const nums = id.match(/\d+/g);
263+
// return nums ? parseInt(nums[nums.length - 1]) : 999999;
264+
// };
265+
266+
// const sortedIdentifiers = Object.keys(identifierToFiles).sort(
267+
// (a, b) => extractNumeric(a) - extractNumeric(b)
268+
// );
269+
270+
// const subjectRecords: SubjectRecord[] = sortedIdentifiers.map((id, i) => ({
271+
// original_id: id,
272+
// numeric_id: String(i + 1),
273+
// site: null,
274+
// pattern_name: "dominant_prefix",
275+
// file_count: identifierToFiles[id].length,
276+
// }));
277+
278+
// return {
279+
// success: true,
280+
// method: "dominant_prefix_fallback",
281+
// subject_records: subjectRecords,
282+
// subject_count: subjectRecords.length,
283+
// has_site_info: false,
284+
// variants_by_subject: {},
285+
// python_generated_filename_rules: [],
286+
// };
287+
// };
288+
289+
// // mirrors _generate_subject_id_mapping
290+
// const generateIdMapping = (
291+
// subjectInfo: Omit<SubjectAnalysis, "id_mapping">
292+
// ): SubjectAnalysis["id_mapping"] => {
293+
// const records = subjectInfo.subject_records;
294+
// const idMapping: Record<string, string> = {};
295+
// const reverseMapping: Record<string, string> = {};
296+
297+
// // detect already-BIDS format (sub-01, sub-02...)
298+
// const allAlreadyBids = records.every((r) => /^sub-\w+$/i.test(r.original_id));
299+
300+
// if (allAlreadyBids) {
301+
// for (const rec of records) {
302+
// const bidsId = rec.original_id.replace(/^sub-/i, "");
303+
// idMapping[rec.original_id] = bidsId;
304+
// reverseMapping[bidsId] = rec.original_id;
305+
// }
306+
// return {
307+
// id_mapping: idMapping,
308+
// reverse_mapping: reverseMapping,
309+
// strategy_used: "already_bids",
310+
// metadata_columns: [],
311+
// };
312+
// }
313+
314+
// // numeric strategy
315+
// for (let i = 0; i < records.length; i++) {
316+
// const orig = records[i].original_id;
317+
// const bidsId = String(i + 1);
318+
// idMapping[orig] = bidsId;
319+
// reverseMapping[bidsId] = orig;
320+
// }
321+
322+
// return {
323+
// id_mapping: idMapping,
324+
// reverse_mapping: reverseMapping,
325+
// strategy_used: "numeric",
326+
// metadata_columns: ["original_id"],
327+
// };
328+
// };
329+
330+
// // main export — call this from llmHelpers
331+
// export const extractSubjectAnalysis = (allFiles: string[]): SubjectAnalysis => {
332+
// const fromDir = extractFromDirectoryStructure(allFiles);
333+
// const base = fromDir ??
334+
// extractFromFlatFilenames(allFiles) ?? {
335+
// success: false,
336+
// method: "none",
337+
// subject_records: [],
338+
// subject_count: 0,
339+
// has_site_info: false,
340+
// variants_by_subject: {},
341+
// python_generated_filename_rules: [],
342+
// };
343+
344+
// const idMapping = generateIdMapping(base);
345+
// return { ...base, id_mapping: idMapping };
346+
// };

0 commit comments

Comments
 (0)