|
1 | 1 | from pathlib import Path |
2 | | -from typing import Any, Dict, List |
| 2 | +from typing import Any, Dict, List, Optional |
3 | 3 |
|
4 | 4 | from graphgen.models import ( |
5 | 5 | CSVReader, |
@@ -34,26 +34,49 @@ def _build_reader(suffix: str, cache_dir: str | None): |
34 | 34 | return _MAPPING[suffix]() |
35 | 35 |
|
36 | 36 |
|
37 | | -def read_files(file_path: str, cache_dir: str | None = None) -> list[dict]: |
38 | | - path = Path(file_path).expanduser() |
| 37 | +def read_files( |
| 38 | + input_file: str, |
| 39 | + allowed_suffix: Optional[List[str]] = None, |
| 40 | + cache_dir: Optional[str] = None, |
| 41 | +) -> list[dict]: |
| 42 | + path = Path(input_file).expanduser() |
39 | 43 | if not path.exists(): |
40 | | - raise FileNotFoundError(f"input_path not found: {file_path}") |
| 44 | + raise FileNotFoundError(f"input_path not found: {input_file}") |
41 | 45 |
|
| 46 | + if allowed_suffix is None: |
| 47 | + support_suffix = set(_MAPPING.keys()) |
| 48 | + else: |
| 49 | + support_suffix = {s.lower().lstrip(".") for s in allowed_suffix} |
| 50 | + |
| 51 | + # single file |
42 | 52 | if path.is_file(): |
43 | | - suffix = path.suffix.lstrip(".") |
| 53 | + suffix = path.suffix.lstrip(".").lower() |
| 54 | + if suffix not in support_suffix: |
| 55 | + logger.warning( |
| 56 | + "Skip file %s (suffix '%s' not in allowed_suffix %s)", |
| 57 | + path, |
| 58 | + suffix, |
| 59 | + support_suffix, |
| 60 | + ) |
| 61 | + return [] |
44 | 62 | reader = _build_reader(suffix, cache_dir) |
45 | 63 | return reader.read(str(path)) |
46 | 64 |
|
47 | | - support_suffix = set(_MAPPING.keys()) |
| 65 | + # folder |
48 | 66 | files_to_read = [ |
49 | 67 | p for p in path.rglob("*") if p.suffix.lstrip(".").lower() in support_suffix |
50 | 68 | ] |
51 | | - logger.info("Found %d file(s) under folder %s", len(files_to_read), file_path) |
| 69 | + logger.info( |
| 70 | + "Found %d eligible file(s) under folder %s (allowed_suffix=%s)", |
| 71 | + len(files_to_read), |
| 72 | + input_file, |
| 73 | + support_suffix, |
| 74 | + ) |
52 | 75 |
|
53 | 76 | all_docs: List[Dict[str, Any]] = [] |
54 | 77 | for p in files_to_read: |
55 | 78 | try: |
56 | | - suffix = p.suffix.lstrip(".") |
| 79 | + suffix = p.suffix.lstrip(".").lower() |
57 | 80 | reader = _build_reader(suffix, cache_dir) |
58 | 81 | all_docs.extend(reader.read(str(p))) |
59 | 82 | except Exception as e: # pylint: disable=broad-except |
|
0 commit comments