Skip to content

Commit 14a0ccd

Browse files
authored
Merge pull request #103 from pickwicksoft/bugfix/#95/loading-big-data-files-not-safe
🐛 Refactor data loaders to be lazy and use generators to prevent memory problems
2 parents 15083ad + 8575eeb commit 14a0ccd

13 files changed

Lines changed: 1384 additions & 894 deletions

File tree

.github/workflows/build.yml

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -21,8 +21,7 @@ jobs:
2121
run: pip install tox
2222
- name: Run tox
2323
run: tox -e py
24-
- name: SonarCloud Scan
25-
uses: SonarSource/sonarcloud-github-action@master
24+
- name: SonarQube Scan
25+
uses: SonarSource/sonarqube-scan-action@299e4b793aaa83bf2aba7c9c14bedbb485688ec4
2626
env:
27-
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} # Needed to get PR information, if any
2827
SONAR_TOKEN: ${{ secrets.SONAR_TOKEN }}

.github/workflows/pylint.yml

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -7,17 +7,17 @@ jobs:
77
runs-on: ubuntu-latest
88
strategy:
99
matrix:
10-
python-version: [ "3.8", "3.9", "3.10", "3.11", "3.12" ]
10+
python-version: [ "3.10", "3.11", "3.12" ]
1111
steps:
1212
- uses: actions/checkout@v4
13-
- name: Set up Python ${{ matrix.python-version }}
14-
uses: actions/setup-python@v4
15-
with:
16-
python-version: ${{ matrix.python-version }}
17-
- name: Install dependencies
18-
run: |
19-
python -m pip install --upgrade pip
20-
pip install pylint
21-
- name: Analysing the code with pylint
22-
run: |
23-
pylint $(git ls-files '*.py')
13+
- name: Set up Python ${{ matrix.python-version }}
14+
uses: actions/setup-python@v4
15+
with:
16+
python-version: ${{ matrix.python-version }}
17+
- name: Install dependencies
18+
run: |
19+
python -m pip install --upgrade pip
20+
pip install pylint
21+
- name: Analysing the code with pylint
22+
run: |
23+
pylint $(git ls-files '*.py')

.github/workflows/unittests.yml

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -17,13 +17,13 @@ jobs:
1717
test:
1818
runs-on: ubuntu-latest
1919
steps:
20-
- uses: actions/checkout@v4
20+
- uses: actions/checkout@v6
2121

2222
# If you wanted to use multiple Python versions, you'd have specify a matrix in the job and
2323
# reference the matrixe python version here.
24-
- uses: actions/setup-python@v5
24+
- uses: actions/setup-python@v6
2525
with:
26-
python-version: 3.9
26+
python-version: '3.10'
2727

2828
# Cache the installation of Poetry itself, e.g. the next step. This prevents the workflow
2929
# from installing Poetry every time, which can be slow. Note the use of the Poetry version
@@ -45,7 +45,7 @@ jobs:
4545
# The key configuration value here is `virtualenvs-in-project: true`: this creates the
4646
# venv as a `.venv` in your testing directory, which allows the next step to easily
4747
# cache it.
48-
- uses: snok/install-poetry@v1
48+
- uses: snok/install-poetry@76e04a911780d5b312d89783f7b1cd627778900a
4949
with:
5050
version: 2.1.0
5151
virtualenvs-create: true

poetry.lock

Lines changed: 927 additions & 609 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pystreamapi/loaders/__csv/__csv_loader.py

Lines changed: 43 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -1,41 +1,62 @@
11
from collections import namedtuple
22
from csv import reader
3+
from io import StringIO
4+
from typing import Any, Iterator
35

46
from pystreamapi.loaders.__loader_utils import LoaderUtils
5-
from pystreamapi.loaders.__lazy_file_iterable import LazyFileIterable
67

78

8-
def csv(file_path: str, cast_types=True, delimiter=',', encoding="utf-8") -> LazyFileIterable:
9+
def csv(
10+
src: str, read_from_src=False, cast_types=True, delimiter=',', encoding="utf-8"
11+
) -> Iterator[Any]:
912
"""
10-
Loads a CSV file and converts it into a list of namedtuples.
11-
12-
Returns:
13-
list: A list of namedtuples, where each namedtuple represents a row in the CSV.
14-
:param cast_types: Set as False to disable casting of values to int, bool or float.
15-
:param encoding: The encoding of the CSV file.
16-
:param file_path: The path to the CSV file.
17-
:param delimiter: The delimiter used in the CSV file.
13+
Lazily loads CSV data from either a path or a string and yields namedtuples.
14+
15+
Args:
16+
src (str): Either the path to a CSV file or a CSV string.
17+
read_from_src (bool): If True, src is treated as a CSV string.
18+
If False, src is treated as a path to a CSV file.
19+
cast_types (bool): Set as False to disable casting of values to int, bool or float.
20+
delimiter (str): The delimiter used in the CSV data.
21+
encoding (str): The encoding of the CSV file (only used when reading from file).
22+
23+
Yields:
24+
namedtuple: Each row in the CSV as a namedtuple.
1825
"""
19-
file_path = LoaderUtils.validate_path(file_path)
20-
return LazyFileIterable(lambda: __load_csv(file_path, cast_types, delimiter, encoding))
26+
if not read_from_src:
27+
src = LoaderUtils.validate_path(src)
28+
return __load_csv_from_file(src, cast_types, delimiter, encoding)
29+
return __load_csv_from_string(src, cast_types, delimiter)
2130

2231

23-
def __load_csv(file_path, cast, delimiter, encoding):
24-
"""Load a CSV file and convert it into a list of namedtuples"""
32+
def __load_csv_from_file(file_path, cast, delimiter, encoding):
33+
"""Load a CSV file and convert it into a generator of namedtuples"""
2534
# skipcq: PTC-W6004
2635
with open(file_path, mode='r', newline='', encoding=encoding) as csvfile:
27-
csvreader = reader(csvfile, delimiter=delimiter)
36+
yield from __process_csv(csvfile, cast, delimiter)
37+
38+
39+
def __load_csv_from_string(csv_string, cast, delimiter):
40+
"""Load a CSV from string and convert it into a generator of namedtuples"""
41+
with StringIO(csv_string) as csvfile:
42+
yield from __process_csv(csvfile, cast, delimiter)
43+
2844

29-
# Create a namedtuple type, casting the header values to int or float if possible
30-
header = __get_csv_header(csvreader)
45+
def __process_csv(csvfile, cast, delimiter):
46+
"""Process CSV data and yield namedtuples"""
47+
csvreader = reader(csvfile, delimiter=delimiter)
3148

32-
Row = namedtuple('Row', list(header))
49+
# Create a namedtuple type, casting the header values to int or float if possible
50+
header = __get_csv_header(csvreader)
51+
if not header:
52+
return
3353

34-
mapper = LoaderUtils.try_cast if cast else lambda x: x
54+
Row = namedtuple('Row', list(header))
55+
mapper = LoaderUtils.try_cast if cast else lambda x: x
3556

36-
# Process the data, casting values to int or float if possible
37-
data = [Row(*[mapper(value) for value in row]) for row in csvreader]
38-
return data
57+
# Yield the data row by row, casting values to int or float if possible
58+
for row in csvreader:
59+
yield Row(*[mapper(value) for value in row])
3960

4061

4162
def __get_csv_header(csvreader):

pystreamapi/loaders/__json/__json_loader.py

Lines changed: 43 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -1,40 +1,61 @@
11
import json as jsonlib
22
from collections import namedtuple
3+
from typing import Any, Iterator
34

4-
from pystreamapi.loaders.__lazy_file_iterable import LazyFileIterable
55
from pystreamapi.loaders.__loader_utils import LoaderUtils
66

77

8-
def json(src: str, read_from_src=False) -> LazyFileIterable:
8+
def json(src: str, read_from_src=False) -> Iterator[Any]:
99
"""
10-
Loads JSON data from either a path or a string and converts it into a list of namedtuples.
10+
Lazily loads JSON data from either a path or a string and yields namedtuples.
1111
12-
Returns:
13-
list: A list of namedtuples, where each namedtuple represents an object in the JSON.
14-
:param src: Either the path to a JSON file or a JSON string.
15-
:param read_from_src: If True, src is treated as a JSON string. If False, src is treated as
16-
a path to a JSON file.
12+
Args:
13+
src (str): Either the path to a JSON file or a JSON string.
14+
read_from_src (bool): If True, src is treated as a JSON string.
15+
If False, src is treated as a path to a JSON file.
16+
17+
Yields:
18+
namedtuple: Each object in the JSON as a namedtuple.
1719
"""
1820
if read_from_src:
19-
return LazyFileIterable(lambda: __load_json_string(src))
21+
return __lazy_load_json_string(src)
2022
path = LoaderUtils.validate_path(src)
21-
return LazyFileIterable(lambda: __load_json_file(path))
23+
return __lazy_load_json_file(path)
24+
25+
26+
def __lazy_load_json_file(file_path: str) -> Iterator[Any]:
27+
"""Lazily read and parse a JSON file, yielding namedtuples."""
28+
29+
def generator():
30+
"""Generate namedtuples from the JSON file contents."""
31+
# skipcq: PTC-W6004
32+
with open(file_path, mode='r', encoding='utf-8') as jsonfile:
33+
src = jsonfile.read()
34+
if not src.strip():
35+
return
36+
result = jsonlib.loads(src, object_hook=__dict_to_namedtuple)
37+
if isinstance(result, list):
38+
yield from result
39+
else:
40+
yield result
41+
42+
return generator()
2243

2344

24-
def __load_json_file(file_path):
25-
"""Load a JSON file and convert it into a list of namedtuples"""
26-
# skipcq: PTC-W6004
27-
with open(file_path, mode='r', encoding='utf-8') as jsonfile:
28-
src = jsonfile.read()
29-
if src == '':
30-
return []
31-
data = jsonlib.loads(src, object_hook=__dict_to_namedtuple)
32-
return data
45+
def __lazy_load_json_string(json_string: str) -> Iterator[Any]:
46+
"""Lazily parse a JSON string, yielding namedtuples."""
3347

48+
def generator():
49+
"""Internal generator that yields namedtuples by parsing the JSON string on demand."""
50+
if not json_string.strip():
51+
return
52+
result = jsonlib.loads(json_string, object_hook=__dict_to_namedtuple)
53+
if isinstance(result, list):
54+
yield from result
55+
else:
56+
yield result
3457

35-
def __load_json_string(json_string):
36-
"""Load JSON data from a string and convert it into a list of namedtuples"""
37-
return jsonlib.loads(json_string, object_hook=__dict_to_namedtuple)
58+
return generator()
3859

3960

4061
def __dict_to_namedtuple(d, name='Item'):
Lines changed: 47 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -1,34 +1,24 @@
1+
from typing import Iterator, Any
2+
13
try:
24
from defusedxml import ElementTree
35
except ImportError as exc:
46
raise ImportError(
57
"Please install the xml_loader extra dependency to use the xml loader."
68
) from exc
79
from collections import namedtuple
8-
from pystreamapi.loaders.__lazy_file_iterable import LazyFileIterable
910
from pystreamapi.loaders.__loader_utils import LoaderUtils
1011

1112

12-
class __XmlLoaderUtil:
13-
"""Utility class for the XML loader."""
14-
15-
def __init__(self):
16-
self.cast_types = True
17-
self.retrieve_children = True
18-
19-
20-
config = __XmlLoaderUtil()
21-
22-
2313
def xml(src: str, read_from_src=False, retrieve_children=True, cast_types=True,
24-
encoding="utf-8") -> LazyFileIterable:
14+
encoding="utf-8") -> Iterator[Any]:
2515
"""
2616
Loads XML data from either a path or a string and converts it into a list of namedtuples.
2717
Warning: This method isn't safe against malicious XML trees. Parse only safe XML from sources
2818
you trust.
2919
3020
Returns:
31-
LazyFileIterable: A list of namedtuples, where each namedtuple represents an XML element.
21+
An iterator with namedtuples, where each namedtuple represents an XML element.
3222
:param retrieve_children: If true, the children of the root element are used as stream
3323
elements.
3424
:param encoding: The encoding of the XML file.
@@ -37,65 +27,76 @@ def xml(src: str, read_from_src=False, retrieve_children=True, cast_types=True,
3727
a path to an XML file.
3828
:param cast_types: Set as False to disable casting of values to int, bool or float.
3929
"""
40-
config.cast_types = cast_types
41-
config.retrieve_children = retrieve_children
4230
if read_from_src:
43-
return LazyFileIterable(lambda: __load_xml_string(src))
31+
return _lazy_parse_xml_string(src, retrieve_children, cast_types)
32+
4433
path = LoaderUtils.validate_path(src)
45-
return LazyFileIterable(lambda: __load_xml_file(path, encoding))
34+
return _lazy_parse_xml_file(path, encoding, retrieve_children, cast_types)
35+
36+
37+
def _lazy_parse_xml_file(file_path: str, encoding: str,
38+
retrieve_children: bool, cast_types: bool) -> Iterator[Any]:
39+
"""Lazily parse an XML file by reading its content and yielding parsed namedtuples."""
40+
def generator():
41+
"""Generator that reads the XML file and yields parsed namedtuples lazily."""
42+
# skipcq: PTC-W6004
43+
with open(file_path, mode='r', encoding=encoding) as xmlfile:
44+
xml_string = xmlfile.read()
45+
yield from _parse_xml_string_lazy(xml_string, retrieve_children, cast_types)
4646

47+
return generator()
4748

48-
def __load_xml_file(file_path, encoding):
49-
"""Load an XML file and convert it into a list of namedtuples."""
50-
# skipcq: PTC-W6004
51-
with open(file_path, mode='r', encoding=encoding) as xmlfile:
52-
src = xmlfile.read()
53-
if src:
54-
return __parse_xml_string(src)
55-
return []
5649

50+
def _lazy_parse_xml_string(xml_string: str, retrieve_children: bool,
51+
cast_types: bool) -> Iterator[Any]:
52+
"""Lazily parse an XML string by yielding parsed namedtuples for each element."""
53+
def generator():
54+
"""Generator that yields parsed namedtuples from the XML string lazily."""
55+
yield from _parse_xml_string_lazy(xml_string, retrieve_children, cast_types)
5756

58-
def __load_xml_string(xml_string):
59-
"""Load XML data from a string and convert it into a list of namedtuples."""
60-
return __parse_xml_string(xml_string)
57+
return generator()
6158

6259

63-
def __parse_xml_string(xml_string):
64-
"""Parse XML string and convert it into a list of namedtuples."""
60+
def _parse_xml_string_lazy(xml_string: str, retrieve_children: bool,
61+
cast_types: bool) -> Iterator[Any]:
62+
"""Parse an XML string into namedtuples, optionally yielding child elements lazily."""
6563
root = ElementTree.fromstring(xml_string)
66-
parsed_xml = __parse_xml(root)
67-
return __flatten(parsed_xml) if config.retrieve_children else [parsed_xml]
64+
parsed = __parse_xml(root, cast_types)
65+
if retrieve_children:
66+
yield from __flatten(parsed)
67+
else:
68+
yield parsed
6869

6970

70-
def __parse_xml(element):
71+
def __parse_xml(element, cast_types: bool):
7172
"""Parse XML element and convert it into a namedtuple."""
7273
if len(element) == 0:
73-
return __parse_empty_element(element)
74+
return __parse_empty_element(element, cast_types)
7475
if len(element) == 1:
75-
return __parse_single_element(element)
76-
return __parse_multiple_elements(element)
76+
return __parse_single_element(element, cast_types)
77+
return __parse_multiple_elements(element, cast_types)
7778

7879

79-
def __parse_empty_element(element):
80+
def __parse_empty_element(element, cast_types: bool):
8081
"""Parse XML element without children and convert it into a namedtuple."""
81-
return LoaderUtils.try_cast(element.text) if config.cast_types else element.text
82+
return LoaderUtils.try_cast(element.text) if cast_types else element.text
8283

8384

84-
def __parse_single_element(element):
85+
def __parse_single_element(element, cast_types: bool):
8586
"""Parse XML element with a single child and convert it into a namedtuple."""
8687
sub_element = element[0]
87-
sub_item = __parse_xml(sub_element)
88+
sub_item = __parse_xml(sub_element, cast_types)
8889
Item = namedtuple(element.tag, [sub_element.tag])
8990
return Item(sub_item)
9091

9192

92-
def __parse_multiple_elements(element):
93+
def __parse_multiple_elements(element, cast_types: bool):
9394
"""Parse XML element with multiple children and convert it into a namedtuple."""
9495
tag_dict = {}
9596
for e in element:
9697
if e.tag not in tag_dict:
9798
tag_dict[e.tag] = []
98-
tag_dict[e.tag].append(__parse_xml(e))
99+
tag_dict[e.tag].append(__parse_xml(e, cast_types))
99100
filtered_dict = __filter_single_items(tag_dict)
100101
Item = namedtuple(element.tag, filtered_dict.keys())
101102
return Item(*filtered_dict.values())
@@ -107,11 +108,9 @@ def __filter_single_items(tag_dict):
107108

108109

109110
def __flatten(data):
110-
"""Flatten a list of lists."""
111-
res = []
111+
"""Yield flattened elements from a possibly nested structure."""
112112
for item in data:
113113
if isinstance(item, list):
114-
res.extend(item)
114+
yield from item
115115
else:
116-
res.append(item)
117-
return res
116+
yield item

0 commit comments

Comments
 (0)