1+ from typing import Iterator , Any
2+
13try :
24 from defusedxml import ElementTree
35except ImportError as exc :
46 raise ImportError (
57 "Please install the xml_loader extra dependency to use the xml loader."
68 ) from exc
79from collections import namedtuple
8- from pystreamapi .loaders .__lazy_file_iterable import LazyFileIterable
910from pystreamapi .loaders .__loader_utils import LoaderUtils
1011
1112
12- class __XmlLoaderUtil :
13- """Utility class for the XML loader."""
14-
15- def __init__ (self ):
16- self .cast_types = True
17- self .retrieve_children = True
18-
19-
20- config = __XmlLoaderUtil ()
21-
22-
2313def xml (src : str , read_from_src = False , retrieve_children = True , cast_types = True ,
24- encoding = "utf-8" ) -> LazyFileIterable :
14+ encoding = "utf-8" ) -> Iterator [ Any ] :
2515 """
2616 Loads XML data from either a path or a string and converts it into a list of namedtuples.
2717 Warning: This method isn't safe against malicious XML trees. Parse only safe XML from sources
2818 you trust.
2919
3020 Returns:
31- LazyFileIterable: A list of namedtuples, where each namedtuple represents an XML element.
21+ An iterator with namedtuples, where each namedtuple represents an XML element.
3222 :param retrieve_children: If true, the children of the root element are used as stream
3323 elements.
3424 :param encoding: The encoding of the XML file.
@@ -37,65 +27,76 @@ def xml(src: str, read_from_src=False, retrieve_children=True, cast_types=True,
3727 a path to an XML file.
3828 :param cast_types: Set as False to disable casting of values to int, bool or float.
3929 """
40- config .cast_types = cast_types
41- config .retrieve_children = retrieve_children
4230 if read_from_src :
43- return LazyFileIterable (lambda : __load_xml_string (src ))
31+ return _lazy_parse_xml_string (src , retrieve_children , cast_types )
32+
4433 path = LoaderUtils .validate_path (src )
45- return LazyFileIterable (lambda : __load_xml_file (path , encoding ))
34+ return _lazy_parse_xml_file (path , encoding , retrieve_children , cast_types )
35+
36+
37+ def _lazy_parse_xml_file (file_path : str , encoding : str ,
38+ retrieve_children : bool , cast_types : bool ) -> Iterator [Any ]:
39+ """Lazily parse an XML file by reading its content and yielding parsed namedtuples."""
40+ def generator ():
41+ """Generator that reads the XML file and yields parsed namedtuples lazily."""
42+ # skipcq: PTC-W6004
43+ with open (file_path , mode = 'r' , encoding = encoding ) as xmlfile :
44+ xml_string = xmlfile .read ()
45+ yield from _parse_xml_string_lazy (xml_string , retrieve_children , cast_types )
4646
47+ return generator ()
4748
48- def __load_xml_file (file_path , encoding ):
49- """Load an XML file and convert it into a list of namedtuples."""
50- # skipcq: PTC-W6004
51- with open (file_path , mode = 'r' , encoding = encoding ) as xmlfile :
52- src = xmlfile .read ()
53- if src :
54- return __parse_xml_string (src )
55- return []
5649
50+ def _lazy_parse_xml_string (xml_string : str , retrieve_children : bool ,
51+ cast_types : bool ) -> Iterator [Any ]:
52+ """Lazily parse an XML string by yielding parsed namedtuples for each element."""
53+ def generator ():
54+ """Generator that yields parsed namedtuples from the XML string lazily."""
55+ yield from _parse_xml_string_lazy (xml_string , retrieve_children , cast_types )
5756
58- def __load_xml_string (xml_string ):
59- """Load XML data from a string and convert it into a list of namedtuples."""
60- return __parse_xml_string (xml_string )
57+ return generator ()
6158
6259
63- def __parse_xml_string (xml_string ):
64- """Parse XML string and convert it into a list of namedtuples."""
60+ def _parse_xml_string_lazy (xml_string : str , retrieve_children : bool ,
61+ cast_types : bool ) -> Iterator [Any ]:
62+ """Parse an XML string into namedtuples, optionally yielding child elements lazily."""
6563 root = ElementTree .fromstring (xml_string )
66- parsed_xml = __parse_xml (root )
67- return __flatten (parsed_xml ) if config .retrieve_children else [parsed_xml ]
64+ parsed = __parse_xml (root , cast_types )
65+ if retrieve_children :
66+ yield from __flatten (parsed )
67+ else :
68+ yield parsed
6869
6970
70- def __parse_xml (element ):
71+ def __parse_xml (element , cast_types : bool ):
7172 """Parse XML element and convert it into a namedtuple."""
7273 if len (element ) == 0 :
73- return __parse_empty_element (element )
74+ return __parse_empty_element (element , cast_types )
7475 if len (element ) == 1 :
75- return __parse_single_element (element )
76- return __parse_multiple_elements (element )
76+ return __parse_single_element (element , cast_types )
77+ return __parse_multiple_elements (element , cast_types )
7778
7879
79- def __parse_empty_element (element ):
80+ def __parse_empty_element (element , cast_types : bool ):
8081 """Parse XML element without children and convert it into a namedtuple."""
81- return LoaderUtils .try_cast (element .text ) if config . cast_types else element .text
82+ return LoaderUtils .try_cast (element .text ) if cast_types else element .text
8283
8384
84- def __parse_single_element (element ):
85+ def __parse_single_element (element , cast_types : bool ):
8586 """Parse XML element with a single child and convert it into a namedtuple."""
8687 sub_element = element [0 ]
87- sub_item = __parse_xml (sub_element )
88+ sub_item = __parse_xml (sub_element , cast_types )
8889 Item = namedtuple (element .tag , [sub_element .tag ])
8990 return Item (sub_item )
9091
9192
92- def __parse_multiple_elements (element ):
93+ def __parse_multiple_elements (element , cast_types : bool ):
9394 """Parse XML element with multiple children and convert it into a namedtuple."""
9495 tag_dict = {}
9596 for e in element :
9697 if e .tag not in tag_dict :
9798 tag_dict [e .tag ] = []
98- tag_dict [e .tag ].append (__parse_xml (e ))
99+ tag_dict [e .tag ].append (__parse_xml (e , cast_types ))
99100 filtered_dict = __filter_single_items (tag_dict )
100101 Item = namedtuple (element .tag , filtered_dict .keys ())
101102 return Item (* filtered_dict .values ())
@@ -107,11 +108,9 @@ def __filter_single_items(tag_dict):
107108
108109
109110def __flatten (data ):
110- """Flatten a list of lists."""
111- res = []
111+ """Yield flattened elements from a possibly nested structure."""
112112 for item in data :
113113 if isinstance (item , list ):
114- res . extend ( item )
114+ yield from item
115115 else :
116- res .append (item )
117- return res
116+ yield item
0 commit comments