Skip to content

Commit eb8b418

Browse files
committed
feat: Update TEI2LossyJSON conversion script and add new TEI and JSON test resources.
1 parent 4ebff29 commit eb8b418

3 files changed

Lines changed: 1109 additions & 0 deletions

File tree

grobid_client/format/TEI2LossyJSON.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -669,9 +669,27 @@ def _iter_passages_from_soup_for_text(self, text_node: Tag, passage_level: str)
669669

670670
div_type = div.get("type")
671671

672+
# Check if this is a header-only div (no content, no nested divs)
673+
# If so, capture its header as context for subsequent divs
674+
head = div.find("head")
675+
direct_p_nodes = [c for c in div.children if hasattr(c, 'name') and c.name == "p"]
676+
direct_formula_nodes = [c for c in div.children if hasattr(c, 'name') and c.name == "formula"]
677+
nested_divs = [c for c in div.children if hasattr(c, 'name') and (c.name == "div" or (c.name and c.name.endswith(":div")))]
678+
has_direct_content = len(direct_p_nodes) > 0 or len(direct_formula_nodes) > 0
679+
680+
if head and not has_direct_content and len(nested_divs) == 0:
681+
# This is a header-only div with no nested content
682+
# Capture the header for the next div
683+
head_paragraph = self._clean_text(head.get_text())
684+
continue # Skip to next div, the header will be used by subsequent sibling
685+
672686
# Process this div and potentially nested divs
673687
for passage in self._process_div_with_nested_content(div, passage_level, head_paragraph):
674688
yield passage
689+
690+
# Reset head_paragraph after it's been used by a content-bearing div
691+
head_paragraph = None
692+
675693

676694
def _process_div_with_nested_content(self, div: Tag, passage_level: str, head_paragraph: str = None) -> Iterator[Dict[str, Union[str, Dict[str, str]]]]:
677695
"""

0 commit comments

Comments
 (0)