Skip to content

Commit 0278c1c

Browse files
authored
Merge pull request #107 from kermitt2/feature/handle-empty-documents
Allow empty JSON or Markdown
2 parents 5f4e2ce + 8cb7a61 commit 0278c1c

3 files changed

Lines changed: 228 additions & 1 deletion

File tree

grobid_client/grobid_client.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -608,7 +608,7 @@ def process_batch(
608608
converter = TEI2MarkdownConverter()
609609
markdown_data = converter.convert_tei_file(filename)
610610

611-
if markdown_data:
611+
if markdown_data is not None:
612612
markdown_filename = filename.replace('.grobid.tei.xml', '.md')
613613
# Always write Markdown file when TEI is written (respects --force behavior)
614614
markdown_filename_expanded = os.path.expanduser(markdown_filename)
Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
<?xml version="1.0" encoding="UTF-8"?>
2+
<TEI xml:space="preserve" xmlns="http://www.tei-c.org/ns/1.0"
3+
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
4+
xsi:schemaLocation="http://www.tei-c.org/ns/1.0 https://raw.githubusercontent.com/kermitt2/grobid/master/grobid-home/schemas/xsd/Grobid.xsd"
5+
xmlns:xlink="http://www.w3.org/1999/xlink">
6+
<teiHeader>
7+
<fileDesc>
8+
<titleStmt>
9+
<title level="a" type="main"></title>
10+
</titleStmt>
11+
<publicationStmt>
12+
<publisher/>
13+
<availability status="unknown"><licence/></availability>
14+
</publicationStmt>
15+
<sourceDesc>
16+
<biblStruct>
17+
<analytic>
18+
</analytic>
19+
<monogr>
20+
<imprint>
21+
<date/>
22+
</imprint>
23+
</monogr>
24+
<idno type="MD5">BF4185EE81F98E32729702A8C45B889D</idno>
25+
</biblStruct>
26+
</sourceDesc>
27+
</fileDesc>
28+
<encodingDesc>
29+
<appInfo>
30+
<application version="0.8.2" ident="GROBID" when="2026-01-04T19:23+0000">
31+
<desc>GROBID - A machine learning software for extracting information from scholarly documents</desc>
32+
<label type="revision">a91ee48</label>
33+
<label type="parameters">startPage=-1, endPage=-1, consolidateCitations=0, consolidateHeader=0, consolidateFunders=0, includeRawAffiliations=false, includeRawCitations=false, includeRawCopyrights=false, generateTeiIds=false, generateTeiCoordinates=[], sentenceSegmentation=false, flavor=null</label>
34+
<ref target="https://github.com/kermitt2/grobid"/>
35+
</application>
36+
</appInfo>
37+
</encodingDesc>
38+
<profileDesc>
39+
<abstract/>
40+
</profileDesc>
41+
</teiHeader>
42+
<text>
43+
<body/>
44+
<back>
45+
<div type="references">
46+
47+
<listBibl/>
48+
</div>
49+
</back>
50+
</text>
51+
</TEI>

tests/test_conversions.py

Lines changed: 176 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -772,4 +772,180 @@ def test_header_extraction_from_mjb_file(self):
772772
for section in expected_sections:
773773
assert section in sections_found, f"'{section}' should be in extracted sections"
774774

775+
def test_withdrawn_article_json_conversion(self):
776+
"""Test JSON conversion for a withdrawn/empty article TEI file."""
777+
from grobid_client.format.TEI2LossyJSON import TEI2LossyJSONConverter
778+
779+
# Use the withdrawn article TEI file from test resources
780+
tei_file = os.path.join(TEST_DATA_PATH, 'article_withdrawn.grobid.tei.xml')
781+
782+
# Verify the test TEI file exists
783+
assert os.path.exists(tei_file), f"Test TEI file should exist at {tei_file}"
784+
785+
converter = TEI2LossyJSONConverter()
786+
json_data = converter.convert_tei_file(tei_file, stream=False)
787+
788+
# The converter should return a non-None result (not fail) for valid but empty TEI
789+
assert json_data is not None, "Withdrawn/empty TEI should return non-None JSON result"
790+
assert isinstance(json_data, dict), "Result should be a dictionary"
791+
792+
# Check basic structure is present
793+
assert 'biblio' in json_data, "Should have biblio section"
794+
assert 'body_text' in json_data, "Should have body_text section"
795+
796+
def test_withdrawn_article_markdown_conversion(self):
797+
"""Test Markdown conversion for a withdrawn/empty article TEI file."""
798+
from grobid_client.format.TEI2Markdown import TEI2MarkdownConverter
799+
800+
# Use the withdrawn article TEI file from test resources
801+
tei_file = os.path.join(TEST_DATA_PATH, 'article_withdrawn.grobid.tei.xml')
802+
803+
# Verify the test TEI file exists
804+
assert os.path.exists(tei_file), f"Test TEI file should exist at {tei_file}"
805+
806+
converter = TEI2MarkdownConverter()
807+
markdown_data = converter.convert_tei_file(tei_file)
808+
809+
# The converter should return a non-None result (not fail) for valid but empty TEI
810+
# It may return an empty string, but should not return None
811+
assert markdown_data is not None, "Withdrawn/empty TEI should return non-None Markdown result"
812+
assert isinstance(markdown_data, str), "Result should be a string"
813+
814+
def test_json_conversion_stream_mode_with_real_file(self):
815+
"""Test JSON conversion in streaming mode with a real TEI file."""
816+
from grobid_client.format.TEI2LossyJSON import TEI2LossyJSONConverter
817+
818+
# Use the actual TEI file from test resources
819+
tei_file = os.path.join(TEST_DATA_PATH, '0046d83a-edd6-4631-b57c-755cdcce8b7f.tei.xml')
820+
821+
# Verify the test TEI file exists
822+
assert os.path.exists(tei_file), f"Test TEI file should exist at {tei_file}"
823+
824+
converter = TEI2LossyJSONConverter()
825+
passages_generator = converter.convert_tei_file(tei_file, stream=True)
826+
827+
# Should return a generator/iterator, not None
828+
assert passages_generator is not None, "Streaming mode should return a generator"
829+
830+
# Collect all passages from the generator
831+
passages = list(passages_generator)
832+
833+
# Should have extracted some passages
834+
assert len(passages) > 0, "Should extract at least one passage in streaming mode"
835+
836+
# Each passage should be a dict with expected structure
837+
for passage in passages:
838+
assert isinstance(passage, dict), "Each passage should be a dictionary"
839+
assert 'id' in passage, "Passage should have an id"
840+
assert 'text' in passage, "Passage should have text"
841+
842+
def test_json_conversion_stream_mode_with_empty_tei(self):
843+
"""Test JSON conversion in streaming mode with empty TEI content."""
844+
from grobid_client.format.TEI2LossyJSON import TEI2LossyJSONConverter
845+
846+
# Test with empty TEI content
847+
empty_tei = """<?xml version="1.0" encoding="UTF-8"?>
848+
<TEI xmlns="http://www.tei-c.org/ns/1.0">
849+
</TEI>"""
850+
851+
# Create a temporary TEI file with empty content
852+
with tempfile.NamedTemporaryFile(mode='w', suffix='.tei.xml', delete=False) as tei_file:
853+
tei_file.write(empty_tei)
854+
tei_path = tei_file.name
855+
856+
try:
857+
converter = TEI2LossyJSONConverter()
858+
passages_generator = converter.convert_tei_file(tei_path, stream=True)
859+
860+
# Should return an empty iterator for empty TEI, not None
861+
assert passages_generator is not None, "Streaming mode should return an iterator even for empty TEI"
862+
863+
# Collect all passages - should be empty for empty TEI
864+
passages = list(passages_generator)
865+
866+
# Empty TEI should produce no passages
867+
assert isinstance(passages, list), "Result should be convertible to list"
868+
869+
finally:
870+
# Clean up temporary file
871+
os.unlink(tei_path)
872+
873+
def test_json_conversion_stream_mode_with_withdrawn_article(self):
874+
"""Test JSON conversion in streaming mode with withdrawn/empty article."""
875+
from grobid_client.format.TEI2LossyJSON import TEI2LossyJSONConverter
876+
877+
# Use the withdrawn article TEI file from test resources
878+
tei_file = os.path.join(TEST_DATA_PATH, 'article_withdrawn.grobid.tei.xml')
879+
880+
# Verify the test TEI file exists
881+
assert os.path.exists(tei_file), f"Test TEI file should exist at {tei_file}"
882+
883+
converter = TEI2LossyJSONConverter()
884+
passages_generator = converter.convert_tei_file(tei_file, stream=True)
885+
886+
# Should return a generator/iterator, not None
887+
assert passages_generator is not None, "Streaming mode should return an iterator for withdrawn article"
888+
889+
# Collect all passages - may be empty for withdrawn article
890+
passages = list(passages_generator)
891+
892+
# Should be a list (possibly empty)
893+
assert isinstance(passages, list), "Result should be convertible to list"
894+
895+
def test_json_conversion_stream_mode_validates_refs(self):
896+
"""Test that streaming mode validates reference offsets correctly."""
897+
from grobid_client.format.TEI2LossyJSON import TEI2LossyJSONConverter
898+
899+
# Use file with references
900+
tei_file = os.path.join(TEST_DATA_PATH, '0046d83a-edd6-4631-b57c-755cdcce8b7f.tei.xml')
901+
902+
converter = TEI2LossyJSONConverter(validate_refs=True)
903+
passages_generator = converter.convert_tei_file(tei_file, stream=True)
904+
905+
# Collect all passages - this should not raise assertion errors if refs are valid
906+
passages = list(passages_generator)
907+
908+
# Check passages with refs have valid offsets
909+
for passage in passages:
910+
if 'refs' in passage and passage['refs']:
911+
for ref in passage['refs']:
912+
offset_start = ref.get('offset_start', -1)
913+
offset_end = ref.get('offset_end', -1)
914+
ref_text = ref.get('text', '')
915+
passage_text = passage.get('text', '')
916+
917+
# Validate offset bounds
918+
assert 0 <= offset_start < offset_end <= len(passage_text), \
919+
f"Invalid ref offsets: {offset_start}-{offset_end} for text length {len(passage_text)}"
920+
921+
# Validate text matches
922+
actual_text = passage_text[offset_start:offset_end]
923+
assert actual_text == ref_text, \
924+
f"Ref text mismatch: expected '{ref_text}', got '{actual_text}'"
925+
926+
def test_json_conversion_stream_vs_non_stream_consistency(self):
927+
"""Test that streaming and non-streaming modes produce consistent results."""
928+
from grobid_client.format.TEI2LossyJSON import TEI2LossyJSONConverter
929+
930+
# Use the actual TEI file from test resources
931+
tei_file = os.path.join(TEST_DATA_PATH, '0046d83a-edd6-4631-b57c-755cdcce8b7f.tei.xml')
932+
933+
converter = TEI2LossyJSONConverter()
934+
935+
# Get non-streaming result
936+
non_stream_result = converter.convert_tei_file(tei_file, stream=False)
937+
body_text_non_stream = non_stream_result.get('body_text', [])
938+
939+
# Get streaming result
940+
stream_result = converter.convert_tei_file(tei_file, stream=True)
941+
body_text_stream = list(stream_result)
942+
943+
# Both should have the same number of passages
944+
assert len(body_text_non_stream) == len(body_text_stream), \
945+
f"Stream and non-stream should have same number of passages: {len(body_text_stream)} vs {len(body_text_non_stream)}"
946+
947+
# Compare passage texts
948+
for i, (stream_p, non_stream_p) in enumerate(zip(body_text_stream, body_text_non_stream)):
949+
assert stream_p.get('text') == non_stream_p.get('text'), \
950+
f"Passage {i} text mismatch between stream and non-stream modes"
775951

0 commit comments

Comments
 (0)