Merge pull request #107 from kermitt2/feature/handle-empty-documents

lfoppiano · web-flow · commit 0278c1c8b3c6 · 2026-01-04T20:08:03.000Z
Allow empty JSON or Markdown
diff --git a/grobid_client/grobid_client.py b/grobid_client/grobid_client.py
@@ -608,7 +608,7 @@ def process_batch(
                             converter = TEI2MarkdownConverter()
                             markdown_data = converter.convert_tei_file(filename)
 
-                            if markdown_data:
+                            if markdown_data is not None:
                                 markdown_filename = filename.replace('.grobid.tei.xml', '.md')
                                 # Always write Markdown file when TEI is written (respects --force behavior)
                                 markdown_filename_expanded = os.path.expanduser(markdown_filename)
diff --git a/tests/resources/article_withdrawn.grobid.tei.xml b/tests/resources/article_withdrawn.grobid.tei.xml
@@ -0,0 +1,51 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<TEI xml:space="preserve" xmlns="http://www.tei-c.org/ns/1.0" 
+xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" 
+xsi:schemaLocation="http://www.tei-c.org/ns/1.0 https://raw.githubusercontent.com/kermitt2/grobid/master/grobid-home/schemas/xsd/Grobid.xsd"
+ xmlns:xlink="http://www.w3.org/1999/xlink">
+	<teiHeader>
+		<fileDesc>
+			<titleStmt>
+				<title level="a" type="main"></title>
+			</titleStmt>
+			<publicationStmt>
+				<publisher/>
+				<availability status="unknown"><licence/></availability>
+			</publicationStmt>
+			<sourceDesc>
+				<biblStruct>
+					<analytic>
+					</analytic>
+					<monogr>
+						<imprint>
+							<date/>
+						</imprint>
+					</monogr>
+					<idno type="MD5">BF4185EE81F98E32729702A8C45B889D</idno>
+				</biblStruct>
+			</sourceDesc>
+		</fileDesc>
+		<encodingDesc>
+			<appInfo>
+				<application version="0.8.2" ident="GROBID" when="2026-01-04T19:23+0000">
+					<desc>GROBID - A machine learning software for extracting information from scholarly documents</desc>
+					<label type="revision">a91ee48</label>
+					<label type="parameters">startPage=-1, endPage=-1, consolidateCitations=0, consolidateHeader=0, consolidateFunders=0, includeRawAffiliations=false, includeRawCitations=false, includeRawCopyrights=false, generateTeiIds=false, generateTeiCoordinates=[], sentenceSegmentation=false, flavor=null</label>
+					<ref target="https://github.com/kermitt2/grobid"/>
+				</application>
+			</appInfo>
+		</encodingDesc>
+		<profileDesc>
+			<abstract/>
+		</profileDesc>
+	</teiHeader>
+	<text>
+		<body/>
+		<back>
+			<div type="references">
+
+				<listBibl/>
+			</div>
+		</back>
+	</text>
+</TEI>
diff --git a/tests/test_conversions.py b/tests/test_conversions.py
@@ -772,4 +772,180 @@ def test_header_extraction_from_mjb_file(self):
         for section in expected_sections:
             assert section in sections_found, f"'{section}' should be in extracted sections"
 
+    def test_withdrawn_article_json_conversion(self):
+        """Test JSON conversion for a withdrawn/empty article TEI file."""
+        from grobid_client.format.TEI2LossyJSON import TEI2LossyJSONConverter
+
+        # Use the withdrawn article TEI file from test resources
+        tei_file = os.path.join(TEST_DATA_PATH, 'article_withdrawn.grobid.tei.xml')
+
+        # Verify the test TEI file exists
+        assert os.path.exists(tei_file), f"Test TEI file should exist at {tei_file}"
+
+        converter = TEI2LossyJSONConverter()
+        json_data = converter.convert_tei_file(tei_file, stream=False)
+
+        # The converter should return a non-None result (not fail) for valid but empty TEI
+        assert json_data is not None, "Withdrawn/empty TEI should return non-None JSON result"
+        assert isinstance(json_data, dict), "Result should be a dictionary"
+
+        # Check basic structure is present
+        assert 'biblio' in json_data, "Should have biblio section"
+        assert 'body_text' in json_data, "Should have body_text section"
+
+    def test_withdrawn_article_markdown_conversion(self):
+        """Test Markdown conversion for a withdrawn/empty article TEI file."""
+        from grobid_client.format.TEI2Markdown import TEI2MarkdownConverter
+
+        # Use the withdrawn article TEI file from test resources
+        tei_file = os.path.join(TEST_DATA_PATH, 'article_withdrawn.grobid.tei.xml')
+
+        # Verify the test TEI file exists
+        assert os.path.exists(tei_file), f"Test TEI file should exist at {tei_file}"
+
+        converter = TEI2MarkdownConverter()
+        markdown_data = converter.convert_tei_file(tei_file)
+
+        # The converter should return a non-None result (not fail) for valid but empty TEI
+        # It may return an empty string, but should not return None
+        assert markdown_data is not None, "Withdrawn/empty TEI should return non-None Markdown result"
+        assert isinstance(markdown_data, str), "Result should be a string"
+
+    def test_json_conversion_stream_mode_with_real_file(self):
+        """Test JSON conversion in streaming mode with a real TEI file."""
+        from grobid_client.format.TEI2LossyJSON import TEI2LossyJSONConverter
+
+        # Use the actual TEI file from test resources
+        tei_file = os.path.join(TEST_DATA_PATH, '0046d83a-edd6-4631-b57c-755cdcce8b7f.tei.xml')
+
+        # Verify the test TEI file exists
+        assert os.path.exists(tei_file), f"Test TEI file should exist at {tei_file}"
+
+        converter = TEI2LossyJSONConverter()
+        passages_generator = converter.convert_tei_file(tei_file, stream=True)
+
+        # Should return a generator/iterator, not None
+        assert passages_generator is not None, "Streaming mode should return a generator"
+
+        # Collect all passages from the generator
+        passages = list(passages_generator)
+
+        # Should have extracted some passages
+        assert len(passages) > 0, "Should extract at least one passage in streaming mode"
+
+        # Each passage should be a dict with expected structure
+        for passage in passages:
+            assert isinstance(passage, dict), "Each passage should be a dictionary"
+            assert 'id' in passage, "Passage should have an id"
+            assert 'text' in passage, "Passage should have text"
+
+    def test_json_conversion_stream_mode_with_empty_tei(self):
+        """Test JSON conversion in streaming mode with empty TEI content."""
+        from grobid_client.format.TEI2LossyJSON import TEI2LossyJSONConverter
+
+        # Test with empty TEI content
+        empty_tei = """<?xml version="1.0" encoding="UTF-8"?>
+<TEI xmlns="http://www.tei-c.org/ns/1.0">
+</TEI>"""
+
+        # Create a temporary TEI file with empty content
+        with tempfile.NamedTemporaryFile(mode='w', suffix='.tei.xml', delete=False) as tei_file:
+            tei_file.write(empty_tei)
+            tei_path = tei_file.name
+
+        try:
+            converter = TEI2LossyJSONConverter()
+            passages_generator = converter.convert_tei_file(tei_path, stream=True)
+
+            # Should return an empty iterator for empty TEI, not None
+            assert passages_generator is not None, "Streaming mode should return an iterator even for empty TEI"
+
+            # Collect all passages - should be empty for empty TEI
+            passages = list(passages_generator)
+
+            # Empty TEI should produce no passages
+            assert isinstance(passages, list), "Result should be convertible to list"
+
+        finally:
+            # Clean up temporary file
+            os.unlink(tei_path)
+
+    def test_json_conversion_stream_mode_with_withdrawn_article(self):
+        """Test JSON conversion in streaming mode with withdrawn/empty article."""
+        from grobid_client.format.TEI2LossyJSON import TEI2LossyJSONConverter
+
+        # Use the withdrawn article TEI file from test resources
+        tei_file = os.path.join(TEST_DATA_PATH, 'article_withdrawn.grobid.tei.xml')
+
+        # Verify the test TEI file exists
+        assert os.path.exists(tei_file), f"Test TEI file should exist at {tei_file}"
+
+        converter = TEI2LossyJSONConverter()
+        passages_generator = converter.convert_tei_file(tei_file, stream=True)
+
+        # Should return a generator/iterator, not None
+        assert passages_generator is not None, "Streaming mode should return an iterator for withdrawn article"
+
+        # Collect all passages - may be empty for withdrawn article
+        passages = list(passages_generator)
+
+        # Should be a list (possibly empty)
+        assert isinstance(passages, list), "Result should be convertible to list"
+
+    def test_json_conversion_stream_mode_validates_refs(self):
+        """Test that streaming mode validates reference offsets correctly."""
+        from grobid_client.format.TEI2LossyJSON import TEI2LossyJSONConverter
+
+        # Use file with references
+        tei_file = os.path.join(TEST_DATA_PATH, '0046d83a-edd6-4631-b57c-755cdcce8b7f.tei.xml')
+
+        converter = TEI2LossyJSONConverter(validate_refs=True)
+        passages_generator = converter.convert_tei_file(tei_file, stream=True)
+
+        # Collect all passages - this should not raise assertion errors if refs are valid
+        passages = list(passages_generator)
+
+        # Check passages with refs have valid offsets
+        for passage in passages:
+            if 'refs' in passage and passage['refs']:
+                for ref in passage['refs']:
+                    offset_start = ref.get('offset_start', -1)
+                    offset_end = ref.get('offset_end', -1)
+                    ref_text = ref.get('text', '')
+                    passage_text = passage.get('text', '')
+
+                    # Validate offset bounds
+                    assert 0 <= offset_start < offset_end <= len(passage_text), \
+                        f"Invalid ref offsets: {offset_start}-{offset_end} for text length {len(passage_text)}"
+
+                    # Validate text matches
+                    actual_text = passage_text[offset_start:offset_end]
+                    assert actual_text == ref_text, \
+                        f"Ref text mismatch: expected '{ref_text}', got '{actual_text}'"
+
+    def test_json_conversion_stream_vs_non_stream_consistency(self):
+        """Test that streaming and non-streaming modes produce consistent results."""
+        from grobid_client.format.TEI2LossyJSON import TEI2LossyJSONConverter
+
+        # Use the actual TEI file from test resources
+        tei_file = os.path.join(TEST_DATA_PATH, '0046d83a-edd6-4631-b57c-755cdcce8b7f.tei.xml')
+
+        converter = TEI2LossyJSONConverter()
+
+        # Get non-streaming result
+        non_stream_result = converter.convert_tei_file(tei_file, stream=False)
+        body_text_non_stream = non_stream_result.get('body_text', [])
+
+        # Get streaming result
+        stream_result = converter.convert_tei_file(tei_file, stream=True)
+        body_text_stream = list(stream_result)
+
+        # Both should have the same number of passages
+        assert len(body_text_non_stream) == len(body_text_stream), \
+            f"Stream and non-stream should have same number of passages: {len(body_text_stream)} vs {len(body_text_non_stream)}"
+
+        # Compare passage texts
+        for i, (stream_p, non_stream_p) in enumerate(zip(body_text_stream, body_text_non_stream)):
+            assert stream_p.get('text') == non_stream_p.get('text'), \
+                f"Passage {i} text mismatch between stream and non-stream modes"