@@ -772,4 +772,180 @@ def test_header_extraction_from_mjb_file(self):
772772 for section in expected_sections :
773773 assert section in sections_found , f"'{ section } ' should be in extracted sections"
774774
775+ def test_withdrawn_article_json_conversion (self ):
776+ """Test JSON conversion for a withdrawn/empty article TEI file."""
777+ from grobid_client .format .TEI2LossyJSON import TEI2LossyJSONConverter
778+
779+ # Use the withdrawn article TEI file from test resources
780+ tei_file = os .path .join (TEST_DATA_PATH , 'article_withdrawn.grobid.tei.xml' )
781+
782+ # Verify the test TEI file exists
783+ assert os .path .exists (tei_file ), f"Test TEI file should exist at { tei_file } "
784+
785+ converter = TEI2LossyJSONConverter ()
786+ json_data = converter .convert_tei_file (tei_file , stream = False )
787+
788+ # The converter should return a non-None result (not fail) for valid but empty TEI
789+ assert json_data is not None , "Withdrawn/empty TEI should return non-None JSON result"
790+ assert isinstance (json_data , dict ), "Result should be a dictionary"
791+
792+ # Check basic structure is present
793+ assert 'biblio' in json_data , "Should have biblio section"
794+ assert 'body_text' in json_data , "Should have body_text section"
795+
796+ def test_withdrawn_article_markdown_conversion (self ):
797+ """Test Markdown conversion for a withdrawn/empty article TEI file."""
798+ from grobid_client .format .TEI2Markdown import TEI2MarkdownConverter
799+
800+ # Use the withdrawn article TEI file from test resources
801+ tei_file = os .path .join (TEST_DATA_PATH , 'article_withdrawn.grobid.tei.xml' )
802+
803+ # Verify the test TEI file exists
804+ assert os .path .exists (tei_file ), f"Test TEI file should exist at { tei_file } "
805+
806+ converter = TEI2MarkdownConverter ()
807+ markdown_data = converter .convert_tei_file (tei_file )
808+
809+ # The converter should return a non-None result (not fail) for valid but empty TEI
810+ # It may return an empty string, but should not return None
811+ assert markdown_data is not None , "Withdrawn/empty TEI should return non-None Markdown result"
812+ assert isinstance (markdown_data , str ), "Result should be a string"
813+
814+ def test_json_conversion_stream_mode_with_real_file (self ):
815+ """Test JSON conversion in streaming mode with a real TEI file."""
816+ from grobid_client .format .TEI2LossyJSON import TEI2LossyJSONConverter
817+
818+ # Use the actual TEI file from test resources
819+ tei_file = os .path .join (TEST_DATA_PATH , '0046d83a-edd6-4631-b57c-755cdcce8b7f.tei.xml' )
820+
821+ # Verify the test TEI file exists
822+ assert os .path .exists (tei_file ), f"Test TEI file should exist at { tei_file } "
823+
824+ converter = TEI2LossyJSONConverter ()
825+ passages_generator = converter .convert_tei_file (tei_file , stream = True )
826+
827+ # Should return a generator/iterator, not None
828+ assert passages_generator is not None , "Streaming mode should return a generator"
829+
830+ # Collect all passages from the generator
831+ passages = list (passages_generator )
832+
833+ # Should have extracted some passages
834+ assert len (passages ) > 0 , "Should extract at least one passage in streaming mode"
835+
836+ # Each passage should be a dict with expected structure
837+ for passage in passages :
838+ assert isinstance (passage , dict ), "Each passage should be a dictionary"
839+ assert 'id' in passage , "Passage should have an id"
840+ assert 'text' in passage , "Passage should have text"
841+
842+ def test_json_conversion_stream_mode_with_empty_tei (self ):
843+ """Test JSON conversion in streaming mode with empty TEI content."""
844+ from grobid_client .format .TEI2LossyJSON import TEI2LossyJSONConverter
845+
846+ # Test with empty TEI content
847+ empty_tei = """<?xml version="1.0" encoding="UTF-8"?>
848+ <TEI xmlns="http://www.tei-c.org/ns/1.0">
849+ </TEI>"""
850+
851+ # Create a temporary TEI file with empty content
852+ with tempfile .NamedTemporaryFile (mode = 'w' , suffix = '.tei.xml' , delete = False ) as tei_file :
853+ tei_file .write (empty_tei )
854+ tei_path = tei_file .name
855+
856+ try :
857+ converter = TEI2LossyJSONConverter ()
858+ passages_generator = converter .convert_tei_file (tei_path , stream = True )
859+
860+ # Should return an empty iterator for empty TEI, not None
861+ assert passages_generator is not None , "Streaming mode should return an iterator even for empty TEI"
862+
863+ # Collect all passages - should be empty for empty TEI
864+ passages = list (passages_generator )
865+
866+ # Empty TEI should produce no passages
867+ assert isinstance (passages , list ), "Result should be convertible to list"
868+
869+ finally :
870+ # Clean up temporary file
871+ os .unlink (tei_path )
872+
873+ def test_json_conversion_stream_mode_with_withdrawn_article (self ):
874+ """Test JSON conversion in streaming mode with withdrawn/empty article."""
875+ from grobid_client .format .TEI2LossyJSON import TEI2LossyJSONConverter
876+
877+ # Use the withdrawn article TEI file from test resources
878+ tei_file = os .path .join (TEST_DATA_PATH , 'article_withdrawn.grobid.tei.xml' )
879+
880+ # Verify the test TEI file exists
881+ assert os .path .exists (tei_file ), f"Test TEI file should exist at { tei_file } "
882+
883+ converter = TEI2LossyJSONConverter ()
884+ passages_generator = converter .convert_tei_file (tei_file , stream = True )
885+
886+ # Should return a generator/iterator, not None
887+ assert passages_generator is not None , "Streaming mode should return an iterator for withdrawn article"
888+
889+ # Collect all passages - may be empty for withdrawn article
890+ passages = list (passages_generator )
891+
892+ # Should be a list (possibly empty)
893+ assert isinstance (passages , list ), "Result should be convertible to list"
894+
895+ def test_json_conversion_stream_mode_validates_refs (self ):
896+ """Test that streaming mode validates reference offsets correctly."""
897+ from grobid_client .format .TEI2LossyJSON import TEI2LossyJSONConverter
898+
899+ # Use file with references
900+ tei_file = os .path .join (TEST_DATA_PATH , '0046d83a-edd6-4631-b57c-755cdcce8b7f.tei.xml' )
901+
902+ converter = TEI2LossyJSONConverter (validate_refs = True )
903+ passages_generator = converter .convert_tei_file (tei_file , stream = True )
904+
905+ # Collect all passages - this should not raise assertion errors if refs are valid
906+ passages = list (passages_generator )
907+
908+ # Check passages with refs have valid offsets
909+ for passage in passages :
910+ if 'refs' in passage and passage ['refs' ]:
911+ for ref in passage ['refs' ]:
912+ offset_start = ref .get ('offset_start' , - 1 )
913+ offset_end = ref .get ('offset_end' , - 1 )
914+ ref_text = ref .get ('text' , '' )
915+ passage_text = passage .get ('text' , '' )
916+
917+ # Validate offset bounds
918+ assert 0 <= offset_start < offset_end <= len (passage_text ), \
919+ f"Invalid ref offsets: { offset_start } -{ offset_end } for text length { len (passage_text )} "
920+
921+ # Validate text matches
922+ actual_text = passage_text [offset_start :offset_end ]
923+ assert actual_text == ref_text , \
924+ f"Ref text mismatch: expected '{ ref_text } ', got '{ actual_text } '"
925+
926+ def test_json_conversion_stream_vs_non_stream_consistency (self ):
927+ """Test that streaming and non-streaming modes produce consistent results."""
928+ from grobid_client .format .TEI2LossyJSON import TEI2LossyJSONConverter
929+
930+ # Use the actual TEI file from test resources
931+ tei_file = os .path .join (TEST_DATA_PATH , '0046d83a-edd6-4631-b57c-755cdcce8b7f.tei.xml' )
932+
933+ converter = TEI2LossyJSONConverter ()
934+
935+ # Get non-streaming result
936+ non_stream_result = converter .convert_tei_file (tei_file , stream = False )
937+ body_text_non_stream = non_stream_result .get ('body_text' , [])
938+
939+ # Get streaming result
940+ stream_result = converter .convert_tei_file (tei_file , stream = True )
941+ body_text_stream = list (stream_result )
942+
943+ # Both should have the same number of passages
944+ assert len (body_text_non_stream ) == len (body_text_stream ), \
945+ f"Stream and non-stream should have same number of passages: { len (body_text_stream )} vs { len (body_text_non_stream )} "
946+
947+ # Compare passage texts
948+ for i , (stream_p , non_stream_p ) in enumerate (zip (body_text_stream , body_text_non_stream )):
949+ assert stream_p .get ('text' ) == non_stream_p .get ('text' ), \
950+ f"Passage { i } text mismatch between stream and non-stream modes"
775951
0 commit comments