Skip to content

Commit 7206d29

Browse files
committed
report errors
1 parent 234c661 commit 7206d29

1 file changed

Lines changed: 10 additions & 4 deletions

File tree

grobid_client/grobid_client.py

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -347,6 +347,7 @@ def process(
347347

348348
# Counter for actually processed files
349349
processed_files_count = 0
350+
errors_files_count = 0
350351
input_files = []
351352

352353
for input_file in all_input_files:
@@ -363,7 +364,7 @@ def process(
363364
input_files.append(input_file)
364365

365366
if len(input_files) == batch_size_pdf:
366-
batch_processed = self.process_batch(
367+
batch_processed, batch_errors = self.process_batch(
367368
service,
368369
input_files,
369370
input_path,
@@ -381,11 +382,12 @@ def process(
381382
flavor
382383
)
383384
processed_files_count += batch_processed
385+
errors_files_count += batch_errors
384386
input_files = []
385387

386388
# last batch
387389
if len(input_files) > 0:
388-
batch_processed = self.process_batch(
390+
batch_processed, batch_errors = self.process_batch(
389391
service,
390392
input_files,
391393
input_path,
@@ -402,9 +404,11 @@ def process(
402404
verbose,
403405
)
404406
processed_files_count += batch_processed
407+
errors_files_count += batch_errors
405408

406409
# Log final statistics
407410
self.logger.info(f"Processing completed: {processed_files_count} out of {total_files} files processed")
411+
self.logger.info(f"Errors: {errors_files_count} out of {total_files} files processed")
408412

409413
def process_batch(
410414
self,
@@ -428,6 +432,7 @@ def process_batch(
428432
self.logger.info(f"{len(input_files)} files to process in current batch")
429433

430434
processed_count = 0
435+
error_count = 0
431436

432437
# we use ThreadPoolExecutor and not ProcessPoolExecutor because it is an I/O intensive process
433438
with concurrent.futures.ThreadPoolExecutor(max_workers=n) as executor:
@@ -468,10 +473,10 @@ def process_batch(
468473
for r in concurrent.futures.as_completed(results):
469474
input_file, status, text = r.result()
470475
filename = self._output_file_name(input_file, input_path, output)
471-
processed_count += 1
472476

473477
if status != 200 or text is None:
474478
self.logger.error(f"Processing of {input_file} failed with error {status}: {text}")
479+
error_count += 1
475480
# writing error file with suffixed error code
476481
try:
477482
pathlib.Path(os.path.dirname(filename)).mkdir(parents=True, exist_ok=True)
@@ -485,6 +490,7 @@ def process_batch(
485490
except OSError as e:
486491
self.logger.error(f"Failed to write error file {filename}: {str(e)}")
487492
else:
493+
processed_count += 1
488494
# writing TEI file
489495
try:
490496
pathlib.Path(os.path.dirname(filename)).mkdir(parents=True, exist_ok=True)
@@ -494,7 +500,7 @@ def process_batch(
494500
except OSError as e:
495501
self.logger.error(f"Failed to write TEI XML file {filename}: {str(e)}")
496502

497-
return processed_count
503+
return processed_count, error_count
498504

499505
def process_pdf(
500506
self,

0 commit comments

Comments
 (0)