-
Notifications
You must be signed in to change notification settings - Fork 83
Expand file tree
/
Copy pathtest-cache.py
More file actions
27 lines (21 loc) · 824 Bytes
/
test-cache.py
File metadata and controls
27 lines (21 loc) · 824 Bytes
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
'''
Recursively apply GROBID to the PDF present in a file tree via the grobid client and save the output XMLs in a cache without downloading them locally.
'''
import os
import re
import json
import time
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import subprocess
import xml.etree.ElementTree as ET
grobid = __import__('grobid-client')
if __name__ == '__main__':
client = grobid.grobid_client(config_path="./config.json")
input_path = "/mnt/data/covid/data/"
for root, _, _ in os.walk(input_path):
client.process(root, root, 10, "processFulltextDocument", False, 1, 0, True, True, True, False, False)
print(root)
# client.cache contains a list of tuples containing the file name, path, and the XML output in a string form
print(client.cache)