diff --git a/eu_fact_force/ingestion/data_collection/download_ground_truth.py b/eu_fact_force/ingestion/data_collection/download_ground_truth.py new file mode 100644 index 0000000..38dc2f2 --- /dev/null +++ b/eu_fact_force/ingestion/data_collection/download_ground_truth.py @@ -0,0 +1,239 @@ +""" +Download PDFs and extract ground truth text for articles in verified_ground_truth.csv. + +For each arXiv article: + - Downloads the PDF to {output_dir}/pdf/{article_id}.pdf + - Downloads the LaTeX source tar, extracts all .tex files, and writes + cleaned text to {output_dir}/text/{article_id}.txt + +The extracted text is used as the reference ("ground truth") when measuring +how well the PDF parser reproduces the original content. + +Usage: + python -m eu_fact_force.ingestion.data_collection.download_ground_truth \\ + --csv verified_ground_truth.csv \\ + --output-dir ./verified_ground_truth_data \\ + --workers 4 +""" + +import argparse +import csv +import json +import logging +import re +import sys +import tarfile +import tempfile +from concurrent.futures import ThreadPoolExecutor, as_completed +from pathlib import Path + +import requests + +logger = logging.getLogger(__name__) + +_TIMEOUT = 60 # seconds — source tarballs can be large + + +def download_article(article: dict, pdf_dir: Path, text_dir: Path) -> dict: + """ + Download PDF and extract LaTeX text for one article row from the CSV. + + Returns a result dict with keys: article_id, status, pdf_path, text_path, error. + """ + article_id = article["article_id"] + source = article["source"] + + if source != "arxiv": + logger.warning("download.unsupported_source id=%s source=%s", article_id, source) + return {"article_id": article_id, "status": "skipped", "reason": f"unsupported source: {source}"} + + safe_id = article_id.replace(":", "_").replace("/", "_") + pdf_path = pdf_dir / f"{safe_id}.pdf" + text_path = text_dir / f"{safe_id}.txt" + + # Skip if both already present + if pdf_path.exists() and text_path.exists(): + logger.info("download.skip id=%s reason=already_exists", article_id) + return {"article_id": article_id, "status": "skipped", "reason": "already_exists", + "pdf_path": str(pdf_path), "text_path": str(text_path)} + + pdf_ok = _download_pdf(article["pdf_url"], pdf_path) + text_ok = _download_arxiv_latex(article["text_url"], text_path) + + status = "success" if pdf_ok and text_ok else ("partial" if pdf_ok or text_ok else "failed") + result = { + "article_id": article_id, + "status": status, + "pdf_path": str(pdf_path) if pdf_ok else None, + "text_path": str(text_path) if text_ok else None, + } + logger.info("download.done id=%s status=%s", article_id, status) + return result + + +def download_all( + csv_path: str, + output_dir: str, + workers: int = 4, +) -> list[dict]: + """ + Download all articles from the ground truth CSV in parallel. + + Writes a download_manifest.json into output_dir summarising results. + """ + pdf_dir = Path(output_dir) / "pdf" + text_dir = Path(output_dir) / "text" + pdf_dir.mkdir(parents=True, exist_ok=True) + text_dir.mkdir(parents=True, exist_ok=True) + + with open(csv_path, encoding="utf-8") as f: + articles = list(csv.DictReader(f)) + + logger.info("download.start total=%d workers=%d", len(articles), workers) + + results = [] + with ThreadPoolExecutor(max_workers=workers) as pool: + futures = { + pool.submit(download_article, a, pdf_dir, text_dir): a["article_id"] + for a in articles + } + for future in as_completed(futures): + results.append(future.result()) + + successful = [r for r in results if r["status"] in ("success", "partial")] + failed = [r for r in results if r["status"] == "failed"] + + manifest = { + "csv": csv_path, + "total": len(articles), + "successful": len(successful), + "failed": len(failed), + "results": results, + } + manifest_path = Path(output_dir) / "download_manifest.json" + manifest_path.write_text(json.dumps(manifest, indent=2), encoding="utf-8") + + print(f"\nDownload complete: {len(successful)}/{len(articles)} succeeded") + if failed: + print(f"Failed ({len(failed)}):") + for r in failed: + print(f" {r['article_id']}: {r.get('error', 'unknown')}") + print(f"Manifest: {manifest_path}") + + return results + + +# --------------------------------------------------------------------------- +# Internal helpers +# --------------------------------------------------------------------------- + +def _download_pdf(url: str, dest: Path) -> bool: + """Download a PDF file. Returns True on success.""" + if dest.exists(): + return True + try: + resp = requests.get(url, timeout=_TIMEOUT) + resp.raise_for_status() + if not resp.content.startswith(b"%PDF"): + logger.warning("download.not_a_pdf url=%s", url) + return False + dest.write_bytes(resp.content) + logger.info("download.pdf_ok url=%s size=%d", url, len(resp.content)) + return True + except Exception as e: + logger.warning("download.pdf_failed url=%s error=%s", url, e) + return False + + +def _download_arxiv_latex(source_url: str, dest: Path) -> bool: + """ + Download an arXiv source tarball, extract all .tex files, clean and + concatenate them, then write plain text to dest. Returns True on success. + """ + if dest.exists(): + return True + try: + resp = requests.get(source_url, timeout=_TIMEOUT) + resp.raise_for_status() + except Exception as e: + logger.warning("download.latex_fetch_failed url=%s error=%s", source_url, e) + return False + + try: + with tempfile.TemporaryDirectory() as tmpdir: + tar_path = Path(tmpdir) / "source.tar.gz" + tar_path.write_bytes(resp.content) + + try: + with tarfile.open(tar_path, "r:gz") as tar: + tar.extractall(tmpdir) + except tarfile.TarError: + # Some arXiv sources are bare .tex, not tar'd + text = _clean_latex(resp.content) + dest.write_text(text, encoding="utf-8") + return bool(text.strip()) + + tex_files = sorted(Path(tmpdir).rglob("*.tex")) + if not tex_files: + logger.warning("download.no_tex_found url=%s", source_url) + return False + + # Concatenate all .tex files (main file first if identifiable) + parts = [] + for tex in tex_files: + try: + parts.append(_clean_latex(tex.read_bytes())) + except Exception: + pass + + text = "\n\n".join(p for p in parts if p.strip()) + dest.write_text(text, encoding="utf-8") + logger.info("download.latex_ok url=%s chars=%d", source_url, len(text)) + return bool(text.strip()) + + except Exception as e: + logger.warning("download.latex_extract_failed url=%s error=%s", source_url, e) + return False + + +def _clean_latex(raw: bytes) -> str: + """Strip LaTeX markup and return readable plain text.""" + text = raw.decode("utf-8", errors="ignore") + # Remove comments + text = re.sub(r"%[^\n]*", "", text) + # Unwrap common commands that enclose text: \cmd{content} → content + text = re.sub(r"\\(?:textbf|textit|emph|textrm|texttt|text|mbox)\{([^}]*)\}", r"\1", text) + # Remove remaining LaTeX commands (with or without braces) + text = re.sub(r"\\[a-zA-Z]+\*?\{[^}]*\}", "", text) + text = re.sub(r"\\[a-zA-Z]+\*?", " ", text) + # Remove leftover braces and math delimiters + text = re.sub(r"[{}$]", " ", text) + # Collapse whitespace + text = re.sub(r"\n{3,}", "\n\n", text) + text = re.sub(r"[ \t]+", " ", text) + return text.strip() + + +# --------------------------------------------------------------------------- +# CLI +# --------------------------------------------------------------------------- + +def main() -> None: + logging.basicConfig( + level=logging.INFO, + format="%(asctime)s %(levelname)s %(name)s — %(message)s", + ) + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("--csv", required=True, help="Path to verified_ground_truth.csv") + parser.add_argument("--output-dir", default="./verified_ground_truth_data") + parser.add_argument("--workers", type=int, default=4) + args = parser.parse_args() + + results = download_all(args.csv, args.output_dir, args.workers) + failed = sum(1 for r in results if r["status"] == "failed") + if failed: + sys.exit(1) + + +if __name__ == "__main__": + main() diff --git a/eu_fact_force/ingestion/data_collection/ground_truth.py b/eu_fact_force/ingestion/data_collection/ground_truth.py new file mode 100644 index 0000000..86ed7a2 --- /dev/null +++ b/eu_fact_force/ingestion/data_collection/ground_truth.py @@ -0,0 +1,204 @@ +""" +Collect verified ground truth articles from arXiv. + +"Verified" means the ground truth text is the official LaTeX source that the +authors submitted — not extracted from a PDF. This gives clean, artifact-free +reference text for parser quality evaluation. + +Usage: + python -m eu_fact_force.ingestion.data_collection.ground_truth \\ + --output verified_ground_truth.csv \\ + --vaccine-limit 30 \\ + --other-limit 30 +""" + +import argparse +import csv +import logging +import sys +import xml.etree.ElementTree as ET +from dataclasses import dataclass +from typing import Optional + +import requests + +logger = logging.getLogger(__name__) + +_ARXIV_API = "https://arxiv.org/api/query" +_ATOM_NS = "http://www.w3.org/2005/Atom" + +CSV_FIELDS = [ + "category", + "article_id", + "title", + "source", + "ground_truth_format", + "pdf_url", + "text_url", + "verification", +] + + +@dataclass +class ArxivGroundTruth: + arxiv_id: str + title: str + category: str # "vaccine_autism" | "other" + + @property + def article_id(self) -> str: + return f"arxiv:{self.arxiv_id}" + + @property + def pdf_url(self) -> str: + return f"https://arxiv.org/pdf/{self.arxiv_id}.pdf" + + @property + def text_url(self) -> str: + # Official LaTeX source archive + return f"https://arxiv.org/src/{self.arxiv_id}" + + def to_row(self) -> dict: + return { + "category": self.category, + "article_id": self.article_id, + "title": self.title, + "source": "arxiv", + "ground_truth_format": "arxiv_latex_source", + "pdf_url": self.pdf_url, + "text_url": self.text_url, + "verification": "Official arXiv LaTeX source (authors' original)", + } + + +class ArxivGroundTruthCollector: + """Query the arXiv Atom API and return ArxivGroundTruth records.""" + + def search( + self, + query: str, + category: str, + limit: int, + ) -> list[ArxivGroundTruth]: + params = { + "search_query": query, + "max_results": limit * 2, # fetch extra to account for parse failures + "sortBy": "submittedDate", + "sortOrder": "descending", + } + try: + resp = requests.get(_ARXIV_API, params=params, timeout=30) + resp.raise_for_status() + except Exception: + logger.exception("arxiv.request_failed query=%s", query[:60]) + return [] + + root = ET.fromstring(resp.content) + articles: list[ArxivGroundTruth] = [] + + for entry in root.findall(f"{{{_ATOM_NS}}}entry"): + try: + arxiv_id = _parse_arxiv_id(entry) + title = _parse_title(entry) + if arxiv_id and title: + articles.append(ArxivGroundTruth(arxiv_id, title, category)) + logger.debug("arxiv.found id=%s title=%s", arxiv_id, title[:60]) + except Exception as e: + logger.debug("arxiv.parse_error error=%s", e) + + result = articles[:limit] + logger.info("arxiv.search_done query=%s found=%d", query[:60], len(result)) + return result + + +def collect( + output_csv: str, + vaccine_limit: int = 30, + other_limit: int = 30, +) -> list[ArxivGroundTruth]: + """ + Collect verified ground truth articles and write to a CSV. + + Searches for two categories: + - vaccine_autism: papers on vaccine safety / autism link + - other: general biomedical / ML-in-health papers (as contrast set) + + Returns the list of collected articles. + """ + collector = ArxivGroundTruthCollector() + + vaccine_query = ( + 'cat:(q-bio.QM OR q-bio.CB OR stat.AP OR cs.CY) AND ' + '(abs:"vaccine" AND abs:"autism" OR ' + 'abs:"vaccination" AND abs:"autism spectrum" OR ' + 'abs:"vaccine safety" AND abs:"autism")' + ) + other_query = ( + 'cat:(q-bio.QM OR stat.AP) AND ' + '(abs:"clinical trial" OR abs:"efficacy" OR abs:"treatment") ' + 'NOT abs:vaccine' + ) + + vaccine_articles = collector.search(vaccine_query, "vaccine_autism", vaccine_limit) + other_articles = collector.search(other_query, "other", other_limit) + + all_articles = vaccine_articles + other_articles + + with open(output_csv, "w", newline="", encoding="utf-8") as f: + writer = csv.DictWriter(f, fieldnames=CSV_FIELDS) + writer.writeheader() + for article in all_articles: + writer.writerow(article.to_row()) + + logger.info( + "ground_truth.saved csv=%s vaccine=%d other=%d total=%d", + output_csv, len(vaccine_articles), len(other_articles), len(all_articles), + ) + print(f"Saved {len(all_articles)} articles to {output_csv}") + print(f" vaccine_autism : {len(vaccine_articles)}") + print(f" other : {len(other_articles)}") + return all_articles + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + +def _parse_arxiv_id(entry: ET.Element) -> Optional[str]: + id_elem = entry.find(f"{{{_ATOM_NS}}}id") + if id_elem is None or not id_elem.text: + return None + # URL form: https://arxiv.org/abs/2301.00001v1 + return id_elem.text.strip().split("/abs/")[-1] + + +def _parse_title(entry: ET.Element) -> Optional[str]: + title_elem = entry.find(f"{{{_ATOM_NS}}}title") + if title_elem is None or not title_elem.text: + return None + return " ".join(title_elem.text.split()) # normalise whitespace + + +# --------------------------------------------------------------------------- +# CLI +# --------------------------------------------------------------------------- + +def main() -> None: + logging.basicConfig( + level=logging.INFO, + format="%(asctime)s %(levelname)s %(name)s — %(message)s", + ) + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("--output", default="verified_ground_truth.csv") + parser.add_argument("--vaccine-limit", type=int, default=30) + parser.add_argument("--other-limit", type=int, default=30) + args = parser.parse_args() + + articles = collect(args.output, args.vaccine_limit, args.other_limit) + if not articles: + print("No articles collected.", file=sys.stderr) + sys.exit(1) + + +if __name__ == "__main__": + main() diff --git a/verified_ground_truth.csv b/verified_ground_truth.csv new file mode 100644 index 0000000..1c917b7 --- /dev/null +++ b/verified_ground_truth.csv @@ -0,0 +1,33 @@ +category,article_id,title,source,ground_truth_format,pdf_url,text_url,verification +vaccine_autism,arxiv:1905.12616v3,Defending Against Neural Fake News,arxiv,arxiv_latex_source,https://arxiv.org/pdf/1905.12616v3.pdf,https://arxiv.org/src/1905.12616v3,Official arXiv LaTeX source (authors' original) +vaccine_autism,arxiv:1409.2651v1,Social determinants of content selection in the age of (mis)information,arxiv,arxiv_latex_source,https://arxiv.org/pdf/1409.2651v1.pdf,https://arxiv.org/src/1409.2651v1,Official arXiv LaTeX source (authors' original) +vaccine_autism,arxiv:2105.05134v2,COVID-19 Vaccine Hesitancy on Social Media: Building a Public Twitter Dataset of Anti-vaccine Content Vaccine Misinformation and Conspiracies,arxiv,arxiv_latex_source,https://arxiv.org/pdf/2105.05134v2.pdf,https://arxiv.org/src/2105.05134v2,Official arXiv LaTeX source (authors' original) +vaccine_autism,arxiv:2106.08423v1,COVID-19 Vaccines: Characterizing Misinformation Campaigns and Vaccine Hesitancy on Twitter,arxiv,arxiv_latex_source,https://arxiv.org/pdf/2106.08423v1.pdf,https://arxiv.org/src/2106.08423v1,Official arXiv LaTeX source (authors' original) +vaccine_autism,arxiv:2311.18195v1,COVID-19 Vaccine Misinformation in Middle Income Countries,arxiv,arxiv_latex_source,https://arxiv.org/pdf/2311.18195v1.pdf,https://arxiv.org/src/2311.18195v1,Official arXiv LaTeX source (authors' original) +vaccine_autism,arxiv:2304.06858v1,Vax-Culture: A Dataset for Studying Vaccine Discourse on Twitter,arxiv,arxiv_latex_source,https://arxiv.org/pdf/2304.06858v1.pdf,https://arxiv.org/src/2304.06858v1,Official arXiv LaTeX source (authors' original) +vaccine_autism,arxiv:2402.01783v1,Hierarchical Multi-Label Classification of Online Vaccine Concerns,arxiv,arxiv_latex_source,https://arxiv.org/pdf/2402.01783v1.pdf,https://arxiv.org/src/2402.01783v1,Official arXiv LaTeX source (authors' original) +vaccine_autism,arxiv:2404.01669v1,How COVID-19 has Impacted the Anti-Vaccine Discourse: A Large-Scale Twitter Study Spanning Pre-COVID and Post-COVID Era,arxiv,arxiv_latex_source,https://arxiv.org/pdf/2404.01669v1.pdf,https://arxiv.org/src/2404.01669v1,Official arXiv LaTeX source (authors' original) +vaccine_autism,arxiv:2402.11351v2,Modeling the amplification of epidemic spread by individuals exposed to misinformation on social media,arxiv,arxiv_latex_source,https://arxiv.org/pdf/2402.11351v2.pdf,https://arxiv.org/src/2402.11351v2,Official arXiv LaTeX source (authors' original) +vaccine_autism,arxiv:2503.04572v1,Social Imitation Dynamics of Vaccination Driven by Vaccine Effectiveness and Beliefs,arxiv,arxiv_latex_source,https://arxiv.org/pdf/2503.04572v1.pdf,https://arxiv.org/src/2503.04572v1,Official arXiv LaTeX source (authors' original) +vaccine_autism,arxiv:2603.05626v1,The Impact of Neglecting Vaccine Unwillingness in Epidemiology Models,arxiv,arxiv_latex_source,https://arxiv.org/pdf/2603.05626v1.pdf,https://arxiv.org/src/2603.05626v1,Official arXiv LaTeX source (authors' original) +vaccine_autism,arxiv:2411.11813v1,Heterogeneous population and its resilience to misinformation in vaccination uptake: A dual ODE and network approach,arxiv,arxiv_latex_source,https://arxiv.org/pdf/2411.11813v1.pdf,https://arxiv.org/src/2411.11813v1,Official arXiv LaTeX source (authors' original) +vaccine_autism,arxiv:2410.18670v1,Health Misinformation in Social Networks: A Survey of IT Approaches,arxiv,arxiv_latex_source,https://arxiv.org/pdf/2410.18670v1.pdf,https://arxiv.org/src/2410.18670v1,Official arXiv LaTeX source (authors' original) +vaccine_autism,arxiv:2403.09349v1,From Pro Anti to Informative and Hesitant: An Infoveillance Study of COVID-19 Vaccines and Vaccination Discourse on Twitter,arxiv,arxiv_latex_source,https://arxiv.org/pdf/2403.09349v1.pdf,https://arxiv.org/src/2403.09349v1,Official arXiv LaTeX source (authors' original) +vaccine_autism,arxiv:2407.03190v2,Cutting Through the Noise to Motivate People: A Comprehensive Analysis of COVID-19 Social Media Posts De/motivating Vaccination,arxiv,arxiv_latex_source,https://arxiv.org/pdf/2407.03190v2.pdf,https://arxiv.org/src/2407.03190v2,Official arXiv LaTeX source (authors' original) +vaccine_autism,arxiv:2510.16359v1,Utilising Large Language Models for Generating Effective Counter Arguments to Anti-Vaccine Tweets,arxiv,arxiv_latex_source,https://arxiv.org/pdf/2510.16359v1.pdf,https://arxiv.org/src/2510.16359v1,Official arXiv LaTeX source (authors' original) +vaccine_autism,arxiv:2402.18335v1,Detecting Anti-vaccine Content on Twitter using Multiple Message-Based Network Representations,arxiv,arxiv_latex_source,https://arxiv.org/pdf/2402.18335v1.pdf,https://arxiv.org/src/2402.18335v1,Official arXiv LaTeX source (authors' original) +vaccine_autism,arxiv:2208.04491v1,Improving Vaccine Stance Detection by Combining Online and Offline Data,arxiv,arxiv_latex_source,https://arxiv.org/pdf/2208.04491v1.pdf,https://arxiv.org/src/2208.04491v1,Official arXiv LaTeX source (authors' original) +vaccine_autism,arxiv:2312.10626v1,Decoding Concerns: Multi-label Classification of Vaccine Sentiments in Social Media,arxiv,arxiv_latex_source,https://arxiv.org/pdf/2312.10626v1.pdf,https://arxiv.org/src/2312.10626v1,Official arXiv LaTeX source (authors' original) +vaccine_autism,arxiv:2110.11333v1,Detecting Anti-Vaccine Users on Twitter,arxiv,arxiv_latex_source,https://arxiv.org/pdf/2110.11333v1.pdf,https://arxiv.org/src/2110.11333v1,Official arXiv LaTeX source (authors' original) +vaccine_autism,arxiv:2106.04081v1,Insight from NLP Analysis: COVID-19 Vaccines Sentiments on Social Media,arxiv,arxiv_latex_source,https://arxiv.org/pdf/2106.04081v1.pdf,https://arxiv.org/src/2106.04081v1,Official arXiv LaTeX source (authors' original) +vaccine_autism,arxiv:2601.18377v1,Socioeconomic Determinants of the COVID-19 Infodemics,arxiv,arxiv_latex_source,https://arxiv.org/pdf/2601.18377v1.pdf,https://arxiv.org/src/2601.18377v1,Official arXiv LaTeX source (authors' original) +vaccine_autism,arxiv:2303.06433v1,Reinforcement Learning-based Counter-Misinformation Response Generation: A Case Study of COVID-19 Vaccine Misinformation,arxiv,arxiv_latex_source,https://arxiv.org/pdf/2303.06433v1.pdf,https://arxiv.org/src/2303.06433v1,Official arXiv LaTeX source (authors' original) +vaccine_autism,arxiv:2211.11495v1,Global misinformation spillovers in the online vaccination debate before and during COVID-19,arxiv,arxiv_latex_source,https://arxiv.org/pdf/2211.11495v1.pdf,https://arxiv.org/src/2211.11495v1,Official arXiv LaTeX source (authors' original) +vaccine_autism,arxiv:2208.01509v1,Characterizing Vaccination Movements on YouTube in the United States and Brazil,arxiv,arxiv_latex_source,https://arxiv.org/pdf/2208.01509v1.pdf,https://arxiv.org/src/2208.01509v1,Official arXiv LaTeX source (authors' original) +vaccine_autism,arxiv:2309.08503v1,HealthFC: Verifying Health Claims with Evidence-Based Medical Fact-Checking,arxiv,arxiv_latex_source,https://arxiv.org/pdf/2309.08503v1.pdf,https://arxiv.org/src/2309.08503v1,Official arXiv LaTeX source (authors' original) +vaccine_autism,arxiv:2010.09926v1,Explainable Automated Fact-Checking for Public Health Claims,arxiv,arxiv_latex_source,https://arxiv.org/pdf/2010.09926v1.pdf,https://arxiv.org/src/2010.09926v1,Official arXiv LaTeX source (authors' original) +vaccine_autism,arxiv:2310.19834v2,AMIR: An Automated MisInformation Rebuttal System — A COVID-19 Vaccination Datasets based Exposition,arxiv,arxiv_latex_source,https://arxiv.org/pdf/2310.19834v2.pdf,https://arxiv.org/src/2310.19834v2,Official arXiv LaTeX source (authors' original) +vaccine_autism,arxiv:2602.15476v1,How to Detect Information Voids Using Longitudinal Data from Social Media and Web Searches,arxiv,arxiv_latex_source,https://arxiv.org/pdf/2602.15476v1.pdf,https://arxiv.org/src/2602.15476v1,Official arXiv LaTeX source (authors' original) +vaccine_autism,arxiv:2311.11435v1,Unveiling Public Perceptions: Machine Learning-Based Sentiment Analysis of COVID-19 Vaccines in India,arxiv,arxiv_latex_source,https://arxiv.org/pdf/2311.11435v1.pdf,https://arxiv.org/src/2311.11435v1,Official arXiv LaTeX source (authors' original) +vaccine_autism,arxiv:2306.13797v1,An Analysis of Vaccine-Related Sentiments from Development to Deployment of COVID-19 Vaccines,arxiv,arxiv_latex_source,https://arxiv.org/pdf/2306.13797v1.pdf,https://arxiv.org/src/2306.13797v1,Official arXiv LaTeX source (authors' original) +vaccine_autism,arxiv:2107.10648v1,DEAP-FAKED: Knowledge Graph based Approach for Fake News Detection,arxiv,arxiv_latex_source,https://arxiv.org/pdf/2107.10648v1.pdf,https://arxiv.org/src/2107.10648v1,Official arXiv LaTeX source (authors' original)