Skip to content

Commit 37e3855

Browse files
committed
Remove article counting logic due to DOAJ API limitations
1 parent 3d75671 commit 37e3855

1 file changed

Lines changed: 4 additions & 20 deletions

File tree

scripts/1-fetch/doaj_fetch.py

Lines changed: 4 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -152,7 +152,6 @@ def process_journals(session, args):
152152

153153
license_counts = Counter()
154154
year_counts = defaultdict(Counter)
155-
article_counts = defaultdict(int) # Track total articles per license type
156155
processed_journals = set() # Track unique journals to avoid double counting
157156

158157
total_processed = 0
@@ -216,8 +215,7 @@ def process_journals(session, args):
216215
if not cc_license_types:
217216
continue
218217

219-
# Extract article count and year once per journal
220-
article_count = bibjson.get("article_count", 0)
218+
# Extract year from oa_start (Open Access start year)
221219
oa_start = bibjson.get("oa_start")
222220

223221
# Apply date-back filter if specified
@@ -234,13 +232,9 @@ def process_journals(session, args):
234232
else:
235233
year_counts[license_type]["Unknown"] += 1
236234

237-
# Add article count only once per unique journal (avoid double counting)
235+
# Track unique journals to avoid double counting in statistics
238236
if journal_id not in processed_journals:
239237
processed_journals.add(journal_id)
240-
# Add full article count to each license type this journal supports
241-
if article_count:
242-
for license_type in cc_license_types:
243-
article_counts[license_type] += article_count
244238

245239
total_processed += 1
246240

@@ -265,17 +259,15 @@ def process_journals(session, args):
265259
return (
266260
license_counts,
267261
year_counts,
268-
article_counts,
269262
len(processed_journals), # Return unique journal count
270263
)
271264

272265

273266
def save_count_data(
274267
license_counts,
275268
year_counts,
276-
article_counts,
277269
):
278-
"""Save essential journal data and article context to CSV files."""
270+
"""Save essential journal data to CSV files."""
279271

280272
# Save license counts
281273
with open(
@@ -313,7 +305,6 @@ def query_doaj(args):
313305
(
314306
license_counts,
315307
year_counts,
316-
article_counts,
317308
journals_processed,
318309
) = process_journals(session, args)
319310

@@ -322,21 +313,18 @@ def query_doaj(args):
322313
save_count_data(
323314
license_counts,
324315
year_counts,
325-
article_counts,
326316
)
327317

328318
# Save provenance
329-
total_articles = sum(article_counts.values())
330319
provenance_data = {
331-
"total_articles_in_cc_journals": total_articles,
332320
"total_journals_fetched": journals_processed,
333321
"total_processed": journals_processed,
334322
"limit": args.limit,
335323
"date_back_filter": args.date_back,
336324
"quarter": QUARTER,
337325
"script": os.path.basename(__file__),
338326
"api_version": "v4",
339-
"note": "Article counts provide context for CC journal scope - individual article licenses unknown",
327+
"note": "Journal-level CC license data only - article counts not available via DOAJ API",
340328
}
341329

342330
try:
@@ -360,10 +348,6 @@ def query_doaj(args):
360348
# Calculate total license availability instances
361349
total_license_instances = sum(license_counts.values())
362350
LOGGER.info(f"Total CC license type instances: {total_license_instances}")
363-
364-
# Calculate total articles for context
365-
total_articles = sum(article_counts.values())
366-
LOGGER.info(f"Total articles in CC-licensed journals: {total_articles}")
367351
LOGGER.info("Note: Journals supporting multiple CC license types are counted once per license type")
368352

369353

0 commit comments

Comments
 (0)