Skip to content

Commit 1cdf721

Browse files
committed
Add country and language data collection using pycountry library
1 parent 8c7bea5 commit 1cdf721

3 files changed

Lines changed: 100 additions & 1 deletion

File tree

Pipfile

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ requests = ">=2.31.0"
2424
seaborn = "*"
2525
urllib3 = ">=2.5.0"
2626
wordcloud = "*"
27+
pycountry = "*"
2728

2829
[dev-packages]
2930
black = "*"

Pipfile.lock

Lines changed: 10 additions & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

scripts/1-fetch/doaj_fetch.py

Lines changed: 89 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
from collections import Counter, defaultdict
2828

2929
# Third-party
30+
import pycountry
3031
import requests
3132
import yaml
3233
from pygments import highlight
@@ -62,6 +63,8 @@
6263

6364
# File Paths
6465
FILE_DOAJ_COUNT = shared.path_join(PATHS["data_1-fetch"], "doaj_1_count.csv")
66+
FILE_DOAJ_COUNTRY = shared.path_join(PATHS["data_1-fetch"], "doaj_3_count_by_country.csv")
67+
FILE_DOAJ_LANGUAGE = shared.path_join(PATHS["data_1-fetch"], "doaj_5_count_by_language.csv")
6568
FILE_PROVENANCE = shared.path_join(
6669
PATHS["data_1-fetch"], "doaj_provenance.yaml"
6770
)
@@ -71,6 +74,8 @@
7174

7275
# CSV Headers
7376
HEADER_COUNT = ["TOOL_IDENTIFIER", "COUNT"]
77+
HEADER_COUNTRY = ["TOOL_IDENTIFIER", "COUNTRY_CODE", "COUNTRY_NAME", "COUNT"]
78+
HEADER_LANGUAGE = ["TOOL_IDENTIFIER", "LANGUAGE_CODE", "LANGUAGE_NAME", "COUNT"]
7479
HEADER_YEAR = ["TOOL_IDENTIFIER", "YEAR", "COUNT"]
7580

7681
# Runtime variables
@@ -129,9 +134,33 @@ def initialize_all_data_files(args):
129134
return
130135
os.makedirs(PATHS["data_1-fetch"], exist_ok=True)
131136
initialize_data_file(FILE_DOAJ_COUNT, HEADER_COUNT)
137+
initialize_data_file(FILE_DOAJ_COUNTRY, HEADER_COUNTRY)
138+
initialize_data_file(FILE_DOAJ_LANGUAGE, HEADER_LANGUAGE)
132139
initialize_data_file(FILE_DOAJ_YEAR, HEADER_YEAR)
133140

134141

142+
def get_country_name(country_code):
143+
"""Get country name from ISO 3166-1 alpha-2 code using pycountry."""
144+
if not country_code or country_code == "Unknown":
145+
return "Unknown"
146+
try:
147+
country = pycountry.countries.get(alpha_2=country_code.upper())
148+
return country.name if country else country_code
149+
except Exception:
150+
return country_code
151+
152+
153+
def get_language_name(language_code):
154+
"""Get language name from ISO 639-1 code using pycountry."""
155+
if not language_code or language_code == "Unknown":
156+
return "Unknown"
157+
try:
158+
language = pycountry.languages.get(alpha_2=language_code.upper())
159+
return language.name if language else language_code
160+
except Exception:
161+
return language_code
162+
163+
135164
def extract_license_types(license_info):
136165
"""Extract all CC license types from DOAJ license information."""
137166
if not license_info:
@@ -151,6 +180,8 @@ def process_journals(session, args):
151180
LOGGER.info("Fetching DOAJ journals...")
152181

153182
license_counts = Counter()
183+
country_counts = defaultdict(Counter)
184+
language_counts = defaultdict(Counter)
154185
year_counts = defaultdict(Counter)
155186
processed_journals = set() # Track unique journals to avoid double counting
156187

@@ -232,6 +263,20 @@ def process_journals(session, args):
232263
else:
233264
year_counts[license_type]["Unknown"] += 1
234265

266+
# Extract country information
267+
publisher_info = bibjson.get("publisher", {})
268+
if isinstance(publisher_info, dict):
269+
country_code = publisher_info.get("country", "Unknown")
270+
country_counts[license_type][country_code] += 1
271+
272+
# Extract language information
273+
languages = bibjson.get("language", [])
274+
if languages:
275+
for lang_code in languages:
276+
language_counts[license_type][lang_code] += 1
277+
else:
278+
language_counts[license_type]["Unknown"] += 1
279+
235280
# Track unique journals to avoid double counting in statistics
236281
if journal_id not in processed_journals:
237282
processed_journals.add(journal_id)
@@ -258,13 +303,17 @@ def process_journals(session, args):
258303

259304
return (
260305
license_counts,
306+
country_counts,
307+
language_counts,
261308
year_counts,
262309
len(processed_journals), # Return unique journal count
263310
)
264311

265312

266313
def save_count_data(
267314
license_counts,
315+
country_counts,
316+
language_counts,
268317
year_counts,
269318
):
270319
"""Save essential journal data to CSV files."""
@@ -280,6 +329,42 @@ def save_count_data(
280329
for lic, count in license_counts.items():
281330
writer.writerow({"TOOL_IDENTIFIER": lic, "COUNT": count})
282331

332+
# Save country counts with pycountry names
333+
with open(
334+
FILE_DOAJ_COUNTRY, "w", encoding="utf-8", newline="\n"
335+
) as file_object:
336+
writer = csv.DictWriter(
337+
file_object, fieldnames=HEADER_COUNTRY, dialect="unix"
338+
)
339+
writer.writeheader()
340+
for lic, countries in country_counts.items():
341+
for country_code, count in countries.items():
342+
country_name = get_country_name(country_code)
343+
writer.writerow({
344+
"TOOL_IDENTIFIER": lic,
345+
"COUNTRY_CODE": country_code,
346+
"COUNTRY_NAME": country_name,
347+
"COUNT": count,
348+
})
349+
350+
# Save language counts with pycountry names
351+
with open(
352+
FILE_DOAJ_LANGUAGE, "w", encoding="utf-8", newline="\n"
353+
) as file_object:
354+
writer = csv.DictWriter(
355+
file_object, fieldnames=HEADER_LANGUAGE, dialect="unix"
356+
)
357+
writer.writeheader()
358+
for lic, languages in language_counts.items():
359+
for lang_code, count in languages.items():
360+
lang_name = get_language_name(lang_code)
361+
writer.writerow({
362+
"TOOL_IDENTIFIER": lic,
363+
"LANGUAGE_CODE": lang_code,
364+
"LANGUAGE_NAME": lang_name,
365+
"COUNT": count,
366+
})
367+
283368
# Save year counts
284369
with open(
285370
FILE_DOAJ_YEAR, "w", encoding="utf-8", newline="\n"
@@ -304,6 +389,8 @@ def query_doaj(args):
304389
# Process journals
305390
(
306391
license_counts,
392+
country_counts,
393+
language_counts,
307394
year_counts,
308395
journals_processed,
309396
) = process_journals(session, args)
@@ -312,6 +399,8 @@ def query_doaj(args):
312399
if args.enable_save:
313400
save_count_data(
314401
license_counts,
402+
country_counts,
403+
language_counts,
315404
year_counts,
316405
)
317406

0 commit comments

Comments
 (0)