2727from collections import Counter , defaultdict
2828
2929# Third-party
30+ import pycountry
3031import requests
3132import yaml
3233from pygments import highlight
6263
6364# File Paths
6465FILE_DOAJ_COUNT = shared .path_join (PATHS ["data_1-fetch" ], "doaj_1_count.csv" )
66+ FILE_DOAJ_COUNTRY = shared .path_join (PATHS ["data_1-fetch" ], "doaj_3_count_by_country.csv" )
67+ FILE_DOAJ_LANGUAGE = shared .path_join (PATHS ["data_1-fetch" ], "doaj_5_count_by_language.csv" )
6568FILE_PROVENANCE = shared .path_join (
6669 PATHS ["data_1-fetch" ], "doaj_provenance.yaml"
6770)
7174
7275# CSV Headers
7376HEADER_COUNT = ["TOOL_IDENTIFIER" , "COUNT" ]
77+ HEADER_COUNTRY = ["TOOL_IDENTIFIER" , "COUNTRY_CODE" , "COUNTRY_NAME" , "COUNT" ]
78+ HEADER_LANGUAGE = ["TOOL_IDENTIFIER" , "LANGUAGE_CODE" , "LANGUAGE_NAME" , "COUNT" ]
7479HEADER_YEAR = ["TOOL_IDENTIFIER" , "YEAR" , "COUNT" ]
7580
7681# Runtime variables
@@ -129,9 +134,33 @@ def initialize_all_data_files(args):
129134 return
130135 os .makedirs (PATHS ["data_1-fetch" ], exist_ok = True )
131136 initialize_data_file (FILE_DOAJ_COUNT , HEADER_COUNT )
137+ initialize_data_file (FILE_DOAJ_COUNTRY , HEADER_COUNTRY )
138+ initialize_data_file (FILE_DOAJ_LANGUAGE , HEADER_LANGUAGE )
132139 initialize_data_file (FILE_DOAJ_YEAR , HEADER_YEAR )
133140
134141
142+ def get_country_name (country_code ):
143+ """Get country name from ISO 3166-1 alpha-2 code using pycountry."""
144+ if not country_code or country_code == "Unknown" :
145+ return "Unknown"
146+ try :
147+ country = pycountry .countries .get (alpha_2 = country_code .upper ())
148+ return country .name if country else country_code
149+ except Exception :
150+ return country_code
151+
152+
153+ def get_language_name (language_code ):
154+ """Get language name from ISO 639-1 code using pycountry."""
155+ if not language_code or language_code == "Unknown" :
156+ return "Unknown"
157+ try :
158+ language = pycountry .languages .get (alpha_2 = language_code .upper ())
159+ return language .name if language else language_code
160+ except Exception :
161+ return language_code
162+
163+
135164def extract_license_types (license_info ):
136165 """Extract all CC license types from DOAJ license information."""
137166 if not license_info :
@@ -151,6 +180,8 @@ def process_journals(session, args):
151180 LOGGER .info ("Fetching DOAJ journals..." )
152181
153182 license_counts = Counter ()
183+ country_counts = defaultdict (Counter )
184+ language_counts = defaultdict (Counter )
154185 year_counts = defaultdict (Counter )
155186 processed_journals = set () # Track unique journals to avoid double counting
156187
@@ -232,6 +263,20 @@ def process_journals(session, args):
232263 else :
233264 year_counts [license_type ]["Unknown" ] += 1
234265
266+ # Extract country information
267+ publisher_info = bibjson .get ("publisher" , {})
268+ if isinstance (publisher_info , dict ):
269+ country_code = publisher_info .get ("country" , "Unknown" )
270+ country_counts [license_type ][country_code ] += 1
271+
272+ # Extract language information
273+ languages = bibjson .get ("language" , [])
274+ if languages :
275+ for lang_code in languages :
276+ language_counts [license_type ][lang_code ] += 1
277+ else :
278+ language_counts [license_type ]["Unknown" ] += 1
279+
235280 # Track unique journals to avoid double counting in statistics
236281 if journal_id not in processed_journals :
237282 processed_journals .add (journal_id )
@@ -258,13 +303,17 @@ def process_journals(session, args):
258303
259304 return (
260305 license_counts ,
306+ country_counts ,
307+ language_counts ,
261308 year_counts ,
262309 len (processed_journals ), # Return unique journal count
263310 )
264311
265312
266313def save_count_data (
267314 license_counts ,
315+ country_counts ,
316+ language_counts ,
268317 year_counts ,
269318):
270319 """Save essential journal data to CSV files."""
@@ -280,6 +329,42 @@ def save_count_data(
280329 for lic , count in license_counts .items ():
281330 writer .writerow ({"TOOL_IDENTIFIER" : lic , "COUNT" : count })
282331
332+ # Save country counts with pycountry names
333+ with open (
334+ FILE_DOAJ_COUNTRY , "w" , encoding = "utf-8" , newline = "\n "
335+ ) as file_object :
336+ writer = csv .DictWriter (
337+ file_object , fieldnames = HEADER_COUNTRY , dialect = "unix"
338+ )
339+ writer .writeheader ()
340+ for lic , countries in country_counts .items ():
341+ for country_code , count in countries .items ():
342+ country_name = get_country_name (country_code )
343+ writer .writerow ({
344+ "TOOL_IDENTIFIER" : lic ,
345+ "COUNTRY_CODE" : country_code ,
346+ "COUNTRY_NAME" : country_name ,
347+ "COUNT" : count ,
348+ })
349+
350+ # Save language counts with pycountry names
351+ with open (
352+ FILE_DOAJ_LANGUAGE , "w" , encoding = "utf-8" , newline = "\n "
353+ ) as file_object :
354+ writer = csv .DictWriter (
355+ file_object , fieldnames = HEADER_LANGUAGE , dialect = "unix"
356+ )
357+ writer .writeheader ()
358+ for lic , languages in language_counts .items ():
359+ for lang_code , count in languages .items ():
360+ lang_name = get_language_name (lang_code )
361+ writer .writerow ({
362+ "TOOL_IDENTIFIER" : lic ,
363+ "LANGUAGE_CODE" : lang_code ,
364+ "LANGUAGE_NAME" : lang_name ,
365+ "COUNT" : count ,
366+ })
367+
283368 # Save year counts
284369 with open (
285370 FILE_DOAJ_YEAR , "w" , encoding = "utf-8" , newline = "\n "
@@ -304,6 +389,8 @@ def query_doaj(args):
304389 # Process journals
305390 (
306391 license_counts ,
392+ country_counts ,
393+ language_counts ,
307394 year_counts ,
308395 journals_processed ,
309396 ) = process_journals (session , args )
@@ -312,6 +399,8 @@ def query_doaj(args):
312399 if args .enable_save :
313400 save_count_data (
314401 license_counts ,
402+ country_counts ,
403+ language_counts ,
315404 year_counts ,
316405 )
317406
0 commit comments