Skip to content

Commit 8b90caf

Browse files
committed
Add new Wikipedia processing and report scripts
1 parent 0d44547 commit 8b90caf

2 files changed

Lines changed: 422 additions & 0 deletions

File tree

Lines changed: 163 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,163 @@
1+
#!/usr/bin/env python
2+
"""
3+
This file is dedicated to processing Wikipedia data
4+
for analysis and comparison between quarters.
5+
"""
6+
# Standard library
7+
import argparse
8+
import csv
9+
import os
10+
import sys
11+
import traceback
12+
13+
# Third-party
14+
# import pandas as pd
15+
import pandas as pd
16+
17+
# Add parent directory so shared can be imported
18+
sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
19+
20+
# First-party/Local
21+
import shared # noqa: E402
22+
23+
# Setup
24+
LOGGER, PATHS = shared.setup(__file__)
25+
26+
# Constants
27+
QUARTER = os.path.basename(PATHS["data_quarter"])
28+
29+
30+
def parse_arguments():
31+
"""
32+
Parse command-line options, returns parsed argument namespace.
33+
"""
34+
LOGGER.info("Parsing command-line options")
35+
parser = argparse.ArgumentParser(description=__doc__)
36+
parser.add_argument(
37+
"--quarter",
38+
default=QUARTER,
39+
help=f"Data quarter in format YYYYQx (default: {QUARTER})",
40+
)
41+
parser.add_argument(
42+
"--enable-save",
43+
action="store_true",
44+
help="Enable saving results (default: False)",
45+
)
46+
parser.add_argument(
47+
"--enable-git",
48+
action="store_true",
49+
help="Enable git actions such as fetch, merge, add, commit, and push"
50+
" (default: False)",
51+
)
52+
args = parser.parse_args()
53+
if not args.enable_save and args.enable_git:
54+
parser.error("--enable-git requires --enable-save")
55+
if args.quarter != QUARTER:
56+
global PATHS
57+
PATHS = shared.paths_update(LOGGER, PATHS, QUARTER, args.quarter)
58+
args.logger = LOGGER
59+
args.paths = PATHS
60+
return args
61+
62+
63+
def data_to_csv(args, data, file_path):
64+
if not args.enable_save:
65+
return
66+
os.makedirs(PATHS["data_phase"], exist_ok=True)
67+
# emulate csv.unix_dialect
68+
data.to_csv(
69+
file_path, index=False, quoting=csv.QUOTE_ALL, lineterminator="\n"
70+
)
71+
72+
73+
def process_highest_language_usage(args, count_data):
74+
"""
75+
Processing count data: top 10 highest language usage
76+
"""
77+
LOGGER.info(process_highest_language_usage.__doc__.strip())
78+
data = {}
79+
80+
for row in count_data.itertuples(index=False):
81+
language_name_en = row.LANGUAGE_NAME_EN
82+
count = row.COUNT
83+
data[language_name_en] = count
84+
85+
data = pd.DataFrame(data.items(), columns=["language_name_en", "count"])
86+
data.sort_values("count", ascending=False, inplace=True)
87+
top_10 = data.head(10)
88+
file_path = shared.path_join(
89+
PATHS["data_phase"], "wikipedia_highest_language_usage.csv"
90+
)
91+
data_to_csv(args, top_10, file_path)
92+
93+
94+
def process_language_representation(args, count_data):
95+
"""
96+
Processing count data: language representation
97+
"""
98+
LOGGER.info(process_language_representation.__doc__.strip())
99+
data = {}
100+
101+
for row in count_data.itertuples(index=False):
102+
language_name_en = row.LANGUAGE_NAME_EN
103+
count = row.COUNT
104+
data[language_name_en] = count
105+
106+
data = pd.DataFrame(data.items(), columns=["language_name_en", "count"])
107+
average_count = data["count"].mean()
108+
109+
data["category"] = data["count"].apply(
110+
lambda x: "Underrepresented" if x < average_count else "Represented"
111+
)
112+
language_counts = (
113+
data.groupby("category").size().reset_index(name="language_count")
114+
)
115+
language_counts.sort_values(
116+
"language_count", ascending=False, inplace=True
117+
)
118+
file_path = shared.path_join(
119+
PATHS["data_phase"], "wikipedia_language_representation.csv"
120+
)
121+
data_to_csv(args, language_counts, file_path)
122+
123+
124+
def main():
125+
args = parse_arguments()
126+
shared.paths_log(LOGGER, PATHS)
127+
shared.git_fetch_and_merge(args, PATHS["repo"])
128+
129+
file_count = shared.path_join(
130+
PATHS["data_1-fetch"], "wikipedia_count_by_languages.csv"
131+
)
132+
count_data = pd.read_csv(file_count, usecols=["LANGUAGE_NAME_EN", "COUNT"])
133+
process_highest_language_usage(args, count_data)
134+
process_language_representation(args, count_data)
135+
136+
# Push changes
137+
args = shared.git_add_and_commit(
138+
args,
139+
PATHS["repo"],
140+
PATHS["data_quarter"],
141+
f"Add and commit new Wikipedia data for {QUARTER}",
142+
)
143+
shared.git_push_changes(args, PATHS["repo"])
144+
145+
146+
if __name__ == "__main__":
147+
try:
148+
main()
149+
except shared.QuantifyingException as e:
150+
if e.exit_code == 0:
151+
LOGGER.info(e.message)
152+
else:
153+
LOGGER.error(e.message)
154+
sys.exit(e.code)
155+
except SystemExit as e:
156+
LOGGER.error(f"System exit with code: {e.code}")
157+
sys.exit(e.code)
158+
except KeyboardInterrupt:
159+
LOGGER.info("(130) Halted via KeyboardInterrupt.")
160+
sys.exit(130)
161+
except Exception:
162+
LOGGER.exception(f"(1) Unhandled exception: {traceback.format_exc()}")
163+
sys.exit(1)

0 commit comments

Comments
 (0)