Skip to content

Commit 05f2e2a

Browse files
authored
Merge pull request #205 from American-Institutes-for-Research/HEA-809/add_regex_for_men_women_boys_girls
Add regex pattern for men, women .. see HEA-809
2 parents 1cb423b + 160e484 commit 05f2e2a

5 files changed

Lines changed: 279 additions & 2 deletions

File tree

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
# Generated by Django 5.2.7 on 2025-12-29 10:31
2+
3+
from django.db import migrations, models
4+
5+
6+
class Migration(migrations.Migration):
7+
8+
dependencies = [
9+
("baseline", "0025_alter_milkproduction_milking_animals"),
10+
]
11+
12+
operations = [
13+
migrations.AlterField(
14+
model_name="livelihoodactivity",
15+
name="household_labor_provider",
16+
field=models.CharField(
17+
blank=True,
18+
choices=[
19+
("men", "Mainly Men"),
20+
("women", "Mainly Women"),
21+
("children", "Mainly Children"),
22+
("boys", "Mainly Boys"),
23+
("girls", "Mainly Girls"),
24+
("adults", "Mainly Adults"),
25+
("all", "All Together"),
26+
],
27+
max_length=10,
28+
verbose_name="Activity done by",
29+
),
30+
),
31+
]

apps/baseline/models.py

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1231,8 +1231,67 @@ class HouseholdLaborProvider(models.TextChoices):
12311231
MEN = "men", _("Mainly Men")
12321232
WOMEN = "women", _("Mainly Women")
12331233
CHILDREN = "children", _("Mainly Children")
1234+
BOYS = "boys", _("Mainly Boys")
1235+
GIRLS = "girls", _("Mainly Girls")
1236+
ADULTS = "adults", _("Mainly Adults")
12341237
ALL = "all", _("All Together")
12351238

1239+
@classmethod
1240+
def _missing_(cls, value):
1241+
"""
1242+
Called when the value is missing from the enum
1243+
"""
1244+
value_lower = str(value).lower()
1245+
if hasattr(cls, "_aliases") and value_lower in cls._aliases:
1246+
return cls(cls._aliases[value_lower])
1247+
return None
1248+
1249+
@classmethod
1250+
def get_all_labels(cls):
1251+
"""
1252+
Return all possible labels (canonical values + display labels + aliases) for pattern matching.
1253+
"""
1254+
canonical_values = [value for value, _label in cls.choices]
1255+
display_labels = [str(label) for _value, label in cls.choices]
1256+
alias_labels = list(cls._aliases.keys()) if hasattr(cls, "_aliases") else []
1257+
all_labels = canonical_values + display_labels + alias_labels
1258+
return sorted(all_labels, key=len, reverse=True)
1259+
1260+
HouseholdLaborProvider._aliases = {
1261+
# French singular/plural for men
1262+
"hommes": "men",
1263+
"homme": "men",
1264+
# French singular/plural for women
1265+
"femmes": "women",
1266+
"femme": "women",
1267+
# French singular/plural for boys
1268+
"garçons": "boys",
1269+
"garçon": "boys",
1270+
"garcons": "boys", # without accent
1271+
"garcon": "boys", # without accent
1272+
# French singular/plural for girls
1273+
"filles": "girls",
1274+
"fille": "girls",
1275+
# French for adults
1276+
"adultes": "adults",
1277+
# Children combinations (boys/girls in any order)
1278+
"boys/girls": "children",
1279+
"girls/boys": "children",
1280+
"garçons/filles": "children",
1281+
"filles/garçons": "children",
1282+
"garcons/filles": "children", # without accent
1283+
"filles/garcons": "children", # without accent
1284+
# Adults combinations (men/women in any order)
1285+
"men/women": "adults",
1286+
"women/men": "adults",
1287+
"men & women": "adults",
1288+
"women & men": "adults",
1289+
"hommes/femmes": "adults",
1290+
"femmes/hommes": "adults",
1291+
"hommes & femmes": "adults",
1292+
"femmes & hommes": "adults",
1293+
}
1294+
12361295
household_labor_provider = models.CharField(
12371296
max_length=10, choices=HouseholdLaborProvider.choices, blank=True, verbose_name=_("Activity done by")
12381297
)

pipelines/assets/livelihood_activity.py

Lines changed: 24 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -234,15 +234,22 @@ def get_livelihood_activity_regexes() -> list:
234234
livelihood_activity_regexes = json.load(f)
235235

236236
# Create regex patterns for metadata attributes to replace the placeholders in the regexes
237+
238+
# Dynamically build age_gender_pattern from HouseholdLaborProvider
239+
age_gender_labels = LivelihoodActivity.HouseholdLaborProvider.get_all_labels()
240+
age_gender_labels_escaped = [re.escape(label) for label in age_gender_labels]
241+
age_gender_pattern = r"(?P<household_labor_provider>" + "|".join(age_gender_labels_escaped) + ")"
242+
237243
placeholder_patterns = {
238-
"label_pattern": r"[a-zà-ÿ][a-zà-ÿ',/ \.\>\-\(\)]+?",
244+
"label_pattern": r"[a-zà-ÿ][a-zà-ÿ1-9',/ \.\>\-\(\)]+?",
239245
"product_pattern": r"(?P<product_id>[a-zà-ÿ][a-zà-ÿ1-9',/ \.\>\-\(\)]+?)",
240246
"season_pattern": r"(?P<season>season [12]|saison [12]|[12][a-z] season||[12][a-zà-ÿ] saison|r[eé]colte principale|principale r[eé]colte|gu|deyr+?)", # NOQA: E501
241247
"additional_identifier_pattern": r"\(?(?P<additional_identifier>rainfed|irrigated|pluviale?|irriguée|submersion libre|submersion contrôlée|flottant)\)?",
248+
"age_gender_pattern": age_gender_pattern,
242249
"unit_of_measure_pattern": r"(?P<unit_of_measure_id>[a-z]+)",
243250
"nbr_pattern": r"(?:n[bo]?r?e?|no)\.?",
244251
"vendu_pattern": r"(?:quantité )?vendu(?:e|s|ss|es|ses)?",
245-
"separator_pattern": r" ?[:-]?",
252+
"separator_pattern": r" *[:-]? *",
246253
"name_of_local_measure_pattern": r"(?:name of (?:meas(?:ure)?\.?)|nom(?: (?:de la mesure(?: locale?)?|de mesure locale?|du mesure|d'unité|mesure(?: locale?)?|unité de mesure))?)",
247254
}
248255
# Compile the regexes
@@ -271,13 +278,28 @@ def get_livelihood_activity_regular_expression_attributes(label: str) -> dict:
271278
"unit_of_measure_id": None,
272279
"season": None,
273280
"additional_identifier": None,
281+
"household_labor_provider": None,
274282
"attribute": None,
275283
"notes": None,
276284
}
277285
for pattern, strategy_type, is_start, attribute in get_livelihood_activity_regexes():
278286
match = pattern.fullmatch(label)
279287
if match:
280288
attributes.update(match.groupdict())
289+
290+
# Map household_labor_provider to canonical values using TextChoices
291+
if "household_labor_provider" in attributes and attributes["household_labor_provider"]:
292+
try:
293+
hlp = LivelihoodActivity.HouseholdLaborProvider(attributes["household_labor_provider"].lower())
294+
attributes["household_labor_provider"] = hlp.value
295+
except ValueError:
296+
# Check if it's a display label
297+
hlp_label = attributes["household_labor_provider"].lower()
298+
for choice_value, choice_label in LivelihoodActivity.HouseholdLaborProvider.choices:
299+
if str(choice_label).lower() == hlp_label:
300+
attributes["household_labor_provider"] = choice_value
301+
break
302+
281303
attributes["activity_label"] = label
282304
attributes["strategy_type"] = strategy_type
283305
attributes["is_start"] = is_start

pipelines/assets/livelihood_activity_regexes.json

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,60 @@
6767
true,
6868
null
6969
],
70+
[
71+
"(?:autre auto-emploi|other self-employment){separator_pattern} {age_gender_pattern}{separator_pattern} {product_pattern}",
72+
"OtherCashIncome",
73+
true,
74+
null
75+
],
76+
[
77+
"(?:autre auto-emploi|other self-employment){separator_pattern} {product_pattern}{separator_pattern} {age_gender_pattern}",
78+
"OtherCashIncome",
79+
true,
80+
null
81+
],
82+
[
83+
"{age_gender_pattern}{separator_pattern} {product_pattern}{separator_pattern} (?:quantity|quantité) ?\\({unit_of_measure_pattern}\\)",
84+
null,
85+
true,
86+
"quantity_produced_or_purchased"
87+
],
88+
[
89+
"{age_gender_pattern}{separator_pattern} {product_pattern}{separator_pattern} {nbr_pattern} (?:de )?(?:pers|personnes|people)(?: ?/ ?| par | per )(?:ménage|mènage|hh)",
90+
null,
91+
true,
92+
"people_per_household"
93+
],
94+
[
95+
"{product_pattern}{separator_pattern} {age_gender_pattern} ?\\(?{unit_of_measure_pattern}(?: collectés?|gathered)?\\)?",
96+
null,
97+
true,
98+
"quantity_produced"
99+
],
100+
[
101+
"{product_pattern}{separator_pattern} {age_gender_pattern}{separator_pattern} {nbr_pattern} (?:de )?(?:pers|personnes|people)(?: ?/ ?| par | per )(?:ménage|mènage|hh)",
102+
null,
103+
true,
104+
"people_per_household"
105+
],
106+
[
107+
"{product_pattern}{separator_pattern} {age_gender_pattern}",
108+
null,
109+
true,
110+
null
111+
],
112+
[
113+
"{age_gender_pattern}{separator_pattern} {product_pattern}",
114+
null,
115+
true,
116+
null
117+
],
118+
[
119+
"{age_gender_pattern}",
120+
null,
121+
false,
122+
null
123+
],
70124
[
71125
"(?:other purchases?|autres? achats?|achats?):?",
72126
"OtherPurchase",

pipelines_tests/test_assets/test_livelihood_activity_regexes.json

Lines changed: 111 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1076,6 +1076,117 @@
10761076
"is_start": true,
10771077
"product_id": "petty trade",
10781078
"attribute": "people_per_household"
1079+
},
1080+
"hommes - preparation terre/labour": {
1081+
"is_start": true,
1082+
"product_id": "preparation terre/labour",
1083+
"household_labor_provider": "men"
1084+
},
1085+
"women- threshing, harvesting": {
1086+
"is_start": true,
1087+
"product_id": "threshing, harvesting",
1088+
"household_labor_provider": "women"
1089+
},
1090+
"poisson/peche homme (kg collectés)": {
1091+
"is_start": true,
1092+
"product_id": "poisson/peche",
1093+
"household_labor_provider": "men",
1094+
"unit_of_measure_id": "kg",
1095+
"attribute": "quantity_produced"
1096+
},
1097+
"men - petty trade : nb. personnes par ménage": {
1098+
"is_start": true,
1099+
"product_id": "petty trade",
1100+
"household_labor_provider": "men",
1101+
"attribute": "people_per_household"
1102+
},
1103+
"pousse pousseur : garçon": {
1104+
"is_start": true,
1105+
"product_id": "pousse pousseur",
1106+
"household_labor_provider": "boys"
1107+
},
1108+
"femme vente de légumes": {
1109+
"is_start": true,
1110+
"product_id": "vente de légumes",
1111+
"household_labor_provider": "women"
1112+
},
1113+
"petty trade - men: no. people per hh": {
1114+
"is_start": true,
1115+
"product_id": "petty trade",
1116+
"household_labor_provider": "men",
1117+
"attribute": "people_per_household"
1118+
},
1119+
"petit commerce femme": {
1120+
"is_start": true,
1121+
"product_id": "petit commerce",
1122+
"household_labor_provider": "women"
1123+
},
1124+
"autre auto-emploi: women - petty trade": {
1125+
"strategy_type": "OtherCashIncome",
1126+
"is_start": true,
1127+
"product_id": "petty trade",
1128+
"household_labor_provider": "women"
1129+
},
1130+
"adults": {
1131+
"household_labor_provider": "adults"
1132+
},
1133+
"adultes": {
1134+
"household_labor_provider": "adults"
1135+
},
1136+
"men/women": {
1137+
"household_labor_provider": "adults"
1138+
},
1139+
"women/men": {
1140+
"household_labor_provider": "adults"
1141+
},
1142+
"men & women": {
1143+
"household_labor_provider": "adults"
1144+
},
1145+
"women & men": {
1146+
"household_labor_provider": "adults"
1147+
},
1148+
"hommes/femmes": {
1149+
"household_labor_provider": "adults"
1150+
},
1151+
"femmes/hommes": {
1152+
"household_labor_provider": "adults"
1153+
},
1154+
"hommes & femmes": {
1155+
"household_labor_provider": "adults"
1156+
},
1157+
"femmes & hommes": {
1158+
"household_labor_provider": "adults"
1159+
},
1160+
"other self-employment: petty trade - boys": {
1161+
"strategy_type": "OtherCashIncome",
1162+
"is_start": true,
1163+
"product_id": "petty trade",
1164+
"household_labor_provider": "boys"
1165+
},
1166+
"boys/girls - agriculture work": {
1167+
"is_start": true,
1168+
"product_id": "agriculture work",
1169+
"household_labor_provider": "children"
1170+
},
1171+
"garçons/filles collection d'eau": {
1172+
"is_start": true,
1173+
"product_id": "collection d'eau",
1174+
"household_labor_provider": "children"
1175+
},
1176+
"girls charcoal selling": {
1177+
"is_start": true,
1178+
"product_id": "charcoal selling",
1179+
"household_labor_provider": "girls"
1180+
},
1181+
"boys fishing": {
1182+
"is_start": true,
1183+
"product_id": "fishing",
1184+
"household_labor_provider": "boys"
1185+
},
1186+
"filles vente de légumes": {
1187+
"is_start": true,
1188+
"product_id": "vente de légumes",
1189+
"household_labor_provider": "girls"
10791190
}
10801191
,
10811192
"huile: nom du mesure": {

0 commit comments

Comments
 (0)