Skip to content

Commit fdf91b3

Browse files
committed
Merge branch 'main' into HEA-809/add_regex_for_men_women_boys_girls
2 parents 99ecba4 + 039965e commit fdf91b3

5 files changed

Lines changed: 213 additions & 75 deletions

File tree

pipelines/assets/baseline.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -301,6 +301,7 @@ def community_instances(context: AssetExecutionContext, config: BSSMetadataConfi
301301
"Région et cercle", # 2023 Mali BSSs
302302
"LGA", # Local Government Area, in the 2023 Nigeria BSSs
303303
"Province et territoire", # 2024 DRC BSSs
304+
"District (Departamiento/Municipio)", # 2019 GT06
304305
],
305306
[
306307
"Village",
@@ -313,6 +314,7 @@ def community_instances(context: AssetExecutionContext, config: BSSMetadataConfi
313314
"Commune et village", # 2023 Mali BSSs
314315
"Quartier",
315316
"Quartier/Secteur",
317+
"Village ou Fokotony", # 2017 MG BSSs
316318
],
317319
["Interview number:", "Numéro d'entretien", "Numero d'entretien"],
318320
["Interviewers", "Enquetêur(s)", "Intervieweurs"],

pipelines/assets/livelihood_activity.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -244,6 +244,7 @@ def get_livelihood_activity_regexes() -> list:
244244
"nbr_pattern": r"(?:n[bo]?r?e?|no)\.?",
245245
"vendu_pattern": r"(?:quantité )?vendu(?:e|s|ss|es|ses)?",
246246
"separator_pattern": r" ?[:-]?",
247+
"name_of_local_measure_pattern": r"(?:name of (?:meas(?:ure)?\.?)|nom(?: (?:de la mesure(?: locale)?|de mesure locale|du mesure|d'unité|mesure locale|unité de mesure))?)",
247248
}
248249
# Compile the regexes
249250
compiled_regexes = []

pipelines/assets/livelihood_activity_regexes.json

Lines changed: 18 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -194,7 +194,7 @@
194194
"times_per_month"
195195
],
196196
[
197-
"name of measure",
197+
"{name_of_local_measure_pattern}",
198198
null,
199199
false,
200200
"name_of_local_measure"
@@ -295,12 +295,6 @@
295295
false,
296296
"times_per_month"
297297
],
298-
[
299-
"(?:nom de la mesure|nom du mesure|nom d'unité|nom mesure locale)",
300-
null,
301-
false,
302-
"name_of_local_measure"
303-
],
304298
[
305299
"(?:poids? de la mesure|poids? du mesure|poids? d'unité)",
306300
null,
@@ -404,7 +398,7 @@
404398
"lactation_days"
405399
],
406400
[
407-
"no. milking animals",
401+
"(?:no\\.|no|number) milking animals",
408402
null,
409403
false,
410404
"milking_animals"
@@ -572,7 +566,19 @@
572566
"quantity_sold"
573567
],
574568
[
575-
"{product_pattern}: name of meas\\.",
569+
"autre culture(?: de rente)?{separator_pattern} (?:type|nom)",
570+
null,
571+
true,
572+
"product__name"
573+
],
574+
[
575+
"(?:autre nouriture de base|autre legumineuse|autre culture)?{separator_pattern} ?\\(?{product_pattern}\\)?{separator_pattern} ?{name_of_local_measure_pattern}",
576+
null,
577+
true,
578+
"name_of_local_measure"
579+
],
580+
[
581+
"{product_pattern}{separator_pattern} {name_of_local_measure_pattern}:?\\.?",
576582
null,
577583
true,
578584
"name_of_local_measure"
@@ -608,19 +614,19 @@
608614
null
609615
],
610616
[
611-
"{nbr_pattern} (d')?animaux laitiers",
617+
"{nbr_pattern} (?:d')?animaux (?:laitiers|lactants)",
612618
null,
613619
false,
614620
"milking_animals"
615621
],
616622
[
617-
"(?P<season>saison 1): {nbr_pattern} animaux (?:laitiers|lactants)",
623+
"(?P<season>saison (?:1|pluvieuse|hivernage)): {nbr_pattern} (?:d')?animaux (?:laitiers|lactants)",
618624
"MilkProduction",
619625
false,
620626
"milking_animals"
621627
],
622628
[
623-
"(?P<season>saison 2): {nbr_pattern} animaux (?:laitiers|lactants)",
629+
"(?P<season>saison (?:2|seche|sèche)): {nbr_pattern} (?:d')?animaux (?:laitiers|lactants)",
624630
"MilkProduction",
625631
true,
626632
"milking_animals"
@@ -829,12 +835,6 @@
829835
false,
830836
"number_of_local_measures"
831837
],
832-
[
833-
"autre culture(?: de rente)?{separator_pattern} (?:type|nom)",
834-
null,
835-
true,
836-
"product__name"
837-
],
838838
[
839839
"autre culture{separator_pattern} \\(?{product_pattern}\\)?(?: type)?",
840840
null,
@@ -919,12 +919,6 @@
919919
true,
920920
"expenditure"
921921
],
922-
[
923-
"(?:autre nouriture de base|autre legumineuse|autre culture)?{separator_pattern} ?\\(?{product_pattern}\\)?{separator_pattern} ?(?:nom de la mesure locale?|nom mesure locale|nom du mesure|nom unité de mesure|nom)",
924-
null,
925-
true,
926-
"name_of_local_measure"
927-
],
928922
[
929923
"{product_pattern} achetée?: quantité ?\\(?{unit_of_measure_pattern}\\)?",
930924
null,

pipelines/jobs/metadata.py

Lines changed: 110 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -513,58 +513,117 @@ def load_all_community_aliases(context: OpExecutionContext):
513513
def load_all_fewsnet_geographies(context: OpExecutionContext):
514514
"""
515515
Load all Livelihood Zone Baseline geographies from the FEWS NET Data Warehouse via the API.
516+
Reads the 'Metadata' sheet from BSS Metadata Google Sheet to get fnid for each baseline.
516517
"""
517-
baseline_countries = (
518-
LivelihoodZoneBaseline.objects.all()
519-
.values_list("livelihood_zone__country__iso3166a2", flat=True)
520-
.order_by("livelihood_zone__country__iso3166a2")
521-
.distinct()
522-
)
523-
all_geometries = {}
524-
for iso3166a2 in baseline_countries:
525-
response = requests.get(
526-
f"https://fdw.fews.net/api/feature/?format=geojson&unit_type=livelihood_zone&ordering=fnid&country_code={iso3166a2}"
527-
)
528-
response.raise_for_status()
529-
srid = int(response.json()["crs"]["properties"]["name"].split(":")[-1])
530-
for feature in response.json()["features"]:
531-
# Also save the geometry for the Livelihood Zone Baseline
532-
all_geometries[
533-
(
534-
feature["properties"]["attributes"]["EFF_YEAR"],
535-
feature["properties"]["attributes"]["LZCODE"],
536-
)
537-
] = feature
538-
539-
for livelihood_zone_baseline in LivelihoodZoneBaseline.objects.filter(
540-
livelihood_zone__country_id=iso3166a2
541-
).order_by(
542-
"livelihood_zone__country__iso3166a2",
543-
"reference_year_end_date",
544-
"livelihood_zone__code",
545-
):
546-
for feature in all_geometries.values():
547-
start_date = (
548-
datetime.date.fromisoformat(feature["properties"]["start_date"])
549-
if feature["properties"]["start_date"]
550-
else datetime.date.min
551-
)
552-
end_date = (
553-
datetime.date.fromisoformat(feature["properties"]["end_date"])
554-
if feature["properties"]["end_date"]
555-
else datetime.date.max
556-
)
557-
if (
558-
feature["properties"]["attributes"]["LZCODE"] == livelihood_zone_baseline.livelihood_zone.code
559-
) and (start_date <= livelihood_zone_baseline.valid_from_date <= end_date):
560-
geometry = GEOSGeometry(json.dumps(feature["geometry"]), srid=srid)
561-
if isinstance(geometry, Polygon):
562-
geometry = MultiPolygon(geometry)
563-
livelihood_zone_baseline.geography = geometry
564-
livelihood_zone_baseline.save()
565-
context.log.info(f"Updated geometry for {livelihood_zone_baseline}")
566-
continue
567-
context.log.warning(f"Failed to find FEWS NET geometry for {livelihood_zone_baseline}")
518+
# Read the Metadata sheet from BSS Metadata Google Sheet
519+
storage_options = {"token": "service_account", "access": "read_only", "root_file_id": "0AOJ0gJ8sjnO7Uk9PVA"}
520+
storage_options["creds"] = json.loads(os.environ["GOOGLE_APPLICATION_CREDENTIALS"])
521+
p = UPath("gdrive://Database Design/BSS Metadata", **storage_options)
522+
523+
with p.fs.open(p.path, mode="rb", cache_type="bytes") as f:
524+
# Google Sheets have to be exported rather than read directly
525+
if isinstance(f, GoogleDriveFile) and (f.details["mimeType"] == "application/vnd.google-apps.spreadsheet"):
526+
f = BytesIO(p.fs.export(p.path, "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"))
527+
528+
# Read the Metadata sheet with columns: code, reference_year_end_date, fnid
529+
metadata_df = pd.read_excel(f, sheet_name="Metadata", engine="openpyxl")
530+
531+
# Filter out rows without fnid
532+
metadata_df = metadata_df[metadata_df["fnid"].notna()]
533+
534+
context.log.info(f"Found {len(metadata_df)} baseline zones with fnid in Metadata sheet")
535+
536+
# Process each baseline zone
537+
for _, row in metadata_df.iterrows():
538+
zone_code = row["code"]
539+
reference_year_end_date = pd.to_datetime(row["reference_year_end_date"]).date()
540+
fnid = row["fnid"]
541+
542+
try:
543+
livelihood_zone_baseline = LivelihoodZoneBaseline.objects.get(
544+
livelihood_zone__code=zone_code, reference_year_end_date=reference_year_end_date
545+
)
546+
except LivelihoodZoneBaseline.DoesNotExist:
547+
context.log.warning(
548+
f"Baseline not found for code={zone_code}, reference_year_end_date={reference_year_end_date}"
549+
)
550+
continue
551+
except LivelihoodZoneBaseline.MultipleObjectsReturned:
552+
context.log.error(
553+
f"Multiple baselines found for code={zone_code}, reference_year_end_date={reference_year_end_date}"
554+
)
555+
continue
556+
557+
try:
558+
response = requests.get(
559+
f"https://fdw.fews.net/api/feature/?format=geojson&unit_type=livelihood_zone&fnid={fnid}&fields=with_population&demography_year={livelihood_zone_baseline.reference_year_end_date.year}"
560+
)
561+
response.raise_for_status()
562+
563+
geojson_data = response.json()
564+
565+
if not geojson_data.get("features"):
566+
context.log.warning(f"No features found for fnid={fnid}, baseline={livelihood_zone_baseline}")
567+
continue
568+
569+
srid = int(geojson_data["crs"]["properties"]["name"].split(":")[-1])
570+
571+
feature = geojson_data["features"][0]
572+
573+
# Convert to Django geometry
574+
geometry = GEOSGeometry(json.dumps(feature["geometry"]), srid=srid)
575+
if isinstance(geometry, Polygon):
576+
geometry = MultiPolygon(geometry)
577+
578+
# Update the baseline with the geometry
579+
livelihood_zone_baseline.geography = geometry
580+
581+
# Extract population estimate if available
582+
population_estimate = feature["properties"].get("estimated_population")
583+
if population_estimate is not None:
584+
try:
585+
livelihood_zone_baseline.population_estimate = int(population_estimate)
586+
# Population sources are in the geographicunit metadata url
587+
unit_id = geojson_data["features"][0].get("id")
588+
metadata_url = f"https://fdw.fews.net/api/geographicunit/metadata/?id={unit_id}&format=json&fields=with_population"
589+
context.log.info(f"Fetching population metadata from {metadata_url}")
590+
metadata_response = requests.get(metadata_url)
591+
metadata_response.raise_for_status()
592+
593+
metadata_json = metadata_response.json()
594+
595+
# Extract population source from metadata
596+
population_source = None
597+
if "metadata" in metadata_json and "Population" in metadata_json["metadata"]:
598+
population_list = metadata_json["metadata"]["Population"]
599+
for pop_item in population_list:
600+
if pop_item.get("Name") == "Population distribution":
601+
population_source = pop_item.get("Description")
602+
context.log.info(f"Found population source: {population_source}")
603+
break
604+
605+
if population_source:
606+
livelihood_zone_baseline.population_source = population_source
607+
else:
608+
context.log.warning(
609+
f"No population source found in metadata for {livelihood_zone_baseline} (fnid={fnid})"
610+
)
611+
except (ValueError, TypeError) as e:
612+
context.log.warning(f"Invalid population data for {livelihood_zone_baseline} (fnid={fnid}): {e}")
613+
614+
livelihood_zone_baseline.save()
615+
616+
log_msg = f"Updated geometry for {livelihood_zone_baseline} (fnid={fnid})"
617+
if population_estimate is not None:
618+
log_msg += f" with population={livelihood_zone_baseline.population_estimate}"
619+
context.log.info(log_msg)
620+
621+
except requests.RequestException as e:
622+
context.log.error(f"Failed to fetch geography for fnid={fnid}, baseline={livelihood_zone_baseline}: {e}")
623+
except (KeyError, ValueError, json.JSONDecodeError) as e:
624+
context.log.error(
625+
f"Failed to parse geography response for fnid={fnid}, baseline={livelihood_zone_baseline}: {e}"
626+
)
568627

569628

570629
@job

0 commit comments

Comments
 (0)