Skip to content

Commit 0fe4ab5

Browse files
committed
Fixes for zone-specifc seasons in dairy Livelihood Strategies - see HEA-196
1 parent de7c5f4 commit 0fe4ab5

3 files changed

Lines changed: 112 additions & 23 deletions

File tree

apps/metadata/lookups.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,9 @@ def prepare_lookup_df(self) -> pd.DataFrame:
6262
extra_dfs = []
6363
for country in all_countries:
6464
country_df = df[df["country_id"] == country]
65-
null_purpose_rows = country_df[country_df["purpose"] == ""]
65+
# Purpose isn't a foreign key, and has blank=True, null=True, so there may be rows with a null purpose
66+
# and others with purpose=="".
67+
null_purpose_rows = country_df[country_df["purpose"].isna() | (country_df["purpose"] == "")]
6668
for purpose in all_purposes:
6769
# Only add duplicate rows for purposes that aren't already defined for this country
6870
if purpose not in country_df["purpose"].unique():

pipelines/assets/livelihood_activity.py

Lines changed: 104 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -423,7 +423,7 @@ def get_all_label_attributes(
423423
# Convert the season alias to an actual Season.name, which is the natural key for a Season.
424424
# We only do this if the country_id is in the dataframe, so that we can use this function to test labels
425425
# outside the context of a BSS, e.g. in unit tests, without needing to define country-specific seasons.
426-
# The country_id is needed for an actual Season lookup because all the BSSs use Season 1 and Season 2 names for
426+
# The country_id is needed for an actual Season lookup because all the BSSs use Season 1 and Season 2 aliases for
427427
# the seasons and we need to know the Country (and maybe the Strategy Type) to limit the lookup to a small enough
428428
# set of rows that Season 1 and Season 2 can uniquely identify a specific Season.
429429
# The all_label_attributes dataframe should also contain a 'strategy_type' column, which will be used by the
@@ -453,6 +453,17 @@ def get_all_label_attributes(
453453
# Drop the intermediate columns used for the lookup
454454
all_label_attributes = all_label_attributes.drop(columns=["zone_season", "zone_season_original"])
455455
all_label_attributes["season"] = all_label_attributes["season"].astype(object).replace(pd.NA, None)
456+
# Check that we found an actual Season for any labels that contained a season alias
457+
unrecognized_seasons_df = all_label_attributes[
458+
(all_label_attributes["season_original"] != "") & (all_label_attributes["season"].isna())
459+
]
460+
if not unrecognized_seasons_df.empty:
461+
raise ValueError(
462+
"Unrecognized seasons in labels:\n"
463+
+ unrecognized_seasons_df[
464+
["activity_label", "status", "strategy_type", "product_id", "season_original"]
465+
].to_markdown()
466+
)
456467

457468
# Make sure we keep the same index so we can match by row number
458469
all_label_attributes.index = labels.index
@@ -756,15 +767,30 @@ def get_instances_from_dataframe(
756767
]
757768

758769
# Save the identifier for Season 2 because we need it when creating MilkProduction and ButterProduction instances
770+
# Try to find zone-specific seasons first, and fall back to national-level seasons if necessary.
759771
seasonnamelookup = SeasonNameLookup()
760-
dairy_season2_names = [
761-
seasonnamelookup.get(
762-
"Season 2", country_id=livelihood_zone_baseline.livelihood_zone.country_id, purpose="MilkProduction"
763-
),
764-
seasonnamelookup.get(
765-
"Season 2", country_id=livelihood_zone_baseline.livelihood_zone.country_id, purpose="ButterProduction"
766-
),
767-
]
772+
dairy_season2_names = []
773+
for purpose in ["MilkProduction", "ButterProduction"]:
774+
for alias in ["season 2", "saison 2", "2ème saison"]:
775+
# Try to find a zone-specific season first
776+
season_2 = seasonnamelookup.get(
777+
f"{alias} ({livelihood_zone_baseline.livelihood_zone_id})",
778+
country_id=livelihood_zone_baseline.livelihood_zone.country_id,
779+
purpose=purpose,
780+
)
781+
# Fall back to a general season if a zone-specific one isn't found
782+
if not season_2:
783+
season_2 = seasonnamelookup.get(
784+
alias, country_id=livelihood_zone_baseline.livelihood_zone.country_id, purpose=purpose
785+
)
786+
# If we found a season, then there is no need to continue looking for other aliases
787+
if season_2:
788+
break
789+
if not season_2:
790+
raise ValueError(
791+
"Could not find a Season matching 'Season 2' for purpose '%s' in BSS %s" % (purpose, partition_key)
792+
)
793+
dairy_season2_names.append(season_2)
768794

769795
# Prepare a lookup for ClassifiedProduct, so it caches and reuses the results of .get() lookups
770796
classifiedproductlookup = ClassifiedProductLookup()
@@ -1047,30 +1073,65 @@ def get_instances_from_dataframe(
10471073
for livelihood_activity in livelihood_activities_for_strategy
10481074
)
10491075
):
1050-
# Find the MilkProduction livelihood strategy
1076+
# Find the corresponding MilkProduction livelihood strategy
1077+
# First, find the equivalent MilkProduction season for the current ButterProduction strategy.
1078+
# Try to find a Zone-specific season first, and fall back to a general season if necessary.
1079+
milk_season = seasonnamelookup.get(
1080+
f'{livelihood_strategy["season_original"]} ({livelihood_zone_baseline.livelihood_zone_id})',
1081+
country_id=livelihood_zone_baseline.livelihood_zone.country_id,
1082+
purpose="MilkProduction",
1083+
)
1084+
if not milk_season:
1085+
milk_season = seasonnamelookup.get(
1086+
livelihood_strategy["season_original"],
1087+
country_id=livelihood_zone_baseline.livelihood_zone.country_id,
1088+
purpose="MilkProduction",
1089+
)
1090+
if not milk_season:
1091+
raise ValueError(
1092+
f"Could not find a MilkProduction Season matching '{livelihood_strategy['season_original']}' "
1093+
f"from ButterProduction strategy for season '{livelihood_strategy['season_original']}' "
1094+
f"({livelihood_strategy['season']}) at row {livelihood_strategy['bss_row']} from:\n"
1095+
)
1096+
# Next, find the candidate MilkProduction strategies
10511097
milk_strategy = None
1052-
for strategy in reversed(livelihood_strategies):
1098+
milk_strategies = [
1099+
strategy
1100+
for strategy in reversed(livelihood_strategies)
1101+
if strategy["strategy_type"] == "MilkProduction"
1102+
]
1103+
# Test each strategy in turn
1104+
for strategy in milk_strategies:
10531105
if (
1054-
strategy["strategy_type"] == "MilkProduction"
10551106
# Season for the current LivelihoodStrategy hasn't been converted to a natural key yet,
10561107
# so coerce it to a list for comparison
1057-
and strategy["season"]
1058-
== [
1059-
seasonnamelookup.get(
1060-
livelihood_strategy["season_original"],
1061-
country_id=livelihood_zone_baseline.livelihood_zone.country_id,
1062-
purpose="MilkProduction",
1063-
)
1064-
]
1108+
strategy["season"] == [milk_season]
10651109
and strategy["additional_identifier"]
10661110
== livelihood_strategy["additional_identifier"]
10671111
):
10681112
milk_strategy = strategy
10691113
break
10701114
if not milk_strategy:
1115+
# Keep only the required attributes so that the error message is clearer
1116+
milk_strategies = [
1117+
{
1118+
k: strategy[k]
1119+
for k in [
1120+
"bss_row",
1121+
"strategy_type",
1122+
"season_original",
1123+
"season",
1124+
"product_id",
1125+
"additional_identifier",
1126+
]
1127+
}
1128+
for strategy in milk_strategies
1129+
]
10711130
raise ValueError(
10721131
f"Could not find the MilkProduction Livelihood Strategy associated with "
1073-
f"the ButterProduction strategy at row {row}."
1132+
f"the ButterProduction strategy for season '{livelihood_strategy['season_original']}' "
1133+
f"({livelihood_strategy['season']}) at row {livelihood_strategy['bss_row']} from:\n"
1134+
f"{'\n'.join([str(strategy) for strategy in milk_strategies])}"
10741135
)
10751136
milk_activities = {
10761137
activity["wealth_group"]: activity
@@ -1717,6 +1778,28 @@ def get_instances_from_dataframe(
17171778
}
17181779
if not unrecognized_labels.empty:
17191780
metadata["unrecognized_labels"] = MetadataValue.md(unrecognized_labels.to_markdown(index=False))
1781+
if livelihood_strategies and activity_type != ActivityLabel.LivelihoodActivityType.LIVELIHOOD_SUMMARY:
1782+
seasons_df = pd.DataFrame(
1783+
[
1784+
(
1785+
livelihood_strategy["strategy_type"],
1786+
livelihood_strategy["season_original"],
1787+
livelihood_strategy["season"][0] if livelihood_strategy["season"] else None,
1788+
)
1789+
for livelihood_strategy in livelihood_strategies
1790+
if livelihood_strategy.get("season") or livelihood_strategy.get("season_original")
1791+
],
1792+
columns=("strategy_type", "season_original", "season"),
1793+
).drop_duplicates()
1794+
if seasons_df["season"].isna().any():
1795+
metadata["unrecognized_seasons"] = MetadataValue.md(
1796+
seasons_df[seasons_df["season"].isna()].to_markdown(index=False)
1797+
)
1798+
1799+
if seasons_df["season"].notna().any():
1800+
metadata["recognized_seasons"] = MetadataValue.md(
1801+
seasons_df[seasons_df["season"].notna()].to_markdown(index=False)
1802+
)
17201803
metadata["pct_rows_recognized"] = round(
17211804
(
17221805
1

pipelines/jobs/metadata.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,11 @@ def load_metadata_for_model(context: OpExecutionContext, sheet_name: str, model:
4747
field.get_attname() for field in model._meta.concrete_fields if field.get_attname() not in valid_field_names
4848
]
4949
if "aliases" in df:
50-
df["aliases"] = df["aliases"].astype(object).apply(lambda x: sorted(x.lower().split("~")) if x else None)
50+
df["aliases"] = (
51+
df["aliases"]
52+
.astype(object)
53+
.apply(lambda x: sorted(alias.strip() for alias in x.lower().split("~")) if x else None)
54+
)
5155
if "cpcv2" in df:
5256
df["cpcv2"] = df["cpcv2"].astype(object).apply(lambda x: sorted(x.split("~")) if x else None)
5357
if "hs2012" in df:

0 commit comments

Comments
 (0)