Skip to content

Commit de7c5f4

Browse files
committed
Improved livelihood_activity_label_recognition_dataframe - see HEA-196
Includes a product name lookup for product cpc codes, and a season lookup for All Labels rows, and reorders the columns in Label Summary to match the Reference Data activity label worksheets.
1 parent b8c0676 commit de7c5f4

1 file changed

Lines changed: 130 additions & 65 deletions

File tree

pipelines/assets/livelihood_activity.py

Lines changed: 130 additions & 65 deletions
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,7 @@
8585
MilkProduction,
8686
)
8787
from common.lookups import ClassifiedProductLookup, UnitOfMeasureLookup # NOQA: E402
88+
from common.models import ClassifiedProduct # NOQA: E402
8889
from metadata.lookups import SeasonNameLookup # NOQA: E402
8990
from metadata.models import ( # NOQA: E402
9091
ActivityLabel,
@@ -272,10 +273,12 @@ def get_livelihood_activity_regular_expression_attributes(label: str) -> dict:
272273
label = prepare_lookup(label)
273274
attributes = {
274275
"activity_label": None,
276+
"status": None,
275277
"strategy_type": None,
276278
"is_start": None,
277279
"product_id": None,
278280
"unit_of_measure_id": None,
281+
"currency_id": None,
279282
"season": None,
280283
"additional_identifier": None,
281284
"household_labor_provider": None,
@@ -301,6 +304,7 @@ def get_livelihood_activity_regular_expression_attributes(label: str) -> dict:
301304
break
302305

303306
attributes["activity_label"] = label
307+
attributes["status"] = ActivityLabel.LabelStatus.REGULAR_EXPRESSION
304308
attributes["strategy_type"] = strategy_type
305309
attributes["is_start"] = is_start
306310
if isinstance(attribute, dict):
@@ -333,10 +337,12 @@ def get_livelihood_activity_label_map(activity_type: str) -> dict[str, dict]:
333337
status=ActivityLabel.LabelStatus.OVERRIDE, activity_type=activity_type
334338
).values(
335339
"activity_label",
340+
"status",
336341
"strategy_type",
337342
"is_start",
338343
"product_id",
339344
"unit_of_measure_id",
345+
"currency_id",
340346
"season",
341347
"additional_identifier",
342348
"attribute",
@@ -435,8 +441,8 @@ def get_all_label_attributes(
435441
["season_original", "livelihood_zone_id"]
436442
].apply(
437443
lambda x: (
438-
f"{x['season_original']} ({x['livelihood_zone_id']})"
439-
if x["season_original"]
444+
f"{x['season_original'].strip()} ({x['livelihood_zone_id']})"
445+
if x["season_original"].strip()
440446
else x["season_original"]
441447
),
442448
axis=1,
@@ -446,7 +452,7 @@ def get_all_label_attributes(
446452
all_label_attributes["season"] = all_label_attributes["zone_season"].fillna(all_label_attributes["season"])
447453
# Drop the intermediate columns used for the lookup
448454
all_label_attributes = all_label_attributes.drop(columns=["zone_season", "zone_season_original"])
449-
all_label_attributes["season"] = all_label_attributes["season"].replace(pd.NA, None)
455+
all_label_attributes["season"] = all_label_attributes["season"].astype(object).replace(pd.NA, None)
450456

451457
# Make sure we keep the same index so we can match by row number
452458
all_label_attributes.index = labels.index
@@ -458,22 +464,23 @@ def get_all_label_attributes(
458464
# `additional_identifier` so that we can differentiate between aggregate Livelihood Strategies under a high-level
459465
# Classified Product, e.g. Skilled Labor, without losing the specific text that was provided in the BSS.
460466
product_labels = labels[all_label_attributes["activity_label"] == ""].to_frame(name="label")
461-
product_labels = classifiedproductlookup.do_lookup(product_labels, "label", "product_id")
462-
# Set the activity_label so that get_instances_from_dataframe() doesn't treat these as unrecognized labels
463-
product_labels.loc[product_labels["product_id"].notna(), "activity_label"] = product_labels.loc[
464-
product_labels["product_id"].notna(), "label"
465-
]
466-
# Set the additional_identifier so that we can differentiate between different labels that map to the same product
467-
product_labels.loc[product_labels["product_id"].notna(), "additional_identifier"] = product_labels.loc[
468-
product_labels["product_id"].notna(), "label"
469-
]
470-
# Labels that contain just a product name or alias and weren't recognized by an Activity Label or regular
471-
# expression always indicate a new Livelihood Strategy with the values in the row containing income or expenditure,
472-
# depending on the Livelihood Strategy.
473-
product_labels.loc[product_labels["product_id"].notna(), "is_start"] = True
474-
product_labels.loc[product_labels["product_id"].notna(), "attribute"] = "income_or_expenditure"
475-
# Copy the product labels back into the main dataframe
476-
all_label_attributes.update(product_labels.drop(columns=["label"]))
467+
if not product_labels.empty:
468+
product_labels = classifiedproductlookup.do_lookup(product_labels, "label", "product_id")
469+
# Set the activity_label so that get_instances_from_dataframe() doesn't treat these as unrecognized labels
470+
product_labels.loc[product_labels["product_id"].notna(), "activity_label"] = product_labels.loc[
471+
product_labels["product_id"].notna(), "label"
472+
]
473+
# Set the additional_identifier so that we can differentiate between different labels that map to the same product
474+
product_labels.loc[product_labels["product_id"].notna(), "additional_identifier"] = product_labels.loc[
475+
product_labels["product_id"].notna(), "label"
476+
]
477+
# Labels that contain just a product name or alias and weren't recognized by an Activity Label or regular
478+
# expression always indicate a new Livelihood Strategy with the values in the row containing income or expenditure,
479+
# depending on the Livelihood Strategy.
480+
product_labels.loc[product_labels["product_id"].notna(), "is_start"] = True
481+
product_labels.loc[product_labels["product_id"].notna(), "attribute"] = "income_or_expenditure"
482+
# Copy the product labels back into the main dataframe
483+
all_label_attributes.update(product_labels.drop(columns=["label"]))
477484

478485
return all_label_attributes
479486

@@ -482,46 +489,66 @@ def get_all_label_attributes(
482489
def livelihood_activity_label_recognition_dataframe(
483490
context: AssetExecutionContext,
484491
config: BSSMetadataConfig,
485-
all_livelihood_activity_labels_dataframe: pd.DataFrame,
486-
all_other_cash_income_labels_dataframe: pd.DataFrame,
487-
all_wild_foods_labels_dataframe: pd.DataFrame,
488-
all_livelihood_summary_labels_dataframe: pd.DataFrame,
492+
livelihood_activity_label_dataframe: dict[str, pd.DataFrame],
493+
other_cash_income_label_dataframe: dict[str, pd.DataFrame],
494+
wild_foods_label_dataframe: dict[str, pd.DataFrame],
495+
livelihood_summary_label_dataframe: dict[str, pd.DataFrame],
489496
) -> Output[dict[str, pd.DataFrame]]:
490497
"""
491498
A saved spreadsheet showing how each BSS label is recognized, either from the ActivityLabel model or a regex.
492499
"""
493-
all_livelihood_activity_labels_dataframe["activity_type"] = (
494-
ActivityLabel.LivelihoodActivityType.LIVELIHOOD_ACTIVITY
495-
)
496-
all_other_cash_income_labels_dataframe["activity_type"] = ActivityLabel.LivelihoodActivityType.OTHER_CASH_INCOME
497-
all_wild_foods_labels_dataframe["activity_type"] = ActivityLabel.LivelihoodActivityType.WILD_FOODS
498-
all_livelihood_summary_labels_dataframe["activity_type"] = ActivityLabel.LivelihoodActivityType.LIVELIHOOD_SUMMARY
499-
500500
# Build a dataframe of all the Activity Labels from all BSSs, including the attributes recognized from the labels,
501501
# including any labels that are matched directly as Products.
502+
product_name_map = dict(ClassifiedProduct.objects.values_list("cpc", "common_name_en").order_by("cpc"))
502503
all_labels_df = pd.DataFrame()
503-
for summary_label_df in [
504-
all_livelihood_activity_labels_dataframe,
505-
all_other_cash_income_labels_dataframe,
506-
all_wild_foods_labels_dataframe,
507-
all_livelihood_summary_labels_dataframe,
504+
for activity_type, bss_label_dataframes in [
505+
(
506+
ActivityLabel.LivelihoodActivityType.LIVELIHOOD_ACTIVITY,
507+
livelihood_activity_label_dataframe,
508+
),
509+
(
510+
ActivityLabel.LivelihoodActivityType.OTHER_CASH_INCOME,
511+
other_cash_income_label_dataframe,
512+
),
513+
(
514+
ActivityLabel.LivelihoodActivityType.WILD_FOODS,
515+
wild_foods_label_dataframe,
516+
),
517+
(
518+
ActivityLabel.LivelihoodActivityType.LIVELIHOOD_SUMMARY,
519+
livelihood_summary_label_dataframe,
520+
),
508521
]:
509-
recognized_attributes_df = get_all_label_attributes(
510-
summary_label_df["label"],
511-
summary_label_df["activity_type"].iloc[0],
512-
country_code=None, # We don't need the Season lookup for this asset
513-
)
514-
# Join the recognized attributes to the label dataframe
515-
summary_label_df = summary_label_df.join(
516-
recognized_attributes_df,
517-
how="left",
518-
)
519-
all_labels_df = pd.concat([all_labels_df, summary_label_df], ignore_index=True)
522+
for bss, label_df in bss_label_dataframes.items():
523+
if label_df.empty:
524+
continue
525+
livelihood_zone_baseline = LivelihoodZoneBaseline.objects.get_by_natural_key(*bss.split("~")[1:])
526+
recognized_attributes_df = get_all_label_attributes(
527+
label_df["label"],
528+
activity_type,
529+
country_code=livelihood_zone_baseline.livelihood_zone.country_id,
530+
livelihood_zone_id=livelihood_zone_baseline.livelihood_zone_id,
531+
)
532+
recognized_attributes_df["product_name"] = (
533+
recognized_attributes_df["product_id"].map(product_name_map).fillna("")
534+
)
535+
536+
# Join the recognized attributes to the label dataframe
537+
label_df["activity_type"] = activity_type
538+
label_df = label_df.join(
539+
recognized_attributes_df,
540+
how="left",
541+
)
542+
all_labels_df = pd.concat([all_labels_df, label_df], ignore_index=True)
520543

521544
# Add the regular expressions
522545
regex_attributes_df = pd.DataFrame.from_records(
523546
all_labels_df["label"].astype(str).map(get_livelihood_activity_regular_expression_attributes)
524547
)
548+
regex_attributes_df = ClassifiedProductLookup(require_match=False).do_lookup(
549+
regex_attributes_df, "product_id", "product_id"
550+
)
551+
regex_attributes_df["product_name"] = regex_attributes_df["product_id"].map(product_name_map).fillna("")
525552
all_labels_df = all_labels_df.join(
526553
regex_attributes_df,
527554
how="left",
@@ -546,6 +573,7 @@ def livelihood_activity_label_recognition_dataframe(
546573
"notes",
547574
)
548575
)
576+
db_labels_df["product_name"] = db_labels_df["product_id"].map(product_name_map).fillna("")
549577
all_labels_df = all_labels_df.join(
550578
db_labels_df.set_index(["label_lower", "activity_type"]),
551579
on=("label_lower", "activity_type"),
@@ -555,11 +583,9 @@ def livelihood_activity_label_recognition_dataframe(
555583
)
556584

557585
# Create a deduplicated dataframe of all of the labels
558-
summary_label_df = all_labels_df.sort_values(by=["label_lower", "row_number", "bss"])
586+
summary_label_df = all_labels_df.sort_values(by=["label_lower", "activity_type", "row_number", "bss"])
559587
summary_label_df = (
560-
summary_label_df.groupby(
561-
"label_lower",
562-
)
588+
summary_label_df.groupby(["label_lower", "activity_type"])
563589
.agg(
564590
langs=(
565591
"lang",
@@ -576,42 +602,81 @@ def livelihood_activity_label_recognition_dataframe(
576602
.reset_index()
577603
)
578604
summary_label_df = summary_label_df.sort_values(
579-
by=["min_row_number", "label_lower", "bss_for_min_row", "bss_for_max_row"]
605+
by=["min_row_number", "activity_type", "label_lower", "bss_for_min_row", "bss_for_max_row"]
580606
)
581607
summary_label_df = summary_label_df.rename(
582608
columns={"label_lower": "label", "datapoint_count_sum": "datapoint_count", "in_summary_sum": "summary_count"}
583609
)
610+
# Add an empty translation column, to match the structure of the ReferenceData label worksheets
611+
summary_label_df["translation"] = ""
612+
# For the summary labels repeat the attribute lookup, but without the country-specific season lookup
613+
summary_label_dfs = []
614+
for activity_type in summary_label_df["activity_type"].unique():
615+
activity_type_label_df = summary_label_df[summary_label_df["activity_type"] == activity_type]
616+
recognized_attributes_df = get_all_label_attributes(
617+
activity_type_label_df["label"],
618+
activity_type,
619+
country_code=None,
620+
livelihood_zone_id=None,
621+
)
622+
activity_type_label_df = activity_type_label_df.join(
623+
recognized_attributes_df,
624+
how="left",
625+
)
626+
activity_type_label_df["product_name"] = activity_type_label_df["product_id"].map(product_name_map).fillna("")
627+
summary_label_dfs.append(activity_type_label_df)
628+
629+
# Concatenate all the activity type dataframes back into a single dataframe and put
630+
# the columns in the correct order to match the ReferenceData activity label worksheets
631+
summary_label_df = pd.concat(summary_label_dfs, ignore_index=True)[
632+
[
633+
"label",
634+
"langs",
635+
"datapoint_count",
636+
"summary_count",
637+
"unique_bss_count",
638+
"min_row_number",
639+
"max_row_number",
640+
"bss_for_min_row",
641+
"bss_for_max_row",
642+
"translation",
643+
"activity_type",
644+
"status",
645+
"is_start",
646+
"strategy_type",
647+
"attribute",
648+
"product_name",
649+
"unit_of_measure_id_original",
650+
"currency_id",
651+
"season",
652+
"additional_identifier",
653+
"notes",
654+
"household_labor_provider",
655+
"product_id",
656+
"activity_label",
657+
]
658+
]
659+
660+
# Join the summary_label_df to the all_labels_df to add the attributes for the recognized labels.
584661
summary_label_df = summary_label_df.join(
585662
all_labels_df[
586663
[
587664
"label_lower",
588-
"activity_type",
589-
"activity_label",
590-
"strategy_type",
591-
"is_start",
592-
"product_id_original",
593-
"unit_of_measure_id_original",
594-
"season",
595-
"additional_identifier",
596-
"attribute",
597-
"notes",
598-
"product_id",
599-
"unit_of_measure_id",
600665
"activity_label_regex",
601666
"strategy_type_regex",
602667
"is_start_regex",
603668
"product_id_regex",
669+
"product_name_regex",
604670
"unit_of_measure_id_regex",
605671
"season_regex",
606672
"additional_identifier_regex",
607673
"attribute_regex",
608674
"notes_regex",
609-
"status",
610675
"strategy_type_db",
611676
"is_start_db",
612677
"product_id_db",
678+
"product_name_db",
613679
"unit_of_measure_id_db",
614-
"currency_id",
615680
"season_db",
616681
"additional_identifier_db",
617682
"attribute_db",

0 commit comments

Comments
 (0)