8585 MilkProduction ,
8686)
8787from common .lookups import ClassifiedProductLookup , UnitOfMeasureLookup # NOQA: E402
88+ from common .models import ClassifiedProduct # NOQA: E402
8889from metadata .lookups import SeasonNameLookup # NOQA: E402
8990from metadata .models import ( # NOQA: E402
9091 ActivityLabel ,
@@ -272,10 +273,12 @@ def get_livelihood_activity_regular_expression_attributes(label: str) -> dict:
272273 label = prepare_lookup (label )
273274 attributes = {
274275 "activity_label" : None ,
276+ "status" : None ,
275277 "strategy_type" : None ,
276278 "is_start" : None ,
277279 "product_id" : None ,
278280 "unit_of_measure_id" : None ,
281+ "currency_id" : None ,
279282 "season" : None ,
280283 "additional_identifier" : None ,
281284 "household_labor_provider" : None ,
@@ -301,6 +304,7 @@ def get_livelihood_activity_regular_expression_attributes(label: str) -> dict:
301304 break
302305
303306 attributes ["activity_label" ] = label
307+ attributes ["status" ] = ActivityLabel .LabelStatus .REGULAR_EXPRESSION
304308 attributes ["strategy_type" ] = strategy_type
305309 attributes ["is_start" ] = is_start
306310 if isinstance (attribute , dict ):
@@ -333,10 +337,12 @@ def get_livelihood_activity_label_map(activity_type: str) -> dict[str, dict]:
333337 status = ActivityLabel .LabelStatus .OVERRIDE , activity_type = activity_type
334338 ).values (
335339 "activity_label" ,
340+ "status" ,
336341 "strategy_type" ,
337342 "is_start" ,
338343 "product_id" ,
339344 "unit_of_measure_id" ,
345+ "currency_id" ,
340346 "season" ,
341347 "additional_identifier" ,
342348 "attribute" ,
@@ -435,8 +441,8 @@ def get_all_label_attributes(
435441 ["season_original" , "livelihood_zone_id" ]
436442 ].apply (
437443 lambda x : (
438- f"{ x ['season_original' ]} ({ x ['livelihood_zone_id' ]} )"
439- if x ["season_original" ]
444+ f"{ x ['season_original' ]. strip () } ({ x ['livelihood_zone_id' ]} )"
445+ if x ["season_original" ]. strip ()
440446 else x ["season_original" ]
441447 ),
442448 axis = 1 ,
@@ -446,7 +452,7 @@ def get_all_label_attributes(
446452 all_label_attributes ["season" ] = all_label_attributes ["zone_season" ].fillna (all_label_attributes ["season" ])
447453 # Drop the intermediate columns used for the lookup
448454 all_label_attributes = all_label_attributes .drop (columns = ["zone_season" , "zone_season_original" ])
449- all_label_attributes ["season" ] = all_label_attributes ["season" ].replace (pd .NA , None )
455+ all_label_attributes ["season" ] = all_label_attributes ["season" ].astype ( object ). replace (pd .NA , None )
450456
451457 # Make sure we keep the same index so we can match by row number
452458 all_label_attributes .index = labels .index
@@ -458,22 +464,23 @@ def get_all_label_attributes(
458464 # `additional_identifier` so that we can differentiate between aggregate Livelihood Strategies under a high-level
459465 # Classified Product, e.g. Skilled Labor, without losing the specific text that was provided in the BSS.
460466 product_labels = labels [all_label_attributes ["activity_label" ] == "" ].to_frame (name = "label" )
461- product_labels = classifiedproductlookup .do_lookup (product_labels , "label" , "product_id" )
462- # Set the activity_label so that get_instances_from_dataframe() doesn't treat these as unrecognized labels
463- product_labels .loc [product_labels ["product_id" ].notna (), "activity_label" ] = product_labels .loc [
464- product_labels ["product_id" ].notna (), "label"
465- ]
466- # Set the additional_identifier so that we can differentiate between different labels that map to the same product
467- product_labels .loc [product_labels ["product_id" ].notna (), "additional_identifier" ] = product_labels .loc [
468- product_labels ["product_id" ].notna (), "label"
469- ]
470- # Labels that contain just a product name or alias and weren't recognized by an Activity Label or regular
471- # expression always indicate a new Livelihood Strategy with the values in the row containing income or expenditure,
472- # depending on the Livelihood Strategy.
473- product_labels .loc [product_labels ["product_id" ].notna (), "is_start" ] = True
474- product_labels .loc [product_labels ["product_id" ].notna (), "attribute" ] = "income_or_expenditure"
475- # Copy the product labels back into the main dataframe
476- all_label_attributes .update (product_labels .drop (columns = ["label" ]))
467+ if not product_labels .empty :
468+ product_labels = classifiedproductlookup .do_lookup (product_labels , "label" , "product_id" )
469+ # Set the activity_label so that get_instances_from_dataframe() doesn't treat these as unrecognized labels
470+ product_labels .loc [product_labels ["product_id" ].notna (), "activity_label" ] = product_labels .loc [
471+ product_labels ["product_id" ].notna (), "label"
472+ ]
473+ # Set the additional_identifier so that we can differentiate between different labels that map to the same product
474+ product_labels .loc [product_labels ["product_id" ].notna (), "additional_identifier" ] = product_labels .loc [
475+ product_labels ["product_id" ].notna (), "label"
476+ ]
477+ # Labels that contain just a product name or alias and weren't recognized by an Activity Label or regular
478+ # expression always indicate a new Livelihood Strategy with the values in the row containing income or expenditure,
479+ # depending on the Livelihood Strategy.
480+ product_labels .loc [product_labels ["product_id" ].notna (), "is_start" ] = True
481+ product_labels .loc [product_labels ["product_id" ].notna (), "attribute" ] = "income_or_expenditure"
482+ # Copy the product labels back into the main dataframe
483+ all_label_attributes .update (product_labels .drop (columns = ["label" ]))
477484
478485 return all_label_attributes
479486
@@ -482,46 +489,66 @@ def get_all_label_attributes(
482489def livelihood_activity_label_recognition_dataframe (
483490 context : AssetExecutionContext ,
484491 config : BSSMetadataConfig ,
485- all_livelihood_activity_labels_dataframe : pd .DataFrame ,
486- all_other_cash_income_labels_dataframe : pd .DataFrame ,
487- all_wild_foods_labels_dataframe : pd .DataFrame ,
488- all_livelihood_summary_labels_dataframe : pd .DataFrame ,
492+ livelihood_activity_label_dataframe : dict [ str , pd .DataFrame ] ,
493+ other_cash_income_label_dataframe : dict [ str , pd .DataFrame ] ,
494+ wild_foods_label_dataframe : dict [ str , pd .DataFrame ] ,
495+ livelihood_summary_label_dataframe : dict [ str , pd .DataFrame ] ,
489496) -> Output [dict [str , pd .DataFrame ]]:
490497 """
491498 A saved spreadsheet showing how each BSS label is recognized, either from the ActivityLabel model or a regex.
492499 """
493- all_livelihood_activity_labels_dataframe ["activity_type" ] = (
494- ActivityLabel .LivelihoodActivityType .LIVELIHOOD_ACTIVITY
495- )
496- all_other_cash_income_labels_dataframe ["activity_type" ] = ActivityLabel .LivelihoodActivityType .OTHER_CASH_INCOME
497- all_wild_foods_labels_dataframe ["activity_type" ] = ActivityLabel .LivelihoodActivityType .WILD_FOODS
498- all_livelihood_summary_labels_dataframe ["activity_type" ] = ActivityLabel .LivelihoodActivityType .LIVELIHOOD_SUMMARY
499-
500500 # Build a dataframe of all the Activity Labels from all BSSs, including the attributes recognized from the labels,
501501 # including any labels that are matched directly as Products.
502+ product_name_map = dict (ClassifiedProduct .objects .values_list ("cpc" , "common_name_en" ).order_by ("cpc" ))
502503 all_labels_df = pd .DataFrame ()
503- for summary_label_df in [
504- all_livelihood_activity_labels_dataframe ,
505- all_other_cash_income_labels_dataframe ,
506- all_wild_foods_labels_dataframe ,
507- all_livelihood_summary_labels_dataframe ,
504+ for activity_type , bss_label_dataframes in [
505+ (
506+ ActivityLabel .LivelihoodActivityType .LIVELIHOOD_ACTIVITY ,
507+ livelihood_activity_label_dataframe ,
508+ ),
509+ (
510+ ActivityLabel .LivelihoodActivityType .OTHER_CASH_INCOME ,
511+ other_cash_income_label_dataframe ,
512+ ),
513+ (
514+ ActivityLabel .LivelihoodActivityType .WILD_FOODS ,
515+ wild_foods_label_dataframe ,
516+ ),
517+ (
518+ ActivityLabel .LivelihoodActivityType .LIVELIHOOD_SUMMARY ,
519+ livelihood_summary_label_dataframe ,
520+ ),
508521 ]:
509- recognized_attributes_df = get_all_label_attributes (
510- summary_label_df ["label" ],
511- summary_label_df ["activity_type" ].iloc [0 ],
512- country_code = None , # We don't need the Season lookup for this asset
513- )
514- # Join the recognized attributes to the label dataframe
515- summary_label_df = summary_label_df .join (
516- recognized_attributes_df ,
517- how = "left" ,
518- )
519- all_labels_df = pd .concat ([all_labels_df , summary_label_df ], ignore_index = True )
522+ for bss , label_df in bss_label_dataframes .items ():
523+ if label_df .empty :
524+ continue
525+ livelihood_zone_baseline = LivelihoodZoneBaseline .objects .get_by_natural_key (* bss .split ("~" )[1 :])
526+ recognized_attributes_df = get_all_label_attributes (
527+ label_df ["label" ],
528+ activity_type ,
529+ country_code = livelihood_zone_baseline .livelihood_zone .country_id ,
530+ livelihood_zone_id = livelihood_zone_baseline .livelihood_zone_id ,
531+ )
532+ recognized_attributes_df ["product_name" ] = (
533+ recognized_attributes_df ["product_id" ].map (product_name_map ).fillna ("" )
534+ )
535+
536+ # Join the recognized attributes to the label dataframe
537+ label_df ["activity_type" ] = activity_type
538+ label_df = label_df .join (
539+ recognized_attributes_df ,
540+ how = "left" ,
541+ )
542+ all_labels_df = pd .concat ([all_labels_df , label_df ], ignore_index = True )
520543
521544 # Add the regular expressions
522545 regex_attributes_df = pd .DataFrame .from_records (
523546 all_labels_df ["label" ].astype (str ).map (get_livelihood_activity_regular_expression_attributes )
524547 )
548+ regex_attributes_df = ClassifiedProductLookup (require_match = False ).do_lookup (
549+ regex_attributes_df , "product_id" , "product_id"
550+ )
551+ regex_attributes_df ["product_name" ] = regex_attributes_df ["product_id" ].map (product_name_map ).fillna ("" )
525552 all_labels_df = all_labels_df .join (
526553 regex_attributes_df ,
527554 how = "left" ,
@@ -546,6 +573,7 @@ def livelihood_activity_label_recognition_dataframe(
546573 "notes" ,
547574 )
548575 )
576+ db_labels_df ["product_name" ] = db_labels_df ["product_id" ].map (product_name_map ).fillna ("" )
549577 all_labels_df = all_labels_df .join (
550578 db_labels_df .set_index (["label_lower" , "activity_type" ]),
551579 on = ("label_lower" , "activity_type" ),
@@ -555,11 +583,9 @@ def livelihood_activity_label_recognition_dataframe(
555583 )
556584
557585 # Create a deduplicated dataframe of all of the labels
558- summary_label_df = all_labels_df .sort_values (by = ["label_lower" , "row_number" , "bss" ])
586+ summary_label_df = all_labels_df .sort_values (by = ["label_lower" , "activity_type" , " row_number" , "bss" ])
559587 summary_label_df = (
560- summary_label_df .groupby (
561- "label_lower" ,
562- )
588+ summary_label_df .groupby (["label_lower" , "activity_type" ])
563589 .agg (
564590 langs = (
565591 "lang" ,
@@ -576,42 +602,81 @@ def livelihood_activity_label_recognition_dataframe(
576602 .reset_index ()
577603 )
578604 summary_label_df = summary_label_df .sort_values (
579- by = ["min_row_number" , "label_lower" , "bss_for_min_row" , "bss_for_max_row" ]
605+ by = ["min_row_number" , "activity_type" , " label_lower" , "bss_for_min_row" , "bss_for_max_row" ]
580606 )
581607 summary_label_df = summary_label_df .rename (
582608 columns = {"label_lower" : "label" , "datapoint_count_sum" : "datapoint_count" , "in_summary_sum" : "summary_count" }
583609 )
610+ # Add an empty translation column, to match the structure of the ReferenceData label worksheets
611+ summary_label_df ["translation" ] = ""
612+ # For the summary labels repeat the attribute lookup, but without the country-specific season lookup
613+ summary_label_dfs = []
614+ for activity_type in summary_label_df ["activity_type" ].unique ():
615+ activity_type_label_df = summary_label_df [summary_label_df ["activity_type" ] == activity_type ]
616+ recognized_attributes_df = get_all_label_attributes (
617+ activity_type_label_df ["label" ],
618+ activity_type ,
619+ country_code = None ,
620+ livelihood_zone_id = None ,
621+ )
622+ activity_type_label_df = activity_type_label_df .join (
623+ recognized_attributes_df ,
624+ how = "left" ,
625+ )
626+ activity_type_label_df ["product_name" ] = activity_type_label_df ["product_id" ].map (product_name_map ).fillna ("" )
627+ summary_label_dfs .append (activity_type_label_df )
628+
629+ # Concatenate all the activity type dataframes back into a single dataframe and put
630+ # the columns in the correct order to match the ReferenceData activity label worksheets
631+ summary_label_df = pd .concat (summary_label_dfs , ignore_index = True )[
632+ [
633+ "label" ,
634+ "langs" ,
635+ "datapoint_count" ,
636+ "summary_count" ,
637+ "unique_bss_count" ,
638+ "min_row_number" ,
639+ "max_row_number" ,
640+ "bss_for_min_row" ,
641+ "bss_for_max_row" ,
642+ "translation" ,
643+ "activity_type" ,
644+ "status" ,
645+ "is_start" ,
646+ "strategy_type" ,
647+ "attribute" ,
648+ "product_name" ,
649+ "unit_of_measure_id_original" ,
650+ "currency_id" ,
651+ "season" ,
652+ "additional_identifier" ,
653+ "notes" ,
654+ "household_labor_provider" ,
655+ "product_id" ,
656+ "activity_label" ,
657+ ]
658+ ]
659+
660+ # Join the summary_label_df to the all_labels_df to add the attributes for the recognized labels.
584661 summary_label_df = summary_label_df .join (
585662 all_labels_df [
586663 [
587664 "label_lower" ,
588- "activity_type" ,
589- "activity_label" ,
590- "strategy_type" ,
591- "is_start" ,
592- "product_id_original" ,
593- "unit_of_measure_id_original" ,
594- "season" ,
595- "additional_identifier" ,
596- "attribute" ,
597- "notes" ,
598- "product_id" ,
599- "unit_of_measure_id" ,
600665 "activity_label_regex" ,
601666 "strategy_type_regex" ,
602667 "is_start_regex" ,
603668 "product_id_regex" ,
669+ "product_name_regex" ,
604670 "unit_of_measure_id_regex" ,
605671 "season_regex" ,
606672 "additional_identifier_regex" ,
607673 "attribute_regex" ,
608674 "notes_regex" ,
609- "status" ,
610675 "strategy_type_db" ,
611676 "is_start_db" ,
612677 "product_id_db" ,
678+ "product_name_db" ,
613679 "unit_of_measure_id_db" ,
614- "currency_id" ,
615680 "season_db" ,
616681 "additional_identifier_db" ,
617682 "attribute_db" ,
0 commit comments