Skip to content

Commit e7ddbb8

Browse files
authored
Merge pull request #253 from American-Institutes-for-Research/HEA-196/fix_recognition_dataframe
Hea 196/fix recognition dataframe
2 parents b715d0a + 683829b commit e7ddbb8

3 files changed

Lines changed: 42 additions & 29 deletions

File tree

apps/common/lookups.py

Lines changed: 17 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -505,29 +505,23 @@ def prepare_lookup_df(self):
505505
"""
506506
df = super().prepare_lookup_df()
507507

508-
# Create a DataFrame that just contains the single digits 0 through 9.
509-
df_digit = pd.DataFrame(range(10), columns=["digit"])
510-
511-
# Create a Cartesian product of df and df_digit
512-
df_all = pd.merge(df.assign(key=1), df_digit.assign(key=1), on="key").drop("key", axis=1)
513-
514-
# Create the "child_candidate" column that adds the 0-9 to the end of the lookup_value
515-
df_all["child_candidate"] = df_all["lookup_value"] + df_all["digit"].astype(str)
516-
517-
# Merge with the original DataFrame on the child_candidate column and lookup_key to find only the rows
518-
# that contain valid child codes where the lookup_key in the child is the same as the lookup_key in the parent.
519-
unwanted_parents = df_all.merge(
520-
df,
521-
left_on=["child_candidate", "lookup_key"],
522-
right_on=["lookup_value", "lookup_key"],
523-
suffixes=[None, "_child"],
524-
)
525-
526-
# Drop the extra columns so the shape matches the original dataframe.
527-
unwanted_parents = unwanted_parents[["lookup_value", "lookup_key"]]
528-
529-
# Drop the unwanted parents from the original DataFrame
530-
df = pd.concat([df, unwanted_parents]).drop_duplicates(keep=False)
508+
# Drop duplicate lookup_key entries where the lookup value is a leading substring of another lookup value for
509+
# the same lookup_key.
510+
# First, sort the rows by lookup_key and lookup_value, so that any parent rows will be immediately
511+
# followed by their child rows (because longer strings will come after shorter strings).
512+
df = df.sort_values(by=["lookup_key", "lookup_value"]).reset_index(drop=True)
513+
# Create a shifted version of the dataframe so that we can compare each row to the values in the next row
514+
# using the same index.
515+
next_row_df = df.shift(-1, fill_value="")
516+
# Create a boolean mask that identifies parent rows - i.e. rows where the lookup_value in the next row starts
517+
# with the lookup_value in the current row.
518+
parent_value_mask = [
519+
next_value.startswith(lookup_value)
520+
for lookup_value, next_value in zip(df["lookup_value"], next_row_df["lookup_value"])
521+
]
522+
# Drop any parent rows with an identical lookup_key to their child row.
523+
rows_to_drop = (df["lookup_key"] == next_row_df["lookup_key"]) & pd.Series(parent_value_mask)
524+
df = df[~rows_to_drop]
531525
return df
532526

533527

apps/common/tests/test_lookups.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,27 @@ def test_ignore_unwanted_parents(self):
4040
self.assertEqual(len(result_df), 1)
4141
self.assertEqual(result_df["cpc"][0], product.pk)
4242

43+
def test_ignore_unwanted_parents_using_suffix(self):
44+
ClassifiedProductFactory(
45+
cpc="P16200", description_en="Salt and pure sodium chloride; sea water Salt", common_name_en="Salt"
46+
)
47+
# Create a child with a matching alias, but different cpc
48+
product = ClassifiedProductFactory(
49+
cpc="P16200HA",
50+
description_en="Salt, salt and pepper, salt and condiments",
51+
common_name_en="Salt and condiments",
52+
aliases=[
53+
"salt",
54+
"salt and pepper",
55+
"sel et piment",
56+
],
57+
)
58+
df = pd.DataFrame({"product": ["salt"]})
59+
result_df = ClassifiedProductLookup().do_lookup(df, "product", "cpc")
60+
self.assertTrue("cpc" in result_df.columns)
61+
self.assertEqual(len(result_df), 1)
62+
self.assertEqual(result_df["cpc"][0], product.pk)
63+
4364
def test_excludes_r0113(self):
4465
# Create the unwanted product R0113 with a matching common name
4566
ClassifiedProductFactory(cpc="R0113", common_name_en="Rice", description_en="Rice")

pipelines/assets/livelihood_activity.py

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -458,12 +458,10 @@ def get_all_label_attributes(
458458
(all_label_attributes["season_original"] != "") & (all_label_attributes["season"].isna())
459459
]
460460
if not unrecognized_seasons_df.empty:
461-
raise ValueError(
462-
"Unrecognized seasons in labels:\n"
463-
+ unrecognized_seasons_df[
464-
["activity_label", "status", "strategy_type", "product_id", "season_original"]
465-
].to_markdown()
466-
)
461+
columns = ["activity_label", "status", "strategy_type", "product_id", "season_original", "country_id"]
462+
if livelihood_zone_id:
463+
columns.append("livelihood_zone_id")
464+
raise ValueError("Unrecognized seasons in labels:\n" + unrecognized_seasons_df[columns].to_markdown())
467465

468466
# Make sure we keep the same index so we can match by row number
469467
all_label_attributes.index = labels.index

0 commit comments

Comments
 (0)