Skip to content

Commit 2c66838

Browse files
committed
Ignore parents with duplicate lookup_keys in ClassifiedProductLookup - see HEA-98
This was done in HEA-98 initially, but only ignored duplicates in the main CPC hierarchy. This commit extends it so that it can also ignore duplicates between CPC codes at the bottom of the official hierarchy and a custom code with a suffix below that.
1 parent 2050efd commit 2c66838

2 files changed

Lines changed: 38 additions & 23 deletions

File tree

apps/common/lookups.py

Lines changed: 17 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -505,29 +505,23 @@ def prepare_lookup_df(self):
505505
"""
506506
df = super().prepare_lookup_df()
507507

508-
# Create a DataFrame that just contains the single digits 0 through 9.
509-
df_digit = pd.DataFrame(range(10), columns=["digit"])
510-
511-
# Create a Cartesian product of df and df_digit
512-
df_all = pd.merge(df.assign(key=1), df_digit.assign(key=1), on="key").drop("key", axis=1)
513-
514-
# Create the "child_candidate" column that adds the 0-9 to the end of the lookup_value
515-
df_all["child_candidate"] = df_all["lookup_value"] + df_all["digit"].astype(str)
516-
517-
# Merge with the original DataFrame on the child_candidate column and lookup_key to find only the rows
518-
# that contain valid child codes where the lookup_key in the child is the same as the lookup_key in the parent.
519-
unwanted_parents = df_all.merge(
520-
df,
521-
left_on=["child_candidate", "lookup_key"],
522-
right_on=["lookup_value", "lookup_key"],
523-
suffixes=[None, "_child"],
524-
)
525-
526-
# Drop the extra columns so the shape matches the original dataframe.
527-
unwanted_parents = unwanted_parents[["lookup_value", "lookup_key"]]
528-
529-
# Drop the unwanted parents from the original DataFrame
530-
df = pd.concat([df, unwanted_parents]).drop_duplicates(keep=False)
508+
# Drop duplicate lookup_key entries where the lookup value is a leading substring of another lookup value for
509+
# the same lookup_key.
510+
# First, sort the rows by lookup_key and lookup_value, so that any parent rows will be immediately
511+
# followed by their child rows (because longer strings will come after shorter strings).
512+
df = df.sort_values(by=["lookup_key", "lookup_value"]).reset_index(drop=True)
513+
# Create a shifted version of the dataframe so that we can compare each row to the values in the next row
514+
# using the same index.
515+
next_row_df = df.shift(-1, fill_value="")
516+
# Create a boolean mask that identifies parent rows - i.e. rows where the lookup_value in the next row starts
517+
# with the lookup_value in the current row.
518+
parent_value_mask = [
519+
next_value.startswith(lookup_value)
520+
for lookup_value, next_value in zip(df["lookup_value"], next_row_df["lookup_value"])
521+
]
522+
# Drop any parent rows with an identical lookup_key to their child row.
523+
rows_to_drop = (df["lookup_key"] == next_row_df["lookup_key"]) & pd.Series(parent_value_mask)
524+
df = df[~rows_to_drop]
531525
return df
532526

533527

apps/common/tests/test_lookups.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,27 @@ def test_ignore_unwanted_parents(self):
4040
self.assertEqual(len(result_df), 1)
4141
self.assertEqual(result_df["cpc"][0], product.pk)
4242

43+
def test_ignore_unwanted_parents_using_suffix(self):
44+
ClassifiedProductFactory(
45+
cpc="P16200", description_en="Salt and pure sodium chloride; sea water Salt", common_name_en="Salt"
46+
)
47+
# Create a child with a matching alias, but different cpc
48+
product = ClassifiedProductFactory(
49+
cpc="P16200HA",
50+
description_en="Salt, salt and pepper, salt and condiments",
51+
common_name_en="Salt and condiments",
52+
aliases=[
53+
"salt",
54+
"salt and pepper",
55+
"sel et piment",
56+
],
57+
)
58+
df = pd.DataFrame({"product": ["salt"]})
59+
result_df = ClassifiedProductLookup().do_lookup(df, "product", "cpc")
60+
self.assertTrue("cpc" in result_df.columns)
61+
self.assertEqual(len(result_df), 1)
62+
self.assertEqual(result_df["cpc"][0], product.pk)
63+
4364
def test_excludes_r0113(self):
4465
# Create the unwanted product R0113 with a matching common name
4566
ClassifiedProductFactory(cpc="R0113", common_name_en="Rice", description_en="Rice")

0 commit comments

Comments
 (0)