@@ -505,29 +505,23 @@ def prepare_lookup_df(self):
505505 """
506506 df = super ().prepare_lookup_df ()
507507
508- # Create a DataFrame that just contains the single digits 0 through 9.
509- df_digit = pd .DataFrame (range (10 ), columns = ["digit" ])
510-
511- # Create a Cartesian product of df and df_digit
512- df_all = pd .merge (df .assign (key = 1 ), df_digit .assign (key = 1 ), on = "key" ).drop ("key" , axis = 1 )
513-
514- # Create the "child_candidate" column that adds the 0-9 to the end of the lookup_value
515- df_all ["child_candidate" ] = df_all ["lookup_value" ] + df_all ["digit" ].astype (str )
516-
517- # Merge with the original DataFrame on the child_candidate column and lookup_key to find only the rows
518- # that contain valid child codes where the lookup_key in the child is the same as the lookup_key in the parent.
519- unwanted_parents = df_all .merge (
520- df ,
521- left_on = ["child_candidate" , "lookup_key" ],
522- right_on = ["lookup_value" , "lookup_key" ],
523- suffixes = [None , "_child" ],
524- )
525-
526- # Drop the extra columns so the shape matches the original dataframe.
527- unwanted_parents = unwanted_parents [["lookup_value" , "lookup_key" ]]
528-
529- # Drop the unwanted parents from the original DataFrame
530- df = pd .concat ([df , unwanted_parents ]).drop_duplicates (keep = False )
508+ # Drop duplicate lookup_key entries where the lookup value is a leading substring of another lookup value for
509+ # the same lookup_key.
510+ # First, sort the rows by lookup_key and lookup_value, so that any parent rows will be immediately
511+ # followed by their child rows (because longer strings will come after shorter strings).
512+ df = df .sort_values (by = ["lookup_key" , "lookup_value" ]).reset_index (drop = True )
513+ # Create a shifted version of the dataframe so that we can compare each row to the values in the next row
514+ # using the same index.
515+ next_row_df = df .shift (- 1 , fill_value = "" )
516+ # Create a boolean mask that identifies parent rows - i.e. rows where the lookup_value in the next row starts
517+ # with the lookup_value in the current row.
518+ parent_value_mask = [
519+ next_value .startswith (lookup_value )
520+ for lookup_value , next_value in zip (df ["lookup_value" ], next_row_df ["lookup_value" ])
521+ ]
522+ # Drop any parent rows with an identical lookup_key to their child row.
523+ rows_to_drop = (df ["lookup_key" ] == next_row_df ["lookup_key" ]) & pd .Series (parent_value_mask )
524+ df = df [~ rows_to_drop ]
531525 return df
532526
533527
0 commit comments