Skip to content

Commit bbdeb70

Browse files
feat(bigframes): Support unstable sort_values, sort_index (#16665)
Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly: - [ ] Make sure to open an issue as a [bug/issue](https://github.com/googleapis/google-cloud-python/issues) before writing your code! That way we can discuss the change, evaluate designs, and agree on the general idea - [ ] Ensure the tests and linter pass - [ ] Code coverage does not decrease (if any source code was changed) - [ ] Appropriate docs were updated (if necessary) Fixes #<issue_number_goes_here> 🦕 --------- Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
1 parent 63b88fb commit bbdeb70

11 files changed

Lines changed: 111 additions & 50 deletions

File tree

packages/bigframes/bigframes/core/array_value.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -212,11 +212,17 @@ def filter(self, predicate: ex.Expression):
212212
return arr.drop_columns(filter_ids)
213213

214214
def order_by(
215-
self, by: Sequence[OrderingExpression], is_total_order: bool = False
215+
self,
216+
by: Sequence[OrderingExpression],
217+
is_total_order: bool = False,
218+
stable: bool = True,
216219
) -> ArrayValue:
217220
return ArrayValue(
218221
nodes.OrderByNode(
219-
child=self.node, by=tuple(by), is_total_order=is_total_order
222+
child=self.node,
223+
by=tuple(by),
224+
is_total_order=is_total_order,
225+
stable=stable,
220226
)
221227
)
222228

packages/bigframes/bigframes/core/blocks.py

Lines changed: 14 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -395,9 +395,10 @@ def cols_matching_label(self, partial_label: Label) -> typing.Sequence[str]:
395395
def order_by(
396396
self,
397397
by: typing.Sequence[ordering.OrderingExpression],
398+
stable: bool = True,
398399
) -> Block:
399400
return Block(
400-
self._expr.order_by(by),
401+
self._expr.order_by(by, stable=stable),
401402
index_columns=self.index_columns,
402403
column_labels=self.column_labels,
403404
index_labels=self.index.names,
@@ -2412,13 +2413,13 @@ def _align_both_axes(
24122413
rcol_indexer if (rcol_indexer is not None) else range(len(columns))
24132414
)
24142415

2415-
left_input_lookup = (
2416-
lambda index: ex.deref(get_column_left[self.value_columns[index]])
2416+
left_input_lookup = lambda index: (
2417+
ex.deref(get_column_left[self.value_columns[index]])
24172418
if index != -1
24182419
else ex.const(None)
24192420
)
2420-
righ_input_lookup = (
2421-
lambda index: ex.deref(get_column_right[other.value_columns[index]])
2421+
righ_input_lookup = lambda index: (
2422+
ex.deref(get_column_right[other.value_columns[index]])
24222423
if index != -1
24232424
else ex.const(None)
24242425
)
@@ -2471,15 +2472,13 @@ def _align_series_block_axis_1(
24712472
rcol_indexer if (rcol_indexer is not None) else range(len(columns))
24722473
)
24732474

2474-
left_input_lookup = (
2475-
lambda index: ex.deref(get_column_left[self.value_columns[index]])
2475+
left_input_lookup = lambda index: (
2476+
ex.deref(get_column_left[self.value_columns[index]])
24762477
if index != -1
24772478
else ex.const(None)
24782479
)
2479-
righ_input_lookup = (
2480-
lambda index: ex.deref(
2481-
get_column_right[other.transpose().value_columns[index]]
2482-
)
2480+
righ_input_lookup = lambda index: (
2481+
ex.deref(get_column_right[other.transpose().value_columns[index]])
24832482
if index != -1
24842483
else ex.const(None)
24852484
)
@@ -2506,13 +2505,11 @@ def _align_pd_series_axis_1(
25062505
rcol_indexer if (rcol_indexer is not None) else range(len(columns))
25072506
)
25082507

2509-
left_input_lookup = (
2510-
lambda index: ex.deref(self.value_columns[index])
2511-
if index != -1
2512-
else ex.const(None)
2508+
left_input_lookup = lambda index: (
2509+
ex.deref(self.value_columns[index]) if index != -1 else ex.const(None)
25132510
)
2514-
righ_input_lookup = (
2515-
lambda index: ex.const(other.iloc[index]) if index != -1 else ex.const(None)
2511+
righ_input_lookup = lambda index: (
2512+
ex.const(other.iloc[index]) if index != -1 else ex.const(None)
25162513
)
25172514

25182515
left_inputs = [left_input_lookup(i) for i in lcol_indexer]

packages/bigframes/bigframes/core/indexes/base.py

Lines changed: 11 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -255,12 +255,6 @@ def query_job(self) -> bigquery.QueryJob:
255255
self._query_job = query_job
256256
return self._query_job
257257

258-
@property
259-
def str(self) -> bigframes.operations.strings.StringMethods:
260-
import bigframes.operations.strings
261-
262-
return bigframes.operations.strings.StringMethods(self)
263-
264258
def get_loc(self, key) -> typing.Union[int, slice, "bigframes.series.Series"]:
265259
"""Get integer location, slice or boolean mask for requested label.
266260
@@ -436,7 +430,8 @@ def sort_values(
436430
*,
437431
inplace: bool = False,
438432
ascending: bool = True,
439-
na_position: __builtins__.str = "last",
433+
kind: str | None = None,
434+
na_position: str = "last",
440435
) -> Index:
441436
if na_position not in ["first", "last"]:
442437
raise ValueError("Param na_position must be one of 'first' or 'last'")
@@ -448,7 +443,8 @@ def sort_values(
448443
else order.descending_over(column, na_last)
449444
for column in index_columns
450445
]
451-
return Index(self._block.order_by(ordering))
446+
is_stable = (kind or constants.DEFAULT_SORT_KIND) in constants.STABLE_SORT_KINDS
447+
return Index(self._block.order_by(ordering, stable=is_stable))
452448

453449
def astype(
454450
self,
@@ -840,6 +836,13 @@ def _apply_binary_op(
840836
else:
841837
return NotImplemented
842838

839+
# last so as to not shadow __builtins__.str
840+
@property
841+
def str(self) -> bigframes.operations.strings.StringMethods:
842+
import bigframes.operations.strings
843+
844+
return bigframes.operations.strings.StringMethods(self)
845+
843846

844847
def _should_create_datetime_index(block: blocks.Block) -> bool:
845848
if len(block.index.dtypes) != 1:

packages/bigframes/bigframes/core/nodes.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -991,7 +991,8 @@ def remap_refs(
991991
@dataclasses.dataclass(frozen=True, eq=False)
992992
class OrderByNode(UnaryNode):
993993
by: Tuple[OrderingExpression, ...]
994-
# This is an optimization, if true, can discard previous orderings.
994+
stable: bool = True
995+
# This is an optimization, if true, can discard previous orderings, even if doing a stable sort
995996
# might be a total ordering even if false
996997
is_total_order: bool = False
997998

packages/bigframes/bigframes/core/rewrite/order.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -71,7 +71,8 @@ def pull_up_order_inner(
7171
child_result, child_order = pull_up_order_inner(node.child)
7272
return child_result, child_order.with_reverse()
7373
elif isinstance(node, bigframes.core.nodes.OrderByNode):
74-
if node.is_total_order:
74+
# unstable sorts don't care about previous order, total orders override previous order
75+
if (not node.stable) or node.is_total_order:
7576
new_node = remove_order(node.child)
7677
else:
7778
new_node, child_order = pull_up_order_inner(node.child)
@@ -106,6 +107,10 @@ def pull_up_order_inner(
106107
),
107108
)
108109
)
110+
elif not node.stable:
111+
new_order = bigframes.core.ordering.RowOrdering(
112+
ordering_value_columns=tuple(new_by),
113+
)
109114
else:
110115
assert child_order
111116
new_order = child_order.with_ordering_columns(new_by)

packages/bigframes/bigframes/dataframe.py

Lines changed: 16 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -2418,6 +2418,7 @@ def sort_index(
24182418
*,
24192419
ascending: bool = ...,
24202420
inplace: Literal[False] = ...,
2421+
kind: str | None = ...,
24212422
na_position: Literal["first", "last"] = ...,
24222423
) -> DataFrame: ...
24232424

@@ -2427,6 +2428,7 @@ def sort_index(
24272428
*,
24282429
ascending: bool = ...,
24292430
inplace: Literal[True] = ...,
2431+
kind: str | None = ...,
24302432
na_position: Literal["first", "last"] = ...,
24312433
) -> None: ...
24322434

@@ -2436,6 +2438,7 @@ def sort_index(
24362438
axis: Union[int, str] = 0,
24372439
ascending: bool = True,
24382440
inplace: bool = False,
2441+
kind: str | None = None,
24392442
na_position: Literal["first", "last"] = "last",
24402443
) -> Optional[DataFrame]:
24412444
if utils.get_axis_number(axis) == 0:
@@ -2449,7 +2452,10 @@ def sort_index(
24492452
else order.descending_over(column, na_last)
24502453
for column in index_columns
24512454
]
2452-
block = self._block.order_by(ordering)
2455+
is_stable = (
2456+
kind or constants.DEFAULT_SORT_KIND
2457+
) in constants.STABLE_SORT_KINDS
2458+
block = self._block.order_by(ordering, stable=is_stable)
24532459
else: # axis=1
24542460
_, indexer = self.columns.sort_values(
24552461
return_indexer=True,
@@ -2472,7 +2478,7 @@ def sort_values(
24722478
*,
24732479
inplace: Literal[False] = ...,
24742480
ascending: bool | typing.Sequence[bool] = ...,
2475-
kind: str = ...,
2481+
kind: str | None = ...,
24762482
na_position: typing.Literal["first", "last"] = ...,
24772483
) -> DataFrame: ...
24782484

@@ -2483,7 +2489,7 @@ def sort_values(
24832489
*,
24842490
inplace: Literal[True] = ...,
24852491
ascending: bool | typing.Sequence[bool] = ...,
2486-
kind: str = ...,
2492+
kind: str | None = ...,
24872493
na_position: typing.Literal["first", "last"] = ...,
24882494
) -> None: ...
24892495

@@ -2493,7 +2499,7 @@ def sort_values(
24932499
*,
24942500
inplace: bool = False,
24952501
ascending: bool | typing.Sequence[bool] = True,
2496-
kind: str = "quicksort",
2502+
kind: str | None = None,
24972503
na_position: typing.Literal["first", "last"] = "last",
24982504
) -> Optional[DataFrame]:
24992505
if isinstance(by, (bigframes.series.Series, indexes.Index, DataFrame)):
@@ -2525,7 +2531,8 @@ def sort_values(
25252531
if is_ascending
25262532
else order.descending_over(column_id, na_last)
25272533
)
2528-
block = self._block.order_by(ordering)
2534+
is_stable = (kind or constants.DEFAULT_SORT_KIND) in constants.STABLE_SORT_KINDS
2535+
block = self._block.order_by(ordering, stable=is_stable)
25292536
if inplace:
25302537
self._set_block(block)
25312538
return None
@@ -2768,11 +2775,11 @@ def replace(
27682775
):
27692776
if utils.is_dict_like(value):
27702777
return self.apply(
2771-
lambda x: x.replace(
2772-
to_replace=to_replace, value=value[x.name], regex=regex
2778+
lambda x: (
2779+
x.replace(to_replace=to_replace, value=value[x.name], regex=regex)
2780+
if (x.name in value)
2781+
else x
27732782
)
2774-
if (x.name in value)
2775-
else x
27762783
)
27772784
return self.apply(
27782785
lambda x: x.replace(to_replace=to_replace, value=value, regex=regex)

packages/bigframes/bigframes/series.py

Lines changed: 28 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1779,7 +1779,7 @@ def sort_values(
17791779
axis=...,
17801780
inplace: Literal[True] = ...,
17811781
ascending: bool | typing.Sequence[bool] = ...,
1782-
kind: str = ...,
1782+
kind: str | None = ...,
17831783
na_position: typing.Literal["first", "last"] = ...,
17841784
) -> None: ...
17851785

@@ -1790,7 +1790,7 @@ def sort_values(
17901790
axis=...,
17911791
inplace: Literal[False] = ...,
17921792
ascending: bool | typing.Sequence[bool] = ...,
1793-
kind: str = ...,
1793+
kind: str | None = ...,
17941794
na_position: typing.Literal["first", "last"] = ...,
17951795
) -> Series: ...
17961796

@@ -1800,19 +1800,21 @@ def sort_values(
18001800
axis=0,
18011801
inplace: bool = False,
18021802
ascending=True,
1803-
kind: str = "quicksort",
1803+
kind: str | None = None,
18041804
na_position: typing.Literal["first", "last"] = "last",
18051805
) -> Optional[Series]:
18061806
if axis != 0 and axis != "index":
18071807
raise ValueError(f"No axis named {axis} for object type Series")
18081808
if na_position not in ["first", "last"]:
18091809
raise ValueError("Param na_position must be one of 'first' or 'last'")
1810+
is_stable = (kind or constants.DEFAULT_SORT_KIND) in constants.STABLE_SORT_KINDS
18101811
block = self._block.order_by(
18111812
[
18121813
order.ascending_over(self._value_column, (na_position == "last"))
18131814
if ascending
18141815
else order.descending_over(self._value_column, (na_position == "last"))
18151816
],
1817+
stable=is_stable,
18161818
)
18171819
if inplace:
18181820
self._set_block(block)
@@ -1822,17 +1824,35 @@ def sort_values(
18221824

18231825
@typing.overload # type: ignore[override]
18241826
def sort_index(
1825-
self, *, axis=..., inplace: Literal[False] = ..., ascending=..., na_position=...
1827+
self,
1828+
*,
1829+
axis=...,
1830+
inplace: Literal[False] = ...,
1831+
ascending=...,
1832+
kind: str | None = ...,
1833+
na_position=...,
18261834
) -> Series: ...
18271835

18281836
@typing.overload
18291837
def sort_index(
1830-
self, *, axis=0, inplace: Literal[True] = ..., ascending=..., na_position=...
1838+
self,
1839+
*,
1840+
axis=0,
1841+
inplace: Literal[True] = ...,
1842+
ascending=...,
1843+
kind: str | None = ...,
1844+
na_position=...,
18311845
) -> None: ...
18321846

18331847
@validations.requires_index
18341848
def sort_index(
1835-
self, *, axis=0, inplace: bool = False, ascending=True, na_position="last"
1849+
self,
1850+
*,
1851+
axis=0,
1852+
inplace: bool = False,
1853+
ascending=True,
1854+
kind: str | None = None,
1855+
na_position="last",
18361856
) -> Optional[Series]:
18371857
# TODO(tbergeron): Support level parameter once multi-index introduced.
18381858
if axis != 0 and axis != "index":
@@ -1847,7 +1867,8 @@ def sort_index(
18471867
else order.descending_over(column, na_last)
18481868
for column in block.index_columns
18491869
]
1850-
block = block.order_by(ordering)
1870+
is_stable = (kind or constants.DEFAULT_SORT_KIND) in constants.STABLE_SORT_KINDS
1871+
block = block.order_by(ordering, stable=is_stable)
18511872
if inplace:
18521873
self._set_block(block)
18531874
return None

packages/bigframes/third_party/bigframes_vendored/constants.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,3 +55,6 @@
5555
"_deferred",
5656
]
5757
VALID_WRITE_ENGINES = typing.get_args(WriteEngineType)
58+
59+
DEFAULT_SORT_KIND = "stable"
60+
STABLE_SORT_KINDS = ("stable", "mergesort")

packages/bigframes/third_party/bigframes_vendored/pandas/core/frame.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2253,7 +2253,7 @@ def sort_values(
22532253
*,
22542254
inplace: bool = False,
22552255
ascending: bool | Sequence[bool] = True,
2256-
kind: str = "quicksort",
2256+
kind: str | None = None,
22572257
na_position: Literal["first", "last"] = "last",
22582258
):
22592259
"""Sort by the values along row axis.
@@ -2339,7 +2339,7 @@ def sort_values(
23392339
the by.
23402340
inplace (bool, default False):
23412341
If True, perform operation in-place.
2342-
kind (str, default 'quicksort'):
2342+
kind (str, default None):
23432343
Choice of sorting algorithm. Accepts 'quicksort', 'mergesort',
23442344
'heapsort', 'stable'. Ignored except when determining whether to
23452345
sort stably. 'mergesort' or 'stable' will result in stable reorder.
@@ -2363,6 +2363,7 @@ def sort_index(
23632363
axis: str | int = 0,
23642364
ascending: bool = True,
23652365
inplace: bool = False,
2366+
kind: str | None = None,
23662367
na_position: Literal["first", "last"] = "last",
23672368
):
23682369
"""Sort object by labels (along an axis).
@@ -2375,6 +2376,10 @@ def sort_index(
23752376
Sort ascending vs. descending.
23762377
inplace (bool, default False):
23772378
Whether to modify the DataFrame rather than creating a new one.
2379+
kind (str, default None):
2380+
Choice of sorting algorithm. Accepts 'quicksort', 'mergesort',
2381+
'heapsort', 'stable'. Ignored except when determining whether to
2382+
sort stably. 'mergesort' or 'stable' will result in stable reorder.
23782383
na_position ({'first', 'last'}, default 'last'):
23792384
Puts NaNs at the beginning if `first`; `last` puts NaNs at the end.
23802385
Not implemented for MultiIndex.

0 commit comments

Comments
 (0)