diff --git a/.github/workflows/docs-deploy.yml b/.github/workflows/docs-deploy.yml new file mode 100644 index 0000000000..13d4d87263 --- /dev/null +++ b/.github/workflows/docs-deploy.yml @@ -0,0 +1,57 @@ +name: Deploy docs to GitHub Pages + +on: + # Runs on pushes targeting the default branch + # TODO(tswast): Update this to only be releases once we confirm it's working. + push: + branches: ["main"] + + # Allows you to run this workflow manually from the Actions tab + workflow_dispatch: + +# Sets permissions of the GITHUB_TOKEN to allow deployment to GitHub Pages +permissions: + contents: read + pages: write + id-token: write + +# Allow only one concurrent deployment, skipping runs queued between the run in-progress and latest queued. +# However, do NOT cancel in-progress runs as we want to allow these production deployments to complete. +concurrency: + group: "pages" + cancel-in-progress: false + +jobs: + # Build job + build: + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v4 + - name: Setup Python + uses: actions/setup-python@v5 + with: + python-version: "3.13" + - name: Install nox + run: | + python -m pip install --upgrade setuptools pip wheel + python -m pip install nox + - name: Run docs + run: | + nox -s docs + - name: Upload artifact + uses: actions/upload-pages-artifact@v3 + with: + path: docs/_build/html/ + + # Deployment job + deploy: + environment: + name: github-pages + url: ${{ steps.deployment.outputs.page_url }} + runs-on: ubuntu-latest + needs: build + steps: + - name: Deploy to GitHub Pages + id: deployment + uses: actions/deploy-pages@v4 diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml index 2833fe98ff..6773aef7c2 100644 --- a/.github/workflows/docs.yml +++ b/.github/workflows/docs.yml @@ -2,6 +2,9 @@ on: pull_request: branches: - main + push: + branches: + - main name: docs jobs: docs: @@ -12,7 +15,7 @@ jobs: - name: Setup Python uses: actions/setup-python@v5 with: - python-version: "3.10" + python-version: "3.13" - name: Install nox run: | python -m pip install --upgrade setuptools pip wheel diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml index 1051da0bdd..7914b72651 100644 --- a/.github/workflows/lint.yml +++ b/.github/workflows/lint.yml @@ -2,6 +2,9 @@ on: pull_request: branches: - main + push: + branches: + - main name: lint jobs: lint: diff --git a/.github/workflows/mypy.yml b/.github/workflows/mypy.yml index e6a79291d0..fc9e970946 100644 --- a/.github/workflows/mypy.yml +++ b/.github/workflows/mypy.yml @@ -2,6 +2,9 @@ on: pull_request: branches: - main + push: + branches: + - main name: mypy jobs: mypy: diff --git a/.github/workflows/unittest.yml b/.github/workflows/unittest.yml index a7805de447..518cec6312 100644 --- a/.github/workflows/unittest.yml +++ b/.github/workflows/unittest.yml @@ -2,6 +2,9 @@ on: pull_request: branches: - main + push: + branches: + - main name: unittest jobs: unit: diff --git a/CHANGELOG.md b/CHANGELOG.md index 1df3ad0f70..7a87fc0160 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,29 @@ [1]: https://pypi.org/project/bigframes/#history +## [2.29.0](https://github.com/googleapis/python-bigquery-dataframes/compare/v2.28.0...v2.29.0) (2025-11-10) + + +### Features + +* Add bigframes.bigquery.st_regionstats to join raster data from Earth Engine ([#2228](https://github.com/googleapis/python-bigquery-dataframes/issues/2228)) ([10ec52f](https://github.com/googleapis/python-bigquery-dataframes/commit/10ec52f30a0a9c61b9eda9cf4f9bd6aa0cd95db5)) +* Add DataFrame.resample and Series.resample ([#2213](https://github.com/googleapis/python-bigquery-dataframes/issues/2213)) ([c9ca02c](https://github.com/googleapis/python-bigquery-dataframes/commit/c9ca02c5194c8b8e9b940eddd2224efd2ff0d5d9)) +* SQL Cell no longer escapes formatted string values ([#2245](https://github.com/googleapis/python-bigquery-dataframes/issues/2245)) ([d2d38f9](https://github.com/googleapis/python-bigquery-dataframes/commit/d2d38f94ed8333eae6f9cff3833177756eefe85a)) +* Support left_index and right_index for merge ([#2220](https://github.com/googleapis/python-bigquery-dataframes/issues/2220)) ([da9ba26](https://github.com/googleapis/python-bigquery-dataframes/commit/da9ba267812c01ffa6fa0b09943d7a4c63b8f187)) + + +### Bug Fixes + +* Correctly iterate over null struct values in ManagedArrowTable ([#2209](https://github.com/googleapis/python-bigquery-dataframes/issues/2209)) ([12e04d5](https://github.com/googleapis/python-bigquery-dataframes/commit/12e04d55f0d6aef1297b7ca773935aecf3313ee7)) +* Simplify UnsupportedTypeError message ([#2212](https://github.com/googleapis/python-bigquery-dataframes/issues/2212)) ([6c9a18d](https://github.com/googleapis/python-bigquery-dataframes/commit/6c9a18d7e67841c6fe6c1c6f34f80b950815141f)) +* Support results with STRUCT and ARRAY columns containing JSON subfields in `to_pandas_batches()` ([#2216](https://github.com/googleapis/python-bigquery-dataframes/issues/2216)) ([3d8b17f](https://github.com/googleapis/python-bigquery-dataframes/commit/3d8b17fa5eb9bbfc9e151031141a419f2dc3acb4)) + + +### Documentation + +* Switch API reference docs to pydata theme ([#2237](https://github.com/googleapis/python-bigquery-dataframes/issues/2237)) ([9b86dcf](https://github.com/googleapis/python-bigquery-dataframes/commit/9b86dcf87929648bf5ab565dfd46a23b639f01ac)) +* Update notebook for JSON subfields support in to_pandas_batches() ([#2138](https://github.com/googleapis/python-bigquery-dataframes/issues/2138)) ([5663d2a](https://github.com/googleapis/python-bigquery-dataframes/commit/5663d2a18064589596558af109e915f87d426eb0)) + ## [2.28.0](https://github.com/googleapis/python-bigquery-dataframes/compare/v2.27.0...v2.28.0) (2025-11-03) diff --git a/README.rst b/README.rst index 36d3c2ca20..84de370652 100644 --- a/README.rst +++ b/README.rst @@ -1,3 +1,5 @@ +:orphan: + BigQuery DataFrames (BigFrames) =============================== diff --git a/bigframes/bigquery/__init__.py b/bigframes/bigquery/__init__.py index c599a4b543..0650953fc7 100644 --- a/bigframes/bigquery/__init__.py +++ b/bigframes/bigquery/__init__.py @@ -40,6 +40,7 @@ st_intersection, st_isclosed, st_length, + st_regionstats, st_simplify, ) from bigframes.bigquery._operations.json import ( @@ -81,6 +82,7 @@ st_intersection, st_isclosed, st_length, + st_regionstats, st_simplify, # json ops json_extract, diff --git a/bigframes/bigquery/_operations/geo.py b/bigframes/bigquery/_operations/geo.py index 6b7e5d88a2..f0fda99a16 100644 --- a/bigframes/bigquery/_operations/geo.py +++ b/bigframes/bigquery/_operations/geo.py @@ -14,11 +14,13 @@ from __future__ import annotations -from typing import Union +import json +from typing import Mapping, Optional, Union import shapely # type: ignore from bigframes import operations as ops +import bigframes.dataframe import bigframes.geopandas import bigframes.series @@ -677,6 +679,65 @@ def st_length( return series +def st_regionstats( + geography: Union[bigframes.series.Series, bigframes.geopandas.GeoSeries], + raster_id: str, + band: Optional[str] = None, + include: Optional[str] = None, + options: Optional[Mapping[str, Union[str, int, float]]] = None, +) -> bigframes.series.Series: + """Returns statistics summarizing the pixel values of the raster image + referenced by raster_id that intersect with geography. + + The statistics include the count, minimum, maximum, sum, standard + deviation, mean, and area of the valid pixels of the raster band named + band_name. Google Earth Engine computes the results of the function call. + + See: https://cloud.google.com/bigquery/docs/reference/standard-sql/geography_functions#st_regionstats + + Args: + geography (bigframes.series.Series | bigframes.geopandas.GeoSeries): + A series of geography objects to intersect with the raster image. + raster_id (str): + A string that identifies a raster image. The following formats are + supported. A URI from an image table provided by Google Earth Engine + in BigQuery sharing (formerly Analytics Hub). A URI for a readable + GeoTIFF raster file. A Google Earth Engine asset path that + references public catalog data or project-owned assets with read + access. + band (Optional[str]): + A string in one of the following formats: + A single band within the raster image specified by raster_id. A + formula to compute a value from the available bands in the raster + image. The formula uses the Google Earth Engine image expression + syntax. Bands can be referenced by their name, band_name, in + expressions. If you don't specify a band, the first band of the + image is used. + include (Optional[str]): + An optional string formula that uses the Google Earth Engine image + expression syntax to compute a pixel weight. The formula should + return values from 0 to 1. Values outside this range are set to the + nearest limit, either 0 or 1. A value of 0 means that the pixel is + invalid and it's excluded from analysis. A positive value means that + a pixel is valid. Values between 0 and 1 represent proportional + weights for calculations, such as weighted means. + options (Mapping[str, Union[str, int, float]], optional): + A dictionary of options to pass to the function. See the BigQuery + documentation for a list of available options. + + Returns: + bigframes.pandas.Series: + A STRUCT Series containing the computed statistics. + """ + op = ops.GeoStRegionStatsOp( + raster_id=raster_id, + band=band, + include=include, + options=json.dumps(options) if options else None, + ) + return geography._apply_unary_op(op) + + def st_simplify( geography: "bigframes.series.Series", tolerance_meters: float, diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index 41986ce5df..f657f28a6f 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -68,6 +68,7 @@ import bigframes.operations.aggregations as agg_ops from bigframes.session import dry_runs, execution_spec from bigframes.session import executor as executors +from bigframes.session._io import pandas as io_pandas # Type constraint for wherever column labels are used Label = typing.Hashable @@ -711,12 +712,15 @@ def to_pandas_batches( # To reduce the number of edge cases to consider when working with the # results of this, always return at least one DataFrame. See: # b/428918844. - empty_val = pd.DataFrame( - { - col: pd.Series([], dtype=self.expr.get_column_type(col)) - for col in itertools.chain(self.value_columns, self.index_columns) - } - ) + try: + empty_arrow_table = self.expr.schema.to_pyarrow().empty_table() + except pa.ArrowNotImplementedError: + # Bug with some pyarrow versions(https://github.com/apache/arrow/issues/45262), + # empty_table only supports base storage types, not extension types. + empty_arrow_table = self.expr.schema.to_pyarrow( + use_storage_types=True + ).empty_table() + empty_val = io_pandas.arrow_to_pandas(empty_arrow_table, self.expr.schema) dfs = map( lambda a: a[0], itertools.zip_longest( @@ -1992,6 +1996,31 @@ def _generate_resample_label( Literal["epoch", "start", "start_day", "end", "end_day"], ] = "start_day", ) -> Block: + if not isinstance(rule, str): + raise NotImplementedError( + f"Only offset strings are currently supported for rule, but got {repr(rule)}. {constants.FEEDBACK_LINK}" + ) + + if rule in ("ME", "YE", "QE", "BME", "BA", "BQE", "W"): + raise NotImplementedError( + f"Offset strings 'ME', 'YE', 'QE', 'BME', 'BA', 'BQE', 'W' are not currently supported for rule, but got {repr(rule)}. {constants.FEEDBACK_LINK}" + ) + + if closed == "right": + raise NotImplementedError( + f"Only closed='left' is currently supported. {constants.FEEDBACK_LINK}", + ) + + if label == "right": + raise NotImplementedError( + f"Only label='left' is currently supported. {constants.FEEDBACK_LINK}", + ) + + if origin not in ("epoch", "start", "start_day"): + raise NotImplementedError( + f"Only origin='epoch', 'start', 'start_day' are currently supported, but got {repr(origin)}. {constants.FEEDBACK_LINK}" + ) + # Validate and resolve the index or column to use for grouping if on is None: if len(self.index_columns) == 0: @@ -2303,6 +2332,8 @@ def merge( right_join_ids: typing.Sequence[str], sort: bool, suffixes: tuple[str, str] = ("_x", "_y"), + left_index: bool = False, + right_index: bool = False, ) -> Block: conditions = tuple( (lid, rid) for lid, rid in zip(left_join_ids, right_join_ids) @@ -2310,34 +2341,52 @@ def merge( joined_expr, (get_column_left, get_column_right) = self.expr.relational_join( other.expr, type=how, conditions=conditions ) - result_columns = [] - matching_join_labels = [] left_post_join_ids = tuple(get_column_left[id] for id in left_join_ids) right_post_join_ids = tuple(get_column_right[id] for id in right_join_ids) - joined_expr, coalesced_ids = coalesce_columns( - joined_expr, left_post_join_ids, right_post_join_ids, how=how, drop=False - ) + if left_index or right_index: + # For some reason pandas coalesces two joining columns if one side is an index. + joined_expr, resolved_join_ids = coalesce_columns( + joined_expr, left_post_join_ids, right_post_join_ids + ) + else: + joined_expr, resolved_join_ids = resolve_col_join_ids( # type: ignore + joined_expr, + left_post_join_ids, + right_post_join_ids, + how=how, + drop=False, + ) + result_columns = [] + matching_join_labels = [] + + # Select left value columns for col_id in self.value_columns: if col_id in left_join_ids: key_part = left_join_ids.index(col_id) matching_right_id = right_join_ids[key_part] if ( - self.col_id_to_label[col_id] + right_index + or self.col_id_to_label[col_id] == other.col_id_to_label[matching_right_id] ): matching_join_labels.append(self.col_id_to_label[col_id]) - result_columns.append(coalesced_ids[key_part]) + result_columns.append(resolved_join_ids[key_part]) else: result_columns.append(get_column_left[col_id]) else: result_columns.append(get_column_left[col_id]) + + # Select right value columns for col_id in other.value_columns: if col_id in right_join_ids: if other.col_id_to_label[col_id] in matching_join_labels: pass + elif left_index: + key_part = right_join_ids.index(col_id) + result_columns.append(resolved_join_ids[key_part]) else: result_columns.append(get_column_right[col_id]) else: @@ -2348,11 +2397,22 @@ def merge( joined_expr = joined_expr.order_by( [ ordering.OrderingExpression(ex.deref(col_id)) - for col_id in coalesced_ids + for col_id in resolved_join_ids ], ) - joined_expr = joined_expr.select_columns(result_columns) + left_idx_id_post_join = [get_column_left[id] for id in self.index_columns] + right_idx_id_post_join = [get_column_right[id] for id in other.index_columns] + index_cols = _resolve_index_col( + left_idx_id_post_join, + right_idx_id_post_join, + resolved_join_ids, + left_index, + right_index, + how, + ) + + joined_expr = joined_expr.select_columns(result_columns + index_cols) labels = utils.merge_column_labels( self.column_labels, other.column_labels, @@ -2371,13 +2431,13 @@ def merge( or other.index.is_null or self.session._default_index_type == bigframes.enums.DefaultIndexKind.NULL ): - expr = joined_expr - index_columns = [] + return Block(joined_expr, index_columns=[], column_labels=labels) + elif index_cols: + return Block(joined_expr, index_columns=index_cols, column_labels=labels) else: expr, offset_index_id = joined_expr.promote_offsets() index_columns = [offset_index_id] - - return Block(expr, index_columns=index_columns, column_labels=labels) + return Block(expr, index_columns=index_columns, column_labels=labels) def _align_both_axes( self, other: Block, how: str @@ -3086,7 +3146,7 @@ def join_mono_indexed( left_index = get_column_left[left.index_columns[0]] right_index = get_column_right[right.index_columns[0]] # Drop original indices from each side. and used the coalesced combination generated by the join. - combined_expr, coalesced_join_cols = coalesce_columns( + combined_expr, coalesced_join_cols = resolve_col_join_ids( combined_expr, [left_index], [right_index], how=how ) if sort: @@ -3151,7 +3211,7 @@ def join_multi_indexed( left_ids_post_join = [get_column_left[id] for id in left_join_ids] right_ids_post_join = [get_column_right[id] for id in right_join_ids] # Drop original indices from each side. and used the coalesced combination generated by the join. - combined_expr, coalesced_join_cols = coalesce_columns( + combined_expr, coalesced_join_cols = resolve_col_join_ids( combined_expr, left_ids_post_join, right_ids_post_join, how=how ) if sort: @@ -3194,13 +3254,17 @@ def resolve_label_id(label: Label) -> str: # TODO: Rewrite just to return expressions -def coalesce_columns( +def resolve_col_join_ids( expr: core.ArrayValue, left_ids: typing.Sequence[str], right_ids: typing.Sequence[str], how: str, drop: bool = True, ) -> Tuple[core.ArrayValue, Sequence[str]]: + """ + Collapses and selects the joining column IDs, with the assumption that + the ids are all belong to value columns. + """ result_ids = [] for left_id, right_id in zip(left_ids, right_ids): if how == "left" or how == "inner" or how == "cross": @@ -3212,7 +3276,6 @@ def coalesce_columns( if drop: expr = expr.drop_columns([left_id]) elif how == "outer": - coalesced_id = guid.generate_guid() expr, coalesced_id = expr.project_to_id( ops.coalesce_op.as_expr(left_id, right_id) ) @@ -3224,6 +3287,21 @@ def coalesce_columns( return expr, result_ids +def coalesce_columns( + expr: core.ArrayValue, + left_ids: typing.Sequence[str], + right_ids: typing.Sequence[str], +) -> tuple[core.ArrayValue, list[str]]: + result_ids = [] + for left_id, right_id in zip(left_ids, right_ids): + expr, coalesced_id = expr.project_to_id( + ops.coalesce_op.as_expr(left_id, right_id) + ) + result_ids.append(coalesced_id) + + return expr, result_ids + + def _cast_index(block: Block, dtypes: typing.Sequence[bigframes.dtypes.Dtype]): original_block = block result_ids = [] @@ -3439,3 +3517,35 @@ def _pd_index_to_array_value( rows.append(row) return core.ArrayValue.from_pyarrow(pa.Table.from_pylist(rows), session=session) + + +def _resolve_index_col( + left_index_cols: list[str], + right_index_cols: list[str], + resolved_join_ids: list[str], + left_index: bool, + right_index: bool, + how: typing.Literal[ + "inner", + "left", + "outer", + "right", + "cross", + ], +) -> list[str]: + if left_index and right_index: + if how == "inner" or how == "left": + return left_index_cols + if how == "right": + return right_index_cols + if how == "outer": + return resolved_join_ids + else: + return [] + elif left_index and not right_index: + return right_index_cols + elif right_index and not left_index: + return left_index_cols + else: + # Joining with value columns only. Existing indices will be discarded. + return [] diff --git a/bigframes/core/compile/ibis_compiler/operations/geo_ops.py b/bigframes/core/compile/ibis_compiler/operations/geo_ops.py index 2f06c76768..0ca69726ff 100644 --- a/bigframes/core/compile/ibis_compiler/operations/geo_ops.py +++ b/bigframes/core/compile/ibis_compiler/operations/geo_ops.py @@ -16,8 +16,10 @@ from typing import cast +from bigframes_vendored import ibis from bigframes_vendored.ibis.expr import types as ibis_types import bigframes_vendored.ibis.expr.datatypes as ibis_dtypes +import bigframes_vendored.ibis.expr.operations.geospatial as ibis_geo import bigframes_vendored.ibis.expr.operations.udf as ibis_udf from bigframes.core.compile.ibis_compiler import scalar_op_compiler @@ -101,6 +103,35 @@ def geo_st_isclosed_op_impl(x: ibis_types.Value): return st_isclosed(x) +@register_unary_op(ops.GeoStRegionStatsOp, pass_op=True) +def geo_st_regionstats_op_impl( + geography: ibis_types.Value, + op: ops.GeoStRegionStatsOp, +): + if op.band: + band = ibis.literal(op.band, type=ibis_dtypes.string()) + else: + band = None + + if op.include: + include = ibis.literal(op.include, type=ibis_dtypes.string()) + else: + include = None + + if op.options: + options = ibis.literal(op.options, type=ibis_dtypes.json()) + else: + options = None + + return ibis_geo.GeoRegionStats( + arg=geography, # type: ignore + raster_id=ibis.literal(op.raster_id, type=ibis_dtypes.string()), # type: ignore + band=band, # type: ignore + include=include, # type: ignore + options=options, # type: ignore + ).to_expr() + + @register_unary_op(ops.GeoStSimplifyOp, pass_op=True) def st_simplify_op_impl(x: ibis_types.Value, op: ops.GeoStSimplifyOp): x = cast(ibis_types.GeoSpatialValue, x) diff --git a/bigframes/core/compile/sqlglot/aggregations/op_registration.py b/bigframes/core/compile/sqlglot/aggregations/op_registration.py index eb02b8bd50..a26429f27e 100644 --- a/bigframes/core/compile/sqlglot/aggregations/op_registration.py +++ b/bigframes/core/compile/sqlglot/aggregations/op_registration.py @@ -52,5 +52,5 @@ def arg_checker(*args, **kwargs): def __getitem__(self, op: str | agg_ops.WindowOp) -> CompilationFunc: key = op if isinstance(op, type) else type(op) if str(key) not in self._registered_ops: - raise ValueError(f"{key} is already not registered") + raise ValueError(f"{key} is not registered") return self._registered_ops[str(key)] diff --git a/bigframes/core/compile/sqlglot/aggregations/unary_compiler.py b/bigframes/core/compile/sqlglot/aggregations/unary_compiler.py index d157f07df2..d0d887588c 100644 --- a/bigframes/core/compile/sqlglot/aggregations/unary_compiler.py +++ b/bigframes/core/compile/sqlglot/aggregations/unary_compiler.py @@ -239,6 +239,20 @@ def _( return apply_window_if_present(sge.func("MIN", column.expr), window) +@UNARY_OP_REGISTRATION.register(agg_ops.PopVarOp) +def _( + op: agg_ops.PopVarOp, + column: typed_expr.TypedExpr, + window: typing.Optional[window_spec.WindowSpec] = None, +) -> sge.Expression: + expr = column.expr + if column.dtype == dtypes.BOOL_DTYPE: + expr = sge.Cast(this=expr, to="INT64") + + expr = sge.func("VAR_POP", expr) + return apply_window_if_present(expr, window) + + @UNARY_OP_REGISTRATION.register(agg_ops.QuantileOp) def _( op: agg_ops.QuantileOp, @@ -278,6 +292,22 @@ def _( return apply_window_if_present(sge.func("COUNT", sge.convert(1)), window) +@UNARY_OP_REGISTRATION.register(agg_ops.StdOp) +def _( + op: agg_ops.StdOp, + column: typed_expr.TypedExpr, + window: typing.Optional[window_spec.WindowSpec] = None, +) -> sge.Expression: + expr = column.expr + if column.dtype == dtypes.BOOL_DTYPE: + expr = sge.Cast(this=expr, to="INT64") + + expr = sge.func("STDDEV", expr) + if op.should_floor_result or column.dtype == dtypes.TIMEDELTA_DTYPE: + expr = sge.Cast(this=sge.func("FLOOR", expr), to="INT64") + return apply_window_if_present(expr, window) + + @UNARY_OP_REGISTRATION.register(agg_ops.ShiftOp) def _( op: agg_ops.ShiftOp, @@ -331,3 +361,17 @@ def _( expression=shifted, unit=sge.Identifier(this="MICROSECOND"), ) + + +@UNARY_OP_REGISTRATION.register(agg_ops.VarOp) +def _( + op: agg_ops.VarOp, + column: typed_expr.TypedExpr, + window: typing.Optional[window_spec.WindowSpec] = None, +) -> sge.Expression: + expr = column.expr + if column.dtype == dtypes.BOOL_DTYPE: + expr = sge.Cast(this=expr, to="INT64") + + expr = sge.func("VAR_SAMP", expr) + return apply_window_if_present(expr, window) diff --git a/bigframes/core/compile/sqlglot/expressions/comparison_ops.py b/bigframes/core/compile/sqlglot/expressions/comparison_ops.py index e77b8b50a5..89d3b4a682 100644 --- a/bigframes/core/compile/sqlglot/expressions/comparison_ops.py +++ b/bigframes/core/compile/sqlglot/expressions/comparison_ops.py @@ -109,6 +109,11 @@ def _(left: TypedExpr, right: TypedExpr) -> sge.Expression: return sge.LTE(this=left_expr, expression=right_expr) +@register_binary_op(ops.maximum_op) +def _(left: TypedExpr, right: TypedExpr) -> sge.Expression: + return sge.Greatest(expressions=[left.expr, right.expr]) + + @register_binary_op(ops.minimum_op) def _(left: TypedExpr, right: TypedExpr) -> sge.Expression: return sge.Least(this=left.expr, expressions=right.expr) diff --git a/bigframes/core/compile/sqlglot/expressions/generic_ops.py b/bigframes/core/compile/sqlglot/expressions/generic_ops.py index 07505855e1..2bd19e1967 100644 --- a/bigframes/core/compile/sqlglot/expressions/generic_ops.py +++ b/bigframes/core/compile/sqlglot/expressions/generic_ops.py @@ -24,6 +24,7 @@ import bigframes.core.compile.sqlglot.scalar_compiler as scalar_compiler register_unary_op = scalar_compiler.scalar_op_compiler.register_unary_op +register_binary_op = scalar_compiler.scalar_op_compiler.register_binary_op register_nary_op = scalar_compiler.scalar_op_compiler.register_nary_op register_ternary_op = scalar_compiler.scalar_op_compiler.register_ternary_op @@ -159,6 +160,13 @@ def _(*cases_and_outputs: TypedExpr) -> sge.Expression: ) +@register_binary_op(ops.coalesce_op) +def _(left: TypedExpr, right: TypedExpr) -> sge.Expression: + if left.expr == right.expr: + return left.expr + return sge.Coalesce(this=left.expr, expressions=[right.expr]) + + @register_nary_op(ops.RowKey) def _(*values: TypedExpr) -> sge.Expression: # All inputs into hash must be non-null or resulting hash will be null diff --git a/bigframes/core/compile/sqlglot/expressions/geo_ops.py b/bigframes/core/compile/sqlglot/expressions/geo_ops.py index 53a50fab47..24e488699f 100644 --- a/bigframes/core/compile/sqlglot/expressions/geo_ops.py +++ b/bigframes/core/compile/sqlglot/expressions/geo_ops.py @@ -21,6 +21,7 @@ import bigframes.core.compile.sqlglot.scalar_compiler as scalar_compiler register_unary_op = scalar_compiler.scalar_op_compiler.register_unary_op +register_binary_op = scalar_compiler.scalar_op_compiler.register_binary_op @register_unary_op(ops.geo_area_op) @@ -74,6 +75,32 @@ def _(expr: TypedExpr, op: ops.GeoStLengthOp) -> sge.Expression: return sge.func("ST_LENGTH", expr.expr) +@register_unary_op(ops.GeoStRegionStatsOp, pass_op=True) +def _( + geography: TypedExpr, + op: ops.GeoStRegionStatsOp, +): + args = [geography.expr, sge.convert(op.raster_id)] + if op.band: + args.append(sge.Kwarg(this="band", expression=sge.convert(op.band))) + if op.include: + args.append(sge.Kwarg(this="include", expression=sge.convert(op.include))) + if op.options: + args.append( + sge.Kwarg(this="options", expression=sge.JSON(this=sge.convert(op.options))) + ) + return sge.func("ST_REGIONSTATS", *args) + + +@register_unary_op(ops.GeoStSimplifyOp, pass_op=True) +def _(expr: TypedExpr, op: ops.GeoStSimplifyOp) -> sge.Expression: + return sge.func( + "ST_SIMPLIFY", + expr.expr, + sge.convert(op.tolerance_meters), + ) + + @register_unary_op(ops.geo_x_op) def _(expr: TypedExpr) -> sge.Expression: return sge.func("SAFE.ST_X", expr.expr) @@ -82,3 +109,8 @@ def _(expr: TypedExpr) -> sge.Expression: @register_unary_op(ops.geo_y_op) def _(expr: TypedExpr) -> sge.Expression: return sge.func("SAFE.ST_Y", expr.expr) + + +@register_binary_op(ops.geo_st_difference_op) +def _(left: TypedExpr, right: TypedExpr) -> sge.Expression: + return sge.func("ST_DIFFERENCE", left.expr, right.expr) diff --git a/bigframes/core/compile/sqlglot/expressions/numeric_ops.py b/bigframes/core/compile/sqlglot/expressions/numeric_ops.py index afc0d9d01c..36e2973565 100644 --- a/bigframes/core/compile/sqlglot/expressions/numeric_ops.py +++ b/bigframes/core/compile/sqlglot/expressions/numeric_ops.py @@ -77,6 +77,13 @@ def _(expr: TypedExpr) -> sge.Expression: return sge.func("ASINH", expr.expr) +@register_binary_op(ops.arctan2_op) +def _(left: TypedExpr, right: TypedExpr) -> sge.Expression: + left_expr = _coerce_bool_to_int(left) + right_expr = _coerce_bool_to_int(right) + return sge.func("ATAN2", left_expr, right_expr) + + @register_unary_op(ops.arctan_op) def _(expr: TypedExpr) -> sge.Expression: return sge.func("ATAN", expr.expr) @@ -118,6 +125,18 @@ def _(expr: TypedExpr) -> sge.Expression: ) +@register_binary_op(ops.cosine_distance_op) +def _(left: TypedExpr, right: TypedExpr) -> sge.Expression: + return sge.Anonymous( + this="ML.DISTANCE", + expressions=[ + left.expr, + right.expr, + sge.Literal.string("COSINE"), + ], + ) + + @register_unary_op(ops.exp_op) def _(expr: TypedExpr) -> sge.Expression: return sge.Case( diff --git a/bigframes/core/local_data.py b/bigframes/core/local_data.py index fa18f00483..0735c4fc5a 100644 --- a/bigframes/core/local_data.py +++ b/bigframes/core/local_data.py @@ -253,9 +253,16 @@ def _( value_generator = iter_array( array.flatten(), bigframes.dtypes.get_array_inner_type(dtype) ) - for (start, end) in _pairwise(array.offsets): - arr_size = end.as_py() - start.as_py() - yield list(itertools.islice(value_generator, arr_size)) + offset_generator = iter_array(array.offsets, bigframes.dtypes.INT_DTYPE) + + start_offset = None + end_offset = None + for offset in offset_generator: + start_offset = end_offset + end_offset = offset + if start_offset is not None: + arr_size = end_offset - start_offset + yield list(itertools.islice(value_generator, arr_size)) @iter_array.register def _( @@ -267,8 +274,15 @@ def _( sub_generators[field_name] = iter_array(array.field(field_name), dtype) keys = list(sub_generators.keys()) - for row_values in zip(*sub_generators.values()): - yield {key: value for key, value in zip(keys, row_values)} + is_null_generator = iter_array(array.is_null(), bigframes.dtypes.BOOL_DTYPE) + + for values in zip(is_null_generator, *sub_generators.values()): + is_row_null = values[0] + row_values = values[1:] + if not is_row_null: + yield {key: value for key, value in zip(keys, row_values)} + else: + yield None for batch in table.to_batches(): sub_generators: dict[str, Generator[Any, None, None]] = {} @@ -491,16 +505,3 @@ def _schema_durations_to_ints(schema: pa.Schema) -> pa.Schema: return pa.schema( pa.field(field.name, _durations_to_ints(field.type)) for field in schema ) - - -def _pairwise(iterable): - do_yield = False - a = None - b = None - for item in iterable: - a = b - b = item - if do_yield: - yield (a, b) - else: - do_yield = True diff --git a/bigframes/core/nodes.py b/bigframes/core/nodes.py index 9e0fcb3ace..553b41a631 100644 --- a/bigframes/core/nodes.py +++ b/bigframes/core/nodes.py @@ -1627,7 +1627,7 @@ class ResultNode(UnaryNode): # TODO: CTE definitions def _validate(self): - for ref, name in self.output_cols: + for ref, _ in self.output_cols: assert ref.id in self.child.ids @property diff --git a/bigframes/core/pyformat.py b/bigframes/core/pyformat.py index eab86dc629..8f49556ff4 100644 --- a/bigframes/core/pyformat.py +++ b/bigframes/core/pyformat.py @@ -104,6 +104,9 @@ def _field_to_template_value( if isinstance(value, bigframes.dataframe.DataFrame): return _table_to_sql(value._to_placeholder_table(dry_run=dry_run)) + if isinstance(value, str): + return value + return bigframes.core.sql.simple_literal(value) diff --git a/bigframes/core/reshape/merge.py b/bigframes/core/reshape/merge.py index 5c6cba4915..2afeb2a106 100644 --- a/bigframes/core/reshape/merge.py +++ b/bigframes/core/reshape/merge.py @@ -20,6 +20,7 @@ from typing import Literal, Sequence +from bigframes_vendored import constants import bigframes_vendored.pandas.core.reshape.merge as vendored_pandas_merge from bigframes import dataframe, series @@ -40,6 +41,8 @@ def merge( *, left_on: blocks.Label | Sequence[blocks.Label] | None = None, right_on: blocks.Label | Sequence[blocks.Label] | None = None, + left_index: bool = False, + right_index: bool = False, sort: bool = False, suffixes: tuple[str, str] = ("_x", "_y"), ) -> dataframe.DataFrame: @@ -59,35 +62,16 @@ def merge( ) return dataframe.DataFrame(result_block) - left_on, right_on = _validate_left_right_on( - left, right, on, left_on=left_on, right_on=right_on + left_join_ids, right_join_ids = _validate_left_right_on( + left, + right, + on, + left_on=left_on, + right_on=right_on, + left_index=left_index, + right_index=right_index, ) - if utils.is_list_like(left_on): - left_on = list(left_on) # type: ignore - else: - left_on = [left_on] - - if utils.is_list_like(right_on): - right_on = list(right_on) # type: ignore - else: - right_on = [right_on] - - left_join_ids = [] - for label in left_on: # type: ignore - left_col_id = left._resolve_label_exact(label) - # 0 elements already throws an exception - if not left_col_id: - raise ValueError(f"No column {label} found in self.") - left_join_ids.append(left_col_id) - - right_join_ids = [] - for label in right_on: # type: ignore - right_col_id = right._resolve_label_exact(label) - if not right_col_id: - raise ValueError(f"No column {label} found in other.") - right_join_ids.append(right_col_id) - block = left._block.merge( right._block, how, @@ -95,6 +79,8 @@ def merge( right_join_ids, sort=sort, suffixes=suffixes, + left_index=left_index, + right_index=right_index, ) return dataframe.DataFrame(block) @@ -127,30 +113,106 @@ def _validate_left_right_on( *, left_on: blocks.Label | Sequence[blocks.Label] | None = None, right_on: blocks.Label | Sequence[blocks.Label] | None = None, -): - if on is not None: - if left_on is not None or right_on is not None: - raise ValueError( - "Can not pass both `on` and `left_on` + `right_on` params." - ) - return on, on - - if left_on is not None and right_on is not None: - return left_on, right_on + left_index: bool = False, + right_index: bool = False, +) -> tuple[list[str], list[str]]: + # Turn left_on and right_on to lists + if left_on is not None and not isinstance(left_on, (tuple, list)): + left_on = [left_on] + if right_on is not None and not isinstance(right_on, (tuple, list)): + right_on = [right_on] - left_cols = left.columns - right_cols = right.columns - common_cols = left_cols.intersection(right_cols) - if len(common_cols) == 0: + if left_index and left.index.nlevels > 1: raise ValueError( - "No common columns to perform merge on." - f"Merge options: left_on={left_on}, " - f"right_on={right_on}, " + f"Joining with multi-level index is not supported. {constants.FEEDBACK_LINK}" ) - if ( - not left_cols.join(common_cols, how="inner").is_unique - or not right_cols.join(common_cols, how="inner").is_unique - ): - raise ValueError(f"Data columns not unique: {repr(common_cols)}") + if right_index and right.index.nlevels > 1: + raise ValueError( + f"Joining with multi-level index is not supported. {constants.FEEDBACK_LINK}" + ) + + # The following checks are copied from Pandas. + if on is None and left_on is None and right_on is None: + if left_index and right_index: + return list(left._block.index_columns), list(right._block.index_columns) + elif left_index: + raise ValueError("Must pass right_on or right_index=True") + elif right_index: + raise ValueError("Must pass left_on or left_index=True") + else: + # use the common columns + common_cols = left.columns.intersection(right.columns) + if len(common_cols) == 0: + raise ValueError( + "No common columns to perform merge on. " + f"Merge options: left_on={left_on}, " + f"right_on={right_on}, " + f"left_index={left_index}, " + f"right_index={right_index}" + ) + if ( + not left.columns.join(common_cols, how="inner").is_unique + or not right.columns.join(common_cols, how="inner").is_unique + ): + raise ValueError(f"Data columns not unique: {repr(common_cols)}") + return _to_col_ids(left, common_cols.to_list()), _to_col_ids( + right, common_cols.to_list() + ) - return common_cols, common_cols + elif on is not None: + if left_on is not None or right_on is not None: + raise ValueError( + 'Can only pass argument "on" OR "left_on" ' + 'and "right_on", not a combination of both.' + ) + if left_index or right_index: + raise ValueError( + 'Can only pass argument "on" OR "left_index" ' + 'and "right_index", not a combination of both.' + ) + return _to_col_ids(left, on), _to_col_ids(right, on) + + elif left_on is not None: + if left_index: + raise ValueError( + 'Can only pass argument "left_on" OR "left_index" not both.' + ) + if not right_index and right_on is None: + raise ValueError('Must pass "right_on" OR "right_index".') + if right_index: + if len(left_on) != right.index.nlevels: + raise ValueError( + "len(left_on) must equal the number " + 'of levels in the index of "right"' + ) + return _to_col_ids(left, left_on), list(right._block.index_columns) + + elif right_on is not None: + if right_index: + raise ValueError( + 'Can only pass argument "right_on" OR "right_index" not both.' + ) + if not left_index and left_on is None: + raise ValueError('Must pass "left_on" OR "left_index".') + if left_index: + if len(right_on) != left.index.nlevels: + raise ValueError( + "len(right_on) must equal the number " + 'of levels in the index of "left"' + ) + return list(left._block.index_columns), _to_col_ids(right, right_on) + + # The user correctly specified left_on and right_on + if len(right_on) != len(left_on): # type: ignore + raise ValueError("len(right_on) must equal len(left_on)") + + return _to_col_ids(left, left_on), _to_col_ids(right, right_on) + + +def _to_col_ids( + df: dataframe.DataFrame, join_cols: blocks.Label | Sequence[blocks.Label] +) -> list[str]: + if utils.is_list_like(join_cols): + return [df._block.resolve_label_exact_or_error(col) for col in join_cols] + + return [df._block.resolve_label_exact_or_error(join_cols)] diff --git a/bigframes/core/rewrite/identifiers.py b/bigframes/core/rewrite/identifiers.py index e911d81895..2e31f07a79 100644 --- a/bigframes/core/rewrite/identifiers.py +++ b/bigframes/core/rewrite/identifiers.py @@ -57,8 +57,10 @@ def remap_variables( new_root = root.transform_children(lambda node: remapped_children[node]) # Step 3: Transform the current node using the mappings from its children. + # "reversed" is required for InNode so that in case of a duplicate column ID, + # the left child's mapping is the one that's kept. downstream_mappings: dict[identifiers.ColumnId, identifiers.ColumnId] = { - k: v for mapping in new_child_mappings for k, v in mapping.items() + k: v for mapping in reversed(new_child_mappings) for k, v in mapping.items() } if isinstance(new_root, nodes.InNode): new_root = typing.cast(nodes.InNode, new_root) diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index df8c87416f..0ce602d1ea 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -3650,6 +3650,8 @@ def merge( *, left_on: Union[blocks.Label, Sequence[blocks.Label], None] = None, right_on: Union[blocks.Label, Sequence[blocks.Label], None] = None, + left_index: bool = False, + right_index: bool = False, sort: bool = False, suffixes: tuple[str, str] = ("_x", "_y"), ) -> DataFrame: @@ -3662,6 +3664,8 @@ def merge( on, left_on=left_on, right_on=right_on, + left_index=left_index, + right_index=right_index, sort=sort, suffixes=suffixes, ) @@ -4182,10 +4186,12 @@ def _split( return [DataFrame(block) for block in blocks] @validations.requires_ordering() - def _resample( + def resample( self, rule: str, *, + closed: Optional[Literal["right", "left"]] = None, + label: Optional[Literal["right", "left"]] = None, on: blocks.Label = None, level: Optional[LevelsType] = None, origin: Union[ @@ -4195,64 +4201,10 @@ def _resample( Literal["epoch", "start", "start_day", "end", "end_day"], ] = "start_day", ) -> bigframes.core.groupby.DataFrameGroupBy: - """Internal function to support resample. Resample time-series data. - - **Examples:** - - >>> import bigframes.pandas as bpd - >>> data = { - ... "timestamp_col": pd.date_range( - ... start="2021-01-01 13:00:00", periods=30, freq="1s" - ... ), - ... "int64_col": range(30), - ... "int64_too": range(10, 40), - ... } - - Resample on a DataFrame with index: - - >>> df = bpd.DataFrame(data).set_index("timestamp_col") - >>> df._resample(rule="7s").min() - int64_col int64_too - 2021-01-01 12:59:55 0 10 - 2021-01-01 13:00:02 2 12 - 2021-01-01 13:00:09 9 19 - 2021-01-01 13:00:16 16 26 - 2021-01-01 13:00:23 23 33 - - [5 rows x 2 columns] - - Resample with column and origin set to 'start': - - >>> df = bpd.DataFrame(data) - >>> df._resample(rule="7s", on = "timestamp_col", origin="start").min() - int64_col int64_too - 2021-01-01 13:00:00 0 10 - 2021-01-01 13:00:07 7 17 - 2021-01-01 13:00:14 14 24 - 2021-01-01 13:00:21 21 31 - 2021-01-01 13:00:28 28 38 - - [5 rows x 2 columns] - - Args: - rule (str): - The offset string representing target conversion. - on (str, default None): - For a DataFrame, column to use instead of index for resampling. Column - must be datetime-like. - level (str or int, default None): - For a MultiIndex, level (name or number) to use for resampling. - level must be datetime-like. - origin(str, default 'start_day'): - The timestamp on which to adjust the grouping. Must be one of the following: - 'epoch': origin is 1970-01-01 - 'start': origin is the first value of the timeseries - 'start_day': origin is the first day at midnight of the timeseries - Returns: - DataFrameGroupBy: DataFrameGroupBy object. - """ block = self._block._generate_resample_label( rule=rule, + closed=closed, + label=label, on=on, level=level, origin=origin, diff --git a/bigframes/exceptions.py b/bigframes/exceptions.py index ef51f96575..1fb86d7bd6 100644 --- a/bigframes/exceptions.py +++ b/bigframes/exceptions.py @@ -30,7 +30,7 @@ class UnknownLocationWarning(Warning): class CleanupFailedWarning(Warning): - """Bigframes failed to clean up a table resource.""" + """Bigframes failed to clean up a table or function resource.""" class DefaultIndexWarning(Warning): diff --git a/bigframes/functions/function_typing.py b/bigframes/functions/function_typing.py index 44ee071001..30804f317c 100644 --- a/bigframes/functions/function_typing.py +++ b/bigframes/functions/function_typing.py @@ -60,8 +60,22 @@ class UnsupportedTypeError(ValueError): def __init__(self, type_, supported_types): self.type = type_ self.supported_types = supported_types + + types_to_format = supported_types + if isinstance(supported_types, dict): + types_to_format = supported_types.keys() + + supported_types_str = ", ".join( + sorted( + [ + getattr(supported, "__name__", supported) + for supported in types_to_format + ] + ) + ) + super().__init__( - f"'{type_}' must be one of the supported types ({supported_types}) " + f"'{getattr(type_, '__name__', type_)}' must be one of the supported types ({supported_types_str}) " "or a list of one of those types." ) diff --git a/bigframes/operations/__init__.py b/bigframes/operations/__init__.py index cb03943ada..2a0beb3fb3 100644 --- a/bigframes/operations/__init__.py +++ b/bigframes/operations/__init__.py @@ -121,6 +121,7 @@ GeoStBufferOp, GeoStDistanceOp, GeoStLengthOp, + GeoStRegionStatsOp, GeoStSimplifyOp, ) from bigframes.operations.json_ops import ( @@ -415,12 +416,13 @@ "geo_st_geogpoint_op", "geo_st_intersection_op", "geo_st_isclosed_op", - "GeoStBufferOp", - "GeoStLengthOp", - "GeoStSimplifyOp", "geo_x_op", "geo_y_op", + "GeoStBufferOp", "GeoStDistanceOp", + "GeoStLengthOp", + "GeoStRegionStatsOp", + "GeoStSimplifyOp", # AI ops "AIClassify", "AIGenerate", diff --git a/bigframes/operations/geo_ops.py b/bigframes/operations/geo_ops.py index 86e913d543..75fef1b832 100644 --- a/bigframes/operations/geo_ops.py +++ b/bigframes/operations/geo_ops.py @@ -13,6 +13,7 @@ # limitations under the License. import dataclasses +from typing import Optional from bigframes import dtypes from bigframes.operations import base_ops @@ -135,6 +136,29 @@ def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionT return dtypes.FLOAT_DTYPE +@dataclasses.dataclass(frozen=True) +class GeoStRegionStatsOp(base_ops.UnaryOp): + """See: https://cloud.google.com/bigquery/docs/reference/standard-sql/geography_functions#st_regionstats""" + + name = "geo_st_regionstats" + raster_id: str + band: Optional[str] + include: Optional[str] + options: Optional[str] + + def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType: + return dtypes.struct_type( + [ + ("min", dtypes.FLOAT_DTYPE), + ("max", dtypes.FLOAT_DTYPE), + ("sum", dtypes.FLOAT_DTYPE), + ("count", dtypes.INT_DTYPE), + ("mean", dtypes.FLOAT_DTYPE), + ("area", dtypes.FLOAT_DTYPE), + ] + ) + + @dataclasses.dataclass(frozen=True) class GeoStSimplifyOp(base_ops.UnaryOp): name = "st_simplify" diff --git a/bigframes/series.py b/bigframes/series.py index ef0da32dfc..c11cc48394 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -2505,7 +2505,7 @@ def explode(self, *, ignore_index: Optional[bool] = False) -> Series: ) @validations.requires_ordering() - def _resample( + def resample( self, rule: str, *, @@ -2519,43 +2519,6 @@ def _resample( Literal["epoch", "start", "start_day", "end", "end_day"], ] = "start_day", ) -> bigframes.core.groupby.SeriesGroupBy: - """Internal function to support resample. Resample time-series data. - - **Examples:** - - >>> import bigframes.pandas as bpd - >>> data = { - ... "timestamp_col": pd.date_range( - ... start="2021-01-01 13:00:00", periods=30, freq="1s" - ... ), - ... "int64_col": range(30), - ... } - >>> s = bpd.DataFrame(data).set_index("timestamp_col") - >>> s._resample(rule="7s", origin="epoch").min() - int64_col - 2021-01-01 12:59:56 0 - 2021-01-01 13:00:03 3 - 2021-01-01 13:00:10 10 - 2021-01-01 13:00:17 17 - 2021-01-01 13:00:24 24 - - [5 rows x 1 columns] - - - Args: - rule (str): - The offset string representing target conversion. - level (str or int, default None): - For a MultiIndex, level (name or number) to use for resampling. - level must be datetime-like. - origin(str, default 'start_day'): - The timestamp on which to adjust the grouping. Must be one of the following: - 'epoch': origin is 1970-01-01 - 'start': origin is the first value of the timeseries - 'start_day': origin is the first day at midnight of the timeseries - Returns: - SeriesGroupBy: SeriesGroupBy object. - """ block = self._block._generate_resample_label( rule=rule, closed=closed, diff --git a/bigframes/session/anonymous_dataset.py b/bigframes/session/anonymous_dataset.py index 3c1757806b..bdc6e7f59c 100644 --- a/bigframes/session/anonymous_dataset.py +++ b/bigframes/session/anonymous_dataset.py @@ -16,15 +16,21 @@ import threading from typing import List, Optional, Sequence import uuid +import warnings +from google.api_core import retry as api_core_retry import google.cloud.bigquery as bigquery from bigframes import constants import bigframes.core.events +import bigframes.exceptions as bfe from bigframes.session import temporary_storage import bigframes.session._io.bigquery as bf_io_bigquery _TEMP_TABLE_ID_FORMAT = "bqdf{date}_{session_id}_{random_id}" +# UDFs older than this many days are considered stale and will be deleted +# from the anonymous dataset before creating a new UDF. +_UDF_CLEANUP_THRESHOLD_DAYS = 3 class AnonymousDatasetManager(temporary_storage.TemporaryStorageManager): @@ -137,8 +143,46 @@ def generate_unique_resource_id(self) -> bigquery.TableReference: ) return self.dataset.table(table_id) + def _cleanup_old_udfs(self): + """Clean up old UDFs in the anonymous dataset.""" + dataset = self.dataset + routines = list(self.bqclient.list_routines(dataset)) + cleanup_cutoff_time = datetime.datetime.now( + datetime.timezone.utc + ) - datetime.timedelta(days=_UDF_CLEANUP_THRESHOLD_DAYS) + + for routine in routines: + if ( + routine.created < cleanup_cutoff_time + and routine._properties["routineType"] == "SCALAR_FUNCTION" + ): + try: + self.bqclient.delete_routine( + routine.reference, + not_found_ok=True, + retry=api_core_retry.Retry(timeout=0), + ) + except Exception as e: + msg = bfe.format_message( + f"Unable to clean this old UDF '{routine.reference}': {e}" + ) + warnings.warn(msg, category=bfe.CleanupFailedWarning) + def close(self): """Delete tables that were created with this session's session_id.""" for table_ref in self._table_ids: self.bqclient.delete_table(table_ref, not_found_ok=True) self._table_ids.clear() + + try: + # Before closing the session, attempt to clean up any uncollected, + # old Python UDFs residing in the anonymous dataset. These UDFs + # accumulate over time and can eventually exceed resource limits. + # See more from b/450913424. + self._cleanup_old_udfs() + except Exception as e: + # Log a warning on the failure, do not interrupt the workflow. + msg = bfe.format_message( + f"Failed to clean up the old Python UDFs before closing the session: {e}" + ) + warnings.warn(msg, category=bfe.CleanupFailedWarning) diff --git a/bigframes/session/loader.py b/bigframes/session/loader.py index 6b16fe6bfd..4e67eac9ae 100644 --- a/bigframes/session/loader.py +++ b/bigframes/session/loader.py @@ -45,7 +45,6 @@ import google.cloud.bigquery.table from google.cloud.bigquery_storage_v1 import types as bq_storage_types import pandas -import pyarrow as pa import bigframes._tools import bigframes._tools.strings @@ -1307,22 +1306,6 @@ def _transform_read_gbq_configuration(configuration: Optional[dict]) -> dict: return configuration -def _has_json_arrow_type(arrow_type: pa.DataType) -> bool: - """ - Searches recursively for JSON array type within a PyArrow DataType. - """ - if arrow_type == bigframes.dtypes.JSON_ARROW_TYPE: - return True - if pa.types.is_list(arrow_type): - return _has_json_arrow_type(arrow_type.value_type) - if pa.types.is_struct(arrow_type): - for i in range(arrow_type.num_fields): - if _has_json_arrow_type(arrow_type.field(i).type): - return True - return False - return False - - def _validate_dtype_can_load(name: str, column_type: bigframes.dtypes.Dtype): """ Determines whether a datatype is supported by bq load jobs. @@ -1339,7 +1322,9 @@ def _validate_dtype_can_load(name: str, column_type: bigframes.dtypes.Dtype): if column_type == bigframes.dtypes.JSON_DTYPE: return - if isinstance(column_type, pandas.ArrowDtype) and _has_json_arrow_type( + if isinstance( + column_type, pandas.ArrowDtype + ) and bigframes.dtypes.contains_db_dtypes_json_arrow_type( column_type.pyarrow_dtype ): raise NotImplementedError( diff --git a/bigframes/version.py b/bigframes/version.py index cf7562a306..a129daf092 100644 --- a/bigframes/version.py +++ b/bigframes/version.py @@ -12,8 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "2.28.0" +__version__ = "2.29.0" # {x-release-please-start-date} -__release_date__ = "2025-11-03" +__release_date__ = "2025-11-10" # {x-release-please-end} diff --git a/docs/conf.py b/docs/conf.py index 23ec7a6b36..9d9e9ebd79 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -24,9 +24,11 @@ # All configuration values have a default; values that are commented out # serve to show the default. +from __future__ import annotations + import os -import shlex import sys +from typing import Any # If extensions (or modules to document with autodoc) are in another directory, # add these directories to sys.path here. If the directory is relative to the @@ -56,7 +58,7 @@ "sphinx.ext.napoleon", "sphinx.ext.todo", "sphinx.ext.viewcode", - "recommonmark", + "myst_parser", ] # autodoc/autosummary flags @@ -98,7 +100,7 @@ # # This is also used if you do content translation via gettext catalogs. # Usually you set "language" from the command line for these cases. -language = None +language = "en-US" # There are two options for replacing |today|: either, you set today to some # non-false value, then it is used: @@ -148,19 +150,27 @@ # The theme to use for HTML and HTML Help pages. See the documentation for # a list of builtin themes. -html_theme = "alabaster" +html_theme = "pydata_sphinx_theme" # Theme options are theme-specific and customize the look and feel of a theme # further. For a list of options available for each theme, see the # documentation. +# https://pydata-sphinx-theme.readthedocs.io/en/stable/user_guide/layout.html#references html_theme_options = { - "description": "BigQuery DataFrames provides DataFrame APIs on the BigQuery engine", - "github_user": "googleapis", - "github_repo": "python-bigquery-dataframes", - "github_banner": True, - "font_family": "'Roboto', Georgia, sans", - "head_font_family": "'Roboto', Georgia, serif", - "code_font_family": "'Roboto Mono', 'Consolas', monospace", + "github_url": "https://github.com/googleapis/python-bigquery-dataframes", + "logo": { + "text": "BigQuery DataFrames (BigFrames)", + }, + "external_links": [ + { + "name": "Getting started", + "url": "https://docs.cloud.google.com/bigquery/docs/dataframes-quickstart", + }, + { + "name": "User guide", + "url": "https://docs.cloud.google.com/bigquery/docs/bigquery-dataframes-introduction", + }, + ], } # Add any paths that contain custom themes here, relative to this directory. @@ -264,7 +274,7 @@ # -- Options for LaTeX output --------------------------------------------- -latex_elements = { +latex_elements: dict[str, Any] = { # The paper size ('letterpaper' or 'a4paper'). #'papersize': 'letterpaper', # The font size ('10pt', '11pt' or '12pt'). @@ -282,7 +292,7 @@ ( root_doc, "bigframes.tex", - "bigframes Documentation", + "BigQuery DataFrames (BigFrames)", author, "manual", ) @@ -317,7 +327,7 @@ ( root_doc, "bigframes", - "bigframes Documentation", + "BigQuery DataFrames (BigFrames)", [author], 1, ) @@ -336,7 +346,7 @@ ( root_doc, "bigframes", - "bigframes Documentation", + "BigQuery DataFrames (BigFrames)", author, "bigframes", "bigframes Library", diff --git a/notebooks/dataframes/anywidget_mode.ipynb b/notebooks/dataframes/anywidget_mode.ipynb index c2af915721..3b99bbeae7 100644 --- a/notebooks/dataframes/anywidget_mode.ipynb +++ b/notebooks/dataframes/anywidget_mode.ipynb @@ -35,7 +35,16 @@ "execution_count": 2, "id": "ca22f059", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/google/home/shuowei/src/python-bigquery-dataframes/venv/lib/python3.10/site-packages/google/api_core/_python_version_support.py:266: FutureWarning: You are using a Python version (3.10.15) which Google will stop supporting in new releases of google.api_core once it reaches its end of life (2026-10-04). Please upgrade to the latest Python version, or at least Python 3.11, to continue receiving updates for google.api_core past that date.\n", + " warnings.warn(message, FutureWarning)\n" + ] + } + ], "source": [ "import bigframes.pandas as bpd" ] @@ -142,9 +151,9 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "aafd4f912b5f42e0896aa5f0c2c62620", + "model_id": "47795eaa10f149aeb99574232c0936eb", "version_major": 2, - "version_minor": 0 + "version_minor": 1 }, "text/plain": [ "TableWidget(page_size=10, row_count=5552452, table_html='" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/google/home/shuowei/src/python-bigquery-dataframes/bigframes/dtypes.py:969: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n", + "instead of using `db_dtypes` in the future when available in pandas\n", + "(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n", + " warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n" + ] + }, + { + "data": { + "text/html": [ + "✅ Completed. " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "d1794b42579542a8980bd158e521bd3e", + "version_major": 2, + "version_minor": 1 + }, + "text/plain": [ + "TableWidget(page_size=10, row_count=5, table_html='
(\\\"Extract the values.\\\", OBJ.GET_ACCESS_URL(OBJ.FETCH_METADATA(OBJ.MAKE_REF(gcs_path, \\\"us.conn\\\")), \\\"r\\\")),\n", + " connection_id=>\\\"bigframes-dev.us.bigframes-default-connection\\\",\n", + " output_schema=>\\\"publication_date string, class_international string, application_number string, filing_date string\\\") AS result,\n", + " *\n", + " FROM `bigquery-public-data.labeled_patents.extracted_data`\n", + " LIMIT 5;\n", + "\"\"\")" + ] } ], "metadata": { "kernelspec": { - "display_name": "3.10.18", + "display_name": "venv", "language": "python", "name": "python3" }, @@ -341,7 +461,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.18" + "version": "3.10.15" } }, "nbformat": 4, diff --git a/noxfile.py b/noxfile.py index 8334fcb0e1..b02952f9c2 100644 --- a/noxfile.py +++ b/noxfile.py @@ -515,24 +515,14 @@ def cover(session): session.run("coverage", "erase") -@nox.session(python=DEFAULT_PYTHON_VERSION) +@nox.session(python="3.13") def docs(session): """Build the docs for this library.""" session.install("-e", ".[scikit-learn]") session.install( - # We need to pin to specific versions of the `sphinxcontrib-*` packages - # which still support sphinx 4.x. - # See https://github.com/googleapis/sphinx-docfx-yaml/issues/344 - # and https://github.com/googleapis/sphinx-docfx-yaml/issues/345. - "sphinxcontrib-applehelp==1.0.4", - "sphinxcontrib-devhelp==1.0.2", - "sphinxcontrib-htmlhelp==2.0.1", - "sphinxcontrib-qthelp==1.0.3", - "sphinxcontrib-serializinghtml==1.1.5", - SPHINX_VERSION, - "alabaster", - "recommonmark", - "anywidget", + "sphinx==8.2.3", + "myst-parser==4.0.1", + "pydata-sphinx-theme==0.16.1", ) shutil.rmtree(os.path.join("docs", "_build"), ignore_errors=True) @@ -562,19 +552,10 @@ def docfx(session): session.install("-e", ".[scikit-learn]") session.install( - # We need to pin to specific versions of the `sphinxcontrib-*` packages - # which still support sphinx 4.x. - # See https://github.com/googleapis/sphinx-docfx-yaml/issues/344 - # and https://github.com/googleapis/sphinx-docfx-yaml/issues/345. - "sphinxcontrib-applehelp==1.0.4", - "sphinxcontrib-devhelp==1.0.2", - "sphinxcontrib-htmlhelp==2.0.1", - "sphinxcontrib-qthelp==1.0.3", - "sphinxcontrib-serializinghtml==1.1.5", SPHINX_VERSION, - "alabaster", - "recommonmark", - "gcp-sphinx-docfx-yaml==3.0.1", + "pydata-sphinx-theme==0.13.3", + "myst-parser==0.18.1", + "gcp-sphinx-docfx-yaml==3.2.4", "anywidget", ) @@ -599,7 +580,7 @@ def docfx(session): "sphinx.ext.napoleon," "sphinx.ext.todo," "sphinx.ext.viewcode," - "recommonmark" + "myst_parser" ), "-b", "html", diff --git a/owlbot.py b/owlbot.py index b9145d4367..4a189ff0e2 100644 --- a/owlbot.py +++ b/owlbot.py @@ -44,6 +44,7 @@ excludes=[ # Need a combined LICENSE for all vendored packages. "LICENSE", + "docs/conf.py", # Multi-processing note isn't relevant, as bigframes is responsible for # creating clients, not the end user. "docs/multiprocessing.rst", @@ -57,8 +58,9 @@ ".kokoro/build.sh", ".kokoro/continuous/common.cfg", ".kokoro/presubmit/common.cfg", - # Temporary workaround to update docs job to use python 3.10 ".github/workflows/docs.yml", + ".github/workflows/lint.yml", + ".github/workflows/unittest.yml", ], ) @@ -114,13 +116,6 @@ "recursive-include bigframes *.json *.proto *.js *.css py.typed", ) -# Fixup the documentation. -assert 1 == s.replace( # docs/conf.py - ["docs/conf.py"], - re.escape("Google Cloud Client Libraries for bigframes"), - "BigQuery DataFrames provides DataFrame APIs on the BigQuery engine", -) - # Don't omit `*/core/*.py` when counting test coverages assert 1 == s.replace( # .coveragerc [".coveragerc"], diff --git a/samples/snippets/st_regionstats_test.py b/samples/snippets/st_regionstats_test.py new file mode 100644 index 0000000000..f0f4963a82 --- /dev/null +++ b/samples/snippets/st_regionstats_test.py @@ -0,0 +1,80 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Code sample for https://docs.cloud.google.com/bigquery/docs/raster-data#analytics-hub-source""" + + +def test_st_regionstats() -> None: + project_id = "bigframes-dev" + + # [START bigquery_dataframes_st_regionstats] + import datetime + from typing import cast + + import bigframes.bigquery as bbq + import bigframes.pandas as bpd + + # TODO: Set the project_id to your Google Cloud project ID. + # project_id = "your-project-id" + bpd.options.bigquery.project = project_id + + # TODO: Set the dataset_id to the ID of the dataset that contains the + # `climate` table. This is likely a linked dataset to Earth Engine. + # See: https://cloud.google.com/bigquery/docs/link-earth-engine + linked_dataset = "era5_land_daily_aggregated" + + # For the best efficiency, use partial ordering mode. + bpd.options.bigquery.ordering_mode = "partial" + + # Load the table of country boundaries. + countries = bpd.read_gbq("bigquery-public-data.overture_maps.division_area") + + # Filter to just the countries. + countries = countries[countries["subtype"] == "country"].copy() + countries["name"] = countries["names"].struct.field("primary") + countries["simplified_geometry"] = bbq.st_simplify( + countries["geometry"], + tolerance_meters=10_000, + ) + + # Get the reference to the temperature data from a linked dataset. + # Note: This sample assumes you have a linked dataset to Earth Engine. + image_href = ( + bpd.read_gbq(f"{project_id}.{linked_dataset}.climate") + .set_index("start_datetime") + .loc[[datetime.datetime(2025, 1, 1, tzinfo=datetime.timezone.utc)], :] + ) + raster_id = image_href["assets"].struct.field("image").struct.field("href") + raster_id = raster_id.item() + stats = bbq.st_regionstats( + countries["simplified_geometry"], + raster_id=cast(str, raster_id), + band="temperature_2m", + ) + + # Extract the mean and convert from Kelvin to Celsius. + countries["mean_temperature"] = stats.struct.field("mean") - 273.15 + + # Sort by the mean temperature to find the warmest countries. + result = countries[["name", "mean_temperature"]].sort_values( + "mean_temperature", ascending=False + ) + print(result.head(10)) + # [END bigquery_dataframes_st_regionstats] + + assert len(result) > 0 + + +if __name__ == "__main__": + test_st_regionstats() diff --git a/scripts/publish_api_coverage.py b/scripts/publish_api_coverage.py index 43f7df4dd6..1c052504d3 100644 --- a/scripts/publish_api_coverage.py +++ b/scripts/publish_api_coverage.py @@ -205,6 +205,9 @@ def generate_pandas_api_coverage(): def generate_sklearn_api_coverage(): """Explore all SKLearn modules, and for each item contained generate a regex to detect it being imported, and record whether we implement it""" + + import sklearn # noqa + sklearn_modules = [ "sklearn", "sklearn.model_selection", diff --git a/setup.py b/setup.py index abc760b691..fa663f66d5 100644 --- a/setup.py +++ b/setup.py @@ -54,7 +54,8 @@ "pydata-google-auth >=1.8.2", "requests >=2.27.1", "shapely >=1.8.5", - "sqlglot >=23.6.3", + # 25.20.0 introduces this fix https://github.com/TobikoData/sqlmesh/issues/3095 for rtrim/ltrim. + "sqlglot >=25.20.0", "tabulate >=0.9", "ipywidgets >=7.7.1", "humanize >=4.6.0", diff --git a/testing/constraints-3.9.txt b/testing/constraints-3.9.txt index eceec07dc4..b8dc8697d6 100644 --- a/testing/constraints-3.9.txt +++ b/testing/constraints-3.9.txt @@ -21,7 +21,7 @@ pydata-google-auth==1.8.2 requests==2.27.1 scikit-learn==1.2.2 shapely==1.8.5 -sqlglot==23.6.3 +sqlglot==25.20.0 tabulate==0.9 ipywidgets==7.7.1 humanize==4.6.0 diff --git a/tests/system/conftest.py b/tests/system/conftest.py index 2f08a695e9..9c4fcf58b1 100644 --- a/tests/system/conftest.py +++ b/tests/system/conftest.py @@ -70,23 +70,6 @@ def _hash_digest_file(hasher, filepath): hasher.update(chunk) -@pytest.fixture(scope="session") -def normalize_connection_id(): - """Normalizes the connection ID by casefolding only the LOCATION component. - - Connection format: PROJECT.LOCATION.CONNECTION_NAME - Only LOCATION is case-insensitive; PROJECT and CONNECTION_NAME must be lowercase. - """ - - def normalize(connection_id: str) -> str: - parts = connection_id.split(".") - if len(parts) == 3: - return f"{parts[0]}.{parts[1].casefold()}.{parts[2]}" - return connection_id # Return unchanged if invalid format - - return normalize - - @pytest.fixture(scope="session") def tokyo_location() -> str: return TOKYO_LOCATION @@ -212,7 +195,8 @@ def bq_connection_name() -> str: @pytest.fixture(scope="session") def bq_connection(bigquery_client: bigquery.Client, bq_connection_name: str) -> str: - return f"{bigquery_client.project}.{bigquery_client.location}.{bq_connection_name}" + # TODO(b/458169181): LOCATION casefold is needed for the mutimodal backend bug. Remove after the bug is fixed. + return f"{bigquery_client.project}.{bigquery_client.location.casefold()}.{bq_connection_name}" @pytest.fixture(scope="session", autouse=True) diff --git a/tests/system/large/blob/test_function.py b/tests/system/large/blob/test_function.py index 9ba8126dc6..7963fabd0b 100644 --- a/tests/system/large/blob/test_function.py +++ b/tests/system/large/blob/test_function.py @@ -52,7 +52,6 @@ def images_output_uris(images_output_folder: str) -> list[str]: ] -@pytest.mark.skip(reason="b/457416070") def test_blob_exif( bq_connection: str, session: bigframes.Session, @@ -104,7 +103,6 @@ def test_blob_exif_verbose( assert content_series.dtype == dtypes.JSON_DTYPE -@pytest.mark.skip(reason="b/457416070") def test_blob_image_blur_to_series( images_mm_df: bpd.DataFrame, bq_connection: str, @@ -138,7 +136,6 @@ def test_blob_image_blur_to_series( assert not actual.blob.size().isna().any() -@pytest.mark.skip(reason="b/457416070") def test_blob_image_blur_to_series_verbose( images_mm_df: bpd.DataFrame, bq_connection: str, @@ -166,7 +163,6 @@ def test_blob_image_blur_to_series_verbose( assert not actual.blob.size().isna().any() -@pytest.mark.skip(reason="b/457416070") def test_blob_image_blur_to_folder( images_mm_df: bpd.DataFrame, bq_connection: str, @@ -199,7 +195,6 @@ def test_blob_image_blur_to_folder( assert not actual.blob.size().isna().any() -@pytest.mark.skip(reason="b/457416070") def test_blob_image_blur_to_folder_verbose( images_mm_df: bpd.DataFrame, bq_connection: str, @@ -259,7 +254,6 @@ def test_blob_image_blur_to_bq_verbose(images_mm_df: bpd.DataFrame, bq_connectio assert content_series.dtype == dtypes.BYTES_DTYPE -@pytest.mark.skip(reason="b/457416070") def test_blob_image_resize_to_series( images_mm_df: bpd.DataFrame, bq_connection: str, @@ -297,7 +291,6 @@ def test_blob_image_resize_to_series( assert not actual.blob.size().isna().any() -@pytest.mark.skip(reason="b/457416070") def test_blob_image_resize_to_series_verbose( images_mm_df: bpd.DataFrame, bq_connection: str, @@ -332,7 +325,6 @@ def test_blob_image_resize_to_series_verbose( assert not actual.blob.size().isna().any() -@pytest.mark.skip(reason="b/457416070") def test_blob_image_resize_to_folder( images_mm_df: bpd.DataFrame, bq_connection: str, @@ -366,7 +358,6 @@ def test_blob_image_resize_to_folder( assert not actual.blob.size().isna().any() -@pytest.mark.skip(reason="b/457416070") def test_blob_image_resize_to_folder_verbose( images_mm_df: bpd.DataFrame, bq_connection: str, @@ -429,7 +420,6 @@ def test_blob_image_resize_to_bq_verbose( assert content_series.dtype == dtypes.BYTES_DTYPE -@pytest.mark.skip(reason="b/457416070") def test_blob_image_normalize_to_series( images_mm_df: bpd.DataFrame, bq_connection: str, @@ -502,7 +492,6 @@ def test_blob_image_normalize_to_series_verbose( assert hasattr(content_series, "blob") -@pytest.mark.skip(reason="b/457416070") def test_blob_image_normalize_to_folder( images_mm_df: bpd.DataFrame, bq_connection: str, @@ -609,7 +598,6 @@ def test_blob_image_normalize_to_bq_verbose( assert content_series.dtype == dtypes.BYTES_DTYPE -@pytest.mark.skip(reason="b/457416070") def test_blob_pdf_extract( pdf_mm_df: bpd.DataFrame, bq_connection: str, @@ -645,7 +633,6 @@ def test_blob_pdf_extract( ), f"Item (verbose=False): Expected keyword '{keyword}' not found in extracted text. " -@pytest.mark.skip(reason="b/457416070") def test_blob_pdf_extract_verbose( pdf_mm_df: bpd.DataFrame, bq_connection: str, @@ -683,7 +670,6 @@ def test_blob_pdf_extract_verbose( ), f"Item (verbose=True): Expected keyword '{keyword}' not found in extracted text. " -@pytest.mark.skip(reason="b/457416070") def test_blob_pdf_chunk(pdf_mm_df: bpd.DataFrame, bq_connection: str): actual = ( pdf_mm_df["pdf"] @@ -723,7 +709,6 @@ def test_blob_pdf_chunk(pdf_mm_df: bpd.DataFrame, bq_connection: str): ), f"Item (verbose=False): Expected keyword '{keyword}' not found in extracted text. " -@pytest.mark.skip(reason="b/457416070") def test_blob_pdf_chunk_verbose(pdf_mm_df: bpd.DataFrame, bq_connection: str): actual = ( pdf_mm_df["pdf"] diff --git a/tests/system/large/test_session.py b/tests/system/large/test_session.py index d28146498d..a525defe59 100644 --- a/tests/system/large/test_session.py +++ b/tests/system/large/test_session.py @@ -13,6 +13,7 @@ # limitations under the License. import datetime +from unittest import mock import google.cloud.bigquery as bigquery import google.cloud.exceptions @@ -138,3 +139,35 @@ def test_clean_up_via_context_manager(session_creator): bqclient.delete_table(full_id_1) with pytest.raises(google.cloud.exceptions.NotFound): bqclient.delete_table(full_id_2) + + +def test_cleanup_old_udfs(session: bigframes.Session): + routine_ref = session._anon_dataset_manager.dataset.routine("test_routine_cleanup") + + # Create a dummy function to be deleted. + create_function_sql = f""" +CREATE OR REPLACE FUNCTION `{routine_ref.project}.{routine_ref.dataset_id}.{routine_ref.routine_id}`(x INT64) +RETURNS INT64 LANGUAGE python +OPTIONS (entry_point='dummy_func', runtime_version='python-3.11') +AS r''' +def dummy_func(x): + return x + 1 +''' + """ + session.bqclient.query(create_function_sql).result() + + assert session.bqclient.get_routine(routine_ref) is not None + + mock_routine = mock.MagicMock(spec=bigquery.Routine) + mock_routine.created = datetime.datetime.now( + datetime.timezone.utc + ) - datetime.timedelta(days=100) + mock_routine.reference = routine_ref + mock_routine._properties = {"routineType": "SCALAR_FUNCTION"} + routines = [mock_routine] + + with mock.patch.object(session.bqclient, "list_routines", return_value=routines): + session._anon_dataset_manager._cleanup_old_udfs() + + with pytest.raises(google.cloud.exceptions.NotFound): + session.bqclient.get_routine(routine_ref) diff --git a/tests/system/small/bigquery/test_ai.py b/tests/system/small/bigquery/test_ai.py index 6df4a7a528..e5af45ec2b 100644 --- a/tests/system/small/bigquery/test_ai.py +++ b/tests/system/small/bigquery/test_ai.py @@ -273,10 +273,11 @@ def test_ai_if(session): assert result.dtype == dtypes.BOOL_DTYPE -@pytest.mark.skip(reason="b/457416070") -def test_ai_if_multi_model(session): +def test_ai_if_multi_model(session, bq_connection): df = session.from_glob_path( - "gs://bigframes-dev-testing/a_multimodel/images/*", name="image" + "gs://bigframes-dev-testing/a_multimodel/images/*", + name="image", + connection=bq_connection, ) result = bbq.ai.if_((df["image"], " contains an animal")) @@ -294,10 +295,11 @@ def test_ai_classify(session): assert result.dtype == dtypes.STRING_DTYPE -@pytest.mark.skip(reason="b/457416070") -def test_ai_classify_multi_model(session): +def test_ai_classify_multi_model(session, bq_connection): df = session.from_glob_path( - "gs://bigframes-dev-testing/a_multimodel/images/*", name="image" + "gs://bigframes-dev-testing/a_multimodel/images/*", + name="image", + connection=bq_connection, ) result = bbq.ai.classify(df["image"], ["photo", "cartoon"]) diff --git a/tests/system/small/blob/test_io.py b/tests/system/small/blob/test_io.py index 5da113a5e1..5ada4fabb0 100644 --- a/tests/system/small/blob/test_io.py +++ b/tests/system/small/blob/test_io.py @@ -12,36 +12,27 @@ # See the License for the specific language governing permissions and # limitations under the License. -from typing import Callable from unittest import mock import IPython.display import pandas as pd -import pytest import bigframes import bigframes.pandas as bpd def test_blob_create_from_uri_str( - bq_connection: str, - session: bigframes.Session, - images_uris, - normalize_connection_id: Callable[[str], str], + bq_connection: str, session: bigframes.Session, images_uris ): uri_series = bpd.Series(images_uris, session=session) blob_series = uri_series.str.to_blob(connection=bq_connection) pd_blob_df = blob_series.struct.explode().to_pandas() - pd_blob_df["authorizer"] = pd_blob_df["authorizer"].apply(normalize_connection_id) expected_pd_df = pd.DataFrame( { "uri": images_uris, "version": [None, None], - "authorizer": [ - normalize_connection_id(bq_connection), - normalize_connection_id(bq_connection), - ], + "authorizer": [bq_connection.casefold(), bq_connection.casefold()], "details": [None, None], } ) @@ -52,11 +43,7 @@ def test_blob_create_from_uri_str( def test_blob_create_from_glob_path( - bq_connection: str, - session: bigframes.Session, - images_gcs_path, - images_uris, - normalize_connection_id: Callable[[str], str], + bq_connection: str, session: bigframes.Session, images_gcs_path, images_uris ): blob_df = session.from_glob_path( images_gcs_path, connection=bq_connection, name="blob_col" @@ -68,16 +55,12 @@ def test_blob_create_from_glob_path( .sort_values("uri") .reset_index(drop=True) ) - pd_blob_df["authorizer"] = pd_blob_df["authorizer"].apply(normalize_connection_id) expected_df = pd.DataFrame( { "uri": images_uris, "version": [None, None], - "authorizer": [ - normalize_connection_id(bq_connection), - normalize_connection_id(bq_connection), - ], + "authorizer": [bq_connection.casefold(), bq_connection.casefold()], "details": [None, None], } ) @@ -88,11 +71,7 @@ def test_blob_create_from_glob_path( def test_blob_create_read_gbq_object_table( - bq_connection: str, - session: bigframes.Session, - images_gcs_path, - images_uris, - normalize_connection_id: Callable[[str], str], + bq_connection: str, session: bigframes.Session, images_gcs_path, images_uris ): obj_table = session._create_object_table(images_gcs_path, bq_connection) @@ -104,15 +83,11 @@ def test_blob_create_read_gbq_object_table( .sort_values("uri") .reset_index(drop=True) ) - pd_blob_df["authorizer"] = pd_blob_df["authorizer"].apply(normalize_connection_id) expected_df = pd.DataFrame( { "uri": images_uris, "version": [None, None], - "authorizer": [ - normalize_connection_id(bq_connection), - normalize_connection_id(bq_connection), - ], + "authorizer": [bq_connection.casefold(), bq_connection.casefold()], "details": [None, None], } ) @@ -122,7 +97,6 @@ def test_blob_create_read_gbq_object_table( ) -@pytest.mark.skip(reason="b/457416070") def test_display_images(monkeypatch, images_mm_df: bpd.DataFrame): mock_display = mock.Mock() monkeypatch.setattr(IPython.display, "display", mock_display) diff --git a/tests/system/small/blob/test_properties.py b/tests/system/small/blob/test_properties.py index c411c01f13..47d4d2aa04 100644 --- a/tests/system/small/blob/test_properties.py +++ b/tests/system/small/blob/test_properties.py @@ -12,12 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -from __future__ import annotations - -from typing import Callable - import pandas as pd -import pytest import bigframes.dtypes as dtypes import bigframes.pandas as bpd @@ -32,19 +27,10 @@ def test_blob_uri(images_uris: list[str], images_mm_df: bpd.DataFrame): ) -def test_blob_authorizer( - images_mm_df: bpd.DataFrame, - bq_connection: str, - normalize_connection_id: Callable[[str], str], -): +def test_blob_authorizer(images_mm_df: bpd.DataFrame, bq_connection: str): actual = images_mm_df["blob_col"].blob.authorizer().to_pandas() - actual = actual.apply(normalize_connection_id) expected = pd.Series( - [ - normalize_connection_id(bq_connection), - normalize_connection_id(bq_connection), - ], - name="authorizer", + [bq_connection.casefold(), bq_connection.casefold()], name="authorizer" ) pd.testing.assert_series_equal( @@ -52,7 +38,6 @@ def test_blob_authorizer( ) -@pytest.mark.skip(reason="b/457416070") def test_blob_version(images_mm_df: bpd.DataFrame): actual = images_mm_df["blob_col"].blob.version().to_pandas() expected = pd.Series(["1753907851152593", "1753907851111538"], name="version") @@ -62,7 +47,6 @@ def test_blob_version(images_mm_df: bpd.DataFrame): ) -@pytest.mark.skip(reason="b/457416070") def test_blob_metadata(images_mm_df: bpd.DataFrame): actual = images_mm_df["blob_col"].blob.metadata().to_pandas() expected = pd.Series( @@ -87,7 +71,6 @@ def test_blob_metadata(images_mm_df: bpd.DataFrame): pd.testing.assert_series_equal(actual, expected) -@pytest.mark.skip(reason="b/457416070") def test_blob_content_type(images_mm_df: bpd.DataFrame): actual = images_mm_df["blob_col"].blob.content_type().to_pandas() expected = pd.Series(["image/jpeg", "image/jpeg"], name="content_type") @@ -97,7 +80,6 @@ def test_blob_content_type(images_mm_df: bpd.DataFrame): ) -@pytest.mark.skip(reason="b/457416070") def test_blob_md5_hash(images_mm_df: bpd.DataFrame): actual = images_mm_df["blob_col"].blob.md5_hash().to_pandas() expected = pd.Series( @@ -110,7 +92,6 @@ def test_blob_md5_hash(images_mm_df: bpd.DataFrame): ) -@pytest.mark.skip(reason="b/457416070") def test_blob_size(images_mm_df: bpd.DataFrame): actual = images_mm_df["blob_col"].blob.size().to_pandas() expected = pd.Series([338390, 43333], name="size") @@ -120,7 +101,6 @@ def test_blob_size(images_mm_df: bpd.DataFrame): ) -@pytest.mark.skip(reason="b/457416070") def test_blob_updated(images_mm_df: bpd.DataFrame): actual = images_mm_df["blob_col"].blob.updated().to_pandas() expected = pd.Series( diff --git a/tests/system/small/core/test_reshape.py b/tests/system/small/core/test_reshape.py new file mode 100644 index 0000000000..0850bf50bb --- /dev/null +++ b/tests/system/small/core/test_reshape.py @@ -0,0 +1,120 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pandas as pd +import pandas.testing +import pytest + +from bigframes import session +from bigframes.core.reshape import merge + + +@pytest.mark.parametrize( + ("left_on", "right_on", "left_index", "right_index"), + [ + ("col_a", None, False, True), + (None, "col_d", True, False), + (None, None, True, True), + ], +) +@pytest.mark.parametrize("how", ["inner", "left", "right", "outer"]) +def test_join_with_index( + session: session.Session, left_on, right_on, left_index, right_index, how +): + df1 = pd.DataFrame({"col_a": [1, 2, 3], "col_b": [2, 3, 4]}, index=[1, 2, 3]) + bf1 = session.read_pandas(df1) + df2 = pd.DataFrame({"col_c": [1, 2, 3], "col_d": [2, 3, 4]}, index=[2, 3, 4]) + bf2 = session.read_pandas(df2) + + bf_result = merge.merge( + bf1, + bf2, + left_on=left_on, + right_on=right_on, + left_index=left_index, + right_index=right_index, + how=how, + ).to_pandas() + pd_result = pd.merge( + df1, + df2, + left_on=left_on, + right_on=right_on, + left_index=left_index, + right_index=right_index, + how=how, + ) + + pandas.testing.assert_frame_equal( + bf_result, pd_result, check_dtype=False, check_index_type=False + ) + + +@pytest.mark.parametrize( + ("on", "left_on", "right_on", "left_index", "right_index"), + [ + (None, "col_a", None, True, False), + (None, None, "col_c", None, True), + ("col_a", None, None, True, True), + ], +) +def test_join_with_index_invalid_index_arg_raise_error( + session: session.Session, on, left_on, right_on, left_index, right_index +): + df1 = pd.DataFrame({"col_a": [1, 2, 3], "col_b": [2, 3, 4]}, index=[1, 2, 3]) + bf1 = session.read_pandas(df1) + df2 = pd.DataFrame({"col_c": [1, 2, 3], "col_d": [2, 3, 4]}, index=[2, 3, 4]) + bf2 = session.read_pandas(df2) + + with pytest.raises(ValueError): + merge.merge( + bf1, + bf2, + on=on, + left_on=left_on, + right_on=right_on, + left_index=left_index, + right_index=right_index, + ).to_pandas() + + +@pytest.mark.parametrize( + ("left_on", "right_on", "left_index", "right_index"), + [ + (["col_a", "col_b"], None, False, True), + (None, ["col_c", "col_d"], True, False), + (None, None, True, True), + ], +) +@pytest.mark.parametrize("how", ["inner", "left", "right", "outer"]) +def test_join_with_multiindex_raises_error( + session: session.Session, left_on, right_on, left_index, right_index, how +): + multi_idx1 = pd.MultiIndex.from_tuples([(1, 2), (2, 3), (3, 5)]) + df1 = pd.DataFrame({"col_a": [1, 2, 3], "col_b": [2, 3, 4]}, index=multi_idx1) + bf1 = session.read_pandas(df1) + multi_idx2 = pd.MultiIndex.from_tuples([(1, 2), (2, 3), (3, 2)]) + df2 = pd.DataFrame({"col_c": [1, 2, 3], "col_d": [2, 3, 4]}, index=multi_idx2) + bf2 = session.read_pandas(df2) + + with pytest.raises(ValueError): + merge.merge( + bf1, + bf2, + left_on=left_on, + right_on=right_on, + left_index=left_index, + right_index=right_index, + how=how, + ) diff --git a/tests/system/small/engines/test_aggregation.py b/tests/system/small/engines/test_aggregation.py index 3e6d4843de..4ed826d2ae 100644 --- a/tests/system/small/engines/test_aggregation.py +++ b/tests/system/small/engines/test_aggregation.py @@ -111,7 +111,7 @@ def test_engines_unary_aggregates( assert_equivalence_execution(node, REFERENCE_ENGINE, engine) -@pytest.mark.parametrize("engine", ["polars", "bq"], indirect=True) +@pytest.mark.parametrize("engine", ["polars", "bq", "bq-sqlglot"], indirect=True) @pytest.mark.parametrize( "op", [agg_ops.std_op, agg_ops.var_op, agg_ops.PopVarOp()], diff --git a/tests/system/small/engines/test_generic_ops.py b/tests/system/small/engines/test_generic_ops.py index 5641f91a9a..01d4dad849 100644 --- a/tests/system/small/engines/test_generic_ops.py +++ b/tests/system/small/engines/test_generic_ops.py @@ -329,7 +329,7 @@ def test_engines_where_op(scalars_array_value: array_value.ArrayValue, engine): assert_equivalence_execution(arr.node, REFERENCE_ENGINE, engine) -@pytest.mark.parametrize("engine", ["polars", "bq"], indirect=True) +@pytest.mark.parametrize("engine", ["polars", "bq", "bq-sqlglot"], indirect=True) def test_engines_coalesce_op(scalars_array_value: array_value.ArrayValue, engine): arr, _ = scalars_array_value.compute_values( [ diff --git a/tests/system/small/engines/test_join.py b/tests/system/small/engines/test_join.py index 91c199a437..7ea24a554d 100644 --- a/tests/system/small/engines/test_join.py +++ b/tests/system/small/engines/test_join.py @@ -55,7 +55,7 @@ def test_engines_join_on_coerced_key( assert_equivalence_execution(result.node, REFERENCE_ENGINE, engine) -@pytest.mark.parametrize("engine", ["polars", "bq"], indirect=True) +@pytest.mark.parametrize("engine", ["polars", "bq", "bq-sqlglot"], indirect=True) @pytest.mark.parametrize("join_type", ["left", "inner", "right", "outer"]) def test_engines_join_multi_key( scalars_array_value: array_value.ArrayValue, @@ -90,7 +90,7 @@ def test_engines_cross_join( assert_equivalence_execution(result.node, REFERENCE_ENGINE, engine) -@pytest.mark.parametrize("engine", ["polars", "bq"], indirect=True) +@pytest.mark.parametrize("engine", ["polars", "bq", "bq-sqlglot"], indirect=True) @pytest.mark.parametrize( ("left_key", "right_key"), [ diff --git a/tests/system/small/engines/test_read_local.py b/tests/system/small/engines/test_read_local.py index abdd29c4ac..257bddd917 100644 --- a/tests/system/small/engines/test_read_local.py +++ b/tests/system/small/engines/test_read_local.py @@ -88,8 +88,9 @@ def test_engines_read_local_w_zero_row_source( assert_equivalence_execution(local_node, REFERENCE_ENGINE, engine) -# TODO: Fix sqlglot impl -@pytest.mark.parametrize("engine", ["polars", "bq", "pyarrow"], indirect=True) +@pytest.mark.parametrize( + "engine", ["polars", "bq", "pyarrow", "bq-sqlglot"], indirect=True +) def test_engines_read_local_w_nested_source( fake_session: bigframes.Session, nested_data_source: local_data.ManagedArrowTable, diff --git a/tests/system/small/engines/test_slicing.py b/tests/system/small/engines/test_slicing.py index 7340ff145b..022758893d 100644 --- a/tests/system/small/engines/test_slicing.py +++ b/tests/system/small/engines/test_slicing.py @@ -24,7 +24,7 @@ REFERENCE_ENGINE = polars_executor.PolarsExecutor() -@pytest.mark.parametrize("engine", ["polars", "bq"], indirect=True) +@pytest.mark.parametrize("engine", ["polars", "bq", "bq-sqlglot"], indirect=True) @pytest.mark.parametrize( ("start", "stop", "step"), [ diff --git a/tests/system/small/functions/test_remote_function.py b/tests/system/small/functions/test_remote_function.py index 26c4b89b24..805505ecd5 100644 --- a/tests/system/small/functions/test_remote_function.py +++ b/tests/system/small/functions/test_remote_function.py @@ -1646,7 +1646,7 @@ def func_tuple(x): with pytest.raises( ValueError, - match=r"'typing\.Sequence\[int\]' must be one of the supported types", + match=r"must be one of the supported types", ): bff.remote_function( input_types=int, diff --git a/tests/system/small/ml/test_multimodal_llm.py b/tests/system/small/ml/test_multimodal_llm.py index fe34f9c02b..48a69f522c 100644 --- a/tests/system/small/ml/test_multimodal_llm.py +++ b/tests/system/small/ml/test_multimodal_llm.py @@ -21,7 +21,6 @@ from bigframes.testing import utils -@pytest.mark.skip(reason="b/457416070") @pytest.mark.flaky(retries=2) def test_multimodal_embedding_generator_predict_default_params_success( images_mm_df, session, bq_connection diff --git a/tests/system/small/session/test_read_gbq_colab.py b/tests/system/small/session/test_read_gbq_colab.py index 6d3cf6fe88..65f47fe4e3 100644 --- a/tests/system/small/session/test_read_gbq_colab.py +++ b/tests/system/small/session/test_read_gbq_colab.py @@ -143,7 +143,7 @@ def test_read_gbq_colab_repr_avoids_requery(maybe_ordered_session): def test_read_gbq_colab_includes_formatted_scalars(session): pyformat_args = { "some_integer": 123, - "some_string": "This could be dangerous, but we escape it", + "some_string": "This could be dangerous.", # This is not a supported type, but ignored if not referenced. "some_object": object(), } @@ -153,7 +153,7 @@ def test_read_gbq_colab_includes_formatted_scalars(session): df = session._read_gbq_colab( """ SELECT {some_integer} as some_integer, - {some_string} as some_string, + '{some_string}' as some_string, '{{escaped}}' as escaped """, pyformat_args=pyformat_args, @@ -165,7 +165,7 @@ def test_read_gbq_colab_includes_formatted_scalars(session): { "some_integer": pandas.Series([123], dtype=pandas.Int64Dtype()), "some_string": pandas.Series( - ["This could be dangerous, but we escape it"], + ["This could be dangerous."], dtype="string[pyarrow]", ), "escaped": pandas.Series(["{escaped}"], dtype="string[pyarrow]"), diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index 79f8efd00f..475f98407b 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -5915,21 +5915,15 @@ def test_dataframe_explode_xfail(col_names): pytest.param("datetime_col", "5M", "epoch"), pytest.param("datetime_col", "3Q", "start_day"), pytest.param("datetime_col", "3YE", "start"), - pytest.param( - "int64_col", "100D", "start", marks=pytest.mark.xfail(raises=TypeError) - ), - pytest.param( - "datetime_col", "100D", "end", marks=pytest.mark.xfail(raises=ValueError) - ), ], ) -def test__resample_with_column( +def test_resample_with_column( scalars_df_index, scalars_pandas_df_index, on, rule, origin ): # TODO: supply a reason why this isn't compatible with pandas 1.x pytest.importorskip("pandas", minversion="2.0.0") bf_result = ( - scalars_df_index._resample(rule=rule, on=on, origin=origin)[ + scalars_df_index.resample(rule=rule, on=on, origin=origin)[ ["int64_col", "int64_too"] ] .max() @@ -5943,30 +5937,54 @@ def test__resample_with_column( ) +@pytest.mark.parametrize("index_col", ["timestamp_col", "datetime_col"]) +@pytest.mark.parametrize( + ("index_append", "level"), + [(True, 1), (False, None), (False, 0)], +) @pytest.mark.parametrize( - ("append", "level", "col", "rule"), + "rule", [ - pytest.param(False, None, "timestamp_col", "100d"), - pytest.param(True, 1, "timestamp_col", "1200h"), - pytest.param(False, None, "datetime_col", "100d"), + # TODO(tswast): support timedeltas and dataoffsets. + # TODO(tswast): support bins that default to "right". + "100d", + "1200h", ], ) -def test__resample_with_index( - scalars_df_index, scalars_pandas_df_index, append, level, col, rule +# TODO(tswast): support "right" +@pytest.mark.parametrize("closed", ["left", None]) +# TODO(tswast): support "right" +@pytest.mark.parametrize("label", ["left", None]) +@pytest.mark.parametrize( + "origin", + ["epoch", "start", "start_day"], # TODO(tswast): support end, end_day. +) +def test_resample_with_index( + scalars_df_index, + scalars_pandas_df_index, + index_append, + level, + index_col, + rule, + closed, + origin, + label, ): # TODO: supply a reason why this isn't compatible with pandas 1.x pytest.importorskip("pandas", minversion="2.0.0") - scalars_df_index = scalars_df_index.set_index(col, append=append) - scalars_pandas_df_index = scalars_pandas_df_index.set_index(col, append=append) + scalars_df_index = scalars_df_index.set_index(index_col, append=index_append) + scalars_pandas_df_index = scalars_pandas_df_index.set_index( + index_col, append=index_append + ) bf_result = ( scalars_df_index[["int64_col", "int64_too"]] - ._resample(rule=rule, level=level) + .resample(rule=rule, level=level, closed=closed, origin=origin, label=label) .min() .to_pandas() ) pd_result = ( scalars_pandas_df_index[["int64_col", "int64_too"]] - .resample(rule=rule, level=level) + .resample(rule=rule, level=level, closed=closed, origin=origin, label=label) .min() ) assert_pandas_df_equal(bf_result, pd_result) @@ -6010,7 +6028,7 @@ def test__resample_with_index( ), ], ) -def test__resample_start_time(rule, origin, data): +def test_resample_start_time(rule, origin, data): # TODO: supply a reason why this isn't compatible with pandas 1.x pytest.importorskip("pandas", minversion="2.0.0") col = "timestamp_col" @@ -6018,7 +6036,7 @@ def test__resample_start_time(rule, origin, data): scalars_pandas_df_index = pd.DataFrame(data).set_index(col) scalars_pandas_df_index.index.name = None - bf_result = scalars_df_index._resample(rule=rule, origin=origin).min().to_pandas() + bf_result = scalars_df_index.resample(rule=rule, origin=origin).min().to_pandas() pd_result = scalars_pandas_df_index.resample(rule=rule, origin=origin).min() diff --git a/tests/system/small/test_dataframe_io.py b/tests/system/small/test_dataframe_io.py index 96d7881d67..4d4a144d0a 100644 --- a/tests/system/small/test_dataframe_io.py +++ b/tests/system/small/test_dataframe_io.py @@ -376,6 +376,92 @@ def test_to_pandas_batches_w_empty_dataframe(session): pandas.testing.assert_series_equal(results[0].dtypes, empty.dtypes) +@pytest.mark.skipif( + bigframes.features.PANDAS_VERSIONS.is_arrow_list_dtype_usable, + reason="Test for pandas 1.x behavior only", +) +def test_to_pandas_batches_preserves_dtypes_for_populated_nested_json_pandas1(session): + """Verifies to_pandas_batches() preserves dtypes for nested JSON in pandas 1.x.""" + sql = """ + SELECT + 0 AS id, + [JSON '{"a":1}', JSON '{"b":2}'] AS json_array, + STRUCT(JSON '{"x":1}' AS json_field, 'test' AS str_field) AS json_struct + """ + df = session.read_gbq(sql, index_col="id") + batches = list(df.to_pandas_batches()) + + assert batches[0].dtypes["json_array"] == "object" + assert isinstance(batches[0].dtypes["json_struct"], pd.ArrowDtype) + + +@pytest.mark.skipif( + not bigframes.features.PANDAS_VERSIONS.is_arrow_list_dtype_usable, + reason="Test for pandas 2.x behavior only", +) +def test_to_pandas_batches_preserves_dtypes_for_populated_nested_json_pandas2(session): + """Verifies to_pandas_batches() preserves dtypes for nested JSON in pandas 2.x.""" + sql = """ + SELECT + 0 AS id, + [JSON '{"a":1}', JSON '{"b":2}'] AS json_array, + STRUCT(JSON '{"x":1}' AS json_field, 'test' AS str_field) AS json_struct + """ + df = session.read_gbq(sql, index_col="id") + batches = list(df.to_pandas_batches()) + + assert isinstance(batches[0].dtypes["json_array"], pd.ArrowDtype) + assert isinstance(batches[0].dtypes["json_array"].pyarrow_dtype, pa.ListType) + assert isinstance(batches[0].dtypes["json_struct"], pd.ArrowDtype) + + +@pytest.mark.skipif( + bigframes.features.PANDAS_VERSIONS.is_arrow_list_dtype_usable, + reason="Test for pandas 1.x behavior only", +) +def test_to_pandas_batches_should_not_error_on_empty_nested_json_pandas1(session): + """Verify to_pandas_batches() works with empty nested JSON types in pandas 1.x.""" + + sql = """ + SELECT + 1 AS id, + [] AS json_array, + STRUCT(NULL AS json_field, 'test2' AS str_field) AS json_struct + """ + df = session.read_gbq(sql, index_col="id") + + # The main point: this should not raise an error + batches = list(df.to_pandas_batches()) + assert sum(len(b) for b in batches) == 1 + + assert batches[0].dtypes["json_array"] == "object" + assert isinstance(batches[0].dtypes["json_struct"], pd.ArrowDtype) + + +@pytest.mark.skipif( + not bigframes.features.PANDAS_VERSIONS.is_arrow_list_dtype_usable, + reason="Test for pandas 2.x behavior only", +) +def test_to_pandas_batches_should_not_error_on_empty_nested_json_pandas2(session): + """Verify to_pandas_batches() works with empty nested JSON types in pandas 2.x.""" + + sql = """ + SELECT + 1 AS id, + [] AS json_array, + STRUCT(NULL AS json_field, 'test2' AS str_field) AS json_struct + """ + df = session.read_gbq(sql, index_col="id") + + # The main point: this should not raise an error + batches = list(df.to_pandas_batches()) + assert sum(len(b) for b in batches) == 1 + + assert isinstance(batches[0].dtypes["json_array"], pd.ArrowDtype) + assert isinstance(batches[0].dtypes["json_struct"], pd.ArrowDtype) + assert isinstance(batches[0].dtypes["json_struct"].pyarrow_dtype, pa.StructType) + + @pytest.mark.parametrize("allow_large_results", (True, False)) def test_to_pandas_batches_w_page_size_and_max_results(session, allow_large_results): """Verify to_pandas_batches() APIs returns the expected page size. diff --git a/tests/system/small/test_pandas.py b/tests/system/small/test_pandas.py index d2cde59729..2f4ddaecff 100644 --- a/tests/system/small/test_pandas.py +++ b/tests/system/small/test_pandas.py @@ -13,7 +13,6 @@ # limitations under the License. from datetime import datetime -import re import typing import pandas as pd @@ -440,10 +439,7 @@ def test_merge_raises_error_when_left_right_on_set(scalars_dfs): left = scalars_df[left_columns] right = scalars_df[right_columns] - with pytest.raises( - ValueError, - match=re.escape("Can not pass both `on` and `left_on` + `right_on` params."), - ): + with pytest.raises(ValueError): bpd.merge( left, right, diff --git a/tests/system/small/test_series.py b/tests/system/small/test_series.py index 5ace3f54d8..4df257423f 100644 --- a/tests/system/small/test_series.py +++ b/tests/system/small/test_series.py @@ -4856,14 +4856,14 @@ def test_series_explode_null(data): pytest.param(True, "timestamp_col", "timestamp_col", "1YE"), ], ) -def test__resample(scalars_df_index, scalars_pandas_df_index, append, level, col, rule): +def test_resample(scalars_df_index, scalars_pandas_df_index, append, level, col, rule): # TODO: supply a reason why this isn't compatible with pandas 1.x pytest.importorskip("pandas", minversion="2.0.0") scalars_df_index = scalars_df_index.set_index(col, append=append)["int64_col"] scalars_pandas_df_index = scalars_pandas_df_index.set_index(col, append=append)[ "int64_col" ] - bf_result = scalars_df_index._resample(rule=rule, level=level).min().to_pandas() + bf_result = scalars_df_index.resample(rule=rule, level=level).min().to_pandas() pd_result = scalars_pandas_df_index.resample(rule=rule, level=level).min() pd.testing.assert_series_equal(bf_result, pd_result) diff --git a/tests/system/small/test_unordered.py b/tests/system/small/test_unordered.py index 9cfa54146a..07fdb215df 100644 --- a/tests/system/small/test_unordered.py +++ b/tests/system/small/test_unordered.py @@ -248,7 +248,7 @@ def test_unordered_mode_no_ambiguity_warning(unordered_session): ), ], ) -def test__resample_with_index(unordered_session, rule, origin, data): +def test_resample_with_index(unordered_session, rule, origin, data): # TODO: supply a reason why this isn't compatible with pandas 1.x pytest.importorskip("pandas", minversion="2.0.0") col = "timestamp_col" @@ -256,12 +256,16 @@ def test__resample_with_index(unordered_session, rule, origin, data): scalars_pandas_df_index = pd.DataFrame(data).set_index(col) scalars_pandas_df_index.index.name = None - bf_result = scalars_df_index._resample(rule=rule, origin=origin).min().to_pandas() - + bf_result = scalars_df_index.resample(rule=rule, origin=origin).min() pd_result = scalars_pandas_df_index.resample(rule=rule, origin=origin).min() + assert isinstance(bf_result.index, bpd.DatetimeIndex) + assert isinstance(pd_result.index, pd.DatetimeIndex) pd.testing.assert_frame_equal( - bf_result, pd_result, check_dtype=False, check_index_type=False + bf_result.to_pandas(), + pd_result, + check_index_type=False, + check_dtype=False, ) diff --git a/tests/unit/core/compile/sqlglot/aggregations/snapshots/test_unary_compiler/test_pop_var/out.sql b/tests/unit/core/compile/sqlglot/aggregations/snapshots/test_unary_compiler/test_pop_var/out.sql new file mode 100644 index 0000000000..de422382d1 --- /dev/null +++ b/tests/unit/core/compile/sqlglot/aggregations/snapshots/test_unary_compiler/test_pop_var/out.sql @@ -0,0 +1,15 @@ +WITH `bfcte_0` AS ( + SELECT + `bool_col` AS `bfcol_0`, + `int64_col` AS `bfcol_1` + FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` +), `bfcte_1` AS ( + SELECT + VAR_POP(`bfcol_1`) AS `bfcol_4`, + VAR_POP(CAST(`bfcol_0` AS INT64)) AS `bfcol_5` + FROM `bfcte_0` +) +SELECT + `bfcol_4` AS `int64_col`, + `bfcol_5` AS `bool_col` +FROM `bfcte_1` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/aggregations/snapshots/test_unary_compiler/test_pop_var/window_out.sql b/tests/unit/core/compile/sqlglot/aggregations/snapshots/test_unary_compiler/test_pop_var/window_out.sql new file mode 100644 index 0000000000..fa04dad64e --- /dev/null +++ b/tests/unit/core/compile/sqlglot/aggregations/snapshots/test_unary_compiler/test_pop_var/window_out.sql @@ -0,0 +1,13 @@ +WITH `bfcte_0` AS ( + SELECT + `int64_col` AS `bfcol_0` + FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` +), `bfcte_1` AS ( + SELECT + *, + CASE WHEN `bfcol_0` IS NULL THEN NULL ELSE VAR_POP(`bfcol_0`) OVER () END AS `bfcol_1` + FROM `bfcte_0` +) +SELECT + `bfcol_1` AS `agg_int64` +FROM `bfcte_1` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/aggregations/snapshots/test_unary_compiler/test_std/out.sql b/tests/unit/core/compile/sqlglot/aggregations/snapshots/test_unary_compiler/test_std/out.sql new file mode 100644 index 0000000000..9bfa6288c3 --- /dev/null +++ b/tests/unit/core/compile/sqlglot/aggregations/snapshots/test_unary_compiler/test_std/out.sql @@ -0,0 +1,27 @@ +WITH `bfcte_0` AS ( + SELECT + `bool_col` AS `bfcol_0`, + `int64_col` AS `bfcol_1`, + `duration_col` AS `bfcol_2` + FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` +), `bfcte_1` AS ( + SELECT + *, + `bfcol_1` AS `bfcol_6`, + `bfcol_0` AS `bfcol_7`, + `bfcol_2` AS `bfcol_8` + FROM `bfcte_0` +), `bfcte_2` AS ( + SELECT + STDDEV(`bfcol_6`) AS `bfcol_12`, + STDDEV(CAST(`bfcol_7` AS INT64)) AS `bfcol_13`, + CAST(FLOOR(STDDEV(`bfcol_8`)) AS INT64) AS `bfcol_14`, + CAST(FLOOR(STDDEV(`bfcol_6`)) AS INT64) AS `bfcol_15` + FROM `bfcte_1` +) +SELECT + `bfcol_12` AS `int64_col`, + `bfcol_13` AS `bool_col`, + `bfcol_14` AS `duration_col`, + `bfcol_15` AS `int64_col_w_floor` +FROM `bfcte_2` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/aggregations/snapshots/test_unary_compiler/test_std/window_out.sql b/tests/unit/core/compile/sqlglot/aggregations/snapshots/test_unary_compiler/test_std/window_out.sql new file mode 100644 index 0000000000..e4e4ff0684 --- /dev/null +++ b/tests/unit/core/compile/sqlglot/aggregations/snapshots/test_unary_compiler/test_std/window_out.sql @@ -0,0 +1,13 @@ +WITH `bfcte_0` AS ( + SELECT + `int64_col` AS `bfcol_0` + FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` +), `bfcte_1` AS ( + SELECT + *, + CASE WHEN `bfcol_0` IS NULL THEN NULL ELSE STDDEV(`bfcol_0`) OVER () END AS `bfcol_1` + FROM `bfcte_0` +) +SELECT + `bfcol_1` AS `agg_int64` +FROM `bfcte_1` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/aggregations/snapshots/test_unary_compiler/test_var/out.sql b/tests/unit/core/compile/sqlglot/aggregations/snapshots/test_unary_compiler/test_var/out.sql new file mode 100644 index 0000000000..59ccd59e8f --- /dev/null +++ b/tests/unit/core/compile/sqlglot/aggregations/snapshots/test_unary_compiler/test_var/out.sql @@ -0,0 +1,15 @@ +WITH `bfcte_0` AS ( + SELECT + `bool_col` AS `bfcol_0`, + `int64_col` AS `bfcol_1` + FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` +), `bfcte_1` AS ( + SELECT + VARIANCE(`bfcol_1`) AS `bfcol_4`, + VARIANCE(CAST(`bfcol_0` AS INT64)) AS `bfcol_5` + FROM `bfcte_0` +) +SELECT + `bfcol_4` AS `int64_col`, + `bfcol_5` AS `bool_col` +FROM `bfcte_1` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/aggregations/snapshots/test_unary_compiler/test_var/window_out.sql b/tests/unit/core/compile/sqlglot/aggregations/snapshots/test_unary_compiler/test_var/window_out.sql new file mode 100644 index 0000000000..a65104215b --- /dev/null +++ b/tests/unit/core/compile/sqlglot/aggregations/snapshots/test_unary_compiler/test_var/window_out.sql @@ -0,0 +1,13 @@ +WITH `bfcte_0` AS ( + SELECT + `int64_col` AS `bfcol_0` + FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` +), `bfcte_1` AS ( + SELECT + *, + CASE WHEN `bfcol_0` IS NULL THEN NULL ELSE VARIANCE(`bfcol_0`) OVER () END AS `bfcol_1` + FROM `bfcte_0` +) +SELECT + `bfcol_1` AS `agg_int64` +FROM `bfcte_1` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/aggregations/test_unary_compiler.py b/tests/unit/core/compile/sqlglot/aggregations/test_unary_compiler.py index 3d7e4287ac..478368393a 100644 --- a/tests/unit/core/compile/sqlglot/aggregations/test_unary_compiler.py +++ b/tests/unit/core/compile/sqlglot/aggregations/test_unary_compiler.py @@ -370,6 +370,28 @@ def test_min(scalar_types_df: bpd.DataFrame, snapshot): snapshot.assert_match(sql_window_partition, "window_partition_out.sql") +def test_pop_var(scalar_types_df: bpd.DataFrame, snapshot): + col_names = ["int64_col", "bool_col"] + bf_df = scalar_types_df[col_names] + + agg_ops_map = { + "int64_col": agg_ops.PopVarOp().as_expr("int64_col"), + "bool_col": agg_ops.PopVarOp().as_expr("bool_col"), + } + sql = _apply_unary_agg_ops( + bf_df, list(agg_ops_map.values()), list(agg_ops_map.keys()) + ) + snapshot.assert_match(sql, "out.sql") + + # Window tests + col_name = "int64_col" + bf_df_int = scalar_types_df[[col_name]] + agg_expr = agg_ops.PopVarOp().as_expr(col_name) + window = window_spec.WindowSpec(ordering=(ordering.descending_over(col_name),)) + sql_window = _apply_unary_window_op(bf_df_int, agg_expr, window, "agg_int64") + snapshot.assert_match(sql_window, "window_out.sql") + + def test_quantile(scalar_types_df: bpd.DataFrame, snapshot): col_name = "int64_col" bf_df = scalar_types_df[[col_name]] @@ -428,6 +450,40 @@ def test_shift(scalar_types_df: bpd.DataFrame, snapshot): snapshot.assert_match(noop_sql, "noop.sql") +def test_std(scalar_types_df: bpd.DataFrame, snapshot): + col_names = ["int64_col", "bool_col", "duration_col"] + bf_df = scalar_types_df[col_names] + bf_df["duration_col"] = bpd.to_timedelta(bf_df["duration_col"], unit="us") + + # The `to_timedelta` creates a new mapping for the column id. + col_names.insert(0, "rowindex") + name2id = { + col_name: col_id + for col_name, col_id in zip(col_names, bf_df._block.expr.column_ids) + } + + agg_ops_map = { + "int64_col": agg_ops.StdOp().as_expr(name2id["int64_col"]), + "bool_col": agg_ops.StdOp().as_expr(name2id["bool_col"]), + "duration_col": agg_ops.StdOp().as_expr(name2id["duration_col"]), + "int64_col_w_floor": agg_ops.StdOp(should_floor_result=True).as_expr( + name2id["int64_col"] + ), + } + sql = _apply_unary_agg_ops( + bf_df, list(agg_ops_map.values()), list(agg_ops_map.keys()) + ) + snapshot.assert_match(sql, "out.sql") + + # Window tests + col_name = "int64_col" + bf_df_int = scalar_types_df[[col_name]] + agg_expr = agg_ops.StdOp().as_expr(col_name) + window = window_spec.WindowSpec(ordering=(ordering.descending_over(col_name),)) + sql_window = _apply_unary_window_op(bf_df_int, agg_expr, window, "agg_int64") + snapshot.assert_match(sql_window, "window_out.sql") + + def test_sum(scalar_types_df: bpd.DataFrame, snapshot): bf_df = scalar_types_df[["int64_col", "bool_col"]] agg_ops_map = { @@ -468,3 +524,25 @@ def test_time_series_diff(scalar_types_df: bpd.DataFrame, snapshot): ) sql = _apply_unary_window_op(bf_df, op, window, "diff_time") snapshot.assert_match(sql, "out.sql") + + +def test_var(scalar_types_df: bpd.DataFrame, snapshot): + col_names = ["int64_col", "bool_col"] + bf_df = scalar_types_df[col_names] + + agg_ops_map = { + "int64_col": agg_ops.VarOp().as_expr("int64_col"), + "bool_col": agg_ops.VarOp().as_expr("bool_col"), + } + sql = _apply_unary_agg_ops( + bf_df, list(agg_ops_map.values()), list(agg_ops_map.keys()) + ) + snapshot.assert_match(sql, "out.sql") + + # Window tests + col_name = "int64_col" + bf_df_int = scalar_types_df[[col_name]] + agg_expr = agg_ops.VarOp().as_expr(col_name) + window = window_spec.WindowSpec(ordering=(ordering.descending_over(col_name),)) + sql_window = _apply_unary_window_op(bf_df_int, agg_expr, window, "agg_int64") + snapshot.assert_match(sql_window, "window_out.sql") diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_comparison_ops/test_maximum_op/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_comparison_ops/test_maximum_op/out.sql new file mode 100644 index 0000000000..c0c0f5c97f --- /dev/null +++ b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_comparison_ops/test_maximum_op/out.sql @@ -0,0 +1,14 @@ +WITH `bfcte_0` AS ( + SELECT + `int64_col` AS `bfcol_0`, + `float64_col` AS `bfcol_1` + FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` +), `bfcte_1` AS ( + SELECT + *, + GREATEST(`bfcol_0`, `bfcol_1`) AS `bfcol_2` + FROM `bfcte_0` +) +SELECT + `bfcol_2` AS `int64_col` +FROM `bfcte_1` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_generic_ops/test_coalesce/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_generic_ops/test_coalesce/out.sql new file mode 100644 index 0000000000..5b11a1ddeb --- /dev/null +++ b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_generic_ops/test_coalesce/out.sql @@ -0,0 +1,16 @@ +WITH `bfcte_0` AS ( + SELECT + `int64_col` AS `bfcol_0`, + `int64_too` AS `bfcol_1` + FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` +), `bfcte_1` AS ( + SELECT + *, + `bfcol_0` AS `bfcol_2`, + COALESCE(`bfcol_1`, `bfcol_0`) AS `bfcol_3` + FROM `bfcte_0` +) +SELECT + `bfcol_2` AS `int64_col`, + `bfcol_3` AS `int64_too` +FROM `bfcte_1` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_geo_ops/test_geo_st_difference/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_geo_ops/test_geo_st_difference/out.sql new file mode 100644 index 0000000000..e57a15443d --- /dev/null +++ b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_geo_ops/test_geo_st_difference/out.sql @@ -0,0 +1,13 @@ +WITH `bfcte_0` AS ( + SELECT + `geography_col` AS `bfcol_0` + FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` +), `bfcte_1` AS ( + SELECT + *, + ST_DIFFERENCE(`bfcol_0`, `bfcol_0`) AS `bfcol_1` + FROM `bfcte_0` +) +SELECT + `bfcol_1` AS `geography_col` +FROM `bfcte_1` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_numeric_ops/test_arctan2/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_numeric_ops/test_arctan2/out.sql new file mode 100644 index 0000000000..d131828a98 --- /dev/null +++ b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_numeric_ops/test_arctan2/out.sql @@ -0,0 +1,17 @@ +WITH `bfcte_0` AS ( + SELECT + `bool_col` AS `bfcol_0`, + `int64_col` AS `bfcol_1`, + `float64_col` AS `bfcol_2` + FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` +), `bfcte_1` AS ( + SELECT + *, + ATAN2(`bfcol_1`, `bfcol_2`) AS `bfcol_6`, + ATAN2(CAST(`bfcol_0` AS INT64), `bfcol_2`) AS `bfcol_7` + FROM `bfcte_0` +) +SELECT + `bfcol_6` AS `int64_col`, + `bfcol_7` AS `bool_col` +FROM `bfcte_1` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_numeric_ops/test_cosine_distance/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_numeric_ops/test_cosine_distance/out.sql new file mode 100644 index 0000000000..eb46a16a83 --- /dev/null +++ b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_numeric_ops/test_cosine_distance/out.sql @@ -0,0 +1,16 @@ +WITH `bfcte_0` AS ( + SELECT + `int_list_col` AS `bfcol_0`, + `float_list_col` AS `bfcol_1` + FROM `bigframes-dev`.`sqlglot_test`.`repeated_types` +), `bfcte_1` AS ( + SELECT + *, + ML.DISTANCE(`bfcol_0`, `bfcol_0`, 'COSINE') AS `bfcol_2`, + ML.DISTANCE(`bfcol_1`, `bfcol_1`, 'COSINE') AS `bfcol_3` + FROM `bfcte_0` +) +SELECT + `bfcol_2` AS `int_list_col`, + `bfcol_3` AS `float_list_col` +FROM `bfcte_1` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/expressions/test_comparison_ops.py b/tests/unit/core/compile/sqlglot/expressions/test_comparison_ops.py index 52b57623b3..20dd6c5ca6 100644 --- a/tests/unit/core/compile/sqlglot/expressions/test_comparison_ops.py +++ b/tests/unit/core/compile/sqlglot/expressions/test_comparison_ops.py @@ -110,6 +110,13 @@ def test_le_numeric(scalar_types_df: bpd.DataFrame, snapshot): snapshot.assert_match(bf_df.sql, "out.sql") +def test_maximum_op(scalar_types_df: bpd.DataFrame, snapshot): + bf_df = scalar_types_df[["int64_col", "float64_col"]] + sql = utils._apply_binary_op(bf_df, ops.maximum_op, "int64_col", "float64_col") + + snapshot.assert_match(sql, "out.sql") + + def test_minimum_op(scalar_types_df: bpd.DataFrame, snapshot): bf_df = scalar_types_df[["int64_col", "float64_col"]] sql = utils._apply_binary_op(bf_df, ops.minimum_op, "int64_col", "float64_col") diff --git a/tests/unit/core/compile/sqlglot/expressions/test_generic_ops.py b/tests/unit/core/compile/sqlglot/expressions/test_generic_ops.py index aa40c21fd9..693f8dc34c 100644 --- a/tests/unit/core/compile/sqlglot/expressions/test_generic_ops.py +++ b/tests/unit/core/compile/sqlglot/expressions/test_generic_ops.py @@ -209,6 +209,20 @@ def test_case_when_op(scalar_types_df: bpd.DataFrame, snapshot): snapshot.assert_match(sql, "out.sql") +def test_coalesce(scalar_types_df: bpd.DataFrame, snapshot): + bf_df = scalar_types_df[["int64_col", "int64_too"]] + + sql = utils._apply_ops_to_sql( + bf_df, + [ + ops.coalesce_op.as_expr("int64_col", "int64_col"), + ops.coalesce_op.as_expr("int64_too", "int64_col"), + ], + ["int64_col", "int64_too"], + ) + snapshot.assert_match(sql, "out.sql") + + def test_clip(scalar_types_df: bpd.DataFrame, snapshot): op_expr = ops.clip_op.as_expr("rowindex", "int64_col", "int64_too") diff --git a/tests/unit/core/compile/sqlglot/expressions/test_geo_ops.py b/tests/unit/core/compile/sqlglot/expressions/test_geo_ops.py index 9b99b37fb6..847671b4b7 100644 --- a/tests/unit/core/compile/sqlglot/expressions/test_geo_ops.py +++ b/tests/unit/core/compile/sqlglot/expressions/test_geo_ops.py @@ -81,6 +81,14 @@ def test_geo_st_convexhull(scalar_types_df: bpd.DataFrame, snapshot): snapshot.assert_match(sql, "out.sql") +def test_geo_st_difference(scalar_types_df: bpd.DataFrame, snapshot): + col_name = "geography_col" + bf_df = scalar_types_df[[col_name]] + sql = utils._apply_binary_op(bf_df, ops.geo_st_difference_op, col_name, col_name) + + snapshot.assert_match(sql, "out.sql") + + def test_geo_st_geogfromtext(scalar_types_df: bpd.DataFrame, snapshot): col_name = "string_col" bf_df = scalar_types_df[[col_name]] diff --git a/tests/unit/core/compile/sqlglot/expressions/test_numeric_ops.py b/tests/unit/core/compile/sqlglot/expressions/test_numeric_ops.py index c66fe15c16..06731bcbfa 100644 --- a/tests/unit/core/compile/sqlglot/expressions/test_numeric_ops.py +++ b/tests/unit/core/compile/sqlglot/expressions/test_numeric_ops.py @@ -55,6 +55,20 @@ def test_arcsinh(scalar_types_df: bpd.DataFrame, snapshot): snapshot.assert_match(sql, "out.sql") +def test_arctan2(scalar_types_df: bpd.DataFrame, snapshot): + bf_df = scalar_types_df[["int64_col", "float64_col", "bool_col"]] + + sql = utils._apply_ops_to_sql( + bf_df, + [ + ops.arctan2_op.as_expr("int64_col", "float64_col"), + ops.arctan2_op.as_expr("bool_col", "float64_col"), + ], + ["int64_col", "bool_col"], + ) + snapshot.assert_match(sql, "out.sql") + + def test_arctan(scalar_types_df: bpd.DataFrame, snapshot): col_name = "float64_col" bf_df = scalar_types_df[[col_name]] @@ -103,6 +117,21 @@ def test_cosh(scalar_types_df: bpd.DataFrame, snapshot): snapshot.assert_match(sql, "out.sql") +def test_cosine_distance(repeated_types_df: bpd.DataFrame, snapshot): + col_names = ["int_list_col", "float_list_col"] + bf_df = repeated_types_df[col_names] + + sql = utils._apply_ops_to_sql( + bf_df, + [ + ops.cosine_distance_op.as_expr("int_list_col", "int_list_col"), + ops.cosine_distance_op.as_expr("float_list_col", "float_list_col"), + ], + ["int_list_col", "float_list_col"], + ) + snapshot.assert_match(sql, "out.sql") + + def test_exp(scalar_types_df: bpd.DataFrame, snapshot): col_name = "float64_col" bf_df = scalar_types_df[[col_name]] diff --git a/tests/unit/core/compile/sqlglot/snapshots/test_compile_geo/test_st_regionstats/out.sql b/tests/unit/core/compile/sqlglot/snapshots/test_compile_geo/test_st_regionstats/out.sql new file mode 100644 index 0000000000..63076077cf --- /dev/null +++ b/tests/unit/core/compile/sqlglot/snapshots/test_compile_geo/test_st_regionstats/out.sql @@ -0,0 +1,36 @@ +WITH `bfcte_0` AS ( + SELECT + * + FROM UNNEST(ARRAY>[STRUCT('POINT(1 1)', 0)]) +), `bfcte_1` AS ( + SELECT + *, + ST_REGIONSTATS( + `bfcol_0`, + 'ee://some/raster/uri', + band => 'band1', + include => 'some equation', + options => JSON '{"scale": 100}' + ) AS `bfcol_2` + FROM `bfcte_0` +), `bfcte_2` AS ( + SELECT + *, + `bfcol_2`.`min` AS `bfcol_5`, + `bfcol_2`.`max` AS `bfcol_6`, + `bfcol_2`.`sum` AS `bfcol_7`, + `bfcol_2`.`count` AS `bfcol_8`, + `bfcol_2`.`mean` AS `bfcol_9`, + `bfcol_2`.`area` AS `bfcol_10` + FROM `bfcte_1` +) +SELECT + `bfcol_5` AS `min`, + `bfcol_6` AS `max`, + `bfcol_7` AS `sum`, + `bfcol_8` AS `count`, + `bfcol_9` AS `mean`, + `bfcol_10` AS `area` +FROM `bfcte_2` +ORDER BY + `bfcol_1` ASC NULLS LAST \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/snapshots/test_compile_geo/test_st_regionstats_without_optional_args/out.sql b/tests/unit/core/compile/sqlglot/snapshots/test_compile_geo/test_st_regionstats_without_optional_args/out.sql new file mode 100644 index 0000000000..f794711961 --- /dev/null +++ b/tests/unit/core/compile/sqlglot/snapshots/test_compile_geo/test_st_regionstats_without_optional_args/out.sql @@ -0,0 +1,30 @@ +WITH `bfcte_0` AS ( + SELECT + * + FROM UNNEST(ARRAY>[STRUCT('POINT(1 1)', 0)]) +), `bfcte_1` AS ( + SELECT + *, + ST_REGIONSTATS(`bfcol_0`, 'ee://some/raster/uri') AS `bfcol_2` + FROM `bfcte_0` +), `bfcte_2` AS ( + SELECT + *, + `bfcol_2`.`min` AS `bfcol_5`, + `bfcol_2`.`max` AS `bfcol_6`, + `bfcol_2`.`sum` AS `bfcol_7`, + `bfcol_2`.`count` AS `bfcol_8`, + `bfcol_2`.`mean` AS `bfcol_9`, + `bfcol_2`.`area` AS `bfcol_10` + FROM `bfcte_1` +) +SELECT + `bfcol_5` AS `min`, + `bfcol_6` AS `max`, + `bfcol_7` AS `sum`, + `bfcol_8` AS `count`, + `bfcol_9` AS `mean`, + `bfcol_10` AS `area` +FROM `bfcte_2` +ORDER BY + `bfcol_1` ASC NULLS LAST \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/snapshots/test_compile_geo/test_st_simplify/out.sql b/tests/unit/core/compile/sqlglot/snapshots/test_compile_geo/test_st_simplify/out.sql new file mode 100644 index 0000000000..b8dd1587a8 --- /dev/null +++ b/tests/unit/core/compile/sqlglot/snapshots/test_compile_geo/test_st_simplify/out.sql @@ -0,0 +1,15 @@ +WITH `bfcte_0` AS ( + SELECT + * + FROM UNNEST(ARRAY>[STRUCT('POINT(1 1)', 0)]) +), `bfcte_1` AS ( + SELECT + *, + ST_SIMPLIFY(`bfcol_0`, 123.125) AS `bfcol_2` + FROM `bfcte_0` +) +SELECT + `bfcol_2` AS `0` +FROM `bfcte_1` +ORDER BY + `bfcol_1` ASC NULLS LAST \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/snapshots/test_compile_readlocal/test_compile_readlocal_w_nested_structs_df/out.sql b/tests/unit/core/compile/sqlglot/snapshots/test_compile_readlocal/test_compile_readlocal_w_nested_structs_df/out.sql deleted file mode 100644 index 42b7bc7361..0000000000 --- a/tests/unit/core/compile/sqlglot/snapshots/test_compile_readlocal/test_compile_readlocal_w_nested_structs_df/out.sql +++ /dev/null @@ -1,19 +0,0 @@ -SELECT - * -FROM UNNEST(ARRAY>, `bfcol_2` INT64>>[( - 1, - STRUCT( - 'Alice' AS `name`, - 30 AS `age`, - STRUCT('New York' AS `city`, 'USA' AS `country`) AS `address` - ), - 0 -), ( - 2, - STRUCT( - 'Bob' AS `name`, - 25 AS `age`, - STRUCT('London' AS `city`, 'UK' AS `country`) AS `address` - ), - 1 -)]) \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/test_compile_geo.py b/tests/unit/core/compile/sqlglot/test_compile_geo.py new file mode 100644 index 0000000000..50de1488e6 --- /dev/null +++ b/tests/unit/core/compile/sqlglot/test_compile_geo.py @@ -0,0 +1,52 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest + +import bigframes.bigquery as bbq +import bigframes.geopandas as gpd + +pytest.importorskip("pytest_snapshot") + + +def test_st_regionstats(compiler_session, snapshot): + geos = gpd.GeoSeries(["POINT(1 1)"], session=compiler_session) + result = bbq.st_regionstats( + geos, + "ee://some/raster/uri", + band="band1", + include="some equation", + options={"scale": 100}, + ) + assert "area" in result.struct.dtypes.index + snapshot.assert_match(result.struct.explode().sql, "out.sql") + + +def test_st_regionstats_without_optional_args(compiler_session, snapshot): + geos = gpd.GeoSeries(["POINT(1 1)"], session=compiler_session) + result = bbq.st_regionstats( + geos, + "ee://some/raster/uri", + ) + assert "area" in result.struct.dtypes.index + snapshot.assert_match(result.struct.explode().sql, "out.sql") + + +def test_st_simplify(compiler_session, snapshot): + geos = gpd.GeoSeries(["POINT(1 1)"], session=compiler_session) + result = bbq.st_simplify( + geos, + tolerance_meters=123.125, + ) + snapshot.assert_match(result.to_frame().sql, "out.sql") diff --git a/tests/unit/core/test_pyformat.py b/tests/unit/core/test_pyformat.py index 447ce37766..db7cedba8f 100644 --- a/tests/unit/core/test_pyformat.py +++ b/tests/unit/core/test_pyformat.py @@ -444,7 +444,7 @@ def test_pyformat_with_pandas_dataframe_not_dry_run_no_session_raises_valueerror def test_pyformat_with_query_string_replaces_variables(session): pyformat_args = { - "my_string": "some string value", + "my_string": "`my_table`", "max_value": 2.25, "year": 2025, "null_value": None, @@ -456,9 +456,8 @@ def test_pyformat_with_query_string_replaces_variables(session): SELECT {year} - year AS age, @myparam AS myparam, '{{my_string}}' AS escaped_string, - {my_string} AS my_string, - {null_value} AS null_value, - FROM my_dataset.my_table + * + FROM {my_string} WHERE height < {max_value} """.strip() @@ -466,9 +465,8 @@ def test_pyformat_with_query_string_replaces_variables(session): SELECT 2025 - year AS age, @myparam AS myparam, '{my_string}' AS escaped_string, - 'some string value' AS my_string, - NULL AS null_value, - FROM my_dataset.my_table + * + FROM `my_table` WHERE height < 2.25 """.strip() diff --git a/tests/unit/functions/test_function_typing.py b/tests/unit/functions/test_function_typing.py new file mode 100644 index 0000000000..46ae19555a --- /dev/null +++ b/tests/unit/functions/test_function_typing.py @@ -0,0 +1,50 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import datetime +import decimal + +import pytest + +from bigframes.functions import function_typing + + +def test_unsupported_type_error_init_with_dict(): + err = function_typing.UnsupportedTypeError( + decimal.Decimal, {int: "INT64", float: "FLOAT64"} + ) + + message = str(err) + + assert "Decimal" in message + assert "float, int" in message + + +def test_unsupported_type_error_init_with_set(): + err = function_typing.UnsupportedTypeError(decimal.Decimal, {int, float}) + + message = str(err) + + assert "Decimal" in message + assert "float, int" in message + + +def test_sdk_type_from_python_type_raises_unsupported_type_error(): + with pytest.raises(function_typing.UnsupportedTypeError) as excinfo: + function_typing.sdk_type_from_python_type(datetime.datetime) + + message = str(excinfo.value) + + assert "datetime" in message + assert "bool, bytes, float, int, str" in message diff --git a/tests/unit/pandas/io/test_api.py b/tests/unit/pandas/io/test_api.py index 14419236c9..dbdf427d91 100644 --- a/tests/unit/pandas/io/test_api.py +++ b/tests/unit/pandas/io/test_api.py @@ -108,7 +108,7 @@ def test_read_gbq_colab_calls_set_location( mock_with_default_session.return_value = mock_df query_or_table = "SELECT {param1} AS param1" - sample_pyformat_args = {"param1": "value1"} + sample_pyformat_args = {"param1": "'value1'"} result = bf_io_api._read_gbq_colab( query_or_table, pyformat_args=sample_pyformat_args, dry_run=False ) diff --git a/tests/unit/session/test_io_bigquery.py b/tests/unit/session/test_io_bigquery.py index 57ac3d88f7..41f3755f13 100644 --- a/tests/unit/session/test_io_bigquery.py +++ b/tests/unit/session/test_io_bigquery.py @@ -18,12 +18,15 @@ from unittest import mock import google.cloud.bigquery as bigquery +import google.cloud.bigquery.job +import google.cloud.bigquery.table import pytest import bigframes from bigframes.core import log_adapter import bigframes.core.events import bigframes.pandas as bpd +import bigframes.session._io.bigquery import bigframes.session._io.bigquery as io_bq from bigframes.testing import mocks @@ -32,7 +35,7 @@ def mock_bq_client(): mock_client = mock.create_autospec(bigquery.Client) mock_query_job = mock.create_autospec(bigquery.QueryJob) - mock_row_iterator = mock.create_autospec(bigquery.table.RowIterator) + mock_row_iterator = mock.create_autospec(google.cloud.bigquery.table.RowIterator) mock_query_job.result.return_value = mock_row_iterator @@ -98,14 +101,12 @@ def test_create_job_configs_labels_log_adaptor_call_method_under_length_limit(): cur_labels = { "source": "bigquery-dataframes-temp", } - df = bpd.DataFrame( - {"col1": [1, 2], "col2": [3, 4]}, session=mocks.create_bigquery_session() - ) - # Test running two methods - df.head() - df.max() - df.columns - api_methods = log_adapter._api_methods + api_methods = [ + "dataframe-columns", + "dataframe-max", + "dataframe-head", + "dataframe-__init__", + ] labels = io_bq.create_job_configs_labels( job_configs_labels=cur_labels, api_methods=api_methods @@ -123,17 +124,13 @@ def test_create_job_configs_labels_log_adaptor_call_method_under_length_limit(): def test_create_job_configs_labels_length_limit_met_and_labels_is_none(): log_adapter.get_and_reset_api_methods() - df = bpd.DataFrame( - {"col1": [1, 2], "col2": [3, 4]}, session=mocks.create_bigquery_session() - ) # Test running methods more than the labels' length limit - for i in range(100): - df.head() - api_methods = log_adapter._api_methods + api_methods = list(["dataframe-head"] * 100) - labels = io_bq.create_job_configs_labels( - job_configs_labels=None, api_methods=api_methods - ) + with bpd.option_context("compute.extra_query_labels", {}): + labels = io_bq.create_job_configs_labels( + job_configs_labels=None, api_methods=api_methods + ) assert labels is not None assert len(labels) == log_adapter.MAX_LABELS_COUNT assert "dataframe-head" in labels.values() @@ -150,17 +147,14 @@ def test_create_job_configs_labels_length_limit_met(): value = f"test{i}" cur_labels[key] = value # If cur_labels length is 62, we can only add one label from api_methods - df = bpd.DataFrame( - {"col1": [1, 2], "col2": [3, 4]}, session=mocks.create_bigquery_session() - ) # Test running two methods - df.head() - df.max() - api_methods = log_adapter._api_methods + api_methods = ["dataframe-max", "dataframe-head"] + + with bpd.option_context("compute.extra_query_labels", {}): + labels = io_bq.create_job_configs_labels( + job_configs_labels=cur_labels, api_methods=api_methods + ) - labels = io_bq.create_job_configs_labels( - job_configs_labels=cur_labels, api_methods=api_methods - ) assert labels is not None assert len(labels) == 56 assert "dataframe-max" in labels.values() @@ -184,7 +178,7 @@ def test_add_and_trim_labels_length_limit_met(): {"col1": [1, 2], "col2": [3, 4]}, session=mocks.create_bigquery_session() ) - job_config = bigquery.job.QueryJobConfig() + job_config = google.cloud.bigquery.job.QueryJobConfig() job_config.labels = cur_labels df.max() @@ -221,7 +215,7 @@ def test_start_query_with_client_labels_length_limit_met( {"col1": [1, 2], "col2": [3, 4]}, session=mocks.create_bigquery_session() ) - job_config = bigquery.job.QueryJobConfig() + job_config = google.cloud.bigquery.job.QueryJobConfig() job_config.labels = cur_labels df.max() diff --git a/tests/unit/session/test_read_gbq_colab.py b/tests/unit/session/test_read_gbq_colab.py index 52b091c045..b1dc1ec702 100644 --- a/tests/unit/session/test_read_gbq_colab.py +++ b/tests/unit/session/test_read_gbq_colab.py @@ -60,7 +60,7 @@ def test_read_gbq_colab_includes_formatted_values_in_dry_run(monkeypatch, dry_ru pyformat_args = { "some_integer": 123, - "some_string": "This could be dangerous, but we escape it", + "some_string": "some_column", "bf_df": bf_df, "pd_df": pd_df, # This is not a supported type, but ignored if not referenced. @@ -84,7 +84,7 @@ def test_read_gbq_colab_includes_formatted_values_in_dry_run(monkeypatch, dry_ru expected = textwrap.dedent( f""" SELECT 123 as some_integer, - 'This could be dangerous, but we escape it' as some_string, + some_column as some_string, '{{escaped}}' as escaped FROM `proj`.`dset`.`temp_{"table" if dry_run else "view"}` AS bf_df FULL OUTER JOIN `proj`.`dset`.`temp_{"table" if dry_run else "view"}` AS pd_df diff --git a/tests/unit/test_dataframe.py b/tests/unit/test_dataframe.py index 2326f2595b..015dbd030e 100644 --- a/tests/unit/test_dataframe.py +++ b/tests/unit/test_dataframe.py @@ -42,6 +42,68 @@ def test_dataframe_repr_with_uninitialized_object(): assert "DataFrame" in got +@pytest.mark.parametrize( + "rule", + [ + pd.DateOffset(weeks=1), + pd.Timedelta(hours=8), + # According to + # https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.resample.html + # these all default to "right" for closed and label, which isn't yet supported. + "ME", + "YE", + "QE", + "BME", + "BA", + "BQE", + "W", + ], +) +def test_dataframe_rule_not_implememented( + monkeypatch: pytest.MonkeyPatch, + rule, +): + dataframe = mocks.create_dataframe(monkeypatch) + + with pytest.raises(NotImplementedError, match="rule"): + dataframe.resample(rule=rule) + + +def test_dataframe_closed_not_implememented( + monkeypatch: pytest.MonkeyPatch, +): + dataframe = mocks.create_dataframe(monkeypatch) + + with pytest.raises(NotImplementedError, match="Only closed='left'"): + dataframe.resample(rule="1d", closed="right") + + +def test_dataframe_label_not_implememented( + monkeypatch: pytest.MonkeyPatch, +): + dataframe = mocks.create_dataframe(monkeypatch) + + with pytest.raises(NotImplementedError, match="Only label='left'"): + dataframe.resample(rule="1d", label="right") + + +@pytest.mark.parametrize( + "origin", + [ + "end", + "end_day", + ], +) +def test_dataframe_origin_not_implememented( + monkeypatch: pytest.MonkeyPatch, + origin, +): + dataframe = mocks.create_dataframe(monkeypatch) + + with pytest.raises(NotImplementedError, match="origin"): + dataframe.resample(rule="1d", origin=origin) + + def test_dataframe_setattr_with_uninitialized_object(): """Ensures DataFrame can be subclassed without trying to set attributes as columns.""" # Avoid calling __init__ since it might be called later in a subclass. diff --git a/tests/unit/test_local_data.py b/tests/unit/test_local_data.py index dfd1cd622f..6f23036efb 100644 --- a/tests/unit/test_local_data.py +++ b/tests/unit/test_local_data.py @@ -20,20 +20,21 @@ pd_data = pd.DataFrame( { - "ints": [10, 20, 30, 40], - "nested_ints": [[1, 2], [3, 4, 5], [], [20, 30]], - "structs": [{"a": 100}, {}, {"b": 200}, {"b": 300}], + "ints": [10, 20, 30, 40, 50], + "nested_ints": [[1, 2], [], [3, 4, 5], [], [20, 30]], + "structs": [{"a": 100}, None, {}, {"b": 200}, {"b": 300}], } ) pd_data_normalized = pd.DataFrame( { - "ints": pd.Series([10, 20, 30, 40], dtype=dtypes.INT_DTYPE), + "ints": pd.Series([10, 20, 30, 40, 50], dtype=dtypes.INT_DTYPE), "nested_ints": pd.Series( - [[1, 2], [3, 4, 5], [], [20, 30]], dtype=pd.ArrowDtype(pa.list_(pa.int64())) + [[1, 2], [], [3, 4, 5], [], [20, 30]], + dtype=pd.ArrowDtype(pa.list_(pa.int64())), ), "structs": pd.Series( - [{"a": 100}, {}, {"b": 200}, {"b": 300}], + [{"a": 100}, None, {}, {"b": 200}, {"b": 300}], dtype=pd.ArrowDtype(pa.struct({"a": pa.int64(), "b": pa.int64()})), ), } @@ -122,11 +123,11 @@ def test_local_data_well_formed_round_trip_chunked(): def test_local_data_well_formed_round_trip_sliced(): pa_table = pa.Table.from_pandas(pd_data, preserve_index=False) - as_rechunked_pyarrow = pa.Table.from_batches(pa_table.slice(2, 4).to_batches()) + as_rechunked_pyarrow = pa.Table.from_batches(pa_table.slice(0, 4).to_batches()) local_entry = local_data.ManagedArrowTable.from_pyarrow(as_rechunked_pyarrow) result = pd.DataFrame(local_entry.itertuples(), columns=pd_data.columns) pandas.testing.assert_frame_equal( - pd_data_normalized[2:4].reset_index(drop=True), + pd_data_normalized[0:4].reset_index(drop=True), result.reset_index(drop=True), check_dtype=False, ) @@ -143,3 +144,25 @@ def test_local_data_not_equal_other(): local_entry2 = local_data.ManagedArrowTable.from_pandas(pd_data[::2]) assert local_entry != local_entry2 assert hash(local_entry) != hash(local_entry2) + + +def test_local_data_itertuples_struct_none(): + pd_data = pd.DataFrame( + { + "structs": [{"a": 100}, None, {"b": 200}, {"b": 300}], + } + ) + local_entry = local_data.ManagedArrowTable.from_pandas(pd_data) + result = list(local_entry.itertuples()) + assert result[1][0] is None + + +def test_local_data_itertuples_list_none(): + pd_data = pd.DataFrame( + { + "lists": [[1, 2], None, [3, 4]], + } + ) + local_entry = local_data.ManagedArrowTable.from_pandas(pd_data) + result = list(local_entry.itertuples()) + assert result[1][0] == [] diff --git a/tests/unit/test_series_polars.py b/tests/unit/test_series_polars.py index 55bc048bcd..6f729b0df0 100644 --- a/tests/unit/test_series_polars.py +++ b/tests/unit/test_series_polars.py @@ -5006,14 +5006,14 @@ def test_series_explode_null(data): pytest.param(True, "timestamp_col", "timestamp_col", "1YE"), ], ) -def test__resample(scalars_df_index, scalars_pandas_df_index, append, level, col, rule): +def test_resample(scalars_df_index, scalars_pandas_df_index, append, level, col, rule): # TODO: supply a reason why this isn't compatible with pandas 1.x pytest.importorskip("pandas", minversion="2.0.0") scalars_df_index = scalars_df_index.set_index(col, append=append)["int64_col"] scalars_pandas_df_index = scalars_pandas_df_index.set_index(col, append=append)[ "int64_col" ] - bf_result = scalars_df_index._resample(rule=rule, level=level).min().to_pandas() + bf_result = scalars_df_index.resample(rule=rule, level=level).min().to_pandas() pd_result = scalars_pandas_df_index.resample(rule=rule, level=level).min() pd.testing.assert_series_equal(bf_result, pd_result) diff --git a/third_party/bigframes_vendored/ibis/backends/sql/compilers/base.py b/third_party/bigframes_vendored/ibis/backends/sql/compilers/base.py index cbc51e59d6..c01d87fb28 100644 --- a/third_party/bigframes_vendored/ibis/backends/sql/compilers/base.py +++ b/third_party/bigframes_vendored/ibis/backends/sql/compilers/base.py @@ -811,7 +811,7 @@ def visit_DefaultLiteral(self, op, *, value, dtype): elif dtype.is_uuid(): return self.cast(str(value), dtype) elif dtype.is_json(): - return sge.ParseJSON(this=sge.convert(str(value))) + return sge.JSON(this=sge.convert(str(value))) elif dtype.is_geospatial(): wkt = value if isinstance(value, str) else value.wkt return self.f.st_geogfromtext(wkt) diff --git a/third_party/bigframes_vendored/ibis/backends/sql/compilers/bigquery/__init__.py b/third_party/bigframes_vendored/ibis/backends/sql/compilers/bigquery/__init__.py index cf205b69d6..95d28991a9 100644 --- a/third_party/bigframes_vendored/ibis/backends/sql/compilers/bigquery/__init__.py +++ b/third_party/bigframes_vendored/ibis/backends/sql/compilers/bigquery/__init__.py @@ -261,6 +261,16 @@ def visit_BoundingBox(self, op, *, arg): visit_GeoXMax = visit_GeoXMin = visit_GeoYMax = visit_GeoYMin = visit_BoundingBox + def visit_GeoRegionStats(self, op, *, arg, raster_id, band, include, options): + args = [arg, raster_id] + if op.band: + args.append(sge.Kwarg(this="band", expression=band)) + if op.include: + args.append(sge.Kwarg(this="include", expression=include)) + if op.options: + args.append(sge.Kwarg(this="options", expression=options)) + return sge.func("ST_REGIONSTATS", *args) + def visit_GeoSimplify(self, op, *, arg, tolerance, preserve_collapsed): if ( not isinstance(op.preserve_collapsed, ops.Literal) diff --git a/third_party/bigframes_vendored/ibis/expr/operations/geospatial.py b/third_party/bigframes_vendored/ibis/expr/operations/geospatial.py index 0be832af78..efe038599a 100644 --- a/third_party/bigframes_vendored/ibis/expr/operations/geospatial.py +++ b/third_party/bigframes_vendored/ibis/expr/operations/geospatial.py @@ -343,6 +343,28 @@ class GeoNRings(GeoSpatialUnOp): dtype = dt.int64 +@public +class GeoRegionStats(GeoSpatialUnOp): + """Returns results of ST_REGIONSTATS.""" + + raster_id: Value[dt.String] + band: Value[dt.String] + include: Value[dt.String] + options: Value[dt.JSON] + + dtype = dt.Struct( + fields={ + "count": dt.int64, + "min": dt.float64, + "max": dt.float64, + "stdDev": dt.float64, + "sum": dt.float64, + "mean": dt.float64, + "area": dt.float64, + } + ) + + @public class GeoSRID(GeoSpatialUnOp): """Returns the spatial reference identifier for the ST_Geometry.""" diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py index b434b51fb3..3381f53351 100644 --- a/third_party/bigframes_vendored/pandas/core/frame.py +++ b/third_party/bigframes_vendored/pandas/core/frame.py @@ -11,6 +11,7 @@ """ from __future__ import annotations +import datetime from typing import Hashable, Iterable, Literal, Optional, Sequence, Union from bigframes_vendored import constants @@ -419,7 +420,7 @@ def to_gbq( >>> df = bpd.DataFrame({'col1': [1, 2], 'col2': [3, 4]}) >>> destination = df.to_gbq(ordering_id="ordering_id") >>> # The table created can be read outside of the current session. - >>> bpd.close_session() # Optional, to demonstrate a new session. + >>> bpd.close_session() # Optional, to demonstrate a new session. # doctest: +SKIP >>> bpd.read_gbq(destination, index_col="ordering_id") col1 col2 ordering_id @@ -4592,6 +4593,8 @@ def merge( *, left_on: Optional[str] = None, right_on: Optional[str] = None, + left_index: bool = False, + right_index: bool = False, sort: bool = False, suffixes: tuple[str, str] = ("_x", "_y"), ) -> DataFrame: @@ -4704,6 +4707,10 @@ def merge( right_on (label or list of labels): Columns to join on in the right DataFrame. Either on or left_on + right_on must be passed in. + left_index (bool, default False): + Use the index from the left DataFrame as the join key. + right_index (bool, default False): + Use the index from the right DataFrame as the join key. sort: Default False. Sort the join keys lexicographically in the result DataFrame. If False, the order of the join keys depends @@ -4734,6 +4741,86 @@ def merge( """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def resample( + self, + rule: str, + *, + closed: Optional[Literal["right", "left"]] = None, + label: Optional[Literal["right", "left"]] = None, + on=None, + level=None, + origin: Union[ + Union[pd.Timestamp, datetime.datetime, np.datetime64, int, float, str], + Literal["epoch", "start", "start_day", "end", "end_day"], + ] = "start_day", + ): + """Resample time-series data. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> data = { + ... "timestamp_col": pd.date_range( + ... start="2021-01-01 13:00:00", periods=30, freq="1s" + ... ), + ... "int64_col": range(30), + ... "int64_too": range(10, 40), + ... } + + Resample on a DataFrame with index: + + >>> df = bpd.DataFrame(data).set_index("timestamp_col") + >>> df.resample(rule="7s").min() + int64_col int64_too + 2021-01-01 12:59:55 0 10 + 2021-01-01 13:00:02 2 12 + 2021-01-01 13:00:09 9 19 + 2021-01-01 13:00:16 16 26 + 2021-01-01 13:00:23 23 33 + + [5 rows x 2 columns] + + Resample with column and origin set to 'start': + + >>> df = bpd.DataFrame(data) + >>> df.resample(rule="7s", on = "timestamp_col", origin="start").min() + int64_col int64_too + 2021-01-01 13:00:00 0 10 + 2021-01-01 13:00:07 7 17 + 2021-01-01 13:00:14 14 24 + 2021-01-01 13:00:21 21 31 + 2021-01-01 13:00:28 28 38 + + [5 rows x 2 columns] + + Args: + rule (str): + The offset string representing target conversion. + Offsets 'ME', 'YE', 'QE', 'BME', 'BA', 'BQE', and 'W' are *not* + supported. + closed (Literal['left'] | None): + Which side of bin interval is closed. The default is 'left' for + all supported frequency offsets. + label (Literal['right'] | Literal['left'] | None): + Which bin edge label to label bucket with. The default is 'left' + for all supported frequency offsets. + on (str, default None): + For a DataFrame, column to use instead of index for resampling. Column + must be datetime-like. + level (str or int, default None): + For a MultiIndex, level (name or number) to use for resampling. + level must be datetime-like. + origin(str, default 'start_day'): + The timestamp on which to adjust the grouping. Must be one of the following: + 'epoch': origin is 1970-01-01 + 'start': origin is the first value of the timeseries + 'start_day': origin is the first day at midnight of the timeseries + Origin values 'end' and 'end_day' are *not* supported. + Returns: + DataFrameGroupBy: DataFrameGroupBy object. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def round(self, decimals): """ Round a DataFrame to a variable number of decimal places. diff --git a/third_party/bigframes_vendored/pandas/core/reshape/merge.py b/third_party/bigframes_vendored/pandas/core/reshape/merge.py index 66fb2c2160..49ff409c9a 100644 --- a/third_party/bigframes_vendored/pandas/core/reshape/merge.py +++ b/third_party/bigframes_vendored/pandas/core/reshape/merge.py @@ -13,6 +13,8 @@ def merge( *, left_on=None, right_on=None, + left_index: bool = False, + right_index: bool = False, sort=False, suffixes=("_x", "_y"), ): @@ -61,6 +63,10 @@ def merge( right_on (label or list of labels): Columns to join on in the right DataFrame. Either on or left_on + right_on must be passed in. + left_index (bool, default False): + Use the index from the left DataFrame as the join key. + right_index (bool, default False): + Use the index from the right DataFrame as the join key. sort: Default False. Sort the join keys lexicographically in the result DataFrame. If False, the order of the join keys depends diff --git a/third_party/bigframes_vendored/pandas/core/series.py b/third_party/bigframes_vendored/pandas/core/series.py index 8de1c10f93..2c0f493d81 100644 --- a/third_party/bigframes_vendored/pandas/core/series.py +++ b/third_party/bigframes_vendored/pandas/core/series.py @@ -3,6 +3,7 @@ """ from __future__ import annotations +import datetime from typing import ( Hashable, IO, @@ -19,6 +20,7 @@ from bigframes_vendored.pandas.core.generic import NDFrame import numpy import numpy as np +import pandas as pd from pandas._typing import Axis, FilePath, NaPosition, WriteBuffer from pandas.api import extensions as pd_ext @@ -2502,6 +2504,68 @@ def replace( """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def resample( + self, + rule: str, + *, + closed: Optional[Literal["right", "left"]] = None, + label: Optional[Literal["right", "left"]] = None, + level=None, + origin: Union[ + Union[pd.Timestamp, datetime.datetime, numpy.datetime64, int, float, str], + Literal["epoch", "start", "start_day", "end", "end_day"], + ] = "start_day", + ): + """Resample time-series data. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> data = { + ... "timestamp_col": pd.date_range( + ... start="2021-01-01 13:00:00", periods=30, freq="1s" + ... ), + ... "int64_col": range(30), + ... } + >>> s = bpd.DataFrame(data).set_index("timestamp_col") + >>> s.resample(rule="7s", origin="epoch").min() + int64_col + 2021-01-01 12:59:56 0 + 2021-01-01 13:00:03 3 + 2021-01-01 13:00:10 10 + 2021-01-01 13:00:17 17 + 2021-01-01 13:00:24 24 + + [5 rows x 1 columns] + + Args: + rule (str): + The offset string representing target conversion. + Offsets 'ME', 'YE', 'QE', 'BME', 'BA', 'BQE', and 'W' are *not* + supported. + closed (Literal['left'] | None): + Which side of bin interval is closed. The default is 'left' for + all supported frequency offsets. + label (Literal['right'] | Literal['left'] | None): + Which bin edge label to label bucket with. The default is 'left' + for all supported frequency offsets. + on (str, default None): + For a DataFrame, column to use instead of index for resampling. Column + must be datetime-like. + level (str or int, default None): + For a MultiIndex, level (name or number) to use for resampling. + level must be datetime-like. + origin(str, default 'start_day'): + The timestamp on which to adjust the grouping. Must be one of the following: + 'epoch': origin is 1970-01-01 + 'start': origin is the first value of the timeseries + 'start_day': origin is the first day at midnight of the timeseries + Origin values 'end' and 'end_day' are *not* supported. + Returns: + SeriesGroupBy: SeriesGroupBy object. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def dropna(self, *, axis=0, inplace: bool = False, how=None) -> Series: """ Return a new Series with missing values removed. diff --git a/third_party/bigframes_vendored/version.py b/third_party/bigframes_vendored/version.py index cf7562a306..a129daf092 100644 --- a/third_party/bigframes_vendored/version.py +++ b/third_party/bigframes_vendored/version.py @@ -12,8 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "2.28.0" +__version__ = "2.29.0" # {x-release-please-start-date} -__release_date__ = "2025-11-03" +__release_date__ = "2025-11-10" # {x-release-please-end}