From eefffbc462fdd00674d0b414bdf02c9ba4d80072 Mon Sep 17 00:00:00 2001 From: Shenyang Cai Date: Thu, 6 Feb 2025 03:24:25 +0000 Subject: [PATCH 01/14] chore: support addition between a timestamp and a timedelta --- bigframes/core/compile/ibis_types.py | 9 +- bigframes/core/compile/scalar_op_compiler.py | 5 + bigframes/core/rewrite/operators.py | 24 ++++ bigframes/core/utils.py | 10 +- bigframes/dtypes.py | 6 +- bigframes/operations/__init__.py | 5 +- bigframes/operations/numeric_ops.py | 17 ++- bigframes/operations/timedelta_ops.py | 27 ++++- tests/data/scalars.jsonl | 18 +-- tests/data/scalars_schema.json | 5 + tests/system/conftest.py | 4 +- .../small/operations/test_timedeltas.py | 114 ++++++++++++++++++ tests/system/utils.py | 10 ++ 13 files changed, 233 insertions(+), 21 deletions(-) create mode 100644 tests/system/small/operations/test_timedeltas.py diff --git a/bigframes/core/compile/ibis_types.py b/bigframes/core/compile/ibis_types.py index 78c2259cf0..4d4c1775ef 100644 --- a/bigframes/core/compile/ibis_types.py +++ b/bigframes/core/compile/ibis_types.py @@ -13,6 +13,7 @@ # limitations under the License. from __future__ import annotations +import datetime import typing from typing import cast, Dict, Iterable, Optional, Tuple, Union @@ -30,6 +31,7 @@ import pandas as pd import pyarrow as pa +from bigframes.core import utils import bigframes.dtypes # Type hints for Ibis data types supported by BigQuery DataFrame @@ -402,7 +404,12 @@ def literal_to_ibis_scalar( return bigframes_vendored.ibis.null() scalar_expr = bigframes_vendored.ibis.literal(literal) - if ibis_dtype: + if isinstance(literal, datetime.timedelta): + # In BigQuery, a timedelta is represented as an integer value in microseconds + scalar_expr = bigframes_vendored.ibis.literal( + utils.timedelta_to_micros(literal), ibis_dtype + ) + elif ibis_dtype: scalar_expr = bigframes_vendored.ibis.literal(literal, ibis_dtype) elif scalar_expr.type().is_floating(): scalar_expr = bigframes_vendored.ibis.literal(literal, ibis_dtypes.float64) diff --git a/bigframes/core/compile/scalar_op_compiler.py b/bigframes/core/compile/scalar_op_compiler.py index 4739cc9a99..2fdda77e47 100644 --- a/bigframes/core/compile/scalar_op_compiler.py +++ b/bigframes/core/compile/scalar_op_compiler.py @@ -742,6 +742,11 @@ def timestamp_diff_op_impl(x: ibis_types.TimestampValue, y: ibis_types.Timestamp return x.delta(y, "microsecond") +@scalar_op_compiler.register_binary_op(ops.timestamp_add_op) +def timestamp_add_op_impl(x: ibis_types.TimestampValue, y: ibis_types.IntegerValue): + return x + y.to_interval("us") + + @scalar_op_compiler.register_unary_op(ops.FloorDtOp, pass_op=True) def floor_dt_op_impl(x: ibis_types.Value, op: ops.FloorDtOp): supported_freqs = ["Y", "Q", "M", "W", "D", "h", "min", "s", "ms", "us", "ns"] diff --git a/bigframes/core/rewrite/operators.py b/bigframes/core/rewrite/operators.py index 3145a9e9ae..1570df148e 100644 --- a/bigframes/core/rewrite/operators.py +++ b/bigframes/core/rewrite/operators.py @@ -67,6 +67,9 @@ def _rewrite_op_expr( if isinstance(expr.op, ops.SubOp): return _rewrite_sub_op(inputs[0], inputs[1]) + if isinstance(expr.op, ops.AddOp): + return _rewrite_add_op(inputs[0], inputs[1]) + input_types = tuple(map(lambda x: x.dtype, inputs)) return _TypedExpr(expr, expr.op.output_type(*input_types)) @@ -80,3 +83,24 @@ def _rewrite_sub_op(left: _TypedExpr, right: _TypedExpr) -> _TypedExpr: result_op.as_expr(left.expr, right.expr), result_op.output_type(left.dtype, right.dtype), ) + + +def _rewrite_add_op(left: _TypedExpr, right: _TypedExpr) -> _TypedExpr: + if dtypes.is_datetime_like(left.dtype) and right.dtype is dtypes.TIMEDELTA_DTYPE: + return _TypedExpr( + ops.timestamp_add_op.as_expr(left.expr, right.expr), + ops.timestamp_add_op.output_type(left.dtype, right.dtype), + ) + + if left.dtype is dtypes.TIMEDELTA_DTYPE and dtypes.is_datetime_like(right.dtype): + # Re-arrange operands such that timestamp is always on the left and timedelta is + # always on the right. + return _TypedExpr( + ops.timestamp_add_op.as_expr(right.expr, left.expr), + ops.timestamp_add_op.output_type(right.dtype, left.dtype), + ) + + return _TypedExpr( + ops.add_op.as_expr(left.expr, right.expr), + ops.add_op.output_type(left.dtype, right.dtype), + ) diff --git a/bigframes/core/utils.py b/bigframes/core/utils.py index 7cb2ec7535..9741c8d873 100644 --- a/bigframes/core/utils.py +++ b/bigframes/core/utils.py @@ -11,6 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import datetime import functools import re import typing @@ -187,9 +188,12 @@ def wrapper(*args, **kwargs): return decorator -def timedelta_to_micros(td: pd.Timedelta) -> int: - # td.value returns total nanoseconds. - return td.value // 1000 +def timedelta_to_micros(td: typing.Union[pd.Timedelta, datetime.timedelta]) -> int: + if isinstance(td, pd.Timedelta): + # td.value returns total nanoseconds. + return td.value // 1000 + + return ((td.days * 3600 * 24) + td.seconds) * 1_000_000 + td.microseconds def replace_timedeltas_with_micros(dataframe: pd.DataFrame) -> List[str]: diff --git a/bigframes/dtypes.py b/bigframes/dtypes.py index d5be2ca584..d1a9b39469 100644 --- a/bigframes/dtypes.py +++ b/bigframes/dtypes.py @@ -105,6 +105,8 @@ pd.Timestamp, datetime.date, datetime.time, + pd.Timedelta, + datetime.timedelta, ] LOCAL_SCALAR_TYPES = typing.get_args(LOCAL_SCALAR_TYPE) @@ -420,7 +422,7 @@ def arrow_dtype_to_bigframes_dtype(arrow_dtype: pa.DataType) -> Dtype: return pd.ArrowDtype(arrow_dtype) if pa.types.is_duration(arrow_dtype): - return pd.ArrowDtype(arrow_dtype) + return TIMEDELTA_DTYPE # BigFrames doesn't distinguish between string and large_string because the # largest string (2 GB) is already larger than the largest BigQuery row. @@ -578,6 +580,8 @@ def _infer_dtype_from_python_type(type: type) -> Dtype: return DATE_DTYPE if issubclass(type, datetime.time): return TIME_DTYPE + if issubclass(type, datetime.timedelta): + return TIMEDELTA_DTYPE else: raise TypeError( f"No matching datatype for python type: {type}. {constants.FEEDBACK_LINK}" diff --git a/bigframes/operations/__init__.py b/bigframes/operations/__init__.py index ba8f3f64d7..b621f483eb 100644 --- a/bigframes/operations/__init__.py +++ b/bigframes/operations/__init__.py @@ -98,6 +98,7 @@ from bigframes.operations.numeric_ops import ( abs_op, add_op, + AddOp, arccos_op, arccosh_op, arcsin_op, @@ -172,7 +173,7 @@ ) from bigframes.operations.struct_ops import StructFieldOp, StructOp from bigframes.operations.time_ops import hour_op, minute_op, normalize_op, second_op -from bigframes.operations.timedelta_ops import ToTimedeltaOp +from bigframes.operations.timedelta_ops import timestamp_add_op, ToTimedeltaOp __all__ = [ # Base ops @@ -244,6 +245,7 @@ "second_op", "normalize_op", # Timedelta ops + "timestamp_add_op", "ToTimedeltaOp", # Datetime ops "date_op", @@ -258,6 +260,7 @@ # Numeric ops "abs_op", "add_op", + "AddOp", "arccos_op", "arccosh_op", "arcsin_op", diff --git a/bigframes/operations/numeric_ops.py b/bigframes/operations/numeric_ops.py index 413d8d66e1..2dd5ef4623 100644 --- a/bigframes/operations/numeric_ops.py +++ b/bigframes/operations/numeric_ops.py @@ -113,15 +113,24 @@ class AddOp(base_ops.BinaryOp): def output_type(self, *input_types): left_type = input_types[0] right_type = input_types[1] + + if left_type is None or right_type is None: + return None + if all(map(dtypes.is_string_like, input_types)) and len(set(input_types)) == 1: # String addition return input_types[0] - if (left_type is None or dtypes.is_numeric(left_type)) and ( - right_type is None or dtypes.is_numeric(right_type) - ): + + if dtypes.is_numeric(left_type) and dtypes.is_numeric(right_type): # Numeric addition return dtypes.coerce_to_common(left_type, right_type) - # TODO: Add temporal addition once delta types supported + + # Timestamp addition. + if dtypes.is_datetime_like(left_type) and right_type is dtypes.TIMEDELTA_DTYPE: + return left_type + if left_type is dtypes.TIMEDELTA_DTYPE and dtypes.is_datetime_like(right_type): + return right_type + raise TypeError(f"Cannot add dtypes {left_type} and {right_type}") diff --git a/bigframes/operations/timedelta_ops.py b/bigframes/operations/timedelta_ops.py index e212381557..e15870b592 100644 --- a/bigframes/operations/timedelta_ops.py +++ b/bigframes/operations/timedelta_ops.py @@ -25,7 +25,32 @@ class ToTimedeltaOp(base_ops.UnaryOp): name: typing.ClassVar[str] = "to_timedelta" unit: typing.Literal["us", "ms", "s", "m", "h", "d", "W"] - def output_type(self, *input_types): + def output_type(self, *input_types: dtypes.ExpressionType): if input_types[0] is not dtypes.INT_DTYPE: raise TypeError("expected integer input") return dtypes.TIMEDELTA_DTYPE + + +@dataclasses.dataclass(frozen=True) +class TimestampAdd(base_ops.BinaryOp): + name: typing.ClassVar[str] = "timestamp_add" + + def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType: + # timestamp + timedelta => timestamp + if ( + dtypes.is_datetime_like(input_types[0]) + and input_types[1] is dtypes.TIMEDELTA_DTYPE + ): + return input_types[0] + # timedelta + timestamp => timestamp + if input_types[0] is dtypes.TIMEDELTA_DTYPE and dtypes.is_datetime_like( + input_types[1] + ): + return input_types[1] + + raise TypeError( + f"unsupported types for timestamp_add. left: {input_types[0]} right: {input_types[1]}" + ) + + +timestamp_add_op = TimestampAdd() diff --git a/tests/data/scalars.jsonl b/tests/data/scalars.jsonl index 172a55ec11..02379b8b64 100644 --- a/tests/data/scalars.jsonl +++ b/tests/data/scalars.jsonl @@ -1,9 +1,9 @@ -{"bool_col": true, "bytes_col": "SGVsbG8sIFdvcmxkIQ==", "date_col": "2021-07-21", "datetime_col": "2021-07-21 11:39:45", "geography_col": "POINT(-122.0838511 37.3860517)", "int64_col": "123456789", "int64_too": "0", "numeric_col": "1.23456789", "float64_col": "1.25", "rowindex": 0, "rowindex_2": 0, "string_col": "Hello, World!", "time_col": "11:41:43.076160", "timestamp_col": "2021-07-21T17:43:43.945289Z"} -{"bool_col": false, "bytes_col": "44GT44KT44Gr44Gh44Gv", "date_col": "1991-02-03", "datetime_col": "1991-01-02 03:45:06", "geography_col": "POINT(-71.104 42.315)", "int64_col": "-987654321", "int64_too": "1", "numeric_col": "1.23456789", "float64_col": "2.51", "rowindex": 1, "rowindex_2": 1, "string_col": "こんにちは", "time_col": "11:14:34.701606", "timestamp_col": "2021-07-21T17:43:43.945289Z"} -{"bool_col": true, "bytes_col": "wqFIb2xhIE11bmRvIQ==", "date_col": "2023-03-01", "datetime_col": "2023-03-01 10:55:13", "geography_col": "POINT(-0.124474760143016 51.5007826749545)", "int64_col": "314159", "int64_too": "0", "numeric_col": "101.1010101", "float64_col": "2.5e10", "rowindex": 2, "rowindex_2": 2, "string_col": " ¡Hola Mundo! ", "time_col": "23:59:59.999999", "timestamp_col": "2023-03-01T10:55:13.250125Z"} -{"bool_col": null, "bytes_col": null, "date_col": null, "datetime_col": null, "geography_col": null, "int64_col": null, "int64_too": "1", "numeric_col": null, "float64_col": null, "rowindex": 3, "rowindex_2": 3, "string_col": null, "time_col": null, "timestamp_col": null} -{"bool_col": false, "bytes_col": "44GT44KT44Gr44Gh44Gv", "date_col": "2021-07-21", "datetime_col": null, "geography_col": null, "int64_col": "-234892", "int64_too": "-2345", "numeric_col": null, "float64_col": null, "rowindex": 4, "rowindex_2": 4, "string_col": "Hello, World!", "time_col": null, "timestamp_col": null} -{"bool_col": false, "bytes_col": "R8O8dGVuIFRhZw==", "date_col": "1980-03-14", "datetime_col": "1980-03-14 15:16:17", "geography_col": null, "int64_col": "55555", "int64_too": "0", "numeric_col": "5.555555", "float64_col": "555.555", "rowindex": 5, "rowindex_2": 5, "string_col": "Güten Tag!", "time_col": "15:16:17.181921", "timestamp_col": "1980-03-14T15:16:17.181921Z"} -{"bool_col": true, "bytes_col": "SGVsbG8JQmlnRnJhbWVzIQc=", "date_col": "2023-05-23", "datetime_col": "2023-05-23 11:37:01", "geography_col": "MULTIPOINT (20 20, 10 40, 40 30, 30 10)", "int64_col": "101202303", "int64_too": "2", "numeric_col": "-10.090807", "float64_col": "-123.456", "rowindex": 6, "rowindex_2": 6, "string_col": "capitalize, This ", "time_col": "01:02:03.456789", "timestamp_col": "2023-05-23T11:42:55.000001Z"} -{"bool_col": true, "bytes_col": null, "date_col": "2038-01-20", "datetime_col": "2038-01-19 03:14:08", "geography_col": null, "int64_col": "-214748367", "int64_too": "2", "numeric_col": "11111111.1", "float64_col": "42.42", "rowindex": 7, "rowindex_2": 7, "string_col": " سلام", "time_col": "12:00:00.000001", "timestamp_col": "2038-01-19T03:14:17.999999Z"} -{"bool_col": false, "bytes_col": null, "date_col": null, "datetime_col": null, "geography_col": null, "int64_col": "2", "int64_too": "1", "numeric_col": null, "float64_col": "6.87", "rowindex": 8, "rowindex_2": 8, "string_col": "T", "time_col": null, "timestamp_col": null} +{"bool_col": true, "bytes_col": "SGVsbG8sIFdvcmxkIQ==", "date_col": "2021-07-21", "datetime_col": "2021-07-21 11:39:45", "geography_col": "POINT(-122.0838511 37.3860517)", "int64_col": "123456789", "int64_too": "0", "numeric_col": "1.23456789", "float64_col": "1.25", "rowindex": 0, "rowindex_2": 0, "string_col": "Hello, World!", "time_col": "11:41:43.076160", "timestamp_col": "2021-07-21T17:43:43.945289Z", "timedelta_col": 1} +{"bool_col": false, "bytes_col": "44GT44KT44Gr44Gh44Gv", "date_col": "1991-02-03", "datetime_col": "1991-01-02 03:45:06", "geography_col": "POINT(-71.104 42.315)", "int64_col": "-987654321", "int64_too": "1", "numeric_col": "1.23456789", "float64_col": "2.51", "rowindex": 1, "rowindex_2": 1, "string_col": "こんにちは", "time_col": "11:14:34.701606", "timestamp_col": "2021-07-21T17:43:43.945289Z", "timedelta_col": 2} +{"bool_col": true, "bytes_col": "wqFIb2xhIE11bmRvIQ==", "date_col": "2023-03-01", "datetime_col": "2023-03-01 10:55:13", "geography_col": "POINT(-0.124474760143016 51.5007826749545)", "int64_col": "314159", "int64_too": "0", "numeric_col": "101.1010101", "float64_col": "2.5e10", "rowindex": 2, "rowindex_2": 2, "string_col": " ¡Hola Mundo! ", "time_col": "23:59:59.999999", "timestamp_col": "2023-03-01T10:55:13.250125Z", "timedelta_col": 3} +{"bool_col": null, "bytes_col": null, "date_col": null, "datetime_col": null, "geography_col": null, "int64_col": null, "int64_too": "1", "numeric_col": null, "float64_col": null, "rowindex": 3, "rowindex_2": 3, "string_col": null, "time_col": null, "timestamp_col": null, "timedelta_col": 5} +{"bool_col": false, "bytes_col": "44GT44KT44Gr44Gh44Gv", "date_col": "2021-07-21", "datetime_col": null, "geography_col": null, "int64_col": "-234892", "int64_too": "-2345", "numeric_col": null, "float64_col": null, "rowindex": 4, "rowindex_2": 4, "string_col": "Hello, World!", "time_col": null, "timestamp_col": null, "timedelta_col": 5000} +{"bool_col": false, "bytes_col": "R8O8dGVuIFRhZw==", "date_col": "1980-03-14", "datetime_col": "1980-03-14 15:16:17", "geography_col": null, "int64_col": "55555", "int64_too": "0", "numeric_col": "5.555555", "float64_col": "555.555", "rowindex": 5, "rowindex_2": 5, "string_col": "Güten Tag!", "time_col": "15:16:17.181921", "timestamp_col": "1980-03-14T15:16:17.181921Z", "timedelta_col": 6} +{"bool_col": true, "bytes_col": "SGVsbG8JQmlnRnJhbWVzIQc=", "date_col": "2023-05-23", "datetime_col": "2023-05-23 11:37:01", "geography_col": "MULTIPOINT (20 20, 10 40, 40 30, 30 10)", "int64_col": "101202303", "int64_too": "2", "numeric_col": "-10.090807", "float64_col": "-123.456", "rowindex": 6, "rowindex_2": 6, "string_col": "capitalize, This ", "time_col": "01:02:03.456789", "timestamp_col": "2023-05-23T11:42:55.000001Z", "timedelta_col": -2} +{"bool_col": true, "bytes_col": null, "date_col": "2038-01-20", "datetime_col": "2038-01-19 03:14:08", "geography_col": null, "int64_col": "-214748367", "int64_too": "2", "numeric_col": "11111111.1", "float64_col": "42.42", "rowindex": 7, "rowindex_2": 7, "string_col": " سلام", "time_col": "12:00:00.000001", "timestamp_col": "2038-01-19T03:14:17.999999Z", "timedelta_col": 0} +{"bool_col": false, "bytes_col": null, "date_col": null, "datetime_col": null, "geography_col": null, "int64_col": "2", "int64_too": "1", "numeric_col": null, "float64_col": "6.87", "rowindex": 8, "rowindex_2": 8, "string_col": "T", "time_col": null, "timestamp_col": null, "timedelta_col": -8000} diff --git a/tests/data/scalars_schema.json b/tests/data/scalars_schema.json index 1f5d8cdb65..b348c15d2d 100644 --- a/tests/data/scalars_schema.json +++ b/tests/data/scalars_schema.json @@ -71,5 +71,10 @@ "mode": "NULLABLE", "name": "timestamp_col", "type": "TIMESTAMP" + }, + { + "mode": "NULLABLE", + "name": "timedelta_col", + "type": "INTEGER" } ] diff --git a/tests/system/conftest.py b/tests/system/conftest.py index 29234bc4ef..10b03775ea 100644 --- a/tests/system/conftest.py +++ b/tests/system/conftest.py @@ -536,7 +536,9 @@ def scalars_df_index( scalars_table_id: str, session: bigframes.Session ) -> bigframes.dataframe.DataFrame: """DataFrame pointing at test data.""" - return session.read_gbq(scalars_table_id, index_col="rowindex") + df = session.read_gbq(scalars_table_id, index_col="rowindex") + df["timedelta_col"] = bpd.Series, bpd.to_timedelta(df["timedelta_col"], unit="us") # type: ignore + return df @pytest.fixture(scope="session") diff --git a/tests/system/small/operations/test_timedeltas.py b/tests/system/small/operations/test_timedeltas.py new file mode 100644 index 0000000000..07aa07ac62 --- /dev/null +++ b/tests/system/small/operations/test_timedeltas.py @@ -0,0 +1,114 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import numpy +import pandas as pd +import pandas.testing +import pytest + + +@pytest.mark.parametrize("column", ["datetime_col", "timestamp_col"]) +def test_timestamp_add__ts_series_plus_td_series(scalars_dfs, column): + bf_df, pd_df = scalars_dfs + + actual_result = bf_df[column] + bf_df["timedelta_col"] + + expected_result = pd_df[column] + pd_df["timedelta_col"] + pandas.testing.assert_series_equal( + actual_result.to_pandas(), expected_result, check_index_type=False + ) + + +def test_timestamp_add__ts_series_plus_td_literal(scalars_dfs): + bf_df, pd_df = scalars_dfs + timedelta = pd.Timedelta(1, unit="s") + + actual_result = bf_df["datetime_col"] + timedelta + + expected_result = (pd_df["datetime_col"] + timedelta).astype( + "timestamp[us][pyarrow]" + ) + pandas.testing.assert_series_equal( + actual_result.to_pandas(), expected_result, check_index_type=False + ) + + +@pytest.mark.parametrize("column", ["datetime_col", "timestamp_col"]) +def test_timestamp_add__td_series_plus_ts_series(scalars_dfs, column): + bf_df, pd_df = scalars_dfs + + actual_result = bf_df["timedelta_col"] + bf_df[column] + + expected_result = pd_df["timedelta_col"] + pd_df[column] + pandas.testing.assert_series_equal( + actual_result.to_pandas(), expected_result, check_index_type=False + ) + + +def test_timestamp_add__td_literal_plus_ts_series(scalars_dfs): + bf_df, pd_df = scalars_dfs + timedelta = pd.Timedelta(1, unit="s") + + actual_result = timedelta + bf_df["datetime_col"] + + expected_result = (timedelta + pd_df["datetime_col"]).astype( + "timestamp[us][pyarrow]" + ) + pandas.testing.assert_series_equal( + actual_result.to_pandas(), expected_result, check_index_type=False + ) + + +def test_timestamp_add__ts_literal_plus_td_series(scalars_dfs): + bf_df, pd_df = scalars_dfs + timestamp = pd.Timestamp("2025-01-01", tz="UTC") + + actual_result = timestamp + bf_df["timedelta_col"] + + expected_result = timestamp + pd_df["timedelta_col"] + pandas.testing.assert_series_equal( + actual_result.to_pandas(), expected_result, check_index_type=False + ) + + +@pytest.mark.parametrize("column", ["datetime_col", "timestamp_col"]) +def test_timestamp_add_with_numpy_op(scalars_dfs, column): + bf_df, pd_df = scalars_dfs + + actual_result = numpy.add(bf_df[column], bf_df["timedelta_col"]) + + expected_result = numpy.add(pd_df[column], pd_df["timedelta_col"]) + pandas.testing.assert_series_equal( + actual_result.to_pandas(), expected_result, check_index_type=False + ) + + +def test_timestamp_add_dataframes(scalars_dfs): + columns = ["datetime_col", "timestamp_col"] + timedelta = pd.Timedelta(1, unit="s") + bf_df, pd_df = scalars_dfs + + actual_result = bf_df[columns] + timedelta + + expected_result = pd_df[columns] + timedelta + expected_result["datetime_col"] = expected_result["datetime_col"].astype( + "timestamp[us][pyarrow]" + ) + expected_result["timestamp_col"] = expected_result["timestamp_col"].astype( + "timestamp[us, tz=UTC][pyarrow]" + ) + pandas.testing.assert_frame_equal( + actual_result.to_pandas(), expected_result, check_index_type=False + ) diff --git a/tests/system/utils.py b/tests/system/utils.py index 0772468085..1a0d72630c 100644 --- a/tests/system/utils.py +++ b/tests/system/utils.py @@ -188,6 +188,16 @@ def convert_pandas_dtypes(df: pd.DataFrame, bytes_col: bool): "timestamp_col" ] + if not isinstance(df["timedelta_col"].dtype, pd.ArrowDtype): + df["timedelta_col"] = pd.to_timedelta(df["timedelta_col"], unit="us") + arrow_table = pa.Table.from_pandas( + pd.DataFrame(df, columns=["timedelta_col"]), + schema=pa.schema([("timedelta_col", pa.duration("us"))]), + ) + df["timedelta_col"] = arrow_table.to_pandas(types_mapper=pd.ArrowDtype)[ + "timedelta_col" + ] + # Convert geography types columns. if "geography_col" in df.columns: df["geography_col"] = df["geography_col"].astype( From 1f3a6a52824afee8278df4b0f2bef96e3fe81b0b Mon Sep 17 00:00:00 2001 From: Shenyang Cai Date: Thu, 6 Feb 2025 03:26:59 +0000 Subject: [PATCH 02/14] test_timestamp_dff --- tests/system/conftest.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/system/conftest.py b/tests/system/conftest.py index 10b03775ea..372e318a96 100644 --- a/tests/system/conftest.py +++ b/tests/system/conftest.py @@ -537,7 +537,7 @@ def scalars_df_index( ) -> bigframes.dataframe.DataFrame: """DataFrame pointing at test data.""" df = session.read_gbq(scalars_table_id, index_col="rowindex") - df["timedelta_col"] = bpd.Series, bpd.to_timedelta(df["timedelta_col"], unit="us") # type: ignore + df["timedelta_col"] = bpd.Series, bpd.to_timedelta(df["timedelta_col"], unit="us") # type: ignore return df From d089a357f872c663f39dd7b43a54b425454ca7fb Mon Sep 17 00:00:00 2001 From: Shenyang Cai Date: Thu, 6 Feb 2025 17:07:40 +0000 Subject: [PATCH 03/14] fix conftest.py --- tests/system/conftest.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/system/conftest.py b/tests/system/conftest.py index 372e318a96..6b553773f4 100644 --- a/tests/system/conftest.py +++ b/tests/system/conftest.py @@ -537,7 +537,7 @@ def scalars_df_index( ) -> bigframes.dataframe.DataFrame: """DataFrame pointing at test data.""" df = session.read_gbq(scalars_table_id, index_col="rowindex") - df["timedelta_col"] = bpd.Series, bpd.to_timedelta(df["timedelta_col"], unit="us") # type: ignore + df["timedelta_col"] = bpd.to_timedelta(df["timedelta_col"], unit="us") # type: ignore return df From 495ccb2e9281d071a1486aff3fa0375da85042d9 Mon Sep 17 00:00:00 2001 From: Shenyang Cai Date: Fri, 7 Feb 2025 00:12:02 +0000 Subject: [PATCH 04/14] support numpy and pyarrow timedelta literals --- bigframes/core/compile/ibis_types.py | 22 +++++++++++---- bigframes/core/utils.py | 27 +++++++++++++++---- bigframes/dtypes.py | 8 ++++-- .../small/operations/test_timedeltas.py | 27 ++++++++++++------- tests/unit/core/test_bf_utils.py | 21 +++++++++++++++ 5 files changed, 84 insertions(+), 21 deletions(-) diff --git a/bigframes/core/compile/ibis_types.py b/bigframes/core/compile/ibis_types.py index 4d4c1775ef..e83c978754 100644 --- a/bigframes/core/compile/ibis_types.py +++ b/bigframes/core/compile/ibis_types.py @@ -28,6 +28,7 @@ import db_dtypes # type: ignore import geopandas as gpd # type: ignore import google.cloud.bigquery as bigquery +import numpy import pandas as pd import pyarrow as pa @@ -403,13 +404,24 @@ def literal_to_ibis_scalar( else: return bigframes_vendored.ibis.null() - scalar_expr = bigframes_vendored.ibis.literal(literal) - if isinstance(literal, datetime.timedelta): - # In BigQuery, a timedelta is represented as an integer value in microseconds - scalar_expr = bigframes_vendored.ibis.literal( + if isinstance( + literal, + (datetime.timedelta, pd.Timedelta, numpy.timedelta64, pa.DurationScalar), + ): + # numpy and pyarrow timedeltas are not compatible with Ibis, so we process them separately. + return bigframes_vendored.ibis.literal( utils.timedelta_to_micros(literal), ibis_dtype ) - elif ibis_dtype: + + return _to_ibis_literal(literal, ibis_dtype, validate) + + +def _to_ibis_literal( + literal: typing.Any, ibis_dtype: ibis_dtypes.DataType, validate: bool +): + scalar_expr = bigframes_vendored.ibis.literal(literal) + + if ibis_dtype: scalar_expr = bigframes_vendored.ibis.literal(literal, ibis_dtype) elif scalar_expr.type().is_floating(): scalar_expr = bigframes_vendored.ibis.literal(literal, ibis_dtypes.float64) diff --git a/bigframes/core/utils.py b/bigframes/core/utils.py index 9741c8d873..362e963be3 100644 --- a/bigframes/core/utils.py +++ b/bigframes/core/utils.py @@ -19,8 +19,10 @@ import warnings import bigframes_vendored.pandas.io.common as vendored_pandas_io_common +import numpy as np import pandas as pd import pandas.api.types as pdtypes +import pyarrow as pa import typing_extensions import bigframes.exceptions as bfe @@ -188,12 +190,27 @@ def wrapper(*args, **kwargs): return decorator -def timedelta_to_micros(td: typing.Union[pd.Timedelta, datetime.timedelta]) -> int: - if isinstance(td, pd.Timedelta): - # td.value returns total nanoseconds. - return td.value // 1000 +def timedelta_to_micros( + timedelta: typing.Union[ + pd.Timedelta, datetime.timedelta, np.timedelta64, pa.DurationScalar + ] +) -> int: + if isinstance(timedelta, pd.Timedelta): + # pd.Timedelta.value returns total nanoseconds. + return timedelta.value // 1000 + + if isinstance(timedelta, np.timedelta64): + return timedelta.astype("timedelta64[us]").astype(np.int64) + + if isinstance(timedelta, pa.DurationScalar): + return timedelta_to_micros(timedelta.as_py()) + + if isinstance(timedelta, datetime.timedelta): + return ( + (timedelta.days * 3600 * 24) + timedelta.seconds + ) * 1_000_000 + timedelta.microseconds - return ((td.days * 3600 * 24) + td.seconds) * 1_000_000 + td.microseconds + raise TypeError(f"Unrecognized input type: {type(timedelta)}") def replace_timedeltas_with_micros(dataframe: pd.DataFrame) -> List[str]: diff --git a/bigframes/dtypes.py b/bigframes/dtypes.py index d1a9b39469..5c107d0168 100644 --- a/bigframes/dtypes.py +++ b/bigframes/dtypes.py @@ -107,6 +107,8 @@ datetime.time, pd.Timedelta, datetime.timedelta, + np.timedelta64, + pa.DurationScalar, ] LOCAL_SCALAR_TYPES = typing.get_args(LOCAL_SCALAR_TYPE) @@ -564,6 +566,10 @@ def _is_bigframes_dtype(dtype) -> bool: def _infer_dtype_from_python_type(type: type) -> Dtype: + if type in (datetime.timedelta, pd.Timedelta, np.timedelta64, pa.DurationScalar): + # Must check timedelta type first. Otherwise other branchs will be evaluated to true + # E.g. np.timedelta64 is a sublcass as np.integer + return TIMEDELTA_DTYPE if issubclass(type, (bool, np.bool_)): return BOOL_DTYPE if issubclass(type, (int, np.integer)): @@ -580,8 +586,6 @@ def _infer_dtype_from_python_type(type: type) -> Dtype: return DATE_DTYPE if issubclass(type, datetime.time): return TIME_DTYPE - if issubclass(type, datetime.timedelta): - return TIMEDELTA_DTYPE else: raise TypeError( f"No matching datatype for python type: {type}. {constants.FEEDBACK_LINK}" diff --git a/tests/system/small/operations/test_timedeltas.py b/tests/system/small/operations/test_timedeltas.py index 07aa07ac62..89ab580ee4 100644 --- a/tests/system/small/operations/test_timedeltas.py +++ b/tests/system/small/operations/test_timedeltas.py @@ -13,9 +13,12 @@ # limitations under the License. -import numpy +import datetime + +import numpy as np import pandas as pd import pandas.testing +import pyarrow as pa import pytest @@ -31,15 +34,21 @@ def test_timestamp_add__ts_series_plus_td_series(scalars_dfs, column): ) -def test_timestamp_add__ts_series_plus_td_literal(scalars_dfs): +@pytest.mark.parametrize( + "literal", + [ + pytest.param(pd.Timedelta(1, unit="s"), id="pandas"), + pytest.param(datetime.timedelta(seconds=1), id="python-datetime"), + pytest.param(np.timedelta64(1, "s"), id="numpy"), + pytest.param(pa.scalar(1, type=pa.duration("s")), id="pyarrow"), + ], +) +def test_timestamp_add__ts_series_plus_td_literal(scalars_dfs, literal): bf_df, pd_df = scalars_dfs - timedelta = pd.Timedelta(1, unit="s") - actual_result = bf_df["datetime_col"] + timedelta + actual_result = bf_df["datetime_col"] + literal - expected_result = (pd_df["datetime_col"] + timedelta).astype( - "timestamp[us][pyarrow]" - ) + expected_result = (pd_df["datetime_col"] + literal).astype("timestamp[us][pyarrow]") pandas.testing.assert_series_equal( actual_result.to_pandas(), expected_result, check_index_type=False ) @@ -87,9 +96,9 @@ def test_timestamp_add__ts_literal_plus_td_series(scalars_dfs): def test_timestamp_add_with_numpy_op(scalars_dfs, column): bf_df, pd_df = scalars_dfs - actual_result = numpy.add(bf_df[column], bf_df["timedelta_col"]) + actual_result = np.add(bf_df[column], bf_df["timedelta_col"]) - expected_result = numpy.add(pd_df[column], pd_df["timedelta_col"]) + expected_result = np.add(pd_df[column], pd_df["timedelta_col"]) pandas.testing.assert_series_equal( actual_result.to_pandas(), expected_result, check_index_type=False ) diff --git a/tests/unit/core/test_bf_utils.py b/tests/unit/core/test_bf_utils.py index 248b6796e2..72f00cb91c 100644 --- a/tests/unit/core/test_bf_utils.py +++ b/tests/unit/core/test_bf_utils.py @@ -12,6 +12,13 @@ # See the License for the specific language governing permissions and # limitations under the License. +import datetime + +import numpy as np +import pandas as pd +import pyarrow as pa +import pytest + from bigframes.core import utils @@ -54,3 +61,17 @@ def test_get_standardized_ids_tuple(): col_ids, _ = utils.get_standardized_ids(col_labels) assert col_ids == ["('foo', 1)", "('foo', 2)", "('bar', 1)"] + + + +@pytest.mark.parametrize( + "input", + [ + datetime.timedelta(days=2, hours=3, seconds=4, milliseconds=5, microseconds=6), + pd.Timedelta("2d3h4s5ms6us"), + np.timedelta64(pd.Timedelta("2d3h4s5ms6us")), + pa.scalar(pd.Timedelta("2d3h4s5ms6us")), + ], +) +def test_timedelta_to_micros(input): + assert utils.timedelta_to_micros(input) == 183604005006 From e83d936d38f0e67fc32b7fb982c6ed8ad22b25cd Mon Sep 17 00:00:00 2001 From: Owl Bot Date: Fri, 7 Feb 2025 00:14:18 +0000 Subject: [PATCH 05/14] =?UTF-8?q?=F0=9F=A6=89=20Updates=20from=20OwlBot=20?= =?UTF-8?q?post-processor?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md --- tests/unit/core/test_bf_utils.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/unit/core/test_bf_utils.py b/tests/unit/core/test_bf_utils.py index 72f00cb91c..e9f8cd4112 100644 --- a/tests/unit/core/test_bf_utils.py +++ b/tests/unit/core/test_bf_utils.py @@ -63,7 +63,6 @@ def test_get_standardized_ids_tuple(): assert col_ids == ["('foo', 1)", "('foo', 2)", "('bar', 1)"] - @pytest.mark.parametrize( "input", [ From fd38454a106d0c356b9e5ad08b49f1c1f0a47940 Mon Sep 17 00:00:00 2001 From: Shenyang Cai Date: Fri, 7 Feb 2025 00:16:50 +0000 Subject: [PATCH 06/14] fix format --- bigframes/operations/timedelta_ops.py | 1 - 1 file changed, 1 deletion(-) diff --git a/bigframes/operations/timedelta_ops.py b/bigframes/operations/timedelta_ops.py index 8bb4707300..69e054fa5c 100644 --- a/bigframes/operations/timedelta_ops.py +++ b/bigframes/operations/timedelta_ops.py @@ -25,7 +25,6 @@ class ToTimedeltaOp(base_ops.UnaryOp): name: typing.ClassVar[str] = "to_timedelta" unit: typing.Literal["us", "ms", "s", "m", "h", "d", "W"] - def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType: if input_types[0] in (dtypes.INT_DTYPE, dtypes.FLOAT_DTYPE): return dtypes.TIMEDELTA_DTYPE From c680948aaf19861f94f827f779aead3ce2f70f23 Mon Sep 17 00:00:00 2001 From: Shenyang Cai Date: Fri, 7 Feb 2025 04:22:47 +0000 Subject: [PATCH 07/14] use local fixture for testing --- tests/data/scalars.jsonl | 18 ++-- tests/data/scalars_schema.json | 5 - tests/system/conftest.py | 4 +- .../small/operations/test_timedeltas.py | 101 +++++++++++++----- tests/system/utils.py | 10 -- 5 files changed, 83 insertions(+), 55 deletions(-) diff --git a/tests/data/scalars.jsonl b/tests/data/scalars.jsonl index 02379b8b64..03755c94b7 100644 --- a/tests/data/scalars.jsonl +++ b/tests/data/scalars.jsonl @@ -1,9 +1,9 @@ -{"bool_col": true, "bytes_col": "SGVsbG8sIFdvcmxkIQ==", "date_col": "2021-07-21", "datetime_col": "2021-07-21 11:39:45", "geography_col": "POINT(-122.0838511 37.3860517)", "int64_col": "123456789", "int64_too": "0", "numeric_col": "1.23456789", "float64_col": "1.25", "rowindex": 0, "rowindex_2": 0, "string_col": "Hello, World!", "time_col": "11:41:43.076160", "timestamp_col": "2021-07-21T17:43:43.945289Z", "timedelta_col": 1} -{"bool_col": false, "bytes_col": "44GT44KT44Gr44Gh44Gv", "date_col": "1991-02-03", "datetime_col": "1991-01-02 03:45:06", "geography_col": "POINT(-71.104 42.315)", "int64_col": "-987654321", "int64_too": "1", "numeric_col": "1.23456789", "float64_col": "2.51", "rowindex": 1, "rowindex_2": 1, "string_col": "こんにちは", "time_col": "11:14:34.701606", "timestamp_col": "2021-07-21T17:43:43.945289Z", "timedelta_col": 2} -{"bool_col": true, "bytes_col": "wqFIb2xhIE11bmRvIQ==", "date_col": "2023-03-01", "datetime_col": "2023-03-01 10:55:13", "geography_col": "POINT(-0.124474760143016 51.5007826749545)", "int64_col": "314159", "int64_too": "0", "numeric_col": "101.1010101", "float64_col": "2.5e10", "rowindex": 2, "rowindex_2": 2, "string_col": " ¡Hola Mundo! ", "time_col": "23:59:59.999999", "timestamp_col": "2023-03-01T10:55:13.250125Z", "timedelta_col": 3} -{"bool_col": null, "bytes_col": null, "date_col": null, "datetime_col": null, "geography_col": null, "int64_col": null, "int64_too": "1", "numeric_col": null, "float64_col": null, "rowindex": 3, "rowindex_2": 3, "string_col": null, "time_col": null, "timestamp_col": null, "timedelta_col": 5} -{"bool_col": false, "bytes_col": "44GT44KT44Gr44Gh44Gv", "date_col": "2021-07-21", "datetime_col": null, "geography_col": null, "int64_col": "-234892", "int64_too": "-2345", "numeric_col": null, "float64_col": null, "rowindex": 4, "rowindex_2": 4, "string_col": "Hello, World!", "time_col": null, "timestamp_col": null, "timedelta_col": 5000} -{"bool_col": false, "bytes_col": "R8O8dGVuIFRhZw==", "date_col": "1980-03-14", "datetime_col": "1980-03-14 15:16:17", "geography_col": null, "int64_col": "55555", "int64_too": "0", "numeric_col": "5.555555", "float64_col": "555.555", "rowindex": 5, "rowindex_2": 5, "string_col": "Güten Tag!", "time_col": "15:16:17.181921", "timestamp_col": "1980-03-14T15:16:17.181921Z", "timedelta_col": 6} -{"bool_col": true, "bytes_col": "SGVsbG8JQmlnRnJhbWVzIQc=", "date_col": "2023-05-23", "datetime_col": "2023-05-23 11:37:01", "geography_col": "MULTIPOINT (20 20, 10 40, 40 30, 30 10)", "int64_col": "101202303", "int64_too": "2", "numeric_col": "-10.090807", "float64_col": "-123.456", "rowindex": 6, "rowindex_2": 6, "string_col": "capitalize, This ", "time_col": "01:02:03.456789", "timestamp_col": "2023-05-23T11:42:55.000001Z", "timedelta_col": -2} -{"bool_col": true, "bytes_col": null, "date_col": "2038-01-20", "datetime_col": "2038-01-19 03:14:08", "geography_col": null, "int64_col": "-214748367", "int64_too": "2", "numeric_col": "11111111.1", "float64_col": "42.42", "rowindex": 7, "rowindex_2": 7, "string_col": " سلام", "time_col": "12:00:00.000001", "timestamp_col": "2038-01-19T03:14:17.999999Z", "timedelta_col": 0} -{"bool_col": false, "bytes_col": null, "date_col": null, "datetime_col": null, "geography_col": null, "int64_col": "2", "int64_too": "1", "numeric_col": null, "float64_col": "6.87", "rowindex": 8, "rowindex_2": 8, "string_col": "T", "time_col": null, "timestamp_col": null, "timedelta_col": -8000} +{"bool_col": true, "bytes_col": "SGVsbG8sIFdvcmxkIQ==", "date_col": "2021-07-21", "datetime_col": "2021-07-21 11:39:45", "geography_col": "POINT(-122.0838511 37.3860517)", "int64_col": "123456789", "int64_too": "0", "numeric_col": "1.23456789", "float64_col": "1.25", "rowindex": 0, "rowindex_2": 0, "string_col": "Hello, World!", "time_col": "11:41:43.076160", "timestamp_col": "2021-07-21T17:43:43.945289Z"} +{"bool_col": false, "bytes_col": "44GT44KT44Gr44Gh44Gv", "date_col": "1991-02-03", "datetime_col": "1991-01-02 03:45:06", "geography_col": "POINT(-71.104 42.315)", "int64_col": "-987654321", "int64_too": "1", "numeric_col": "1.23456789", "float64_col": "2.51", "rowindex": 1, "rowindex_2": 1, "string_col": "こんにちは", "time_col": "11:14:34.701606", "timestamp_col": "2021-07-21T17:43:43.945289Z"} +{"bool_col": true, "bytes_col": "wqFIb2xhIE11bmRvIQ==", "date_col": "2023-03-01", "datetime_col": "2023-03-01 10:55:13", "geography_col": "POINT(-0.124474760143016 51.5007826749545)", "int64_col": "314159", "int64_too": "0", "numeric_col": "101.1010101", "float64_col": "2.5e10", "rowindex": 2, "rowindex_2": 2, "string_col": " ¡Hola Mundo! ", "time_col": "23:59:59.999999", "timestamp_col": "2023-03-01T10:55:13.250125Z"} +{"bool_col": null, "bytes_col": null, "date_col": null, "datetime_col": null, "geography_col": null, "int64_col": null, "int64_too": "1", "numeric_col": null, "float64_col": null, "rowindex": 3, "rowindex_2": 3, "string_col": null, "time_col": null, "timestamp_col": null} +{"bool_col": false, "bytes_col": "44GT44KT44Gr44Gh44Gv", "date_col": "2021-07-21", "datetime_col": null, "geography_col": null, "int64_col": "-234892", "int64_too": "-2345", "numeric_col": null, "float64_col": null, "rowindex": 4, "rowindex_2": 4, "string_col": "Hello, World!", "time_col": null, "timestamp_col": null} +{"bool_col": false, "bytes_col": "R8O8dGVuIFRhZw==", "date_col": "1980-03-14", "datetime_col": "1980-03-14 15:16:17", "geography_col": null, "int64_col": "55555", "int64_too": "0", "numeric_col": "5.555555", "float64_col": "555.555", "rowindex": 5, "rowindex_2": 5, "string_col": "Güten Tag!", "time_col": "15:16:17.181921", "timestamp_col": "1980-03-14T15:16:17.181921Z"} +{"bool_col": true, "bytes_col": "SGVsbG8JQmlnRnJhbWVzIQc=", "date_col": "2023-05-23", "datetime_col": "2023-05-23 11:37:01", "geography_col": "MULTIPOINT (20 20, 10 40, 40 30, 30 10)", "int64_col": "101202303", "int64_too": "2", "numeric_col": "-10.090807", "float64_col": "-123.456", "rowindex": 6, "rowindex_2": 6, "string_col": "capitalize, This ", "time_col": "01:02:03.456789", "timestamp_col": "2023-05-23T11:42:55.000001Z"} +{"bool_col": true, "bytes_col": null, "date_col": "2038-01-20", "datetime_col": "2038-01-19 03:14:08", "geography_col": null, "int64_col": "-214748367", "int64_too": "2", "numeric_col": "11111111.1", "float64_col": "42.42", "rowindex": 7, "rowindex_2": 7, "string_col": " سلام", "time_col": "12:00:00.000001", "timestamp_col": "2038-01-19T03:14:17.999999Z"} +{"bool_col": false, "bytes_col": null, "date_col": null, "datetime_col": null, "geography_col": null, "int64_col": "2", "int64_too": "1", "numeric_col": null, "float64_col": "6.87", "rowindex": 8, "rowindex_2": 8, "string_col": "T", "time_col": null, "timestamp_col": null} \ No newline at end of file diff --git a/tests/data/scalars_schema.json b/tests/data/scalars_schema.json index b348c15d2d..1f5d8cdb65 100644 --- a/tests/data/scalars_schema.json +++ b/tests/data/scalars_schema.json @@ -71,10 +71,5 @@ "mode": "NULLABLE", "name": "timestamp_col", "type": "TIMESTAMP" - }, - { - "mode": "NULLABLE", - "name": "timedelta_col", - "type": "INTEGER" } ] diff --git a/tests/system/conftest.py b/tests/system/conftest.py index 6b553773f4..29234bc4ef 100644 --- a/tests/system/conftest.py +++ b/tests/system/conftest.py @@ -536,9 +536,7 @@ def scalars_df_index( scalars_table_id: str, session: bigframes.Session ) -> bigframes.dataframe.DataFrame: """DataFrame pointing at test data.""" - df = session.read_gbq(scalars_table_id, index_col="rowindex") - df["timedelta_col"] = bpd.to_timedelta(df["timedelta_col"], unit="us") # type: ignore - return df + return session.read_gbq(scalars_table_id, index_col="rowindex") @pytest.fixture(scope="session") diff --git a/tests/system/small/operations/test_timedeltas.py b/tests/system/small/operations/test_timedeltas.py index 89ab580ee4..39d2db180a 100644 --- a/tests/system/small/operations/test_timedeltas.py +++ b/tests/system/small/operations/test_timedeltas.py @@ -21,14 +21,43 @@ import pyarrow as pa import pytest +from bigframes import dtypes + + +@pytest.fixture(scope="module") +def temporal_dfs(session): + pandas_df = pd.DataFrame( + { + "datetime_col": [ + pd.Timestamp("2025-02-01 01:00:01"), + pd.Timestamp("2019-01-02 02:00:00"), + ], + "timestamp_col": [ + pd.Timestamp("2023-01-01 01:00:01", tz="UTC"), + pd.Timestamp("2024-01-02 02:00:00", tz="UTC"), + ], + "timedelta_col": [pd.Timedelta(3, "s"), pd.Timedelta(-4, "d")], + } + ) + + bigframes_df = session.read_pandas(pandas_df) + + return bigframes_df, pandas_df -@pytest.mark.parametrize("column", ["datetime_col", "timestamp_col"]) -def test_timestamp_add__ts_series_plus_td_series(scalars_dfs, column): - bf_df, pd_df = scalars_dfs + +@pytest.mark.parametrize( + ("column", "bf_dtype"), + [ + ("datetime_col", dtypes.DATETIME_DTYPE), + ("timestamp_col", dtypes.TIMESTAMP_DTYPE), + ], +) +def test_timestamp_add__ts_series_plus_td_series(temporal_dfs, column, bf_dtype): + bf_df, pd_df = temporal_dfs actual_result = bf_df[column] + bf_df["timedelta_col"] - expected_result = pd_df[column] + pd_df["timedelta_col"] + expected_result = (pd_df[column] + pd_df["timedelta_col"]).astype(bf_dtype) pandas.testing.assert_series_equal( actual_result.to_pandas(), expected_result, check_index_type=False ) @@ -43,80 +72,96 @@ def test_timestamp_add__ts_series_plus_td_series(scalars_dfs, column): pytest.param(pa.scalar(1, type=pa.duration("s")), id="pyarrow"), ], ) -def test_timestamp_add__ts_series_plus_td_literal(scalars_dfs, literal): - bf_df, pd_df = scalars_dfs +def test_timestamp_add__ts_series_plus_td_literal(temporal_dfs, literal): + bf_df, pd_df = temporal_dfs - actual_result = bf_df["datetime_col"] + literal + actual_result = bf_df["timestamp_col"] + literal - expected_result = (pd_df["datetime_col"] + literal).astype("timestamp[us][pyarrow]") + # We don't use the literal value here + # because pandas does not support: series + pyarrow.DurationScalar + expected_result = (pd_df["timestamp_col"] + pd.Timedelta("1s")).astype( + dtypes.TIMESTAMP_DTYPE + ) pandas.testing.assert_series_equal( actual_result.to_pandas(), expected_result, check_index_type=False ) -@pytest.mark.parametrize("column", ["datetime_col", "timestamp_col"]) -def test_timestamp_add__td_series_plus_ts_series(scalars_dfs, column): - bf_df, pd_df = scalars_dfs +@pytest.mark.parametrize( + ("column", "bf_dtype"), + [ + ("datetime_col", dtypes.DATETIME_DTYPE), + ("timestamp_col", dtypes.TIMESTAMP_DTYPE), + ], +) +def test_timestamp_add__td_series_plus_ts_series(temporal_dfs, column, bf_dtype): + bf_df, pd_df = temporal_dfs actual_result = bf_df["timedelta_col"] + bf_df[column] - expected_result = pd_df["timedelta_col"] + pd_df[column] + expected_result = (pd_df["timedelta_col"] + pd_df[column]).astype(bf_dtype) pandas.testing.assert_series_equal( actual_result.to_pandas(), expected_result, check_index_type=False ) -def test_timestamp_add__td_literal_plus_ts_series(scalars_dfs): - bf_df, pd_df = scalars_dfs +def test_timestamp_add__td_literal_plus_ts_series(temporal_dfs): + bf_df, pd_df = temporal_dfs timedelta = pd.Timedelta(1, unit="s") actual_result = timedelta + bf_df["datetime_col"] - expected_result = (timedelta + pd_df["datetime_col"]).astype( - "timestamp[us][pyarrow]" - ) + expected_result = (timedelta + pd_df["datetime_col"]).astype(dtypes.DATETIME_DTYPE) pandas.testing.assert_series_equal( actual_result.to_pandas(), expected_result, check_index_type=False ) -def test_timestamp_add__ts_literal_plus_td_series(scalars_dfs): - bf_df, pd_df = scalars_dfs +def test_timestamp_add__ts_literal_plus_td_series(temporal_dfs): + bf_df, pd_df = temporal_dfs timestamp = pd.Timestamp("2025-01-01", tz="UTC") actual_result = timestamp + bf_df["timedelta_col"] - expected_result = timestamp + pd_df["timedelta_col"] + expected_result = (timestamp + pd_df["timedelta_col"]).astype( + dtypes.TIMESTAMP_DTYPE + ) pandas.testing.assert_series_equal( actual_result.to_pandas(), expected_result, check_index_type=False ) -@pytest.mark.parametrize("column", ["datetime_col", "timestamp_col"]) -def test_timestamp_add_with_numpy_op(scalars_dfs, column): - bf_df, pd_df = scalars_dfs +@pytest.mark.parametrize( + ("column", "bf_dtype"), + [ + ("datetime_col", dtypes.DATETIME_DTYPE), + ("timestamp_col", dtypes.TIMESTAMP_DTYPE), + ], +) +def test_timestamp_add_with_numpy_op(temporal_dfs, column, bf_dtype): + bf_df, pd_df = temporal_dfs actual_result = np.add(bf_df[column], bf_df["timedelta_col"]) - expected_result = np.add(pd_df[column], pd_df["timedelta_col"]) + expected_result = np.add(pd_df[column], pd_df["timedelta_col"]).astype(bf_dtype) pandas.testing.assert_series_equal( actual_result.to_pandas(), expected_result, check_index_type=False ) -def test_timestamp_add_dataframes(scalars_dfs): +def test_timestamp_add_dataframes(temporal_dfs): columns = ["datetime_col", "timestamp_col"] timedelta = pd.Timedelta(1, unit="s") - bf_df, pd_df = scalars_dfs + bf_df, pd_df = temporal_dfs actual_result = bf_df[columns] + timedelta expected_result = pd_df[columns] + timedelta expected_result["datetime_col"] = expected_result["datetime_col"].astype( - "timestamp[us][pyarrow]" + dtypes.DATETIME_DTYPE ) expected_result["timestamp_col"] = expected_result["timestamp_col"].astype( - "timestamp[us, tz=UTC][pyarrow]" + dtypes.TIMESTAMP_DTYPE ) pandas.testing.assert_frame_equal( actual_result.to_pandas(), expected_result, check_index_type=False diff --git a/tests/system/utils.py b/tests/system/utils.py index 1a0d72630c..0772468085 100644 --- a/tests/system/utils.py +++ b/tests/system/utils.py @@ -188,16 +188,6 @@ def convert_pandas_dtypes(df: pd.DataFrame, bytes_col: bool): "timestamp_col" ] - if not isinstance(df["timedelta_col"].dtype, pd.ArrowDtype): - df["timedelta_col"] = pd.to_timedelta(df["timedelta_col"], unit="us") - arrow_table = pa.Table.from_pandas( - pd.DataFrame(df, columns=["timedelta_col"]), - schema=pa.schema([("timedelta_col", pa.duration("us"))]), - ) - df["timedelta_col"] = arrow_table.to_pandas(types_mapper=pd.ArrowDtype)[ - "timedelta_col" - ] - # Convert geography types columns. if "geography_col" in df.columns: df["geography_col"] = df["geography_col"].astype( From fc760fd4bb0ce6fbfb3728cdad4bcafe6308ef77 Mon Sep 17 00:00:00 2001 From: Shenyang Cai Date: Fri, 7 Feb 2025 17:16:03 +0000 Subject: [PATCH 08/14] Remove pyarrow duration scalar support. --- bigframes/core/compile/ibis_types.py | 4 ++-- bigframes/core/utils.py | 5 +---- bigframes/dtypes.py | 3 +-- tests/system/small/operations/test_timedeltas.py | 5 +---- tests/unit/core/test_bf_utils.py | 1 - 5 files changed, 5 insertions(+), 13 deletions(-) diff --git a/bigframes/core/compile/ibis_types.py b/bigframes/core/compile/ibis_types.py index e4f60b67ec..03ee46a9b0 100644 --- a/bigframes/core/compile/ibis_types.py +++ b/bigframes/core/compile/ibis_types.py @@ -419,9 +419,9 @@ def literal_to_ibis_scalar( if isinstance( literal, - (datetime.timedelta, pd.Timedelta, numpy.timedelta64, pa.DurationScalar), + (datetime.timedelta, pd.Timedelta, numpy.timedelta64), ): - # numpy and pyarrow timedeltas are not compatible with Ibis, so we process them separately. + # numpy timedelta is compatible with Ibis, so we process them separately. return bigframes_vendored.ibis.literal( utils.timedelta_to_micros(literal), ibis_dtype ) diff --git a/bigframes/core/utils.py b/bigframes/core/utils.py index 362e963be3..2fae4e8b5b 100644 --- a/bigframes/core/utils.py +++ b/bigframes/core/utils.py @@ -192,7 +192,7 @@ def wrapper(*args, **kwargs): def timedelta_to_micros( timedelta: typing.Union[ - pd.Timedelta, datetime.timedelta, np.timedelta64, pa.DurationScalar + pd.Timedelta, datetime.timedelta, np.timedelta64 ] ) -> int: if isinstance(timedelta, pd.Timedelta): @@ -202,9 +202,6 @@ def timedelta_to_micros( if isinstance(timedelta, np.timedelta64): return timedelta.astype("timedelta64[us]").astype(np.int64) - if isinstance(timedelta, pa.DurationScalar): - return timedelta_to_micros(timedelta.as_py()) - if isinstance(timedelta, datetime.timedelta): return ( (timedelta.days * 3600 * 24) + timedelta.seconds diff --git a/bigframes/dtypes.py b/bigframes/dtypes.py index 5c107d0168..eed45e1dde 100644 --- a/bigframes/dtypes.py +++ b/bigframes/dtypes.py @@ -108,7 +108,6 @@ pd.Timedelta, datetime.timedelta, np.timedelta64, - pa.DurationScalar, ] LOCAL_SCALAR_TYPES = typing.get_args(LOCAL_SCALAR_TYPE) @@ -566,7 +565,7 @@ def _is_bigframes_dtype(dtype) -> bool: def _infer_dtype_from_python_type(type: type) -> Dtype: - if type in (datetime.timedelta, pd.Timedelta, np.timedelta64, pa.DurationScalar): + if type in (datetime.timedelta, pd.Timedelta, np.timedelta64): # Must check timedelta type first. Otherwise other branchs will be evaluated to true # E.g. np.timedelta64 is a sublcass as np.integer return TIMEDELTA_DTYPE diff --git a/tests/system/small/operations/test_timedeltas.py b/tests/system/small/operations/test_timedeltas.py index 39d2db180a..b1be80c42f 100644 --- a/tests/system/small/operations/test_timedeltas.py +++ b/tests/system/small/operations/test_timedeltas.py @@ -69,7 +69,6 @@ def test_timestamp_add__ts_series_plus_td_series(temporal_dfs, column, bf_dtype) pytest.param(pd.Timedelta(1, unit="s"), id="pandas"), pytest.param(datetime.timedelta(seconds=1), id="python-datetime"), pytest.param(np.timedelta64(1, "s"), id="numpy"), - pytest.param(pa.scalar(1, type=pa.duration("s")), id="pyarrow"), ], ) def test_timestamp_add__ts_series_plus_td_literal(temporal_dfs, literal): @@ -77,9 +76,7 @@ def test_timestamp_add__ts_series_plus_td_literal(temporal_dfs, literal): actual_result = bf_df["timestamp_col"] + literal - # We don't use the literal value here - # because pandas does not support: series + pyarrow.DurationScalar - expected_result = (pd_df["timestamp_col"] + pd.Timedelta("1s")).astype( + expected_result = (pd_df["timestamp_col"] + literal).astype( dtypes.TIMESTAMP_DTYPE ) pandas.testing.assert_series_equal( diff --git a/tests/unit/core/test_bf_utils.py b/tests/unit/core/test_bf_utils.py index e9f8cd4112..5ac92ba5d3 100644 --- a/tests/unit/core/test_bf_utils.py +++ b/tests/unit/core/test_bf_utils.py @@ -69,7 +69,6 @@ def test_get_standardized_ids_tuple(): datetime.timedelta(days=2, hours=3, seconds=4, milliseconds=5, microseconds=6), pd.Timedelta("2d3h4s5ms6us"), np.timedelta64(pd.Timedelta("2d3h4s5ms6us")), - pa.scalar(pd.Timedelta("2d3h4s5ms6us")), ], ) def test_timedelta_to_micros(input): From 3003398b96dd16fc3568c674e4948fd11103ff30 Mon Sep 17 00:00:00 2001 From: Shenyang Cai Date: Fri, 7 Feb 2025 17:17:59 +0000 Subject: [PATCH 09/14] fix format --- bigframes/core/utils.py | 4 +--- tests/system/small/operations/test_timedeltas.py | 4 +--- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/bigframes/core/utils.py b/bigframes/core/utils.py index 2fae4e8b5b..82d21515ea 100644 --- a/bigframes/core/utils.py +++ b/bigframes/core/utils.py @@ -191,9 +191,7 @@ def wrapper(*args, **kwargs): def timedelta_to_micros( - timedelta: typing.Union[ - pd.Timedelta, datetime.timedelta, np.timedelta64 - ] + timedelta: typing.Union[pd.Timedelta, datetime.timedelta, np.timedelta64] ) -> int: if isinstance(timedelta, pd.Timedelta): # pd.Timedelta.value returns total nanoseconds. diff --git a/tests/system/small/operations/test_timedeltas.py b/tests/system/small/operations/test_timedeltas.py index b1be80c42f..e0b99235c0 100644 --- a/tests/system/small/operations/test_timedeltas.py +++ b/tests/system/small/operations/test_timedeltas.py @@ -76,9 +76,7 @@ def test_timestamp_add__ts_series_plus_td_literal(temporal_dfs, literal): actual_result = bf_df["timestamp_col"] + literal - expected_result = (pd_df["timestamp_col"] + literal).astype( - dtypes.TIMESTAMP_DTYPE - ) + expected_result = (pd_df["timestamp_col"] + literal).astype(dtypes.TIMESTAMP_DTYPE) pandas.testing.assert_series_equal( actual_result.to_pandas(), expected_result, check_index_type=False ) From b5b69cc47387af3f595c2bed314a58a7b0d8f714 Mon Sep 17 00:00:00 2001 From: Shenyang Cai Date: Fri, 7 Feb 2025 17:20:40 +0000 Subject: [PATCH 10/14] remove redundant imports --- bigframes/core/utils.py | 1 - tests/system/small/operations/test_timedeltas.py | 1 - tests/unit/core/test_bf_utils.py | 1 - 3 files changed, 3 deletions(-) diff --git a/bigframes/core/utils.py b/bigframes/core/utils.py index 82d21515ea..0198f12537 100644 --- a/bigframes/core/utils.py +++ b/bigframes/core/utils.py @@ -22,7 +22,6 @@ import numpy as np import pandas as pd import pandas.api.types as pdtypes -import pyarrow as pa import typing_extensions import bigframes.exceptions as bfe diff --git a/tests/system/small/operations/test_timedeltas.py b/tests/system/small/operations/test_timedeltas.py index e0b99235c0..1284e27f17 100644 --- a/tests/system/small/operations/test_timedeltas.py +++ b/tests/system/small/operations/test_timedeltas.py @@ -18,7 +18,6 @@ import numpy as np import pandas as pd import pandas.testing -import pyarrow as pa import pytest from bigframes import dtypes diff --git a/tests/unit/core/test_bf_utils.py b/tests/unit/core/test_bf_utils.py index 5ac92ba5d3..cb3b03d988 100644 --- a/tests/unit/core/test_bf_utils.py +++ b/tests/unit/core/test_bf_utils.py @@ -16,7 +16,6 @@ import numpy as np import pandas as pd -import pyarrow as pa import pytest from bigframes.core import utils From 00e15db6c961d14af25f4393a9d9a5cbe7c8432e Mon Sep 17 00:00:00 2001 From: Shenyang Cai Date: Fri, 7 Feb 2025 17:31:34 +0000 Subject: [PATCH 11/14] fix mypy --- bigframes/core/compile/ibis_types.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/bigframes/core/compile/ibis_types.py b/bigframes/core/compile/ibis_types.py index 03ee46a9b0..67593350d3 100644 --- a/bigframes/core/compile/ibis_types.py +++ b/bigframes/core/compile/ibis_types.py @@ -430,7 +430,9 @@ def literal_to_ibis_scalar( def _to_ibis_literal( - literal: typing.Any, ibis_dtype: ibis_dtypes.DataType, validate: bool + literal: typing.Any, + ibis_dtype: typing.Optional[ibis_dtypes.DataType], + validate: bool, ): scalar_expr = bigframes_vendored.ibis.literal(literal) From 4efc943bcfde0f9997b9aa8777b603fe17bd8893 Mon Sep 17 00:00:00 2001 From: Shenyang Cai Date: Fri, 7 Feb 2025 18:23:30 +0000 Subject: [PATCH 12/14] update timedelta literals during tree rewrites --- bigframes/core/compile/ibis_types.py | 21 --------------------- bigframes/core/rewrite/operators.py | 12 ++++++++++-- 2 files changed, 10 insertions(+), 23 deletions(-) diff --git a/bigframes/core/compile/ibis_types.py b/bigframes/core/compile/ibis_types.py index 67593350d3..af2b7908ad 100644 --- a/bigframes/core/compile/ibis_types.py +++ b/bigframes/core/compile/ibis_types.py @@ -13,7 +13,6 @@ # limitations under the License. from __future__ import annotations -import datetime import typing from typing import cast, Dict, Iterable, Optional, Tuple, Union @@ -28,11 +27,9 @@ import db_dtypes # type: ignore import geopandas as gpd # type: ignore import google.cloud.bigquery as bigquery -import numpy import pandas as pd import pyarrow as pa -from bigframes.core import utils import bigframes.dtypes # Type hints for Ibis data types supported by BigQuery DataFrame @@ -417,25 +414,7 @@ def literal_to_ibis_scalar( else: return bigframes_vendored.ibis.null() - if isinstance( - literal, - (datetime.timedelta, pd.Timedelta, numpy.timedelta64), - ): - # numpy timedelta is compatible with Ibis, so we process them separately. - return bigframes_vendored.ibis.literal( - utils.timedelta_to_micros(literal), ibis_dtype - ) - - return _to_ibis_literal(literal, ibis_dtype, validate) - - -def _to_ibis_literal( - literal: typing.Any, - ibis_dtype: typing.Optional[ibis_dtypes.DataType], - validate: bool, -): scalar_expr = bigframes_vendored.ibis.literal(literal) - if ibis_dtype: scalar_expr = bigframes_vendored.ibis.literal(literal, ibis_dtype) elif scalar_expr.type().is_floating(): diff --git a/bigframes/core/rewrite/operators.py b/bigframes/core/rewrite/operators.py index 1570df148e..136e9cc220 100644 --- a/bigframes/core/rewrite/operators.py +++ b/bigframes/core/rewrite/operators.py @@ -19,7 +19,7 @@ from bigframes import dtypes from bigframes import operations as ops from bigframes.core import expression as ex -from bigframes.core import nodes, schema +from bigframes.core import nodes, schema, utils @dataclasses.dataclass @@ -50,7 +50,7 @@ def _rewrite_expressions(expr: ex.Expression, schema: schema.ArraySchema) -> _Ty return _TypedExpr(expr, schema.get_type(expr.id.sql)) if isinstance(expr, ex.ScalarConstantExpression): - return _TypedExpr(expr, expr.dtype) + return _rewrite_scalar_constant_expr(expr) if isinstance(expr, ex.OpExpression): updated_inputs = tuple( @@ -61,6 +61,14 @@ def _rewrite_expressions(expr: ex.Expression, schema: schema.ArraySchema) -> _Ty raise AssertionError(f"Unexpected expression type: {type(expr)}") +def _rewrite_scalar_constant_expr(expr: ex.ScalarConstantExpression) -> _TypedExpr: + if expr.dtype is dtypes.TIMEDELTA_DTYPE: + int_repr = utils.timedelta_to_micros(expr.value) # type: ignore + return _TypedExpr(ex.const(int_repr, expr.dtype), expr.dtype) + + return _TypedExpr(expr, expr.dtype) + + def _rewrite_op_expr( expr: ex.OpExpression, inputs: typing.Tuple[_TypedExpr, ...] ) -> _TypedExpr: From 2b506b51757dbee63fdd8b9955022231062da332 Mon Sep 17 00:00:00 2001 From: Shenyang Cai Date: Fri, 7 Feb 2025 19:41:58 +0000 Subject: [PATCH 13/14] update type conversions in tests to make py 3.9 happy --- .../small/operations/test_timedeltas.py | 88 ++++++++++--------- 1 file changed, 46 insertions(+), 42 deletions(-) diff --git a/tests/system/small/operations/test_timedeltas.py b/tests/system/small/operations/test_timedeltas.py index 1284e27f17..6c44a62686 100644 --- a/tests/system/small/operations/test_timedeltas.py +++ b/tests/system/small/operations/test_timedeltas.py @@ -20,8 +20,6 @@ import pandas.testing import pytest -from bigframes import dtypes - @pytest.fixture(scope="module") def temporal_dfs(session): @@ -45,20 +43,22 @@ def temporal_dfs(session): @pytest.mark.parametrize( - ("column", "bf_dtype"), + ("column", "pd_dtype"), [ - ("datetime_col", dtypes.DATETIME_DTYPE), - ("timestamp_col", dtypes.TIMESTAMP_DTYPE), + ("datetime_col", " Date: Fri, 7 Feb 2025 21:22:43 +0000 Subject: [PATCH 14/14] fix add operator for integers --- bigframes/operations/numeric_ops.py | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/bigframes/operations/numeric_ops.py b/bigframes/operations/numeric_ops.py index 2dd5ef4623..5183e5c4c5 100644 --- a/bigframes/operations/numeric_ops.py +++ b/bigframes/operations/numeric_ops.py @@ -113,24 +113,21 @@ class AddOp(base_ops.BinaryOp): def output_type(self, *input_types): left_type = input_types[0] right_type = input_types[1] - - if left_type is None or right_type is None: - return None - if all(map(dtypes.is_string_like, input_types)) and len(set(input_types)) == 1: # String addition return input_types[0] - if dtypes.is_numeric(left_type) and dtypes.is_numeric(right_type): - # Numeric addition - return dtypes.coerce_to_common(left_type, right_type) - # Timestamp addition. if dtypes.is_datetime_like(left_type) and right_type is dtypes.TIMEDELTA_DTYPE: return left_type if left_type is dtypes.TIMEDELTA_DTYPE and dtypes.is_datetime_like(right_type): return right_type + if (left_type is None or dtypes.is_numeric(left_type)) and ( + right_type is None or dtypes.is_numeric(right_type) + ): + # Numeric addition + return dtypes.coerce_to_common(left_type, right_type) raise TypeError(f"Cannot add dtypes {left_type} and {right_type}")