diff --git a/CHANGELOG.md b/CHANGELOG.md index 3b29a6a41..e4574aa7a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,34 @@ [1]: https://pypi.org/project/google-cloud-bigquery/#history +## [3.35.0](https://github.com/googleapis/python-bigquery/compare/v3.34.0...v3.35.0) (2025-07-15) + + +### Features + +* Add null_markers property to LoadJobConfig and CSVOptions ([#2239](https://github.com/googleapis/python-bigquery/issues/2239)) ([289446d](https://github.com/googleapis/python-bigquery/commit/289446dd8c356d11a0b63b8e6275629b1ae5dc08)) +* Add total slot ms to RowIterator ([#2233](https://github.com/googleapis/python-bigquery/issues/2233)) ([d44bf02](https://github.com/googleapis/python-bigquery/commit/d44bf0231e6e96369e4e03667a3f96618fb664e2)) +* Add UpdateMode to update_dataset ([#2204](https://github.com/googleapis/python-bigquery/issues/2204)) ([eb9c2af](https://github.com/googleapis/python-bigquery/commit/eb9c2aff242c5107f968bbd8b6a9d30cecc877f6)) +* Adds dataset_view parameter to get_dataset method ([#2198](https://github.com/googleapis/python-bigquery/issues/2198)) ([28a5750](https://github.com/googleapis/python-bigquery/commit/28a5750d455f0381548df6f9b1f7661823837d81)) +* Adds date_format to load job and external config ([#2231](https://github.com/googleapis/python-bigquery/issues/2231)) ([7d31828](https://github.com/googleapis/python-bigquery/commit/7d3182802deccfceb0646b87fc8d12275d0a569b)) +* Adds datetime_format as an option ([#2236](https://github.com/googleapis/python-bigquery/issues/2236)) ([54d3dc6](https://github.com/googleapis/python-bigquery/commit/54d3dc66244d50a031e3c80d43d372d2743ecbc3)) +* Adds source_column_match and associated tests ([#2227](https://github.com/googleapis/python-bigquery/issues/2227)) ([6d5d236](https://github.com/googleapis/python-bigquery/commit/6d5d23685cd457d85955356705c1101e9ec3cdcd)) +* Adds time_format and timestamp_format and associated tests ([#2238](https://github.com/googleapis/python-bigquery/issues/2238)) ([371ad29](https://github.com/googleapis/python-bigquery/commit/371ad292df537278767dba71d81822ed57dd8e7d)) +* Adds time_zone to external config and load job ([#2229](https://github.com/googleapis/python-bigquery/issues/2229)) ([b2300d0](https://github.com/googleapis/python-bigquery/commit/b2300d032843512b7e4a5703377632fe60ef3f8d)) + + +### Bug Fixes + +* Adds magics.context.project to eliminate issues with unit tests … ([#2228](https://github.com/googleapis/python-bigquery/issues/2228)) ([27ff3a8](https://github.com/googleapis/python-bigquery/commit/27ff3a89a5f97305fa3ff673aa9183baa7df200f)) +* Fix rows returned when both start_index and page_size are provided ([#2181](https://github.com/googleapis/python-bigquery/issues/2181)) ([45643a2](https://github.com/googleapis/python-bigquery/commit/45643a2e20ce5d503118522dd195aeca00dec3bc)) +* Make AccessEntry equality consistent with from_api_repr ([#2218](https://github.com/googleapis/python-bigquery/issues/2218)) ([4941de4](https://github.com/googleapis/python-bigquery/commit/4941de441cb32cabeb55ec0320f305fb62551155)) +* Update type hints for various BigQuery files ([#2206](https://github.com/googleapis/python-bigquery/issues/2206)) ([b863291](https://github.com/googleapis/python-bigquery/commit/b86329188ba35e61871db82ae1d95d2a576eed1b)) + + +### Documentation + +* Improve clarity of "Output Only" fields in Dataset class ([#2201](https://github.com/googleapis/python-bigquery/issues/2201)) ([bd5aba8](https://github.com/googleapis/python-bigquery/commit/bd5aba8ba40c2f35fb672a68eed11d6baedb304f)) + ## [3.34.0](https://github.com/googleapis/python-bigquery/compare/v3.33.0...v3.34.0) (2025-05-27) diff --git a/google/cloud/bigquery/_job_helpers.py b/google/cloud/bigquery/_job_helpers.py index 888dc1e73..73d4f6e7b 100644 --- a/google/cloud/bigquery/_job_helpers.py +++ b/google/cloud/bigquery/_job_helpers.py @@ -560,6 +560,7 @@ def do_query(): num_dml_affected_rows=query_results.num_dml_affected_rows, query=query, total_bytes_processed=query_results.total_bytes_processed, + slot_millis=query_results.slot_millis, ) if job_retry is not None: diff --git a/google/cloud/bigquery/_pandas_helpers.py b/google/cloud/bigquery/_pandas_helpers.py index 10a5c59bb..2dab03a06 100644 --- a/google/cloud/bigquery/_pandas_helpers.py +++ b/google/cloud/bigquery/_pandas_helpers.py @@ -337,13 +337,8 @@ def types_mapper(arrow_data_type): ): return range_date_dtype - # TODO: this section does not have a test yet OR at least not one that is - # recognized by coverage, hence the pragma. See Issue: #2132 - elif ( - range_timestamp_dtype is not None - and arrow_data_type.equals( # pragma: NO COVER - range_timestamp_dtype.pyarrow_dtype - ) + elif range_timestamp_dtype is not None and arrow_data_type.equals( + range_timestamp_dtype.pyarrow_dtype ): return range_timestamp_dtype diff --git a/google/cloud/bigquery/client.py b/google/cloud/bigquery/client.py index c6873545b..804f77ea2 100644 --- a/google/cloud/bigquery/client.py +++ b/google/cloud/bigquery/client.py @@ -90,7 +90,8 @@ from google.cloud.bigquery.dataset import Dataset from google.cloud.bigquery.dataset import DatasetListItem from google.cloud.bigquery.dataset import DatasetReference -from google.cloud.bigquery.enums import AutoRowIDs + +from google.cloud.bigquery.enums import AutoRowIDs, DatasetView, UpdateMode from google.cloud.bigquery.format_options import ParquetOptions from google.cloud.bigquery.job import ( CopyJob, @@ -864,6 +865,7 @@ def get_dataset( dataset_ref: Union[DatasetReference, str], retry: retries.Retry = DEFAULT_RETRY, timeout: TimeoutType = DEFAULT_TIMEOUT, + dataset_view: Optional[DatasetView] = None, ) -> Dataset: """Fetch the dataset referenced by ``dataset_ref`` @@ -881,7 +883,21 @@ def get_dataset( timeout (Optional[float]): The number of seconds to wait for the underlying HTTP transport before using ``retry``. - + dataset_view (Optional[google.cloud.bigquery.enums.DatasetView]): + Specifies the view that determines which dataset information is + returned. By default, dataset metadata (e.g. friendlyName, description, + labels, etc) and ACL information are returned. This argument can + take on the following possible enum values. + + * :attr:`~google.cloud.bigquery.enums.DatasetView.ACL`: + Includes dataset metadata and the ACL. + * :attr:`~google.cloud.bigquery.enums.DatasetView.FULL`: + Includes all dataset metadata, including the ACL and table metadata. + This view is not supported by the `datasets.list` API method. + * :attr:`~google.cloud.bigquery.enums.DatasetView.METADATA`: + Includes basic dataset metadata, but not the ACL. + * :attr:`~google.cloud.bigquery.enums.DatasetView.DATASET_VIEW_UNSPECIFIED`: + The server will decide which view to use. Currently defaults to FULL. Returns: google.cloud.bigquery.dataset.Dataset: A ``Dataset`` instance. @@ -891,6 +907,12 @@ def get_dataset( dataset_ref, default_project=self.project ) path = dataset_ref.path + + if dataset_view: + query_params = {"datasetView": dataset_view.value} + else: + query_params = {} + span_attributes = {"path": path} api_response = self._call_api( retry, @@ -899,6 +921,7 @@ def get_dataset( method="GET", path=path, timeout=timeout, + query_params=query_params, ) return Dataset.from_api_repr(api_response) @@ -1198,6 +1221,7 @@ def update_dataset( fields: Sequence[str], retry: retries.Retry = DEFAULT_RETRY, timeout: TimeoutType = DEFAULT_TIMEOUT, + update_mode: Optional[UpdateMode] = None, ) -> Dataset: """Change some fields of a dataset. @@ -1237,6 +1261,20 @@ def update_dataset( timeout (Optional[float]): The number of seconds to wait for the underlying HTTP transport before using ``retry``. + update_mode (Optional[google.cloud.bigquery.enums.UpdateMode]): + Specifies the kind of information to update in a dataset. + By default, dataset metadata (e.g. friendlyName, description, + labels, etc) and ACL information are updated. This argument can + take on the following possible enum values. + + * :attr:`~google.cloud.bigquery.enums.UPDATE_MODE_UNSPECIFIED`: + The default value. Behavior defaults to UPDATE_FULL. + * :attr:`~google.cloud.bigquery.enums.UpdateMode.UPDATE_METADATA`: + Includes metadata information for the dataset, such as friendlyName, description, labels, etc. + * :attr:`~google.cloud.bigquery.enums.UpdateMode.UPDATE_ACL`: + Includes ACL information for the dataset, which defines dataset access for one or more entities. + * :attr:`~google.cloud.bigquery.enums.UpdateMode.UPDATE_FULL`: + Includes both dataset metadata and ACL information. Returns: google.cloud.bigquery.dataset.Dataset: @@ -1250,6 +1288,11 @@ def update_dataset( path = dataset.path span_attributes = {"path": path, "fields": fields} + if update_mode: + query_params = {"updateMode": update_mode.value} + else: + query_params = {} + api_response = self._call_api( retry, span_name="BigQuery.updateDataset", @@ -1259,6 +1302,7 @@ def update_dataset( data=partial, headers=headers, timeout=timeout, + query_params=query_params, ) return Dataset.from_api_repr(api_response) @@ -2001,6 +2045,7 @@ def _get_query_results( location: Optional[str] = None, timeout: TimeoutType = DEFAULT_TIMEOUT, page_size: int = 0, + start_index: Optional[int] = None, ) -> _QueryResults: """Get the query results object for a query job. @@ -2019,9 +2064,12 @@ def _get_query_results( before using ``retry``. If set, this connection timeout may be increased to a minimum value. This prevents retries on what would otherwise be a successful response. - page_size (int): + page_size (Optional[int]): Maximum number of rows in a single response. See maxResults in the jobs.getQueryResults REST API. + start_index (Optional[int]): + Zero-based index of the starting row. See startIndex in the + jobs.getQueryResults REST API. Returns: google.cloud.bigquery.query._QueryResults: @@ -2051,6 +2099,9 @@ def _get_query_results( if location is not None: extra_params["location"] = location + if start_index is not None: + extra_params["startIndex"] = start_index + path = "/projects/{}/queries/{}".format(project, job_id) # This call is typically made in a polling loop that checks whether the @@ -4093,6 +4144,7 @@ def _list_rows_from_query_results( num_dml_affected_rows: Optional[int] = None, query: Optional[str] = None, total_bytes_processed: Optional[int] = None, + slot_millis: Optional[int] = None, ) -> RowIterator: """List the rows of a completed query. See @@ -4144,6 +4196,8 @@ def _list_rows_from_query_results( The query text used. total_bytes_processed (Optional[int]): total bytes processed from job statistics, if present. + slot_millis (Optional[int]): + Number of slot ms the user is actually billed for. Returns: google.cloud.bigquery.table.RowIterator: @@ -4183,6 +4237,7 @@ def _list_rows_from_query_results( num_dml_affected_rows=num_dml_affected_rows, query=query, total_bytes_processed=total_bytes_processed, + slot_millis=slot_millis, ) return row_iterator diff --git a/google/cloud/bigquery/dataset.py b/google/cloud/bigquery/dataset.py index f788275cd..878b77d41 100644 --- a/google/cloud/bigquery/dataset.py +++ b/google/cloud/bigquery/dataset.py @@ -17,6 +17,7 @@ from __future__ import absolute_import import copy +import json import typing from typing import Optional, List, Dict, Any, Union @@ -506,7 +507,20 @@ def entity_id(self) -> Optional[Union[Dict[str, Any], str]]: def __eq__(self, other): if not isinstance(other, AccessEntry): return NotImplemented - return self._key() == other._key() + return ( + self.role == other.role + and self.entity_type == other.entity_type + and self._normalize_entity_id(self.entity_id) + == self._normalize_entity_id(other.entity_id) + and self.condition == other.condition + ) + + @staticmethod + def _normalize_entity_id(value): + """Ensure consistent equality for dicts like 'view'.""" + if isinstance(value, dict): + return json.dumps(value, sort_keys=True) + return value def __ne__(self, other): return not self == other @@ -557,7 +571,6 @@ def from_api_repr(cls, resource: dict) -> "AccessEntry": google.cloud.bigquery.dataset.AccessEntry: Access entry parsed from ``resource``. """ - access_entry = cls() access_entry._properties = resource.copy() return access_entry @@ -574,6 +587,10 @@ class Dataset(object): A pointer to a dataset. If ``dataset_ref`` is a string, it must include both the project ID and the dataset ID, separated by ``.``. + + Note: + Fields marked as "Output Only" are populated by the server and will only be + available after calling :meth:`google.cloud.bigquery.client.Client.get_dataset`. """ _PROPERTY_TO_API_FIELD = { @@ -692,7 +709,7 @@ def access_entries(self, value): @property def created(self): - """Union[datetime.datetime, None]: Datetime at which the dataset was + """Union[datetime.datetime, None]: Output only. Datetime at which the dataset was created (:data:`None` until set from the server). """ creation_time = self._properties.get("creationTime") @@ -709,8 +726,8 @@ def dataset_id(self): @property def full_dataset_id(self): - """Union[str, None]: ID for the dataset resource (:data:`None` until - set from the server) + """Union[str, None]: Output only. ID for the dataset resource + (:data:`None` until set from the server). In the format ``project_id:dataset_id``. """ @@ -725,14 +742,14 @@ def reference(self): @property def etag(self): - """Union[str, None]: ETag for the dataset resource (:data:`None` until - set from the server). + """Union[str, None]: Output only. ETag for the dataset resource + (:data:`None` until set from the server). """ return self._properties.get("etag") @property def modified(self): - """Union[datetime.datetime, None]: Datetime at which the dataset was + """Union[datetime.datetime, None]: Output only. Datetime at which the dataset was last modified (:data:`None` until set from the server). """ modified_time = self._properties.get("lastModifiedTime") @@ -744,8 +761,8 @@ def modified(self): @property def self_link(self): - """Union[str, None]: URL for the dataset resource (:data:`None` until - set from the server). + """Union[str, None]: Output only. URL for the dataset resource + (:data:`None` until set from the server). """ return self._properties.get("selfLink") diff --git a/google/cloud/bigquery/enums.py b/google/cloud/bigquery/enums.py index 4cb7a056d..1b1eb241a 100644 --- a/google/cloud/bigquery/enums.py +++ b/google/cloud/bigquery/enums.py @@ -80,6 +80,24 @@ class CreateDisposition(object): returned in the job result.""" +class DatasetView(enum.Enum): + """DatasetView specifies which dataset information is returned.""" + + DATASET_VIEW_UNSPECIFIED = "DATASET_VIEW_UNSPECIFIED" + """The default value. Currently maps to the FULL view.""" + + METADATA = "METADATA" + """View metadata information for the dataset, such as friendlyName, + description, labels, etc.""" + + ACL = "ACL" + """View ACL information for the dataset, which defines dataset access + for one or more entities.""" + + FULL = "FULL" + """View both dataset metadata and ACL information.""" + + class DefaultPandasDTypes(enum.Enum): """Default Pandas DataFrem DTypes to convert BigQuery data. These Sentinel values are used instead of None to maintain backward compatibility, @@ -409,6 +427,24 @@ class BigLakeTableFormat(object): """Apache Iceberg format.""" +class UpdateMode(enum.Enum): + """Specifies the kind of information to update in a dataset.""" + + UPDATE_MODE_UNSPECIFIED = "UPDATE_MODE_UNSPECIFIED" + """The default value. Behavior defaults to UPDATE_FULL.""" + + UPDATE_METADATA = "UPDATE_METADATA" + """Includes metadata information for the dataset, such as friendlyName, + description, labels, etc.""" + + UPDATE_ACL = "UPDATE_ACL" + """Includes ACL information for the dataset, which defines dataset access + for one or more entities.""" + + UPDATE_FULL = "UPDATE_FULL" + """Includes both dataset metadata and ACL information.""" + + class JobCreationMode(object): """Documented values for Job Creation Mode.""" @@ -426,3 +462,21 @@ class JobCreationMode(object): The conditions under which BigQuery can decide to not create a Job are subject to change. """ + + +class SourceColumnMatch(str, enum.Enum): + """Uses sensible defaults based on how the schema is provided. + If autodetect is used, then columns are matched by name. Otherwise, columns + are matched by position. This is done to keep the behavior backward-compatible. + """ + + SOURCE_COLUMN_MATCH_UNSPECIFIED = "SOURCE_COLUMN_MATCH_UNSPECIFIED" + """Unspecified column name match option.""" + + POSITION = "POSITION" + """Matches by position. This assumes that the columns are ordered the same + way as the schema.""" + + NAME = "NAME" + """Matches by name. This reads the header row as column names and reorders + columns to match the field names in the schema.""" diff --git a/google/cloud/bigquery/external_config.py b/google/cloud/bigquery/external_config.py index 6e943adf3..dc7a33e6a 100644 --- a/google/cloud/bigquery/external_config.py +++ b/google/cloud/bigquery/external_config.py @@ -22,6 +22,7 @@ import base64 import copy +import typing from typing import Any, Dict, FrozenSet, Iterable, Optional, Union from google.cloud.bigquery._helpers import _to_bytes @@ -29,6 +30,7 @@ from google.cloud.bigquery._helpers import _int_or_none from google.cloud.bigquery._helpers import _str_or_none from google.cloud.bigquery import _helpers +from google.cloud.bigquery.enums import SourceColumnMatch from google.cloud.bigquery.format_options import AvroOptions, ParquetOptions from google.cloud.bigquery import schema from google.cloud.bigquery.schema import SchemaField @@ -473,6 +475,60 @@ def skip_leading_rows(self): def skip_leading_rows(self, value): self._properties["skipLeadingRows"] = str(value) + @property + def source_column_match(self) -> Optional[SourceColumnMatch]: + """Optional[google.cloud.bigquery.enums.SourceColumnMatch]: Controls the + strategy used to match loaded columns to the schema. If not set, a sensible + default is chosen based on how the schema is provided. If autodetect is + used, then columns are matched by name. Otherwise, columns are matched by + position. This is done to keep the behavior backward-compatible. + + Acceptable values are: + + SOURCE_COLUMN_MATCH_UNSPECIFIED: Unspecified column name match option. + POSITION: matches by position. This assumes that the columns are ordered + the same way as the schema. + NAME: matches by name. This reads the header row as column names and + reorders columns to match the field names in the schema. + + See + https://cloud.google.com/bigquery/docs/reference/rest/v2/tables#CsvOptions.FIELDS.source_column_match + """ + + value = self._properties.get("sourceColumnMatch") + return SourceColumnMatch(value) if value is not None else None + + @source_column_match.setter + def source_column_match(self, value: Union[SourceColumnMatch, str, None]): + if value is not None and not isinstance(value, (SourceColumnMatch, str)): + raise TypeError( + "value must be a google.cloud.bigquery.enums.SourceColumnMatch, str, or None" + ) + if isinstance(value, SourceColumnMatch): + value = value.value + self._properties["sourceColumnMatch"] = value if value else None + + @property + def null_markers(self) -> Optional[Iterable[str]]: + """Optional[Iterable[str]]: A list of strings represented as SQL NULL values in a CSV file. + + .. note:: + null_marker and null_markers can't be set at the same time. + If null_marker is set, null_markers has to be not set. + If null_markers is set, null_marker has to be not set. + If both null_marker and null_markers are set at the same time, a user error would be thrown. + Any strings listed in null_markers, including empty string would be interpreted as SQL NULL. + This applies to all column types. + + See + https://cloud.google.com/bigquery/docs/reference/rest/v2/tables#CsvOptions.FIELDS.null_markers + """ + return self._properties.get("nullMarkers") + + @null_markers.setter + def null_markers(self, value: Optional[Iterable[str]]): + self._properties["nullMarkers"] = value + def to_api_repr(self) -> dict: """Build an API representation of this object. @@ -835,10 +891,10 @@ def schema(self): See https://cloud.google.com/bigquery/docs/reference/rest/v2/tables#ExternalDataConfiguration.FIELDS.schema """ - # TODO: The typehinting for this needs work. Setting this pragma to temporarily - # manage a pytype issue that came up in another PR. See Issue: #2132 - prop = self._properties.get("schema", {}) # type: ignore - return [SchemaField.from_api_repr(field) for field in prop.get("fields", [])] # type: ignore + prop: Dict[str, Any] = typing.cast( + Dict[str, Any], self._properties.get("schema", {}) + ) + return [SchemaField.from_api_repr(field) for field in prop.get("fields", [])] @schema.setter def schema(self, value): @@ -847,6 +903,80 @@ def schema(self, value): prop = {"fields": [field.to_api_repr() for field in value]} self._properties["schema"] = prop + @property + def date_format(self) -> Optional[str]: + """Optional[str]: Format used to parse DATE values. Supports C-style and SQL-style values. + + See: + https://cloud.google.com/bigquery/docs/reference/rest/v2/tables#ExternalDataConfiguration.FIELDS.date_format + """ + result = self._properties.get("dateFormat") + return typing.cast(str, result) + + @date_format.setter + def date_format(self, value: Optional[str]): + self._properties["dateFormat"] = value + + @property + def datetime_format(self) -> Optional[str]: + """Optional[str]: Format used to parse DATETIME values. Supports C-style + and SQL-style values. + + See: + https://cloud.google.com/bigquery/docs/reference/rest/v2/tables#ExternalDataConfiguration.FIELDS.datetime_format + """ + result = self._properties.get("datetimeFormat") + return typing.cast(str, result) + + @datetime_format.setter + def datetime_format(self, value: Optional[str]): + self._properties["datetimeFormat"] = value + + @property + def time_zone(self) -> Optional[str]: + """Optional[str]: Time zone used when parsing timestamp values that do not + have specific time zone information (e.g. 2024-04-20 12:34:56). The expected + format is an IANA timezone string (e.g. America/Los_Angeles). + + See: + https://cloud.google.com/bigquery/docs/reference/rest/v2/tables#ExternalDataConfiguration.FIELDS.time_zone + """ + + result = self._properties.get("timeZone") + return typing.cast(str, result) + + @time_zone.setter + def time_zone(self, value: Optional[str]): + self._properties["timeZone"] = value + + @property + def time_format(self) -> Optional[str]: + """Optional[str]: Format used to parse TIME values. Supports C-style and SQL-style values. + + See: + https://cloud.google.com/bigquery/docs/reference/rest/v2/tables#ExternalDataConfiguration.FIELDS.time_format + """ + result = self._properties.get("timeFormat") + return typing.cast(str, result) + + @time_format.setter + def time_format(self, value: Optional[str]): + self._properties["timeFormat"] = value + + @property + def timestamp_format(self) -> Optional[str]: + """Optional[str]: Format used to parse TIMESTAMP values. Supports C-style and SQL-style values. + + See: + https://cloud.google.com/bigquery/docs/reference/rest/v2/tables#ExternalDataConfiguration.FIELDS.timestamp_format + """ + result = self._properties.get("timestampFormat") + return typing.cast(str, result) + + @timestamp_format.setter + def timestamp_format(self, value: Optional[str]): + self._properties["timestampFormat"] = value + @property def connection_id(self): """Optional[str]: [Experimental] ID of a BigQuery Connection API diff --git a/google/cloud/bigquery/job/base.py b/google/cloud/bigquery/job/base.py index 5eb700ce7..f007b9341 100644 --- a/google/cloud/bigquery/job/base.py +++ b/google/cloud/bigquery/job/base.py @@ -435,9 +435,7 @@ def __init__(self, job_id, client): @property def configuration(self) -> _JobConfig: """Job-type specific configurtion.""" - # TODO: The typehinting for this needs work. Setting this pragma to temporarily - # manage a pytype issue that came up in another PR. See Issue: #2132 - configuration = self._CONFIG_CLASS() # pytype: disable=not-callable + configuration: _JobConfig = self._CONFIG_CLASS() # pytype: disable=not-callable configuration._properties = self._properties.setdefault("configuration", {}) return configuration diff --git a/google/cloud/bigquery/job/load.py b/google/cloud/bigquery/job/load.py index e56ce16f0..8cdb779ac 100644 --- a/google/cloud/bigquery/job/load.py +++ b/google/cloud/bigquery/job/load.py @@ -15,9 +15,10 @@ """Classes for load jobs.""" import typing -from typing import FrozenSet, List, Iterable, Optional +from typing import FrozenSet, List, Iterable, Optional, Union from google.cloud.bigquery.encryption_configuration import EncryptionConfiguration +from google.cloud.bigquery.enums import SourceColumnMatch from google.cloud.bigquery.external_config import HivePartitioningOptions from google.cloud.bigquery.format_options import ParquetOptions from google.cloud.bigquery import _helpers @@ -386,6 +387,27 @@ def null_marker(self): def null_marker(self, value): self._set_sub_prop("nullMarker", value) + @property + def null_markers(self) -> Optional[List[str]]: + """Optional[List[str]]: A list of strings represented as SQL NULL values in a CSV file. + + .. note:: + null_marker and null_markers can't be set at the same time. + If null_marker is set, null_markers has to be not set. + If null_markers is set, null_marker has to be not set. + If both null_marker and null_markers are set at the same time, a user error would be thrown. + Any strings listed in null_markers, including empty string would be interpreted as SQL NULL. + This applies to all column types. + + See: + https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobConfigurationLoad.FIELDS.null_markers + """ + return self._get_sub_prop("nullMarkers") + + @null_markers.setter + def null_markers(self, value: Optional[List[str]]): + self._set_sub_prop("nullMarkers", value) + @property def preserve_ascii_control_characters(self): """Optional[bool]: Preserves the embedded ASCII control characters when sourceFormat is set to CSV. @@ -548,6 +570,105 @@ def source_format(self): def source_format(self, value): self._set_sub_prop("sourceFormat", value) + @property + def source_column_match(self) -> Optional[SourceColumnMatch]: + """Optional[google.cloud.bigquery.enums.SourceColumnMatch]: Controls the + strategy used to match loaded columns to the schema. If not set, a sensible + default is chosen based on how the schema is provided. If autodetect is + used, then columns are matched by name. Otherwise, columns are matched by + position. This is done to keep the behavior backward-compatible. + + Acceptable values are: + + SOURCE_COLUMN_MATCH_UNSPECIFIED: Unspecified column name match option. + POSITION: matches by position. This assumes that the columns are ordered + the same way as the schema. + NAME: matches by name. This reads the header row as column names and + reorders columns to match the field names in the schema. + + See: + + https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobConfigurationLoad.FIELDS.source_column_match + """ + value = self._get_sub_prop("sourceColumnMatch") + return SourceColumnMatch(value) if value is not None else None + + @source_column_match.setter + def source_column_match(self, value: Union[SourceColumnMatch, str, None]): + if value is not None and not isinstance(value, (SourceColumnMatch, str)): + raise TypeError( + "value must be a google.cloud.bigquery.enums.SourceColumnMatch, str, or None" + ) + if isinstance(value, SourceColumnMatch): + value = value.value + self._set_sub_prop("sourceColumnMatch", value if value else None) + + @property + def date_format(self) -> Optional[str]: + """Optional[str]: Date format used for parsing DATE values. + + See: + https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobConfigurationLoad.FIELDS.date_format + """ + return self._get_sub_prop("dateFormat") + + @date_format.setter + def date_format(self, value: Optional[str]): + self._set_sub_prop("dateFormat", value) + + @property + def datetime_format(self) -> Optional[str]: + """Optional[str]: Date format used for parsing DATETIME values. + + See: + https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobConfigurationLoad.FIELDS.datetime_format + """ + return self._get_sub_prop("datetimeFormat") + + @datetime_format.setter + def datetime_format(self, value: Optional[str]): + self._set_sub_prop("datetimeFormat", value) + + @property + def time_zone(self) -> Optional[str]: + """Optional[str]: Default time zone that will apply when parsing timestamp + values that have no specific time zone. + + See: + https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobConfigurationLoad.FIELDS.time_zone + """ + return self._get_sub_prop("timeZone") + + @time_zone.setter + def time_zone(self, value: Optional[str]): + self._set_sub_prop("timeZone", value) + + @property + def time_format(self) -> Optional[str]: + """Optional[str]: Date format used for parsing TIME values. + + See: + https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobConfigurationLoad.FIELDS.time_format + """ + return self._get_sub_prop("timeFormat") + + @time_format.setter + def time_format(self, value: Optional[str]): + self._set_sub_prop("timeFormat", value) + + @property + def timestamp_format(self) -> Optional[str]: + """Optional[str]: Date format used for parsing TIMESTAMP values. + + See: + https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobConfigurationLoad.FIELDS.timestamp_format + """ + return self._get_sub_prop("timestampFormat") + + @timestamp_format.setter + def timestamp_format(self, value: Optional[str]): + self._set_sub_prop("timestampFormat", value) + @property def time_partitioning(self): """Optional[google.cloud.bigquery.table.TimePartitioning]: Specifies time-based @@ -788,6 +909,13 @@ def null_marker(self): """ return self.configuration.null_marker + @property + def null_markers(self): + """See + :attr:`google.cloud.bigquery.job.LoadJobConfig.null_markers`. + """ + return self.configuration.null_markers + @property def quote_character(self): """See @@ -889,6 +1017,48 @@ def clustering_fields(self): """ return self.configuration.clustering_fields + @property + def source_column_match(self) -> Optional[SourceColumnMatch]: + """See + :attr:`google.cloud.bigquery.job.LoadJobConfig.source_column_match`. + """ + return self.configuration.source_column_match + + @property + def date_format(self): + """See + :attr:`google.cloud.bigquery.job.LoadJobConfig.date_format`. + """ + return self.configuration.date_format + + @property + def datetime_format(self): + """See + :attr:`google.cloud.bigquery.job.LoadJobConfig.datetime_format`. + """ + return self.configuration.datetime_format + + @property + def time_zone(self): + """See + :attr:`google.cloud.bigquery.job.LoadJobConfig.time_zone`. + """ + return self.configuration.time_zone + + @property + def time_format(self): + """See + :attr:`google.cloud.bigquery.job.LoadJobConfig.time_format`. + """ + return self.configuration.time_format + + @property + def timestamp_format(self): + """See + :attr:`google.cloud.bigquery.job.LoadJobConfig.timestamp_format`. + """ + return self.configuration.timestamp_format + @property def schema_update_options(self): """See diff --git a/google/cloud/bigquery/job/query.py b/google/cloud/bigquery/job/query.py index 954a46963..ec9379ea9 100644 --- a/google/cloud/bigquery/job/query.py +++ b/google/cloud/bigquery/job/query.py @@ -1409,6 +1409,7 @@ def _reload_query_results( retry: "retries.Retry" = DEFAULT_RETRY, timeout: Optional[float] = None, page_size: int = 0, + start_index: Optional[int] = None, ): """Refresh the cached query results unless already cached and complete. @@ -1421,6 +1422,9 @@ def _reload_query_results( page_size (int): Maximum number of rows in a single response. See maxResults in the jobs.getQueryResults REST API. + start_index (Optional[int]): + Zero-based index of the starting row. See startIndex in the + jobs.getQueryResults REST API. """ # Optimization: avoid a call to jobs.getQueryResults if it's already # been fetched, e.g. from jobs.query first page of results. @@ -1468,6 +1472,7 @@ def _reload_query_results( location=self.location, timeout=transport_timeout, page_size=page_size, + start_index=start_index, ) def result( # type: ignore # (incompatible with supertype) @@ -1570,6 +1575,9 @@ def result( # type: ignore # (incompatible with supertype) if page_size is not None: reload_query_results_kwargs["page_size"] = page_size + if start_index is not None: + reload_query_results_kwargs["start_index"] = start_index + try: retry_do_query = getattr(self, "_retry_do_query", None) if retry_do_query is not None: @@ -1758,6 +1766,7 @@ def is_job_done(): num_dml_affected_rows=self._query_results.num_dml_affected_rows, query=self.query, total_bytes_processed=self.total_bytes_processed, + slot_millis=self.slot_millis, **list_rows_kwargs, ) rows._preserve_order = _contains_order_by(self.query) diff --git a/google/cloud/bigquery/query.py b/google/cloud/bigquery/query.py index 8745c09f5..4a006d621 100644 --- a/google/cloud/bigquery/query.py +++ b/google/cloud/bigquery/query.py @@ -1282,6 +1282,20 @@ def total_bytes_processed(self): if total_bytes_processed is not None: return int(total_bytes_processed) + @property + def slot_millis(self): + """Total number of slot ms the user is actually billed for. + + See: + https://cloud.google.com/bigquery/docs/reference/rest/v2/jobs/query#body.QueryResponse.FIELDS.slot_millis + + Returns: + Optional[int]: Count generated on the server (None until set by the server). + """ + slot_millis = self._properties.get("totalSlotMs") + if slot_millis is not None: + return int(slot_millis) + @property def num_dml_affected_rows(self): """Total number of rows affected by a DML query. diff --git a/google/cloud/bigquery/routine/routine.py b/google/cloud/bigquery/routine/routine.py index 7e079781d..e933fa137 100644 --- a/google/cloud/bigquery/routine/routine.py +++ b/google/cloud/bigquery/routine/routine.py @@ -518,23 +518,17 @@ def __init__(self): @property def project(self): """str: ID of the project containing the routine.""" - # TODO: The typehinting for this needs work. Setting this pragma to temporarily - # manage a pytype issue that came up in another PR. See Issue: #2132 - return self._properties["projectId"] # pytype: disable=typed-dict-error + return self._properties.get("projectId", "") @property def dataset_id(self): """str: ID of dataset containing the routine.""" - # TODO: The typehinting for this needs work. Setting this pragma to temporarily - # manage a pytype issue that came up in another PR. See Issue: #2132 - return self._properties["datasetId"] # pytype: disable=typed-dict-error + return self._properties.get("datasetId", "") @property def routine_id(self): """str: The routine ID.""" - # TODO: The typehinting for this needs work. Setting this pragma to temporarily - # manage a pytype issue that came up in another PR. See Issue: #2132 - return self._properties["routineId"] # pytype: disable=typed-dict-error + return self._properties.get("routineId", "") @property def path(self): diff --git a/google/cloud/bigquery/schema.py b/google/cloud/bigquery/schema.py index 1f1aab7a4..456730b00 100644 --- a/google/cloud/bigquery/schema.py +++ b/google/cloud/bigquery/schema.py @@ -232,11 +232,9 @@ def __init__( if max_length is not _DEFAULT_VALUE: self._properties["maxLength"] = max_length if policy_tags is not _DEFAULT_VALUE: - # TODO: The typehinting for this needs work. Setting this pragma to temporarily - # manage a pytype issue that came up in another PR. See Issue: #2132 self._properties["policyTags"] = ( - policy_tags.to_api_repr() # pytype: disable=attribute-error - if policy_tags is not None + policy_tags.to_api_repr() + if isinstance(policy_tags, PolicyTagList) else None ) if isinstance(range_element_type, str): diff --git a/google/cloud/bigquery/table.py b/google/cloud/bigquery/table.py index 3b1334bd3..dbdde36d1 100644 --- a/google/cloud/bigquery/table.py +++ b/google/cloud/bigquery/table.py @@ -137,9 +137,9 @@ def _reference_getter(table): return TableReference(dataset_ref, table.table_id) -# TODO: The typehinting for this needs work. Setting this pragma to temporarily -# manage a pytype issue that came up in another PR. See Issue: #2132 -def _view_use_legacy_sql_getter(table): +def _view_use_legacy_sql_getter( + table: Union["Table", "TableListItem"] +) -> Optional[bool]: """bool: Specifies whether to execute the view with Legacy or Standard SQL. This boolean specifies whether to execute the view with Legacy SQL @@ -151,15 +151,16 @@ def _view_use_legacy_sql_getter(table): ValueError: For invalid value types. """ - view = table._properties.get("view") # type: ignore + view: Optional[Dict[str, Any]] = table._properties.get("view") if view is not None: # The server-side default for useLegacySql is True. - return view.get("useLegacySql", True) # type: ignore + return view.get("useLegacySql", True) if view is not None else True # In some cases, such as in a table list no view object is present, but the # resource still represents a view. Use the type as a fallback. if table.table_type == "VIEW": # The server-side default for useLegacySql is True. return True + return None # explicit return statement to appease mypy class _TableBase: @@ -1811,6 +1812,7 @@ def __init__( num_dml_affected_rows: Optional[int] = None, query: Optional[str] = None, total_bytes_processed: Optional[int] = None, + slot_millis: Optional[int] = None, ): super(RowIterator, self).__init__( client, @@ -1840,6 +1842,7 @@ def __init__( self._num_dml_affected_rows = num_dml_affected_rows self._query = query self._total_bytes_processed = total_bytes_processed + self._slot_millis = slot_millis @property def _billing_project(self) -> Optional[str]: @@ -1897,6 +1900,11 @@ def total_bytes_processed(self) -> Optional[int]: """total bytes processed from job statistics, if present.""" return self._total_bytes_processed + @property + def slot_millis(self) -> Optional[int]: + """Number of slot ms the user is actually billed for.""" + return self._slot_millis + def _is_almost_completely_cached(self): """Check if all results are completely cached. @@ -1986,12 +1994,19 @@ def _get_next_page_response(self): return response params = self._get_query_params() + + # If the user has provided page_size and start_index, we need to pass + # start_index for the first page, but for all subsequent pages, we + # should not pass start_index. We make a shallow copy of params and do + # not alter the original, so if the user iterates the results again, + # start_index is preserved. + params_copy = copy.copy(params) if self._page_size is not None: if self.page_number and "startIndex" in params: - del params["startIndex"] + del params_copy["startIndex"] return self.api_request( - method=self._HTTP_METHOD, path=self.path, query_params=params + method=self._HTTP_METHOD, path=self.path, query_params=params_copy ) @property diff --git a/google/cloud/bigquery/version.py b/google/cloud/bigquery/version.py index 9e1393854..0107ae309 100644 --- a/google/cloud/bigquery/version.py +++ b/google/cloud/bigquery/version.py @@ -12,4 +12,4 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "3.34.0" +__version__ = "3.35.0" diff --git a/noxfile.py b/noxfile.py index 6807b7ee4..eb79c238d 100644 --- a/noxfile.py +++ b/noxfile.py @@ -408,10 +408,10 @@ def prerelease_deps(session): ) # PyArrow prerelease packages are published to an alternative PyPI host. - # https://arrow.apache.org/docs/python/install.html#installing-nightly-packages + # https://arrow.apache.org/docs/developers/python.html#installing-nightly-packages session.install( "--extra-index-url", - "https://pypi.fury.io/arrow-nightlies/", + "https://pypi.anaconda.org/scientific-python-nightly-wheels/simple", "--prefer-binary", "--pre", "--upgrade", diff --git a/samples/desktopapp/requirements-test.txt b/samples/desktopapp/requirements-test.txt index 2ad35b418..3bf52c85d 100644 --- a/samples/desktopapp/requirements-test.txt +++ b/samples/desktopapp/requirements-test.txt @@ -1,4 +1,4 @@ google-cloud-testutils==1.6.4 -pytest==8.3.5 +pytest==8.4.1 mock==5.2.0 -pytest-xdist==3.7.0 +pytest-xdist==3.8.0 diff --git a/samples/desktopapp/requirements.txt b/samples/desktopapp/requirements.txt index 4a5b75346..a512dbd3a 100644 --- a/samples/desktopapp/requirements.txt +++ b/samples/desktopapp/requirements.txt @@ -1,2 +1,2 @@ -google-cloud-bigquery==3.33.0 +google-cloud-bigquery==3.34.0 google-auth-oauthlib==1.2.2 diff --git a/samples/geography/requirements-test.txt b/samples/geography/requirements-test.txt index 3ca365401..d449b373b 100644 --- a/samples/geography/requirements-test.txt +++ b/samples/geography/requirements-test.txt @@ -1,3 +1,3 @@ -pytest==8.3.5 +pytest==8.4.1 mock==5.2.0 -pytest-xdist==3.7.0 +pytest-xdist==3.8.0 diff --git a/samples/geography/requirements.txt b/samples/geography/requirements.txt index 7a0946fae..e932625b8 100644 --- a/samples/geography/requirements.txt +++ b/samples/geography/requirements.txt @@ -1,29 +1,30 @@ attrs==25.3.0 -certifi==2025.4.26 +certifi==2025.7.14 cffi==1.17.1 charset-normalizer==3.4.2 click===8.1.8; python_version == '3.9' click==8.2.1; python_version >= '3.10' -click-plugins==1.1.1 +click-plugins==1.1.1.2 cligj==0.7.2 db-dtypes==1.4.3 Fiona==1.10.1 geojson==3.2.0 -geopandas==1.0.1 -google-api-core==2.24.2 -google-auth==2.40.2 -google-cloud-bigquery==3.33.0 -google-cloud-bigquery-storage==2.31.0 +geopandas===1.0.1; python_version <= '3.9' +geopandas==1.1.1; python_version >= '3.10' +google-api-core==2.25.1 +google-auth==2.40.3 +google-cloud-bigquery==3.34.0 +google-cloud-bigquery-storage==2.32.0 google-cloud-core==2.4.3 google-crc32c==1.7.1 google-resumable-media==2.7.2 googleapis-common-protos==1.70.0 -grpcio==1.71.0 +grpcio==1.73.1 idna==3.10 munch==4.0.0 mypy-extensions==1.1.0 packaging==25.0 -pandas==2.2.3 +pandas==2.3.1 proto-plus==1.26.1 pyarrow==20.0.0 pyasn1==0.6.1 @@ -33,11 +34,11 @@ pyparsing==3.2.3 python-dateutil==2.9.0.post0 pytz==2025.2 PyYAML==6.0.2 -requests==2.32.3 +requests==2.32.4 rsa==4.9.1 Shapely===2.0.7; python_version == '3.9' Shapely==2.1.1; python_version >= '3.10' six==1.17.0 -typing-extensions==4.13.2 +typing-extensions==4.14.1 typing-inspect==0.9.0 -urllib3==2.4.0 +urllib3==2.5.0 diff --git a/samples/magics/requirements-test.txt b/samples/magics/requirements-test.txt index 2ad35b418..3bf52c85d 100644 --- a/samples/magics/requirements-test.txt +++ b/samples/magics/requirements-test.txt @@ -1,4 +1,4 @@ google-cloud-testutils==1.6.4 -pytest==8.3.5 +pytest==8.4.1 mock==5.2.0 -pytest-xdist==3.7.0 +pytest-xdist==3.8.0 diff --git a/samples/magics/requirements.txt b/samples/magics/requirements.txt index 7d0c91e3d..b53a35982 100644 --- a/samples/magics/requirements.txt +++ b/samples/magics/requirements.txt @@ -1,6 +1,6 @@ -bigquery_magics==0.10.0 +bigquery_magics==0.10.1 db-dtypes==1.4.3 -google.cloud.bigquery==3.33.0 -google-cloud-bigquery-storage==2.31.0 +google.cloud.bigquery==3.34.0 +google-cloud-bigquery-storage==2.32.0 ipython===8.18.1 -pandas==2.2.3 +pandas==2.3.1 diff --git a/samples/notebooks/requirements-test.txt b/samples/notebooks/requirements-test.txt index 2ad35b418..3bf52c85d 100644 --- a/samples/notebooks/requirements-test.txt +++ b/samples/notebooks/requirements-test.txt @@ -1,4 +1,4 @@ google-cloud-testutils==1.6.4 -pytest==8.3.5 +pytest==8.4.1 mock==5.2.0 -pytest-xdist==3.7.0 +pytest-xdist==3.8.0 diff --git a/samples/notebooks/requirements.txt b/samples/notebooks/requirements.txt index 9f131e5b8..4b134ac9d 100644 --- a/samples/notebooks/requirements.txt +++ b/samples/notebooks/requirements.txt @@ -1,9 +1,9 @@ -bigquery-magics==0.10.0 +bigquery-magics==0.10.1 db-dtypes==1.4.3 -google-cloud-bigquery==3.33.0 -google-cloud-bigquery-storage==2.31.0 +google-cloud-bigquery==3.34.0 +google-cloud-bigquery-storage==2.32.0 ipython===8.18.1; python_version == '3.9' -ipython==9.2.0; python_version >= '3.10' +ipython==9.4.0; python_version >= '3.10' matplotlib===3.9.2; python_version == '3.9' matplotlib==3.10.3; python_version >= '3.10' -pandas==2.2.3 +pandas==2.3.1 diff --git a/samples/snippets/label_job.py b/samples/snippets/label_job.py new file mode 100644 index 000000000..cfd06d189 --- /dev/null +++ b/samples/snippets/label_job.py @@ -0,0 +1,36 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +def label_job() -> None: + # [START bigquery_label_job] + from google.cloud import bigquery + + client = bigquery.Client() + + sql = """ + SELECT corpus + FROM `bigquery-public-data.samples.shakespeare` + GROUP BY corpus; + """ + labels = {"color": "green"} + + config = bigquery.QueryJobConfig() + config.labels = labels + location = "us" + job = client.query(sql, location=location, job_config=config) + job_id = job.job_id + + print(f"Added {job.labels} to {job_id}.") + # [END bigquery_label_job] diff --git a/samples/snippets/label_job_test.py b/samples/snippets/label_job_test.py new file mode 100644 index 000000000..0780db61a --- /dev/null +++ b/samples/snippets/label_job_test.py @@ -0,0 +1,31 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import typing + +import label_job # type: ignore + + +if typing.TYPE_CHECKING: + import pytest + + +def test_label_job( + capsys: "pytest.CaptureFixture[str]", +) -> None: + label_job.label_job() + + out, _ = capsys.readouterr() + assert "color" in out + assert "green" in out diff --git a/samples/snippets/requirements-test.txt b/samples/snippets/requirements-test.txt index 767f71fb1..cef3450e1 100644 --- a/samples/snippets/requirements-test.txt +++ b/samples/snippets/requirements-test.txt @@ -1,5 +1,5 @@ # samples/snippets should be runnable with no "extras" google-cloud-testutils==1.6.4 -pytest==8.3.5 +pytest==8.4.1 mock==5.2.0 -pytest-xdist==3.7.0 +pytest-xdist==3.8.0 diff --git a/samples/snippets/requirements.txt b/samples/snippets/requirements.txt index dae43eff3..fd8bd672b 100644 --- a/samples/snippets/requirements.txt +++ b/samples/snippets/requirements.txt @@ -1,2 +1,2 @@ # samples/snippets should be runnable with no "extras" -google-cloud-bigquery==3.33.0 +google-cloud-bigquery==3.34.0 diff --git a/tests/unit/job/test_load.py b/tests/unit/job/test_load.py index 10df46fb3..b551d52dd 100644 --- a/tests/unit/job/test_load.py +++ b/tests/unit/job/test_load.py @@ -19,6 +19,7 @@ from .helpers import _Base from .helpers import _make_client +from google.cloud.bigquery.enums import SourceColumnMatch class TestLoadJob(_Base): @@ -37,11 +38,26 @@ def _setUpConstants(self): self.OUTPUT_BYTES = 23456 self.OUTPUT_ROWS = 345 self.REFERENCE_FILE_SCHEMA_URI = "gs://path/to/reference" + self.SOURCE_COLUMN_MATCH = "NAME" + self.DATE_FORMAT = "%Y-%m-%d" + self.DATETIME_FORMAT = "%Y-%m-%dT%H:%M:%S" + self.TIME_ZONE = "UTC" + self.TIME_FORMAT = "%H:%M:%S" + self.TIMESTAMP_FORMAT = "YYYY-MM-DD HH:MM:SS.SSSSSSZ" + self.NULL_MARKERS = ["", "NA"] def _make_resource(self, started=False, ended=False): resource = super(TestLoadJob, self)._make_resource(started, ended) config = resource["configuration"]["load"] config["sourceUris"] = [self.SOURCE1] + config["sourceColumnMatch"] = self.SOURCE_COLUMN_MATCH + config["dateFormat"] = self.DATE_FORMAT + config["datetimeFormat"] = self.DATETIME_FORMAT + config["timeZone"] = self.TIME_ZONE + config["timeFormat"] = self.TIME_FORMAT + config["timestampFormat"] = self.TIMESTAMP_FORMAT + config["nullMarkers"] = self.NULL_MARKERS + config["destinationTable"] = { "projectId": self.PROJECT, "datasetId": self.DS_ID, @@ -129,6 +145,10 @@ def _verifyResourceProperties(self, job, resource): self.assertEqual(job.null_marker, config["nullMarker"]) else: self.assertIsNone(job.null_marker) + if "nullMarkers" in config: + self.assertEqual(job.null_markers, config["nullMarkers"]) + else: + self.assertIsNone(job.null_markers) if "quote" in config: self.assertEqual(job.quote_character, config["quote"]) else: @@ -143,7 +163,6 @@ def _verifyResourceProperties(self, job, resource): ) else: self.assertIsNone(job.reference_file_schema_uri) - if "destinationEncryptionConfiguration" in config: self.assertIsNotNone(job.destination_encryption_configuration) self.assertEqual( @@ -152,6 +171,35 @@ def _verifyResourceProperties(self, job, resource): ) else: self.assertIsNone(job.destination_encryption_configuration) + if "dateFormat" in config: + self.assertEqual(job.date_format, config["dateFormat"]) + else: + self.assertIsNone(job.date_format) + if "datetimeFormat" in config: + self.assertEqual(job.datetime_format, config["datetimeFormat"]) + else: + self.assertIsNone(job.datetime_format) + if "timeZone" in config: + self.assertEqual(job.time_zone, config["timeZone"]) + else: + self.assertIsNone(job.time_zone) + if "timeFormat" in config: + self.assertEqual(job.time_format, config["timeFormat"]) + else: + self.assertIsNone(job.time_format) + if "timestampFormat" in config: + self.assertEqual(job.timestamp_format, config["timestampFormat"]) + else: + self.assertIsNone(job.timestamp_format) + + if "sourceColumnMatch" in config: + # job.source_column_match will be an Enum, config[...] is a string + self.assertEqual( + job.source_column_match.value, + config["sourceColumnMatch"], + ) + else: + self.assertIsNone(job.source_column_match) def test_ctor(self): client = _make_client(project=self.PROJECT) @@ -181,6 +229,7 @@ def test_ctor(self): self.assertIsNone(job.ignore_unknown_values) self.assertIsNone(job.max_bad_records) self.assertIsNone(job.null_marker) + self.assertIsNone(job.null_markers) self.assertIsNone(job.quote_character) self.assertIsNone(job.skip_leading_rows) self.assertIsNone(job.source_format) @@ -194,6 +243,12 @@ def test_ctor(self): self.assertIsNone(job.clustering_fields) self.assertIsNone(job.schema_update_options) self.assertIsNone(job.reference_file_schema_uri) + self.assertIsNone(job.source_column_match) + self.assertIsNone(job.date_format) + self.assertIsNone(job.datetime_format) + self.assertIsNone(job.time_zone) + self.assertIsNone(job.time_format) + self.assertIsNone(job.timestamp_format) def test_ctor_w_config(self): from google.cloud.bigquery.schema import SchemaField @@ -431,6 +486,24 @@ def test_from_api_repr_w_properties(self): self.assertIs(job._client, client) self._verifyResourceProperties(job, RESOURCE) + def test_to_api_repr(self): + self._setUpConstants() + client = _make_client(project=self.PROJECT) + RESOURCE = self._make_resource(ended=False) + + klass = self._get_target_class() + job = klass.from_api_repr(RESOURCE, client) + api_repr = job.to_api_repr() + + # as per the documentation in load.py -> LoadJob.to_api_repr(), + # the return value from to_api_repr should not include statistics + expected = { + "jobReference": RESOURCE["jobReference"], + "configuration": RESOURCE["configuration"], + } + + self.assertEqual(api_repr, expected) + def test_begin_w_already_running(self): conn = make_connection() client = _make_client(project=self.PROJECT, connection=conn) @@ -571,7 +644,14 @@ def test_begin_w_alternate_client(self): ] }, "schemaUpdateOptions": [SchemaUpdateOption.ALLOW_FIELD_ADDITION], + "sourceColumnMatch": self.SOURCE_COLUMN_MATCH, + "dateFormat": self.DATE_FORMAT, + "datetimeFormat": self.DATETIME_FORMAT, + "timeZone": self.TIME_ZONE, + "timeFormat": self.TIME_FORMAT, + "timestampFormat": self.TIMESTAMP_FORMAT, } + RESOURCE["configuration"]["load"] = LOAD_CONFIGURATION conn1 = make_connection() client1 = _make_client(project=self.PROJECT, connection=conn1) @@ -599,6 +679,13 @@ def test_begin_w_alternate_client(self): config.write_disposition = WriteDisposition.WRITE_TRUNCATE config.schema_update_options = [SchemaUpdateOption.ALLOW_FIELD_ADDITION] config.reference_file_schema_uri = "gs://path/to/reference" + config.source_column_match = SourceColumnMatch(self.SOURCE_COLUMN_MATCH) + config.date_format = self.DATE_FORMAT + config.datetime_format = self.DATETIME_FORMAT + config.time_zone = self.TIME_ZONE + config.time_format = self.TIME_FORMAT + config.timestamp_format = self.TIMESTAMP_FORMAT + with mock.patch( "google.cloud.bigquery.opentelemetry_tracing._get_final_span_attributes" ) as final_attributes: diff --git a/tests/unit/job/test_load_config.py b/tests/unit/job/test_load_config.py index 3a681c476..27d3cead1 100644 --- a/tests/unit/job/test_load_config.py +++ b/tests/unit/job/test_load_config.py @@ -469,6 +469,22 @@ def test_null_marker_setter(self): config.null_marker = null_marker self.assertEqual(config._properties["load"]["nullMarker"], null_marker) + def test_null_markers_missing(self): + config = self._get_target_class()() + self.assertIsNone(config.null_markers) + + def test_null_markers_hit(self): + null_markers = ["", "NA"] + config = self._get_target_class()() + config._properties["load"]["nullMarkers"] = null_markers + self.assertEqual(config.null_markers, null_markers) + + def test_null_markers_setter(self): + null_markers = ["", "NA"] + config = self._get_target_class()() + config.null_markers = null_markers + self.assertEqual(config._properties["load"]["nullMarkers"], null_markers) + def test_preserve_ascii_control_characters_missing(self): config = self._get_target_class()() self.assertIsNone(config.preserve_ascii_control_characters) @@ -828,6 +844,120 @@ def test_write_disposition_setter(self): config._properties["load"]["writeDisposition"], write_disposition ) + def test_source_column_match_missing(self): + config = self._get_target_class()() + self.assertIsNone(config.source_column_match) + + def test_source_column_match_hit(self): + from google.cloud.bigquery.enums import SourceColumnMatch + + option_enum = SourceColumnMatch.NAME + config = self._get_target_class()() + # Assume API stores the string value of the enum + config._properties["load"]["sourceColumnMatch"] = option_enum.value + self.assertEqual(config.source_column_match, option_enum) + + def test_source_column_match_setter(self): + from google.cloud.bigquery.enums import SourceColumnMatch + + option_enum = SourceColumnMatch.POSITION + config = self._get_target_class()() + config.source_column_match = option_enum + # Assert that the string value of the enum is stored + self.assertEqual( + config._properties["load"]["sourceColumnMatch"], option_enum.value + ) + option_str = "NAME" + config.source_column_match = option_str + self.assertEqual(config._properties["load"]["sourceColumnMatch"], option_str) + + def test_source_column_match_setter_invalid_type(self): + config = self._get_target_class()() + with self.assertRaises(TypeError): + config.source_column_match = 3.14 + + def test_date_format_missing(self): + config = self._get_target_class()() + self.assertIsNone(config.date_format) + + def test_date_format_hit(self): + date_format = "%Y-%m-%d" + config = self._get_target_class()() + config._properties["load"]["dateFormat"] = date_format + self.assertEqual(config.date_format, date_format) + + def test_date_format_setter(self): + date_format = "YYYY/MM/DD" + config = self._get_target_class()() + config.date_format = date_format + self.assertEqual(config._properties["load"]["dateFormat"], date_format) + + def test_datetime_format_missing(self): + config = self._get_target_class()() + self.assertIsNone(config.datetime_format) + + def test_datetime_format_hit(self): + datetime_format = "%Y-%m-%dT%H:%M:%S" + config = self._get_target_class()() + config._properties["load"]["datetimeFormat"] = datetime_format + self.assertEqual(config.datetime_format, datetime_format) + + def test_datetime_format_setter(self): + datetime_format = "YYYY/MM/DD HH24:MI:SS" + config = self._get_target_class()() + config.datetime_format = datetime_format + self.assertEqual(config._properties["load"]["datetimeFormat"], datetime_format) + + def test_time_zone_missing(self): + config = self._get_target_class()() + self.assertIsNone(config.time_zone) + + def test_time_zone_hit(self): + time_zone = "UTC" + config = self._get_target_class()() + config._properties["load"]["timeZone"] = time_zone + self.assertEqual(config.time_zone, time_zone) + + def test_time_zone_setter(self): + time_zone = "America/New_York" + config = self._get_target_class()() + config.time_zone = time_zone + self.assertEqual(config._properties["load"]["timeZone"], time_zone) + + def test_time_format_missing(self): + config = self._get_target_class()() + self.assertIsNone(config.time_format) + + def test_time_format_hit(self): + time_format = "%H:%M:%S" + config = self._get_target_class()() + config._properties["load"]["timeFormat"] = time_format + self.assertEqual(config.time_format, time_format) + + def test_time_format_setter(self): + time_format = "HH24:MI:SS" + config = self._get_target_class()() + config.time_format = time_format + self.assertEqual(config._properties["load"]["timeFormat"], time_format) + + def test_timestamp_format_missing(self): + config = self._get_target_class()() + self.assertIsNone(config.timestamp_format) + + def test_timestamp_format_hit(self): + timestamp_format = "%Y-%m-%dT%H:%M:%S.%fZ" + config = self._get_target_class()() + config._properties["load"]["timestampFormat"] = timestamp_format + self.assertEqual(config.timestamp_format, timestamp_format) + + def test_timestamp_format_setter(self): + timestamp_format = "YYYY/MM/DD HH24:MI:SS.FF6 TZR" + config = self._get_target_class()() + config.timestamp_format = timestamp_format + self.assertEqual( + config._properties["load"]["timestampFormat"], timestamp_format + ) + def test_parquet_options_missing(self): config = self._get_target_class()() self.assertIsNone(config.parquet_options) @@ -901,3 +1031,117 @@ def test_column_name_character_map_none(self): config._properties["load"]["columnNameCharacterMap"], ColumnNameCharacterMap.COLUMN_NAME_CHARACTER_MAP_UNSPECIFIED, ) + + RESOURCE = { + "load": { + "allowJaggedRows": True, + "createDisposition": "CREATE_NEVER", + "encoding": "UTF-8", + "fieldDelimiter": ",", + "ignoreUnknownValues": True, + "maxBadRecords": 10, + "nullMarker": "\\N", + "quote": '"', + "schema": { + "fields": [ + {"name": "name", "type": "STRING", "mode": "NULLABLE"}, + {"name": "age", "type": "INTEGER", "mode": "NULLABLE"}, + ] + }, + "skipLeadingRows": "1", + "sourceFormat": "CSV", + "timePartitioning": { + "type": "DAY", + "field": "transaction_date", + }, + "useAvroLogicalTypes": True, + "writeDisposition": "WRITE_TRUNCATE", + "dateFormat": "%Y-%m-%d", + "timeZone": "America/New_York", + "parquetOptions": {"enableListInference": True}, + "columnNameCharacterMap": "V2", + "someNewField": "some-value", + } + } + + def test_from_api_repr(self): + from google.cloud.bigquery.job import ( + CreateDisposition, + LoadJobConfig, + SourceFormat, + WriteDisposition, + ) + from google.cloud.bigquery.schema import SchemaField + from google.cloud.bigquery.table import TimePartitioning, TimePartitioningType + + from google.cloud.bigquery.job.load import ColumnNameCharacterMap + + config = LoadJobConfig.from_api_repr(self.RESOURCE) + + self.assertTrue(config.allow_jagged_rows) + self.assertEqual(config.create_disposition, CreateDisposition.CREATE_NEVER) + self.assertEqual(config.encoding, "UTF-8") + self.assertEqual(config.field_delimiter, ",") + self.assertTrue(config.ignore_unknown_values) + self.assertEqual(config.max_bad_records, 10) + self.assertEqual(config.null_marker, "\\N") + self.assertEqual(config.quote_character, '"') + self.assertEqual( + config.schema, + [SchemaField("name", "STRING"), SchemaField("age", "INTEGER")], + ) + self.assertEqual(config.skip_leading_rows, 1) + self.assertEqual(config.source_format, SourceFormat.CSV) + self.assertEqual( + config.time_partitioning, + TimePartitioning(type_=TimePartitioningType.DAY, field="transaction_date"), + ) + self.assertTrue(config.use_avro_logical_types) + self.assertEqual(config.write_disposition, WriteDisposition.WRITE_TRUNCATE) + self.assertEqual(config.date_format, "%Y-%m-%d") + self.assertEqual(config.time_zone, "America/New_York") + self.assertTrue(config.parquet_options.enable_list_inference) + self.assertEqual(config.column_name_character_map, ColumnNameCharacterMap.V2) + self.assertEqual(config._properties["load"]["someNewField"], "some-value") + + def test_to_api_repr(self): + from google.cloud.bigquery.job import ( + CreateDisposition, + LoadJobConfig, + SourceFormat, + WriteDisposition, + ) + from google.cloud.bigquery.schema import SchemaField + from google.cloud.bigquery.table import TimePartitioning, TimePartitioningType + from google.cloud.bigquery.format_options import ParquetOptions + from google.cloud.bigquery.job.load import ColumnNameCharacterMap + + config = LoadJobConfig() + config.allow_jagged_rows = True + config.create_disposition = CreateDisposition.CREATE_NEVER + config.encoding = "UTF-8" + config.field_delimiter = "," + config.ignore_unknown_values = True + config.max_bad_records = 10 + config.null_marker = r"\N" + config.quote_character = '"' + config.schema = [SchemaField("name", "STRING"), SchemaField("age", "INTEGER")] + config.skip_leading_rows = 1 + config.source_format = SourceFormat.CSV + config.time_partitioning = TimePartitioning( + type_=TimePartitioningType.DAY, field="transaction_date" + ) + config.use_avro_logical_types = True + config.write_disposition = WriteDisposition.WRITE_TRUNCATE + config.date_format = "%Y-%m-%d" + config.time_zone = "America/New_York" + parquet_options = ParquetOptions() + parquet_options.enable_list_inference = True + config.parquet_options = parquet_options + config.column_name_character_map = ColumnNameCharacterMap.V2 + config._properties["load"]["someNewField"] = "some-value" + + api_repr = config.to_api_repr() + + expected = self.RESOURCE + self.assertEqual(api_repr, expected) diff --git a/tests/unit/job/test_query.py b/tests/unit/job/test_query.py index 1df65279d..7201adb55 100644 --- a/tests/unit/job/test_query.py +++ b/tests/unit/job/test_query.py @@ -888,6 +888,7 @@ def test_result_reloads_job_state_until_done(self): job_resource = self._make_resource(started=True, location="EU") job_resource_done = self._make_resource(started=True, ended=True, location="EU") job_resource_done["statistics"]["query"]["totalBytesProcessed"] = str(1234) + job_resource_done["statistics"]["query"]["totalSlotMs"] = str(5678) job_resource_done["configuration"]["query"]["destinationTable"] = { "projectId": "dest-project", "datasetId": "dest_dataset", @@ -969,6 +970,7 @@ def test_result_reloads_job_state_until_done(self): self.assertEqual(result.total_rows, 1) self.assertEqual(result.query, job.query) self.assertEqual(result.total_bytes_processed, 1234) + self.assertEqual(result.slot_millis, 5678) query_results_path = f"/projects/{self.PROJECT}/queries/{self.JOB_ID}" query_results_call = mock.call( @@ -1682,6 +1684,78 @@ def test_result_with_start_index(self): tabledata_list_request[1]["query_params"]["maxResults"], page_size ) + def test_result_with_start_index_multi_page(self): + # When there are multiple pages of response and the user has set + # start_index, we should supply start_index to the server in the first + # request. However, in the subsequent requests, we will pass only + # page_token but not start_index, because the server only allows one + # of them. + from google.cloud.bigquery.table import RowIterator + + query_resource = { + "jobComplete": True, + "jobReference": {"projectId": self.PROJECT, "jobId": self.JOB_ID}, + "schema": {"fields": [{"name": "col1", "type": "STRING"}]}, + "totalRows": "7", + } + + # Although the result has 7 rows, the response only returns 6, because + # start_index is 1. + tabledata_resource_1 = { + "totalRows": "7", + "pageToken": "page_token_1", + "rows": [ + {"f": [{"v": "abc"}]}, + {"f": [{"v": "def"}]}, + {"f": [{"v": "ghi"}]}, + ], + } + tabledata_resource_2 = { + "totalRows": "7", + "pageToken": None, + "rows": [ + {"f": [{"v": "jkl"}]}, + {"f": [{"v": "mno"}]}, + {"f": [{"v": "pqe"}]}, + ], + } + + connection = make_connection( + query_resource, tabledata_resource_1, tabledata_resource_2 + ) + client = _make_client(self.PROJECT, connection=connection) + resource = self._make_resource(ended=True) + job = self._get_target_class().from_api_repr(resource, client) + + start_index = 1 + page_size = 3 + + result = job.result(page_size=page_size, start_index=start_index) + + self.assertIsInstance(result, RowIterator) + self.assertEqual(result.total_rows, 7) + + rows = list(result) + + self.assertEqual(len(rows), 6) + self.assertEqual(len(connection.api_request.call_args_list), 3) + + # First call has both startIndex and maxResults. + tabledata_list_request_1 = connection.api_request.call_args_list[1] + self.assertEqual( + tabledata_list_request_1[1]["query_params"]["startIndex"], start_index + ) + self.assertEqual( + tabledata_list_request_1[1]["query_params"]["maxResults"], page_size + ) + + # Second call only has maxResults. + tabledata_list_request_2 = connection.api_request.call_args_list[2] + self.assertFalse("startIndex" in tabledata_list_request_2[1]["query_params"]) + self.assertEqual( + tabledata_list_request_2[1]["query_params"]["maxResults"], page_size + ) + def test_result_error(self): from google.cloud import exceptions diff --git a/tests/unit/job/test_query_pandas.py b/tests/unit/job/test_query_pandas.py index d82f0dfe3..a6c59b158 100644 --- a/tests/unit/job/test_query_pandas.py +++ b/tests/unit/job/test_query_pandas.py @@ -647,12 +647,6 @@ def test_to_dataframe_bqstorage_no_pyarrow_compression(): ) -# TODO: The test needs work to account for pandas 2.0+. See Issue: #2132 -# pragma added due to issues with coverage. -@pytest.mark.skipif( - pandas.__version__.startswith("2."), - reason="pandas 2.0 changes some default dtypes and we haven't update the test to account for those", -) @pytest.mark.skipif(pyarrow is None, reason="Requires `pyarrow`") def test_to_dataframe_column_dtypes(): from google.cloud.bigquery.job import QueryJob as target_class @@ -704,7 +698,6 @@ def test_to_dataframe_column_dtypes(): exp_columns = [field["name"] for field in query_resource["schema"]["fields"]] assert list(df) == exp_columns # verify the column names - assert df.start_timestamp.dtype.name == "datetime64[ns, UTC]" assert df.seconds.dtype.name == "Int64" assert df.miles.dtype.name == "float64" assert df.km.dtype.name == "float16" @@ -712,6 +705,11 @@ def test_to_dataframe_column_dtypes(): assert df.complete.dtype.name == "boolean" assert df.date.dtype.name == "dbdate" + if pandas.__version__.startswith("2."): + assert df.start_timestamp.dtype.name == "datetime64[us, UTC]" + else: + assert df.start_timestamp.dtype.name == "datetime64[ns, UTC]" + def test_to_dataframe_column_date_dtypes(): from google.cloud.bigquery.job import QueryJob as target_class diff --git a/tests/unit/test_client.py b/tests/unit/test_client.py index 8ce8d2cbd..bb86ccc3c 100644 --- a/tests/unit/test_client.py +++ b/tests/unit/test_client.py @@ -60,7 +60,8 @@ from google.cloud.bigquery import job as bqjob import google.cloud.bigquery._job_helpers -from google.cloud.bigquery.dataset import DatasetReference +from google.cloud.bigquery.dataset import DatasetReference, Dataset +from google.cloud.bigquery.enums import UpdateMode, DatasetView from google.cloud.bigquery import exceptions from google.cloud.bigquery import ParquetOptions import google.cloud.bigquery.retry @@ -752,7 +753,7 @@ def test_get_dataset(self): final_attributes.assert_called_once_with({"path": "/%s" % path}, client, None) conn.api_request.assert_called_once_with( - method="GET", path="/%s" % path, timeout=7.5 + method="GET", path="/%s" % path, timeout=7.5, query_params={} ) self.assertEqual(dataset.dataset_id, self.DS_ID) @@ -818,6 +819,72 @@ def test_get_dataset(self): self.assertEqual(dataset.dataset_id, self.DS_ID) + def test_get_dataset_with_dataset_view(self): + path = "projects/%s/datasets/%s" % (self.PROJECT, self.DS_ID) + creds = _make_credentials() + http = object() + client = self._make_one(project=self.PROJECT, credentials=creds, _http=http) + resource = { + "id": "%s:%s" % (self.PROJECT, self.DS_ID), + "datasetReference": {"projectId": self.PROJECT, "datasetId": self.DS_ID}, + } + dataset_ref = DatasetReference(self.PROJECT, self.DS_ID) + + test_cases = [ + (None, None), + (DatasetView.DATASET_VIEW_UNSPECIFIED, "DATASET_VIEW_UNSPECIFIED"), + (DatasetView.METADATA, "METADATA"), + (DatasetView.ACL, "ACL"), + (DatasetView.FULL, "FULL"), + ] + + for dataset_view_arg, expected_param_value in test_cases: + with self.subTest( + dataset_view_arg=dataset_view_arg, + expected_param_value=expected_param_value, + ): + # Re-initialize the connection mock for each sub-test to reset side_effect + conn = client._connection = make_connection(resource) + + dataset = client.get_dataset(dataset_ref, dataset_view=dataset_view_arg) + + self.assertEqual(dataset.dataset_id, self.DS_ID) + + if expected_param_value: + expected_query_params = {"datasetView": expected_param_value} + else: + expected_query_params = {} + + conn.api_request.assert_called_once_with( + method="GET", + path="/%s" % path, + timeout=DEFAULT_TIMEOUT, + query_params=expected_query_params if expected_query_params else {}, + ) + + def test_get_dataset_with_invalid_dataset_view(self): + invalid_view_values = [ + "INVALID_STRING", + 123, + 123.45, + object(), + ] + creds = _make_credentials() + http = object() + client = self._make_one(project=self.PROJECT, credentials=creds, _http=http) + resource = { + "id": "%s:%s" % (self.PROJECT, self.DS_ID), + "datasetReference": {"projectId": self.PROJECT, "datasetId": self.DS_ID}, + } + conn = client._connection = make_connection(resource) + dataset_ref = DatasetReference(self.PROJECT, self.DS_ID) + + for invalid_view_value in invalid_view_values: + with self.subTest(invalid_view_value=invalid_view_value): + conn.api_request.reset_mock() # Reset mock for each sub-test + with self.assertRaises(AttributeError): + client.get_dataset(dataset_ref, dataset_view=invalid_view_value) + def test_ensure_bqstorage_client_creating_new_instance(self): bigquery_storage = pytest.importorskip("google.cloud.bigquery_storage") @@ -2101,6 +2168,7 @@ def test_update_dataset(self): }, path="/" + PATH, timeout=7.5, + query_params={}, ) self.assertEqual(ds2.description, ds.description) self.assertEqual(ds2.friendly_name, ds.friendly_name) @@ -2114,6 +2182,94 @@ def test_update_dataset(self): client.update_dataset(ds, []) req = conn.api_request.call_args self.assertEqual(req[1]["headers"]["If-Match"], "etag") + self.assertEqual(req[1].get("query_params"), {}) + + def test_update_dataset_w_update_mode(self): + PATH = f"projects/{self.PROJECT}/datasets/{self.DS_ID}" + creds = _make_credentials() + client = self._make_one(project=self.PROJECT, credentials=creds) + + DESCRIPTION = "DESCRIPTION" + RESOURCE = { + "datasetReference": {"projectId": self.PROJECT, "datasetId": self.DS_ID}, + "etag": "etag", + "description": DESCRIPTION, + } + dataset_ref = DatasetReference(self.PROJECT, self.DS_ID) + orig_dataset = Dataset(dataset_ref) + orig_dataset.description = DESCRIPTION + filter_fields = ["description"] + + test_cases = [ + (None, None), + (UpdateMode.UPDATE_MODE_UNSPECIFIED, "UPDATE_MODE_UNSPECIFIED"), + (UpdateMode.UPDATE_METADATA, "UPDATE_METADATA"), + (UpdateMode.UPDATE_ACL, "UPDATE_ACL"), + (UpdateMode.UPDATE_FULL, "UPDATE_FULL"), + ] + + for update_mode_arg, expected_param_value in test_cases: + with self.subTest( + update_mode_arg=update_mode_arg, + expected_param_value=expected_param_value, + ): + conn = client._connection = make_connection(RESOURCE, RESOURCE) + + new_dataset = client.update_dataset( + orig_dataset, + fields=filter_fields, + update_mode=update_mode_arg, + ) + self.assertEqual(orig_dataset.description, new_dataset.description) + + if expected_param_value: + expected_query_params = {"updateMode": expected_param_value} + else: + expected_query_params = {} + + conn.api_request.assert_called_once_with( + method="PATCH", + path="/" + PATH, + data={"description": DESCRIPTION}, + timeout=DEFAULT_TIMEOUT, + query_params=expected_query_params if expected_query_params else {}, + ) + + def test_update_dataset_w_invalid_update_mode(self): + creds = _make_credentials() + client = self._make_one(project=self.PROJECT, credentials=creds) + + DESCRIPTION = "DESCRIPTION" + resource = { + "datasetReference": {"projectId": self.PROJECT, "datasetId": self.DS_ID}, + "etag": "etag", + } + + dataset_ref = DatasetReference(self.PROJECT, self.DS_ID) + orig_dataset = Dataset(dataset_ref) + orig_dataset.description = DESCRIPTION + filter_fields = ["description"] # A non-empty list of fields is required + + # Mock the connection to prevent actual API calls + # and to provide a minimal valid response if the call were to proceed. + conn = client._connection = make_connection(resource) + + test_cases = [ + "INVALID_STRING", + 123, + 123.45, + object(), + ] + + for invalid_update_mode in test_cases: + with self.subTest(invalid_update_mode=invalid_update_mode): + conn.api_request.reset_mock() # Reset mock for each sub-test + with self.assertRaises(AttributeError): + client.update_dataset( + orig_dataset, + fields=filter_fields, + update_mode=invalid_update_mode, + ) def test_update_dataset_w_custom_property(self): # The library should handle sending properties to the API that are not @@ -2145,6 +2301,7 @@ def test_update_dataset_w_custom_property(self): data={"newAlphaProperty": "unreleased property"}, path=path, timeout=DEFAULT_TIMEOUT, + query_params={}, ) self.assertEqual(dataset.dataset_id, self.DS_ID) @@ -5561,6 +5718,7 @@ def test_query_and_wait_defaults(self): "rows": [{"f": [{"v": "5552452"}]}], "queryId": "job_abcDEF_", "totalBytesProcessed": 1234, + "totalSlotMs": 5678, } creds = _make_credentials() http = object() @@ -5578,6 +5736,7 @@ def test_query_and_wait_defaults(self): self.assertIsNone(rows.location) self.assertEqual(rows.query, query) self.assertEqual(rows.total_bytes_processed, 1234) + self.assertEqual(rows.slot_millis, 5678) # Verify the request we send is to jobs.query. conn.api_request.assert_called_once() diff --git a/tests/unit/test_create_dataset.py b/tests/unit/test_create_dataset.py index bd7c6a8f8..b144471ca 100644 --- a/tests/unit/test_create_dataset.py +++ b/tests/unit/test_create_dataset.py @@ -372,7 +372,12 @@ def test_create_dataset_alreadyexists_w_exists_ok_true(PROJECT, DS_ID, LOCATION) }, timeout=DEFAULT_TIMEOUT, ), - mock.call(method="GET", path=get_path, timeout=DEFAULT_TIMEOUT), + mock.call( + method="GET", + path=get_path, + timeout=DEFAULT_TIMEOUT, + query_params={}, + ), ] ) diff --git a/tests/unit/test_dataset.py b/tests/unit/test_dataset.py index 3fd2579af..604e5ed2e 100644 --- a/tests/unit/test_dataset.py +++ b/tests/unit/test_dataset.py @@ -1767,3 +1767,70 @@ def test__hash__with_minimal_inputs(self): description=None, ) assert hash(cond1) is not None + + def test_access_entry_view_equality(self): + from google.cloud import bigquery + + entry1 = bigquery.dataset.AccessEntry( + entity_type="view", + entity_id={ + "projectId": "my_project", + "datasetId": "my_dataset", + "tableId": "my_table", + }, + ) + entry2 = bigquery.dataset.AccessEntry.from_api_repr( + { + "view": { + "projectId": "my_project", + "datasetId": "my_dataset", + "tableId": "my_table", + } + } + ) + + entry3 = bigquery.dataset.AccessEntry( + entity_type="routine", + entity_id={ + "projectId": "my_project", + "datasetId": "my_dataset", + "routineId": "my_routine", + }, + ) + + entry4 = bigquery.dataset.AccessEntry.from_api_repr( + { + "routine": { + "projectId": "my_project", + "datasetId": "my_dataset", + "routineId": "my_routine", + } + } + ) + + entry5 = bigquery.dataset.AccessEntry( + entity_type="dataset", + entity_id={ + "dataset": { + "projectId": "my_project", + "datasetId": "my_dataset", + }, + "target_types": "VIEWS", + }, + ) + + entry6 = bigquery.dataset.AccessEntry.from_api_repr( + { + "dataset": { + "dataset": { + "projectId": "my_project", + "datasetId": "my_dataset", + }, + "target_types": "VIEWS", + } + } + ) + + assert entry1 == entry2 + assert entry3 == entry4 + assert entry5 == entry6 diff --git a/tests/unit/test_external_config.py b/tests/unit/test_external_config.py index 7f84a9f5b..ea827a560 100644 --- a/tests/unit/test_external_config.py +++ b/tests/unit/test_external_config.py @@ -19,12 +19,19 @@ from google.cloud.bigquery import external_config from google.cloud.bigquery import schema +from google.cloud.bigquery.enums import SourceColumnMatch import pytest class TestExternalConfig(unittest.TestCase): SOURCE_URIS = ["gs://foo", "gs://bar"] + SOURCE_COLUMN_MATCH = SourceColumnMatch.NAME + DATE_FORMAT = "MM/DD/YYYY" + DATETIME_FORMAT = "MM/DD/YYYY HH24:MI:SS" + TIME_ZONE = "America/Los_Angeles" + TIME_FORMAT = "HH24:MI:SS" + TIMESTAMP_FORMAT = "MM/DD/YYYY HH24:MI:SS.FF6 TZR" BASE_RESOURCE = { "sourceFormat": "", @@ -33,6 +40,11 @@ class TestExternalConfig(unittest.TestCase): "autodetect": True, "ignoreUnknownValues": False, "compression": "compression", + "dateFormat": DATE_FORMAT, + "datetimeFormat": DATETIME_FORMAT, + "timeZone": TIME_ZONE, + "timeFormat": TIME_FORMAT, + "timestampFormat": TIMESTAMP_FORMAT, } def test_from_api_repr_base(self): @@ -79,6 +91,12 @@ def test_to_api_repr_base(self): ec.connection_id = "path/to/connection" ec.schema = [schema.SchemaField("full_name", "STRING", mode="REQUIRED")] + ec.date_format = self.DATE_FORMAT + ec.datetime_format = self.DATETIME_FORMAT + ec.time_zone = self.TIME_ZONE + ec.time_format = self.TIME_FORMAT + ec.timestamp_format = self.TIMESTAMP_FORMAT + exp_schema = { "fields": [{"name": "full_name", "type": "STRING", "mode": "REQUIRED"}] } @@ -92,6 +110,11 @@ def test_to_api_repr_base(self): "compression": "compression", "connectionId": "path/to/connection", "schema": exp_schema, + "dateFormat": self.DATE_FORMAT, + "datetimeFormat": self.DATETIME_FORMAT, + "timeZone": self.TIME_ZONE, + "timeFormat": self.TIME_FORMAT, + "timestampFormat": self.TIMESTAMP_FORMAT, } self.assertEqual(got_resource, exp_resource) @@ -127,6 +150,11 @@ def _verify_base(self, ec): self.assertEqual(ec.ignore_unknown_values, False) self.assertEqual(ec.max_bad_records, 17) self.assertEqual(ec.source_uris, self.SOURCE_URIS) + self.assertEqual(ec.date_format, self.DATE_FORMAT) + self.assertEqual(ec.datetime_format, self.DATETIME_FORMAT) + self.assertEqual(ec.time_zone, self.TIME_ZONE) + self.assertEqual(ec.time_format, self.TIME_FORMAT) + self.assertEqual(ec.timestamp_format, self.TIMESTAMP_FORMAT) def test_to_api_repr_source_format(self): ec = external_config.ExternalConfig("CSV") @@ -251,6 +279,8 @@ def test_from_api_repr_csv(self): "allowJaggedRows": False, "encoding": "encoding", "preserveAsciiControlCharacters": False, + "sourceColumnMatch": self.SOURCE_COLUMN_MATCH, + "nullMarkers": ["", "NA"], }, }, ) @@ -267,6 +297,11 @@ def test_from_api_repr_csv(self): self.assertEqual(ec.options.allow_jagged_rows, False) self.assertEqual(ec.options.encoding, "encoding") self.assertEqual(ec.options.preserve_ascii_control_characters, False) + self.assertEqual( + ec.options.source_column_match, + self.SOURCE_COLUMN_MATCH, + ) + self.assertEqual(ec.options.null_markers, ["", "NA"]) got_resource = ec.to_api_repr() @@ -288,6 +323,9 @@ def test_to_api_repr_csv(self): options.skip_leading_rows = 123 options.allow_jagged_rows = False options.preserve_ascii_control_characters = False + options.source_column_match = self.SOURCE_COLUMN_MATCH + options.null_markers = ["", "NA"] + ec.csv_options = options exp_resource = { @@ -300,6 +338,8 @@ def test_to_api_repr_csv(self): "allowJaggedRows": False, "encoding": "encoding", "preserveAsciiControlCharacters": False, + "sourceColumnMatch": self.SOURCE_COLUMN_MATCH, + "nullMarkers": ["", "NA"], }, } @@ -851,7 +891,9 @@ def test_to_api_repr(self): ) -class CSVOptions(unittest.TestCase): +class TestCSVOptions(unittest.TestCase): + SOURCE_COLUMN_MATCH = SourceColumnMatch.NAME + def test_to_api_repr(self): options = external_config.CSVOptions() options.field_delimiter = "\t" @@ -861,6 +903,7 @@ def test_to_api_repr(self): options.allow_jagged_rows = False options.encoding = "UTF-8" options.preserve_ascii_control_characters = False + options.source_column_match = self.SOURCE_COLUMN_MATCH resource = options.to_api_repr() @@ -874,9 +917,37 @@ def test_to_api_repr(self): "allowJaggedRows": False, "encoding": "UTF-8", "preserveAsciiControlCharacters": False, + "sourceColumnMatch": self.SOURCE_COLUMN_MATCH, }, ) + def test_source_column_match_None(self): + ec = external_config.CSVOptions() + ec.source_column_match = None + expected = None + result = ec.source_column_match + self.assertEqual(expected, result) + + def test_source_column_match_valid_input(self): + ec = external_config.CSVOptions() + ec.source_column_match = SourceColumnMatch.NAME + expected = "NAME" + result = ec.source_column_match + self.assertEqual(expected, result) + + ec.source_column_match = "POSITION" + expected = "POSITION" + result = ec.source_column_match + self.assertEqual(expected, result) + + def test_source_column_match_invalid_input(self): + ec = external_config.CSVOptions() + with self.assertRaisesRegex( + TypeError, + "value must be a google.cloud.bigquery.enums.SourceColumnMatch, str, or None", + ): + ec.source_column_match = 3.14 + class TestGoogleSheetsOptions(unittest.TestCase): def test_to_api_repr(self): diff --git a/tests/unit/test_magics.py b/tests/unit/test_magics.py index 814150693..c79e923f8 100644 --- a/tests/unit/test_magics.py +++ b/tests/unit/test_magics.py @@ -986,6 +986,7 @@ def test_bigquery_magic_dryrun_option_sets_job_config(monkeypatch): google.auth.credentials.Credentials, instance=True ) + magics.context.project = "project-from-context" run_query_patch = mock.patch( "google.cloud.bigquery.magics.magics._run_query", autospec=True ) @@ -1007,6 +1008,7 @@ def test_bigquery_magic_dryrun_option_returns_query_job(monkeypatch): magics.context.credentials = mock.create_autospec( google.auth.credentials.Credentials, instance=True ) + magics.context.project = "project-from-context" query_job_mock = mock.create_autospec( google.cloud.bigquery.job.QueryJob, instance=True ) @@ -1035,6 +1037,7 @@ def test_bigquery_magic_dryrun_option_variable_error_message( google.auth.credentials.Credentials, instance=True ) + magics.context.project = "project-from-context" ipython_ns_cleanup.append((ip, "q_job")) run_query_patch = mock.patch( @@ -1064,6 +1067,7 @@ def test_bigquery_magic_dryrun_option_saves_query_job_to_variable( magics.context.credentials = mock.create_autospec( google.auth.credentials.Credentials, instance=True ) + magics.context.project = "project-from-context" query_job_mock = mock.create_autospec( google.cloud.bigquery.job.QueryJob, instance=True ) @@ -1098,6 +1102,7 @@ def test_bigquery_magic_saves_query_job_to_variable_on_error( google.auth.credentials.Credentials, instance=True ) + magics.context.project = "project-from-context" ipython_ns_cleanup.append((ip, "result")) client_query_patch = mock.patch( diff --git a/tests/unit/test_query.py b/tests/unit/test_query.py index 0d967bdb8..2b704d3c9 100644 --- a/tests/unit/test_query.py +++ b/tests/unit/test_query.py @@ -2000,6 +2000,22 @@ def test_total_bytes_processed_present_string(self): query = self._make_one(resource) self.assertEqual(query.total_bytes_processed, 123456) + def test_slot_millis_missing(self): + query = self._make_one(self._make_resource()) + self.assertIsNone(query.slot_millis) + + def test_slot_millis_present_integer(self): + resource = self._make_resource() + resource["totalSlotMs"] = 123456 + query = self._make_one(resource) + self.assertEqual(query.slot_millis, 123456) + + def test_slot_millis_present_string(self): + resource = self._make_resource() + resource["totalSlotMs"] = "123456" + query = self._make_one(resource) + self.assertEqual(query.slot_millis, 123456) + def test_num_dml_affected_rows_missing(self): query = self._make_one(self._make_resource()) self.assertIsNone(query.num_dml_affected_rows) diff --git a/tests/unit/test_table_pandas.py b/tests/unit/test_table_pandas.py index 43d64d77d..a4fa3fa39 100644 --- a/tests/unit/test_table_pandas.py +++ b/tests/unit/test_table_pandas.py @@ -34,12 +34,6 @@ def class_under_test(): return RowIterator -# TODO: The test needs work to account for pandas 2.0+. See Issue: #2132 -# pragma added due to issues with coverage. -@pytest.mark.skipif( - pandas.__version__.startswith("2."), - reason="pandas 2.0 changes some default dtypes and we haven't update the test to account for those", -) def test_to_dataframe_nullable_scalars( monkeypatch, class_under_test ): # pragma: NO COVER @@ -113,14 +107,18 @@ def test_to_dataframe_nullable_scalars( assert df.dtypes["bool_col"].name == "boolean" assert df.dtypes["bytes_col"].name == "object" assert df.dtypes["date_col"].name == "dbdate" - assert df.dtypes["datetime_col"].name == "datetime64[ns]" assert df.dtypes["float64_col"].name == "float64" assert df.dtypes["int64_col"].name == "Int64" assert df.dtypes["numeric_col"].name == "object" assert df.dtypes["string_col"].name == "object" assert df.dtypes["time_col"].name == "dbtime" - assert df.dtypes["timestamp_col"].name == "datetime64[ns, UTC]" assert df.dtypes["json_col"].name == "object" + if pandas.__version__.startswith("2."): + assert df.dtypes["datetime_col"].name == "datetime64[us]" + assert df.dtypes["timestamp_col"].name == "datetime64[us, UTC]" + else: + assert df.dtypes["datetime_col"].name == "datetime64[ns]" + assert df.dtypes["timestamp_col"].name == "datetime64[ns, UTC]" # Check for expected values. assert df["bignumeric_col"][0] == decimal.Decimal("123.456789101112131415")