# Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. import datetime from datetime import timezone import numpy as np import pyarrow as pa import pyarrow.csv # used to write parquet files import pyarrow.parquet as pq def data(): rng = np.random.default_rng(1) data = np.concatenate( [ rng.normal(0, 0.01, size=50), rng.normal(50, 0.01, size=50), ] ) return pa.array(data) def data_with_nans(): rng = np.random.default_rng(0) data = rng.normal(0, 0.01, size=50) mask = rng.normal(0, 2, size=50) data[mask == 0] = np.nan return data def data_datetime(f): data = [ datetime.datetime.now(tz=timezone.utc), datetime.datetime.now(tz=timezone.utc) - datetime.timedelta(days=1), datetime.datetime.now(tz=timezone.utc) + datetime.timedelta(days=1), ] return pa.array(data, type=pa.timestamp(f), mask=np.array([False, True, False])) def data_date32(): data = [ datetime.date(2000, 1, 1), datetime.date(1980, 1, 1), datetime.date(2030, 1, 1), ] return pa.array(data, type=pa.date32(), mask=np.array([False, True, False])) def data_timedelta(f): data = [ datetime.timedelta(days=100), datetime.timedelta(days=1), datetime.timedelta(seconds=1), ] return pa.array(data, type=pa.duration(f), mask=np.array([False, True, False])) def data_binary_other(): return np.array([1, 0, 0], dtype="u4") def write_parquet(path, data): table = pa.Table.from_arrays([data], names=["a"]) pq.write_table(table, path) return str(path)