forked from lance-format/lance
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathbasic.py
More file actions
82 lines (71 loc) · 2.44 KB
/
basic.py
File metadata and controls
82 lines (71 loc) · 2.44 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright The Lance Authors
# Creates a dataset containing some basic patterns of synthetic data.
#
# Unlike the image EDA dataset (which has similar patterns) this dataset
# is much smaller and easier to run locally.
import lance
import pyarrow as pa
from lance.log import LOGGER
from ci_benchmarks.datasets import get_dataset_uri
NUM_ROWS = 10_000_000
NUM_BATCHES = 100
ROWS_PER_BATCH = NUM_ROWS // NUM_BATCHES
SCHEMA = pa.schema(
{
"row_number": pa.uint64(),
"row_number_bitmap": pa.uint64(),
"integers": pa.int64(),
"small_strings": pa.string(),
}
)
def _gen_data():
LOGGER.info("Generating %d rows of data", NUM_ROWS)
for batch_idx in range(NUM_BATCHES):
batch = pa.record_batch(
[
pa.array(
[batch_idx * ROWS_PER_BATCH + i for i in range(ROWS_PER_BATCH)]
),
pa.array(
[batch_idx * ROWS_PER_BATCH + i for i in range(ROWS_PER_BATCH)]
),
pa.array(
[batch_idx * ROWS_PER_BATCH + i for i in range(ROWS_PER_BATCH)]
),
pa.array([f"payload_{i}" for i in range(ROWS_PER_BATCH)]),
],
names=["row_number", "row_number_bitmap", "integers", "small_strings"],
)
yield batch
def _create(dataset_uri: str):
try:
ds = lance.dataset(dataset_uri)
print(ds.count_rows())
if ds.count_rows() != NUM_ROWS:
if ds.count_rows() == 0 and ds.schema == SCHEMA:
ds = lance.write_dataset(
_gen_data(),
dataset_uri,
schema=SCHEMA,
mode="append",
)
else:
raise Exception(
"Cannot generate basic dataset because a dataset with the URI "
f"{dataset_uri} already exists and doesn't appear to be the "
"same dataset"
)
except ValueError:
ds = lance.write_dataset(
_gen_data(),
dataset_uri,
schema=SCHEMA,
mode="create",
)
if ds.list_indices() == []:
ds.create_scalar_index("row_number", "BTREE")
ds.create_scalar_index("row_number_bitmap", "BITMAP")
def gen_basic():
dataset_uri = get_dataset_uri("basic")
_create(dataset_uri)