forked from lance-format/lance
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathutil.py
More file actions
70 lines (60 loc) · 2.09 KB
/
util.py
File metadata and controls
70 lines (60 loc) · 2.09 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright The Lance Authors
# Utilities shared by datagen.py
#
# Everything here must be runnable by older versions of Lance.
import pyarrow as pa
def build_basic_types():
schema = pa.schema(
[
pa.field("int", pa.int64()),
pa.field("float", pa.float32()),
pa.field("str", pa.string()),
pa.field("list_int", pa.list_(pa.int64())),
pa.field("list_str", pa.list_(pa.string())),
pa.field("struct", pa.struct([pa.field("a", pa.int64())])),
pa.field("dict", pa.dictionary(pa.int16(), pa.string())),
pa.field("str_as_dict", pa.string()),
]
)
return pa.table(
[
pa.array(range(1000)),
pa.array(range(1000), pa.float32()),
pa.array([str(i) for i in range(1000)]),
pa.array([list(range(i)) for i in range(1000)]),
pa.array([[str(i)] for i in range(1000)]),
pa.array([{"a": i} for i in range(1000)]),
pa.array(
[str(i % 10) for i in range(1000)],
pa.dictionary(pa.int16(), pa.string()),
),
pa.array(["a"] * 500 + ["b"] * 500),
],
schema=schema,
)
def build_large():
# ~40MB of vector embedding data (10K 1024-float32)
fsl_data = pa.array(range(1024 * 1000 * 10), pa.float32())
fsls = pa.FixedSizeListArray.from_arrays(fsl_data, 1024)
# ~40 MiB of binary data (10k 4KiB chunks)
bindata = pa.allocate_buffer(1024 * 1000 * 40)
offsets = pa.array(
range(0, (1024 * 1000 * 40) + 4 * 1024, 4 * 1024), pa.int32()
).buffers()[1]
bins = pa.BinaryArray.from_buffers(pa.binary(), 10000, [None, offsets, bindata])
schema = pa.schema(
[
pa.field("int", pa.int32()),
pa.field("fsl", pa.list_(pa.float32())),
pa.field("bin", pa.binary()),
]
)
return pa.table(
[
pa.array(range(10000), pa.int32()),
fsls,
bins,
],
schema=schema,
)