forked from lance-format/lance
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtest_vector.py
More file actions
149 lines (119 loc) · 3.84 KB
/
test_vector.py
File metadata and controls
149 lines (119 loc) · 3.84 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright The Lance Authors
import lance
import numpy as np
import pyarrow as pa
import pytest
from lance.vector import vec_to_table
def test_dict():
ids, vectors = _create_data()
dd = dict(zip(ids, vectors))
tbl = vec_to_table(dd)
expected = [pa.array(ids), _to_vec(vectors)]
assert_table(tbl, expected)
new_tbl = vec_to_table(dd, names=["foo", "bar"])
assert new_tbl.column_names == ["foo", "bar"]
with pytest.raises(ValueError):
ids, vectors = _create_bad_dims()
dd = dict(zip(ids, vectors))
vec_to_table(dd)
def test_list():
_, vectors = _create_data()
tbl = vec_to_table(vectors)
expected = [_to_vec(vectors)]
assert_table(tbl, expected)
with pytest.raises(ValueError):
_, vectors = _create_bad_dims()
vec_to_table(vectors)
def test_ndarray():
_, vectors = _create_data()
tbl = vec_to_table(np.array(vectors))
expected = [_to_vec(vectors)]
assert_table(tbl, expected)
with pytest.raises(ValueError):
_, vectors = _create_bad_dims()
vec_to_table(np.array(vectors))
def assert_table(tbl, expected_arrays, names=None):
if names is None:
if len(expected_arrays) == 1:
names = ["vector"]
else:
names = ["id", "vector"]
for i, n in enumerate(names):
assert_array_eq(tbl[n], expected_arrays[i])
def assert_array_eq(left: pa.Array, right: pa.Array):
if isinstance(left, pa.ChunkedArray):
left = left.combine_chunks()
if isinstance(right, pa.ChunkedArray):
right = right.combine_chunks()
if pa.types.is_float32(left.type):
assert np.all(
np.abs(
left.to_numpy(zero_copy_only=False)
- right.to_numpy(zero_copy_only=False)
)
< 1e-6
)
if pa.types.is_fixed_size_list(left.type):
assert_array_eq(left.values, right.values)
else:
assert np.all(left.to_numpy(False) == right.to_numpy(False))
def _create_data():
ids = list(range(10))
vectors = np.random.randn(10, 8)
return ids, vectors
def _create_bad_dims():
ids = list(range(10))
vectors = [np.random.randn(8) for _ in ids]
vectors[5] = np.random.randn(5)
return ids, vectors
def _to_vec(lst):
return pa.FixedSizeListArray.from_arrays(
pa.array(np.array(lst).ravel(), type=pa.float32()), list_size=8
)
def _binary_vectors_table():
vectors = pa.FixedSizeListArray.from_arrays(
pa.array(
[
0x0F,
0,
0,
0,
0x03,
0,
0,
0,
0,
0,
0,
0,
],
type=pa.uint8(),
),
list_size=4,
)
ids = pa.array([0, 1, 2], type=pa.int32())
return pa.Table.from_arrays([ids, vectors], names=["id", "vector"])
def test_binary_vectors_default_hamming(tmp_path):
dataset = lance.write_dataset(_binary_vectors_table(), tmp_path / "bin")
scanner = dataset.scanner(
nearest={"column": "vector", "q": [0x0F, 0, 0, 0], "k": 3}
)
plan = scanner.analyze_plan()
assert "metric=hamming" in plan
tbl = scanner.to_table()
assert tbl["id"].to_pylist() == [0, 1, 2]
assert tbl["_distance"].to_pylist() == [0.0, 2.0, 4.0]
def test_binary_vectors_invalid_metric(tmp_path):
dataset = lance.write_dataset(_binary_vectors_table(), tmp_path / "bin")
with pytest.raises(
ValueError, match="Distance type l2 does not support .*UInt8 vectors"
):
dataset.scanner(
nearest={
"column": "vector",
"q": [0x0F, 0, 0, 0],
"k": 1,
"metric": "l2",
}
).to_table()