forked from lance-format/lance
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtest_integration.py
More file actions
69 lines (60 loc) · 2.71 KB
/
test_integration.py
File metadata and controls
69 lines (60 loc) · 2.71 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright The Lance Authors
import duckdb
import lance
import pyarrow as pa
import pytest
@pytest.mark.skip(reason="Unsigned integers not available for pushdown yet")
def test_duckdb_filter_on_rowid(tmp_path):
tab = pa.table({"a": [1, 2, 3], "b": [4, 5, 6]})
ds = lance.write_dataset(tab, str(tmp_path))
ds = lance.dataset(str(tmp_path), default_scan_options={"with_row_id": True})
row_ids = ds.scanner(columns=[], with_row_id=True).to_table().column(0).to_pylist()
expected = tab.slice(1, 1)
actual = duckdb.query(
f"SELECT * FROM ds WHERE _rowid = {row_ids[1]}"
).fetch_arrow_table()
assert actual.to_pydict() == expected.to_pydict()
def test_duckdb_pushdown_extension_types(tmp_path):
# large_binary is reported by pyarrow as a substrait extension type. Datafusion
# does not currently handle these extension types. This should be ok as long
# as the filter isn't accessing the column with the extension type.
#
# Lance works around this by removing any columns with extension types from the
# schema it gives to duckdb.
tab = pa.table(
{
"filterme": [1, 2, 3],
"largebin": pa.array([b"123", b"456", b"789"], pa.large_binary()),
"othercol": [4, 5, 6],
}
)
ds = lance.write_dataset(tab, str(tmp_path)) # noqa: F841
expected = tab.slice(1, 1)
actual = duckdb.query("SELECT * FROM ds WHERE filterme = 2").fetch_arrow_table()
assert actual.to_pydict() == expected.to_pydict()
expected = tab.slice(0, 1)
actual = duckdb.query("SELECT * FROM ds WHERE othercol = 4").fetch_arrow_table()
assert actual.to_pydict() == expected.to_pydict()
# Not the best error message but hopefully this is short lived until datafusion
# supports substrait extension types.
with pytest.raises(
# Older versions of duckdb use duckdb.InvalidInputException and newer versions
# use duckdb.duckdb.Error but that part isn't really important so just check for
# Exception
Exception,
match="referenced a field that is not yet supported by Substrait conversion",
):
duckdb.query("SELECT * FROM ds WHERE largebin = '456'").fetchall()
# Unclear if all of these result in pushdown or not but they shouldn't error if
# they do.
for filt in [
"filterme != 2 AND filterme != 1",
"filterme in (1)",
"filterme IS NULL",
"filterme IS NOT NULL",
"filterme < 2",
]:
expected = duckdb.query(f"SELECT * FROM tab WHERE {filt}").fetch_arrow_table()
actual = duckdb.query(f"SELECT * FROM ds WHERE {filt}").fetch_arrow_table()
assert actual == expected