lance/python/python/tests/test_optimize.py at main · lance-format/lance

537 lines (444 loc) · 18.6 KB
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright The Lance Authors
import pickle
import random
import threading
from pathlib import Path
import lance
import numpy as np
import pyarrow as pa
import pytest
from lance.lance import Compaction
from lance.optimize import RewriteResult
from lance.vector import vec_to_table
def test_dataset_optimize(tmp_path: Path):
    base_dir = tmp_path / "dataset"
    data = pa.table({"a": range(1000), "b": range(1000)})
    dataset = lance.write_dataset(data, base_dir, max_rows_per_file=100)
    assert dataset.version == 1
    assert len(dataset.get_fragments()) == 10
    metrics = dataset.optimize.compact_files(
        target_rows_per_fragment=1000,
        materialize_deletions=False,
        num_threads=1,
    assert metrics.fragments_removed == 10
    assert metrics.fragments_added == 1
    assert metrics.files_removed == 10
    assert metrics.files_added == 1
    assert dataset.version == 3
def test_blob_compaction(tmp_path: Path):
    base_dir = tmp_path / "blob_dataset"
    blob_field = pa.field(
        "blob", pa.large_binary(), metadata={"lance-encoding:blob": "true"}
    schema = pa.schema([pa.field("id", pa.int32()), blob_field])
    blobs = [b"\x01\x02", b"\x03\x04\x05"]
    table = pa.table(
            "id": pa.array([0, 1], type=pa.int32()),
            "blob": pa.array(blobs, type=pa.large_binary()),
        schema=schema,
    dataset = lance.write_dataset(
        table,
        base_dir,
        schema=schema,
        max_rows_per_file=1,
        data_storage_version="stable",
    assert len(dataset.get_fragments()) == 2
    dataset.optimize.compact_files(num_threads=1)
    assert len(dataset.get_fragments()) == 1
    blob_files = dataset.take_blobs("blob", indices=[0, 1])
    contents = [blob_files[0].readall(), blob_files[1].readall()]
    assert contents == blobs
def test_optimize_max_bytes(tmp_path: Path):
    base_dir = tmp_path / "dataset"
    arr = pa.array(range(4 * 1024 * 1024))
    arr = pa.FixedSizeListArray.from_arrays(arr, 1024)
    data = pa.table({"a": arr})
    # Write out 4K rows and 32MB of data
    dataset = lance.write_dataset(
        data, base_dir, max_rows_per_file=2 * 1024, data_storage_version="stable"
    # We get 2 fragments
    assert len(dataset.get_fragments()) == 2
    # Now run compaction with a small max_bytes_per_file (1000 bytes) to get more
    # fragments.  The exact number is a bit tricky to calculate because we don't
    # split into a new fragment until we've actually written data and that depends
    # on how much the file format chooses to accumulate, but it should be more than 2
    metrics = dataset.optimize.compact_files(
        target_rows_per_fragment=100 * 1024,
        materialize_deletions=False,
        max_bytes_per_file=1000,
        batch_size=128,
    assert metrics.fragments_removed == 2
    assert metrics.fragments_added > 2
    assert metrics.files_removed == 2
    assert metrics.files_added > 2
    num_frags = len(dataset.get_fragments())
    assert num_frags == metrics.fragments_added
    dataset = lance.write_dataset(
        data,
        base_dir,
        max_rows_per_file=2 * 1024,
        data_storage_version="stable",
        mode="overwrite",
    # Same test but use Compaction.plan
    plan = Compaction.plan(
        dataset,
        options=dict(
            target_rows_per_fragment=100 * 1024, max_bytes_per_file=1000, batch_size=128
    results = [task.execute(dataset) for task in plan.tasks]
    metrics = Compaction.commit(dataset, results)
    assert metrics.fragments_removed == 2
    assert metrics.fragments_added > 2
    assert metrics.files_removed == 2
    assert metrics.files_added > 2
    dataset = lance.write_dataset(
        data,
        base_dir,
        max_rows_per_file=2 * 1024,
        data_storage_version="stable",
        mode="overwrite",
    # In this test max_bytes_per_file is still too small but the batch size
    # is so large we read the entire input in a single batch
    metrics = dataset.optimize.compact_files(
        target_rows_per_fragment=100 * 1024,
        materialize_deletions=False,
        max_bytes_per_file=1000,
        batch_size=2 * 1024,
    assert metrics.fragments_removed == 2
    assert metrics.fragments_added == 2
    assert metrics.files_removed == 2
    assert metrics.files_added == 2
    num_frags = len(dataset.get_fragments())
    assert num_frags == 2
def create_table(min, max, nvec, ndim=8):
    mat = np.random.uniform(min, max, (nvec, ndim))
    tbl = vec_to_table(data=mat)
    # Add id column for filtering
    tbl = pa.Table.from_pydict(
            "vector": tbl.column(0).chunk(0),
            "id": np.arange(0, nvec),
    return tbl
def test_compact_with_write(tmp_path: Path):
    # This test creates a dataset with a manifest containing fragments
    # that are not in sorted order (by id)
    # We do this by running compaction concurrently with append
    # This is because compaction first reserves a fragment id.  Then the
    # concurrent writes grab later ids and commit them.  Then the compaction
    # commits with its earlier id.
    # In the next compaction we should detect this, and reorder the fragments
    # when writing the compacted file.
    base_dir = tmp_path / "dataset"
    NUM_FRAGS = 5
    ROWS_PER_FRAG = 300
    # First, create some data
    data = create_table(min=0, max=1, nvec=ROWS_PER_FRAG)
    dataset = lance.write_dataset(data, base_dir)
    for _ in range(NUM_FRAGS):
        lance.write_dataset(data, base_dir, mode="append")
    # Now, run compaction at the same time as creating new data
    def do_compaction():
        dataset = lance.dataset(base_dir)
        dataset.optimize.compact_files()
    compact_thread = threading.Thread(target=do_compaction)
    compact_thread.start()
    for _ in range(NUM_FRAGS):
        lance.write_dataset(data, base_dir, mode="append")
    compact_thread.join()
    # Now, run compaction again, this should succeed
    dataset = lance.dataset(base_dir)
    dataset.optimize.compact_files()
    assert dataset.to_table().num_rows == ROWS_PER_FRAG * (NUM_FRAGS * 2 + 1)
def test_index_remapping(tmp_path: Path):
    base_dir = tmp_path / "dataset"
    data = create_table(min=0, max=1, nvec=300)
    dataset = lance.write_dataset(data, base_dir, max_rows_per_file=150)
    dataset.create_index(
        "vector", index_type="IVF_PQ", num_partitions=2, num_sub_vectors=2
    assert len(dataset.get_fragments()) == 2
    sample_query_indices = random.sample(range(300), 50)
    vecs = data.column("vector").chunk(0)
    sample_queries = [
        {"column": "vector", "q": vecs[i].values, "k": 5} for i in sample_query_indices
    def has_target(target, results):
        for item in results:
            if item.values == target:
                return True
        return False
    def check_index(has_knn_combined):
        for query in sample_queries:
            results = dataset.to_table(nearest=query).column("vector")
            assert has_target(query["q"], results)
            plan = dataset.scanner(nearest=query).explain_plan()
            assert ("KNNVectorDistance" in plan) == has_knn_combined
    # Original state is 2 indexed fragments of size 150.  This should not require
    # a combined scan
    check_index(has_knn_combined=False)
    # Compact the 2 fragments into 1.  Combined scan still not needed.
    dataset.optimize.compact_files()
    assert len(dataset.get_fragments()) == 1
    check_index(has_knn_combined=False)
    # Add a new fragment and recalculate the index
    extra_data = create_table(min=1000, max=1001, nvec=100)
    dataset = lance.write_dataset(
        extra_data, base_dir, mode="append", max_rows_per_file=100
    dataset.create_index(
        "vector", index_type="IVF_PQ", num_partitions=2, num_sub_vectors=2, replace=True
    # Combined scan should not be needed
    assert len(dataset.get_fragments()) == 2
    check_index(has_knn_combined=False)
    # Add a new unindexed fragment
    extra_data = create_table(min=1000, max=1001, nvec=100)
    dataset = lance.write_dataset(
        extra_data, base_dir, mode="append", max_rows_per_file=100
    assert len(dataset.get_fragments()) == 3
    # Compaction should not combine the unindexed fragment with the indexed fragment
    dataset.optimize.compact_files()
    assert len(dataset.get_fragments()) == 2
    # Now a combined scan is required
    check_index(has_knn_combined=True)
def test_index_remapping_multiple_rewrite_tasks(tmp_path: Path):
    base_dir = tmp_path / "dataset"
    ds = lance.write_dataset(
        create_table(min=0, max=1, nvec=300), base_dir, max_rows_per_file=150
    ds = ds.create_index(
        "vector",
        index_type="IVF_PQ",
        num_partitions=4,
        num_sub_vectors=2,
    assert ds.has_index
    ds = lance.write_dataset(
        create_table(min=0, max=1, nvec=300),
        base_dir,
        mode="append",
        max_rows_per_file=150,
    ds.delete("id % 4 == 0")
    fragments = list(ds.get_fragments())
    assert len(fragments) == 4
    # We have a dataset with 4 small fragments.  2 are indexed and
    # 2 are not.  The indexed fragments and the non-indexed fragments
    # cannot be combined and so we should end up with 2 fragments after
    # compaction
    ds.optimize.compact_files()
    fragments = list(ds.get_fragments())
    assert len(fragments) == 2
    index = ds.describe_indices()[0]
    index_frag_ids = list(index.segments[0].fragment_ids)
    frag_ids = [frag.fragment_id for frag in fragments]
    assert len(index_frag_ids) == 1
    assert index_frag_ids[0] in frag_ids
def test_defer_index_remap(tmp_path: Path):
    base_dir = tmp_path / "dataset"
    data = pa.table({"i": range(6_000), "val": range(6_000)})
    dataset = lance.write_dataset(data, base_dir, max_rows_per_file=1_000)
    dataset.create_scalar_index("i", "BTREE")
    options = dict(
        target_rows_per_fragment=2_000, defer_index_remap=True, num_threads=1
    dataset.delete("i < 500")
    dataset.optimize.compact_files(**options)
    dataset = lance.dataset(base_dir)
    indices = dataset.describe_indices()
    assert any(idx.name == "__lance_frag_reuse" for idx in indices)
@pytest.mark.filterwarnings("ignore::DeprecationWarning")
def test_describe_indices_matches_list_indices_for_frag_reuse(tmp_path: Path):
    """describe_indices() and list_indices() must agree on the index_type
    string for every index, including the __lance_frag_reuse system index
    that defer_index_remap produces.
    list_indices() special-cases system indices via infer_system_index_type()
    in python/src/dataset.rs. describe_indices() in
    rust/lance/src/index.rs::IndexDescriptionImpl::try_new does not, so it
    falls through to a plugin lookup that has no entry for
    FragmentReuseIndexDetails and reports 'Unknown' instead.
    base_dir = tmp_path / "dataset"
    data = pa.table({"i": range(6_000), "val": range(6_000)})
    dataset = lance.write_dataset(data, base_dir, max_rows_per_file=1_000)
    dataset.create_scalar_index("i", "BTREE")
    dataset.delete("i < 500")
    dataset.optimize.compact_files(
        target_rows_per_fragment=2_000, defer_index_remap=True, num_threads=1
    dataset = lance.dataset(base_dir)
    described = {d.name: d.index_type for d in dataset.describe_indices()}
    listed = {idx["name"]: idx["type"] for idx in dataset.list_indices()}
    assert "__lance_frag_reuse" in listed, (
        "test precondition: defer_index_remap should produce a frag-reuse index"
    assert described == listed, (
        "describe_indices and list_indices disagree on index_type:\n"
        f"  describe_indices: {described}\n"
        f"  list_indices:     {listed}"
def test_dataset_distributed_optimize(tmp_path: Path):
    base_dir = tmp_path / "dataset"
    data = pa.table({"a": range(800), "b": range(800)})
    dataset = lance.write_dataset(data, base_dir, max_rows_per_file=200)
    fragments = dataset.get_fragments()
    assert len(fragments) == 4
    plan = Compaction.plan(
        dataset, options=dict(target_rows_per_fragment=400, num_threads=1)
    assert plan.read_version == 1
    assert plan.num_tasks() == 2
    assert plan.tasks[0].fragments == [frag.metadata for frag in fragments[0:2]]
    assert plan.tasks[1].fragments == [frag.metadata for frag in fragments[2:4]]
    assert repr(plan) == "CompactionPlan(read_version=1, tasks=<2 compaction tasks>)"
    # Plan can be pickled
    assert pickle.loads(pickle.dumps(plan)) == plan
    pickled_task = pickle.dumps(plan.tasks[0])
    task = pickle.loads(pickled_task)
    assert task == plan.tasks[0]
    result1 = plan.tasks[0].execute(dataset)
    result1.metrics.fragments_removed == 2
    result1.metrics.fragments_added == 1
    pickled_result = pickle.dumps(result1)
    result = pickle.loads(pickled_result)
    assert isinstance(result, RewriteResult)
    assert result == result1
    assert re.match(
        r"RewriteResult\(read_version=1, new_fragments=\[.+\], old_fragments=\[.+\]\)",
        repr(result),
    metrics = Compaction.commit(dataset, [result1])
    assert metrics.fragments_removed == 2
    assert metrics.fragments_added == 1
    # Compaction occurs in two transactions so it increments the version by 2.
    assert dataset.version == 3
def test_migration_via_fragment_apis(tmp_path):
    This test is a regression of a case where we were using fragment APIs to migrate
    from v1 to v2 but that left the dataset in a state where it had v2 files but wasn't
    marked with the v2 writer flag.
    data = pa.table({"a": range(800), "b": range(800)})
    # Create v1 dataset
    ds = lance.write_dataset(
        data, tmp_path / "dataset", max_rows_per_file=200, data_storage_version="legacy"
    # Create empty v2 dataset
    lance.write_dataset(
        data_obj=[],
        uri=tmp_path / "dataset2",
        schema=ds.schema,
        data_storage_version="2.0",
    # Add v2 files
    fragments = []
    for frag in ds.get_fragments():
        reader = ds.scanner(fragments=[frag])
        fragments.append(
            lance.LanceFragment.create(
                dataset_uri=tmp_path / "dataset2",
                data=reader,
                fragment_id=frag.fragment_id,
                data_storage_version="2.0",
    # Commit
    operation = lance.LanceOperation.Overwrite(ds.schema, fragments)
    ds2 = lance.LanceDataset.commit(tmp_path / "dataset2", operation)
    # Compact, dataset should still be v2
    ds2.optimize.compact_files()
    ds2 = lance.dataset(tmp_path / "dataset2")
    assert ds2.data_storage_version == "2.0"
def test_optimize_indices_second_call_is_noop(tmp_path: Path):
    """A second optimize_indices call when nothing has changed since the first
    must not write any new files to the dataset directory."""
    base_dir = tmp_path / "dataset"
    rng = np.random.default_rng(0)
    vectors = rng.standard_normal((n, 8)).astype(np.float32)
    table = pa.table(
            "id": pa.array(range(n), type=pa.int64()),
            "category": pa.array([f"cat{i % 4}" for i in range(n)]),
            "tags": pa.array([[f"t{i % 3}", f"t{(i + 1) % 3}"] for i in range(n)]),
            "doc": pa.array([f"hello world document {i}" for i in range(n)]),
            "name": pa.array([f"name_{i:05d}" for i in range(n)]),
            "value": pa.array(range(n), type=pa.int64()),
            "bloom_val": pa.array(range(n), type=pa.int64()),
            "vector": pa.FixedSizeListArray.from_arrays(
                pa.array(vectors.reshape(-1), type=pa.float32()), 8
    dataset = lance.write_dataset(table, base_dir)
    dataset.create_scalar_index("id", index_type="BTREE")
    dataset.create_scalar_index("category", index_type="BITMAP")
    dataset.create_scalar_index("tags", index_type="LABEL_LIST")
    dataset.create_scalar_index("doc", index_type="INVERTED")
    dataset.create_scalar_index("name", index_type="NGRAM")
    dataset.create_scalar_index("value", index_type="ZONEMAP")
    dataset.create_scalar_index("bloom_val", index_type="BLOOMFILTER")
    # num_partitions=1 keeps this dataset balanced: the auto-rebalance check
    # in merge_indices only finds join candidates when num_partitions > 1, and
    # 1024 + 128 rows is well below the split threshold. Without this, the
    # rebalance heuristic would keep finding work on the small partitions.
    dataset.create_index(
        "vector", index_type="IVF_PQ", num_partitions=1, num_sub_vectors=2
    extra_rows = 128
    extra_vectors = rng.standard_normal((extra_rows, 8)).astype(np.float32)
    extra = pa.table(
            "id": pa.array(range(n, n + extra_rows), type=pa.int64()),
            "category": pa.array([f"cat{i % 4}" for i in range(extra_rows)]),
            "tags": pa.array([[f"t{i % 3}"] for i in range(extra_rows)]),
            "doc": pa.array([f"goodbye world document {i}" for i in range(extra_rows)]),
            "name": pa.array([f"add_{i:05d}" for i in range(extra_rows)]),
            "value": pa.array(range(n, n + extra_rows), type=pa.int64()),
            "bloom_val": pa.array(range(n, n + extra_rows), type=pa.int64()),
            "vector": pa.FixedSizeListArray.from_arrays(
                pa.array(extra_vectors.reshape(-1), type=pa.float32()), 8
    dataset = lance.write_dataset(extra, base_dir, mode="append")
    # First optimize: should pull the new fragment into each index.
    dataset.optimize.optimize_indices()
    files_before = {p.relative_to(base_dir) for p in base_dir.rglob("*") if p.is_file()}
    # Second optimize: nothing has changed, so this must be a no-op on disk.
    dataset.optimize.optimize_indices()
    files_after = {p.relative_to(base_dir) for p in base_dir.rglob("*") if p.is_file()}
    new_files = files_after - files_before
    assert not new_files, f"second optimize_indices created new files: {new_files}"
def test_compaction_generates_rewrite_transaction(tmp_path: Path):
    # Create a small dataset with multiple fragments
    base_dir = tmp_path / "rewrite_txn"
    data = pa.table({"a": range(300), "b": range(300)})
    dataset = lance.write_dataset(data, base_dir, max_rows_per_file=100)
    # Run compaction: should perform a rewrite (no deletions materialized)
    dataset.optimize.compact_files(materialize_deletions=False, num_threads=1)
    # Verify at least one transaction is a Rewrite; guard against None entries
    transactions = dataset.get_transactions()
    assert any(
        t is not None and t.operation.__class__.__name__ == "Rewrite"
        for t in transactions
Provide feedback

Saved searches

Use saved searches to filter your results more quickly

FilesExpand file tree

test_optimize.py

Latest commit

History

test_optimize.py

File metadata and controls