Skip to content

Commit 9956c2c

Browse files
rootroot
authored andcommitted
Update FAISS hnsw also ivf bruteforce adaptively
1 parent ddcae04 commit 9956c2c

11 files changed

Lines changed: 237 additions & 68 deletions

File tree

benchmarks/cross_engine/engine_worker.py

Lines changed: 17 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,12 @@ class PomaiOptions(ctypes.Structure):
4444
("fsync_policy", ctypes.c_uint32),
4545
("memory_budget_bytes", ctypes.c_uint64),
4646
("deadline_ms", ctypes.c_uint32),
47+
("index_type", ctypes.c_uint8),
48+
("hnsw_m", ctypes.c_uint32),
49+
("hnsw_ef_construction", ctypes.c_uint32),
50+
("hnsw_ef_search", ctypes.c_uint32),
51+
("adaptive_threshold", ctypes.c_uint32),
52+
("metric", ctypes.c_uint8),
4753
]
4854

4955

@@ -93,7 +99,7 @@ class PomaiSearchResults(ctypes.Structure):
9399
]
94100

95101

96-
def run_pomai(base, queries, gt, lib_path: Path, repeats: int):
102+
def run_pomai(base, queries, gt, lib_path: Path, repeats: int, metric: str):
97103
lib = ctypes.CDLL(str(lib_path))
98104
lib.pomai_options_init.argtypes = [ctypes.POINTER(PomaiOptions)]
99105
lib.pomai_open.argtypes = [ctypes.POINTER(PomaiOptions), ctypes.POINTER(ctypes.c_void_p)]
@@ -122,6 +128,12 @@ def check(st):
122128
opts.path = str(tmpdir / "db").encode()
123129
opts.shards = 4
124130
opts.dim = base.shape[1]
131+
opts.index_type = 1 # HNSW
132+
opts.hnsw_m = 32
133+
opts.hnsw_ef_construction = 200
134+
opts.hnsw_ef_search = 64
135+
opts.adaptive_threshold = 0
136+
opts.metric = 1 if metric == "ip" else 0
125137
check(lib.pomai_open(ctypes.byref(opts), ctypes.byref(db)))
126138

127139
ids = np.arange(base.shape[0], dtype=np.uint64)
@@ -142,6 +154,7 @@ def check(st):
142154
batch[i].metadata = None
143155
batch[i].metadata_len = 0
144156
check(lib.pomai_put_batch(db, batch, n))
157+
holder.clear() # Fix memory leak: allow GC of ctypes arrays
145158
ingestion_s = time.perf_counter() - ingest_start
146159

147160
build_start = time.perf_counter()
@@ -192,8 +205,8 @@ def check(st):
192205
pred_ids = np.array(pred, dtype=np.int64)
193206
rec = recall_at_k(pred_ids, gt, 10)
194207
return {
195-
"engine": "PomaiDB",
196-
"params": {"shards": 4, "topk": 10, "durability": "default (WAL enabled by default)"},
208+
"engine": "PomaiDB HNSW",
209+
"params": {"shards": 4, "topk": 10, "M": 32, "efConstruction": 200, "efSearch": 64},
197210
"ingestion_time_s": ingestion_s,
198211
"index_build_time_s": build_s,
199212
"query_throughput_qps": float(np.mean(qps)),
@@ -346,7 +359,7 @@ def main():
346359
gt = np.load(args.ground_truth)
347360

348361
if args.engine == "pomai":
349-
result = run_pomai(base, queries, gt, Path(args.libpomai), args.repeats)
362+
result = run_pomai(base, queries, gt, Path(args.libpomai), args.repeats, args.metric)
350363
elif args.engine == "hnswlib":
351364
result = run_hnswlib(base, queries, gt, args.repeats, args.metric)
352365
elif args.engine == "faiss_flat":

include/pomai/c_types.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,14 @@ typedef struct {
4747
pomai_fsync_policy_t fsync_policy;
4848
uint64_t memory_budget_bytes;
4949
uint32_t deadline_ms;
50+
51+
// Indexing
52+
uint8_t index_type; // 0 = IVF, 1 = HNSW
53+
uint32_t hnsw_m;
54+
uint32_t hnsw_ef_construction;
55+
uint32_t hnsw_ef_search;
56+
uint32_t adaptive_threshold;
57+
uint8_t metric; // 0 = L2, 1 = IP
5058
} pomai_options_t;
5159

5260
typedef struct {

include/pomai/options.h

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,10 +24,26 @@ namespace pomai
2424
kRag = 1,
2525
};
2626

27+
enum class IndexType : uint8_t
28+
{
29+
kIvfFlat = 0,
30+
kHnsw = 1,
31+
};
32+
2733
struct IndexParams
2834
{
35+
IndexType type = IndexType::kIvfFlat;
36+
// IVF Params
2937
uint32_t nlist = 64;
3038
uint32_t nprobe = 16;
39+
// HNSW Params
40+
uint32_t hnsw_m = 32;
41+
uint32_t hnsw_ef_construction = 200;
42+
uint32_t hnsw_ef_search = 64;
43+
// Adaptive dispatcher: segments with fewer vectors use brute-force SIMD
44+
// (guaranteeing 100% recall). Larger segments use HNSW graph traversal.
45+
// Default: 0 = always use HNSW when available (rely on ef_search for recall).
46+
uint32_t adaptive_threshold = 5000;
3147
};
3248

3349
struct DBOptions
@@ -43,6 +59,7 @@ namespace pomai
4359
uint32_t routing_probe = 0;
4460
uint32_t routing_warmup_mult = 20;
4561
uint32_t routing_keep_prev = 1;
62+
MetricType metric = MetricType::kL2;
4663
};
4764

4865
// One membrane = one logical collection.

src/capi/capi_db.cc

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ struct SearchResultsWrapper {
3232
};
3333

3434
constexpr uint32_t MinOptionsStructSize() {
35-
return static_cast<uint32_t>(offsetof(pomai_options_t, memory_budget_bytes) + sizeof(uint64_t));
35+
return static_cast<uint32_t>(offsetof(pomai_options_t, hnsw_ef_search) + sizeof(uint32_t));
3636
}
3737

3838
constexpr uint32_t MinUpsertStructSize() {
@@ -111,6 +111,12 @@ void pomai_options_init(pomai_options_t* opts) {
111111
opts->fsync_policy = POMAI_FSYNC_POLICY_NEVER;
112112
opts->memory_budget_bytes = 0;
113113
opts->deadline_ms = 0;
114+
opts->index_type = 0; // IVF
115+
opts->hnsw_m = 32;
116+
opts->hnsw_ef_construction = 200;
117+
opts->hnsw_ef_search = 64;
118+
opts->adaptive_threshold = 5000;
119+
opts->metric = 0; // L2
114120
}
115121

116122
void pomai_scan_options_init(pomai_scan_options_t* opts) {
@@ -145,6 +151,18 @@ pomai_status_t* pomai_open(const pomai_options_t* opts, pomai_db_t** out_db) {
145151
cpp_opts.fsync = (opts->fsync_policy == POMAI_FSYNC_POLICY_ALWAYS)
146152
? pomai::FsyncPolicy::kAlways
147153
: pomai::FsyncPolicy::kNever;
154+
cpp_opts.metric = (opts->metric == 1) ? pomai::MetricType::kInnerProduct : pomai::MetricType::kL2;
155+
cpp_opts.index_params.adaptive_threshold = opts->adaptive_threshold;
156+
157+
// Default to IVF, overwrite if HNSW.
158+
if (opts->index_type == 1) {
159+
cpp_opts.index_params.type = pomai::IndexType::kHnsw;
160+
cpp_opts.index_params.hnsw_m = opts->hnsw_m;
161+
cpp_opts.index_params.hnsw_ef_construction = opts->hnsw_ef_construction;
162+
cpp_opts.index_params.hnsw_ef_search = opts->hnsw_ef_search;
163+
} else {
164+
cpp_opts.index_params.type = pomai::IndexType::kIvfFlat;
165+
}
148166

149167
std::unique_ptr<pomai::DB> db;
150168
auto st = pomai::DB::Open(cpp_opts, &db);

src/core/index/hnsw_index.cc

Lines changed: 12 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -17,13 +17,15 @@
1717
namespace pomai::index {
1818

1919
// ── Constructor / Destructor ──────────────────────────────────────────────────
20-
HnswIndex::HnswIndex(uint32_t dim, HnswOptions opts)
20+
HnswIndex::HnswIndex(uint32_t dim, HnswOptions opts, pomai::MetricType metric)
2121
: dim_(dim), opts_(opts)
2222
{
23-
// faiss::IndexHNSWFlat(d, M) — uses L2 by default; we switch to IP below
24-
// based on pomai convention (inner product by default).
23+
faiss::MetricType faiss_metric = faiss::METRIC_L2;
24+
if (metric == pomai::MetricType::kInnerProduct || metric == pomai::MetricType::kCosine) {
25+
faiss_metric = faiss::METRIC_INNER_PRODUCT;
26+
}
2527
index_ = std::make_unique<faiss::IndexHNSWFlat>(
26-
static_cast<int>(dim_), opts_.M, faiss::METRIC_INNER_PRODUCT);
28+
static_cast<int>(dim_), opts_.M, faiss_metric);
2729
index_->hnsw.efConstruction = opts_.ef_construction;
2830
index_->hnsw.efSearch = opts_.ef_search;
2931
}
@@ -102,10 +104,11 @@ pomai::Status HnswIndex::Save(const std::string& path) const
102104
// Append id_map to the same file
103105
std::ofstream f(path, std::ios::binary | std::ios::app);
104106
if (!f) return pomai::Status::IOError("Cannot append id_map to " + path);
107+
// Write length at the END so Load() can read it from the tail.
105108
const uint64_t n = static_cast<uint64_t>(id_map_.size());
106-
f.write(reinterpret_cast<const char*>(&n), sizeof(n));
107109
f.write(reinterpret_cast<const char*>(id_map_.data()),
108110
n * sizeof(VectorId));
111+
f.write(reinterpret_cast<const char*>(&n), sizeof(n));
109112
if (!f) return pomai::Status::IOError("id_map write failed: " + path);
110113
return pomai::Status::Ok();
111114
}
@@ -165,8 +168,11 @@ pomai::Status HnswIndex::Load(const std::string& path,
165168
opts.ef_construction= hnsw_idx->hnsw.efConstruction;
166169
opts.ef_search = hnsw_idx->hnsw.efSearch;
167170

171+
pomai::MetricType metric = (hnsw_idx->metric_type == faiss::METRIC_INNER_PRODUCT)
172+
? pomai::MetricType::kInnerProduct : pomai::MetricType::kL2;
173+
168174
auto result = std::make_unique<HnswIndex>(
169-
static_cast<uint32_t>(hnsw_idx->d), opts);
175+
static_cast<uint32_t>(hnsw_idx->d), opts, metric);
170176
result->index_.reset(hnsw_idx);
171177
result->id_map_ = std::move(id_map);
172178
*out = std::move(result);

src/core/index/hnsw_index.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616

1717
#include "pomai/status.h"
1818
#include "pomai/types.h"
19+
#include "pomai/options.h"
1920

2021
// Forward-declare FAISS types to avoid pulling FAISS headers into every TU.
2122
namespace faiss {
@@ -36,7 +37,7 @@ struct HnswOptions {
3637
class HnswIndex {
3738
public:
3839
/// Create an empty HNSW index (not yet trained/populated).
39-
HnswIndex(uint32_t dim, HnswOptions opts = {});
40+
HnswIndex(uint32_t dim, HnswOptions opts = {}, pomai::MetricType metric = pomai::MetricType::kInnerProduct);
4041
~HnswIndex();
4142

4243
// Non-copyable

src/core/membrane/manager.cc

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,8 @@ namespace pomai::core
2525
spec.name = std::string(kDefaultMembrane);
2626
spec.dim = base_.dim;
2727
spec.shard_count = base_.shard_count;
28+
spec.index_params = base_.index_params;
29+
spec.metric = base_.metric;
2830

2931
auto st = CreateMembrane(spec);
3032
if (st.code() == pomai::ErrorCode::kAlreadyExists)

src/core/shard/runtime.cc

Lines changed: 35 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -998,7 +998,7 @@ namespace pomai::core
998998
std::to_string(state.mem_index) + "_" +
999999
std::to_string(state.segment_part) + ".dat";
10001000
state.filepath = (fs::path(shard_dir_) / state.filename).string();
1001-
state.builder = std::make_unique<table::SegmentBuilder>(state.filepath, dim_);
1001+
state.builder = std::make_unique<table::SegmentBuilder>(state.filepath, dim_, index_params_, metric_);
10021002
}
10031003

10041004
pomai::Metadata meta_copy = entry.meta ? *entry.meta : pomai::Metadata();
@@ -1024,7 +1024,7 @@ namespace pomai::core
10241024
state.phase = BackgroundJob::Phase::kFinalizeSegment;
10251025
}
10261026
} else if (state.phase == BackgroundJob::Phase::kFinalizeSegment) {
1027-
auto st = state.builder->BuildIndex(index_params_.nlist);
1027+
auto st = state.builder->BuildIndex();
10281028
if (!st.ok()) {
10291029
complete_job(pomai::Status::Internal("Freeze: BuildIndex failed: " + st.message()));
10301030
return;
@@ -1155,7 +1155,7 @@ namespace pomai::core
11551155
state.filename = "seg_" + std::to_string(sys_now) + "_compacted_" +
11561156
std::to_string(state.segment_part) + ".dat";
11571157
state.filepath = (fs::path(shard_dir_) / state.filename).string();
1158-
state.builder = std::make_unique<table::SegmentBuilder>(state.filepath, dim_);
1158+
state.builder = std::make_unique<table::SegmentBuilder>(state.filepath, dim_, index_params_, metric_);
11591159
}
11601160
std::cout << "TEST_DEBUG COMPACT PRE-ADD id: " << top.id << " vec_mapped[0]: " << vec_mapped[0] << std::endl;
11611161
auto st = state.builder->Add(top.id, pomai::VectorView(vec_mapped), false, meta);
@@ -1200,7 +1200,7 @@ namespace pomai::core
12001200
state.phase = BackgroundJob::Phase::kCommitManifest;
12011201
continue;
12021202
}
1203-
auto st = state.builder->BuildIndex(index_params_.nlist);
1203+
auto st = state.builder->BuildIndex();
12041204
if (!st.ok()) {
12051205
complete_job(pomai::Status::Internal("Compact: BuildIndex failed: " + st.message()));
12061206
return;
@@ -1446,6 +1446,36 @@ namespace pomai::core
14461446
}
14471447

14481448
if (!use_visibility && !has_filters) { // FAST PATH
1449+
// === ADAPTIVE DISPATCHER ===
1450+
// Small segments: brute-force SIMD for 100% recall.
1451+
// Large segments (>= threshold): HNSW graph traversal.
1452+
const bool use_graph = (seg->Count() >= index_params_.adaptive_threshold) &&
1453+
(seg->GetHnswIndex() != nullptr);
1454+
if (use_graph) {
1455+
auto* hnsw = seg->GetHnswIndex();
1456+
std::vector<pomai::VectorId> out_ids;
1457+
std::vector<float> out_dists;
1458+
// Pass ef_search from index params for tuning
1459+
const int ef = static_cast<int>(
1460+
std::max(index_params_.hnsw_ef_search,
1461+
static_cast<uint32_t>(topk) * 2));
1462+
if (hnsw->Search(query, topk, ef, &out_ids, &out_dists).ok() &&
1463+
!out_ids.empty()) {
1464+
used_candidates = true;
1465+
for (size_t i = 0; i < out_ids.size(); ++i) {
1466+
local_scanned++;
1467+
// id_map now stores real user VectorIds directly.
1468+
if (this->metric_ == pomai::MetricType::kInnerProduct ||
1469+
this->metric_ == pomai::MetricType::kCosine) {
1470+
local.Push(out_ids[i], out_dists[i]);
1471+
} else {
1472+
local.Push(out_ids[i], -out_dists[i]);
1473+
}
1474+
}
1475+
total_scanned.fetch_add(local_scanned, std::memory_order_relaxed);
1476+
return local.Drain();
1477+
}
1478+
}
14491479
if (seg->IsQuantized()) {
14501480
float q_min = seg->GetQuantizer()->GetGlobalMin();
14511481
float q_inv_scale = seg->GetQuantizer()->GetGlobalInvScale();
@@ -1530,6 +1560,7 @@ namespace pomai::core
15301560
// ForEach doesn't expose entry_idx directly, so we use a local counter.
15311561
uint32_t fe_idx = 0;
15321562
seg->ForEach([&](VectorId id, std::span<const float> vec, bool is_deleted, const pomai::Metadata* meta) {
1563+
(void)meta; // suppress unused warning
15331564
const uint32_t my_idx = fe_idx++;
15341565
++local_scanned;
15351566
if (is_deleted) return;

src/storage/manifest/manifest.cc

Lines changed: 17 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -272,8 +272,12 @@ namespace pomai::storage
272272
else if (spec.metric == pomai::MetricType::kCosine) mtype = "COS";
273273
out += "metric " + mtype + "\n";
274274

275-
out += "index_params " + std::to_string(spec.index_params.nlist) + " " +
276-
std::to_string(spec.index_params.nprobe) + "\n";
275+
out += "index_params " + std::to_string(static_cast<uint32_t>(spec.index_params.type)) + " " +
276+
std::to_string(spec.index_params.nlist) + " " +
277+
std::to_string(spec.index_params.nprobe) + " " +
278+
std::to_string(spec.index_params.hnsw_m) + " " +
279+
std::to_string(spec.index_params.hnsw_ef_construction) + " " +
280+
std::to_string(spec.index_params.hnsw_ef_search) + "\n";
277281

278282
return AtomicWriteFile(MembraneManifestPath(root_path, spec.name), out);
279283
}
@@ -327,9 +331,19 @@ namespace pomai::storage
327331
spec->kind = ParseMembraneKind(toks[1]);
328332
}
329333
} else if (toks[0] == "index_params") {
330-
if (toks.size() > 2) {
334+
if (toks.size() == 3) {
335+
spec->index_params.type = pomai::IndexType::kIvfFlat;
331336
ParseU32(toks[1], &spec->index_params.nlist);
332337
ParseU32(toks[2], &spec->index_params.nprobe);
338+
} else if (toks.size() >= 7) {
339+
uint32_t type_val = 0;
340+
ParseU32(toks[1], &type_val);
341+
spec->index_params.type = (type_val == 1) ? pomai::IndexType::kHnsw : pomai::IndexType::kIvfFlat;
342+
ParseU32(toks[2], &spec->index_params.nlist);
343+
ParseU32(toks[3], &spec->index_params.nprobe);
344+
ParseU32(toks[4], &spec->index_params.hnsw_m);
345+
ParseU32(toks[5], &spec->index_params.hnsw_ef_construction);
346+
ParseU32(toks[6], &spec->index_params.hnsw_ef_search);
333347
}
334348
}
335349
}

0 commit comments

Comments
 (0)