Skip to content

Commit 62141e8

Browse files
xxsc0529cursoragent
andcommitted
add oceanbase hybrid search support
Co-authored-by: Cursor <cursoragent@cursor.com>
1 parent fca4056 commit 62141e8

4 files changed

Lines changed: 508 additions & 5 deletions

File tree

langchain.gemspec

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,7 @@ Gem::Specification.new do |spec|
5757
spec.add_development_dependency "hnswlib", "~> 0.8.1"
5858
spec.add_development_dependency "hugging-face", "~> 0.3.4"
5959
spec.add_development_dependency "milvus", "~> 0.10.3"
60+
spec.add_development_dependency "mysql2", "~> 0.5"
6061
spec.add_development_dependency "nokogiri", "~> 1.13"
6162
spec.add_development_dependency "mail", "~> 2.8"
6263
spec.add_development_dependency "mistral-ai"

lib/langchain/vectorsearch/base.rb

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ module Langchain::Vectorsearch
1010
# - {Langchain::Vectorsearch::Elasticsearch}
1111
# - {Langchain::Vectorsearch::Hnswlib}
1212
# - {Langchain::Vectorsearch::Milvus}
13+
# - {Langchain::Vectorsearch::Oceanbase}
1314
# - {Langchain::Vectorsearch::Pgvector}
1415
# - {Langchain::Vectorsearch::Pinecone}
1516
# - {Langchain::Vectorsearch::Qdrant}
@@ -29,11 +30,12 @@ module Langchain::Vectorsearch
2930
# )
3031
#
3132
# # You can instantiate other supported vector databases the same way:
32-
# milvus = Langchain::Vectorsearch::Milvus.new(...)
33-
# qdrant = Langchain::Vectorsearch::Qdrant.new(...)
34-
# pinecone = Langchain::Vectorsearch::Pinecone.new(...)
35-
# chroma = Langchain::Vectorsearch::Chroma.new(...)
36-
# pgvector = Langchain::Vectorsearch::Pgvector.new(...)
33+
# milvus = Langchain::Vectorsearch::Milvus.new(...)
34+
# oceanbase = Langchain::Vectorsearch::Oceanbase.new(...)
35+
# qdrant = Langchain::Vectorsearch::Qdrant.new(...)
36+
# pinecone = Langchain::Vectorsearch::Pinecone.new(...)
37+
# chroma = Langchain::Vectorsearch::Chroma.new(...)
38+
# pgvector = Langchain::Vectorsearch::Pgvector.new(...)
3739
#
3840
# == Schema Creation
3941
#
Lines changed: 178 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,178 @@
1+
# frozen_string_literal: true
2+
3+
module Langchain::Vectorsearch
4+
#
5+
# OceanBase vector search adapter (aligned with pyobvector, MySQL protocol compatible).
6+
#
7+
# Gem requirements:
8+
# gem "sequel", "~> 5.87.0"
9+
# gem "mysql2", "~> 0.5"
10+
#
11+
# Usage:
12+
# oceanbase = Langchain::Vectorsearch::Oceanbase.new(
13+
# url: "mysql2://user:password@host:2881/database",
14+
# index_name: "documents",
15+
# llm: llm,
16+
# namespace: nil
17+
# )
18+
#
19+
class Oceanbase < Base
20+
OPERATORS = {
21+
"cosine_distance" => "cosine_distance",
22+
"l2_distance" => "l2_distance",
23+
"inner_product" => "inner_product",
24+
"negative_inner_product" => "negative_inner_product"
25+
}
26+
DEFAULT_OPERATOR = "cosine_distance"
27+
28+
attr_reader :db, :operator, :table_name, :namespace_column, :namespace, :vector_column
29+
30+
# @param url [String] MySQL protocol connection URL, e.g. mysql2://user:password@host:2881/dbname
31+
# @param index_name [String] Table name (used as collection/index)
32+
# @param llm [Object] LLM used to generate embeddings
33+
# @param namespace [String, nil] Namespace for multi-tenant filtering
34+
# @param distance_operator [String] Distance function: cosine_distance / l2_distance / inner_product / negative_inner_product
35+
def initialize(url:, index_name:, llm:, namespace: nil, distance_operator: DEFAULT_OPERATOR)
36+
depends_on "sequel"
37+
depends_on "mysql2"
38+
39+
@db = Sequel.connect(url)
40+
@table_name = index_name
41+
@namespace_column = "namespace"
42+
@namespace = namespace
43+
@vector_column = "vectors"
44+
@operator = OPERATORS[distance_operator] || OPERATORS[DEFAULT_OPERATOR]
45+
46+
super(llm: llm)
47+
end
48+
49+
# Format embedding array as OceanBase VECTOR literal.
50+
# @param embedding [Array<Float>]
51+
# @return [String] e.g. "[0.1,0.2,0.3]"
52+
def self.format_vector(embedding)
53+
"[#{embedding.map { |v| Float(v) }.join(",")}]"
54+
end
55+
56+
def format_vector(embedding)
57+
self.class.format_vector(embedding)
58+
end
59+
60+
# Batch upsert: update if exists, insert otherwise (MySQL/OceanBase REPLACE INTO, same as pyobvector ReplaceStmt).
61+
def upsert_texts(texts:, ids:, metadata: nil)
62+
metadata = Array.new(texts.size, {}) if metadata.nil?
63+
64+
texts.zip(ids, metadata).each do |text, id, meta|
65+
vec_str = format_vector(llm.embed(text: text).embedding)
66+
db[table_name.to_sym].replace(
67+
:id => id,
68+
:content => text,
69+
vector_column.to_sym => vec_str,
70+
namespace_column.to_sym => namespace,
71+
:metadata => meta.to_json
72+
)
73+
end
74+
ids
75+
end
76+
77+
def add_texts(texts:, ids: nil, metadata: nil)
78+
metadata = Array.new(texts.size, {}) if metadata.nil?
79+
80+
if ids.nil? || ids.empty?
81+
inserted = []
82+
texts.zip(metadata).each do |text, meta|
83+
vec_str = format_vector(llm.embed(text: text).embedding)
84+
row = {
85+
:content => text,
86+
vector_column.to_sym => vec_str,
87+
namespace_column.to_sym => namespace,
88+
:metadata => meta.to_json
89+
}
90+
id = db[table_name.to_sym].insert(row)
91+
inserted << id
92+
end
93+
inserted
94+
else
95+
upsert_texts(texts: texts, ids: ids, metadata: metadata)
96+
end
97+
end
98+
99+
def update_texts(texts:, ids:, metadata: nil)
100+
upsert_texts(texts: texts, ids: ids, metadata: metadata)
101+
end
102+
103+
def remove_texts(ids:)
104+
db[table_name.to_sym].where(id: ids).delete
105+
end
106+
107+
# Maps distance function name to OceanBase vector index distance parameter.
108+
INDEX_DISTANCE_PARAM = {
109+
"cosine_distance" => "cosine",
110+
"l2_distance" => "l2",
111+
"inner_product" => "inner_product",
112+
"negative_inner_product" => "negative_inner_product"
113+
}.freeze
114+
115+
def create_default_schema
116+
dim = llm.default_dimensions
117+
# OceanBase VECTOR type (see pyobvector).
118+
db.run <<~SQL
119+
CREATE TABLE IF NOT EXISTS `#{table_name}` (
120+
id BIGINT PRIMARY KEY AUTO_INCREMENT,
121+
content TEXT,
122+
#{vector_column} VECTOR(#{dim}),
123+
#{namespace_column} VARCHAR(255) DEFAULT NULL,
124+
metadata JSON DEFAULT NULL
125+
)
126+
SQL
127+
# Create HNSW vector index for approximate nearest neighbor (OceanBase docs: distance=l2, type=hnsw, lib=vsag).
128+
index_name_sql = "idx_#{table_name}_#{vector_column}"
129+
distance_param = INDEX_DISTANCE_PARAM[operator] || "cosine"
130+
db.run "CREATE VECTOR INDEX `#{index_name_sql}` ON `#{table_name}` (#{vector_column}) WITH (distance=#{distance_param}, type=hnsw)"
131+
rescue Sequel::DatabaseError => e
132+
raise unless e.message.match?(/Duplicate key name|already exists|1061/)
133+
# Ignore if vector index already exists.
134+
end
135+
136+
def destroy_default_schema
137+
db.drop_table?(table_name.to_sym)
138+
end
139+
140+
def similarity_search(query:, k: 4)
141+
embedding = llm.embed(text: query).embedding
142+
similarity_search_by_vector(embedding: embedding, k: k)
143+
end
144+
145+
# ANN search using OceanBase vector distance functions + APPROXIMATE (see pyobvector ann_search).
146+
# @return [Array<Hash>] Hashes with :content, :metadata, etc.
147+
def similarity_search_by_vector(embedding:, k: 4)
148+
vec_str = format_vector(embedding)
149+
vec_escaped = vec_str.gsub("'", "''")
150+
dist_expr = "#{operator}(#{vector_column}, '#{vec_escaped}')"
151+
ns_col = namespace_column.to_sym
152+
153+
ds = db[table_name.to_sym]
154+
.select(Sequel[:content], Sequel[:metadata], Sequel.lit("#{dist_expr} AS _dist"))
155+
.order(Sequel.lit(dist_expr))
156+
.limit(k)
157+
158+
ds = ds.where(ns_col => namespace) if namespace
159+
160+
# OceanBase approximate nearest neighbor: append APPROXIMATE limit k to SQL.
161+
sql = ds.sql
162+
sql = sql.sub(/\s+LIMIT\s+\d+\s*$/i) { " APPROXIMATE LIMIT #{k}" }
163+
164+
rows = db.fetch(sql).all
165+
rows.map { |r| {content: r[:content], metadata: r[:metadata]} }
166+
end
167+
168+
def ask(question:, k: 4, &block)
169+
search_results = similarity_search(query: question, k: k)
170+
context = search_results.map { |r| r[:content].to_s }.join("\n---\n")
171+
prompt = generate_rag_prompt(question: question, context: context)
172+
messages = [{role: "user", content: prompt}]
173+
response = llm.chat(messages: messages, &block)
174+
response.context = context
175+
response
176+
end
177+
end
178+
end

0 commit comments

Comments
 (0)