Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Appearance settings

Commit ea3cb2c

Browse filesBrowse files
committed
added Reciprocal Rank Fusion (RRF) sample
1 parent c6c4a30 commit ea3cb2c
Copy full SHA for ea3cb2c

File tree

5 files changed

+149
-2
lines changed
Filter options

5 files changed

+149
-2
lines changed

‎.gitignore

Copy file name to clipboardExpand all lines: .gitignore
+3-2Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -397,6 +397,7 @@ FodyWeavers.xsd
397397
# JetBrains Rider
398398
*.sln.iml
399399

400-
# Customer
400+
# Custom
401401
*.local.txt
402-
*.local.sql
402+
*.local.sql
403+
.venv/

‎python/00-setup-database.sql

Copy file name to clipboard
+22Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
DROP TABLE IF EXISTS dbo.document_embeddings
2+
DROP TABLE IF EXISTS dbo.documents
3+
go
4+
5+
CREATE TABLE dbo.documents (id INT CONSTRAINT pk__documents PRIMARY KEY IDENTITY, content NVARCHAR(MAX), embedding NVARCHAR(MAX))
6+
CREATE TABLE dbo.document_embeddings (id INT REFERENCES dbo.documents(id), vector_value_id INT, vector_value FLOAT)
7+
go
8+
9+
CREATE CLUSTERED COLUMNSTORE INDEX csi__document_embeddings ON dbo.document_embeddings ORDER (id)
10+
go
11+
12+
IF NOT EXISTS(SELECT * FROM sys.fulltext_catalogs WHERE [name] = 'FullTextCatalog')
13+
BEGIN
14+
CREATE FULLTEXT CATALOG [FullTextCatalog] AS DEFAULT;
15+
END
16+
go
17+
18+
CREATE FULLTEXT INDEX ON dbo.documents (content) KEY INDEX pk__documents;
19+
go
20+
21+
ALTER FULLTEXT INDEX ON dbo.documents ENABLE;
22+
go

‎python/hybrid_search.py

Copy file name to clipboard
+103Lines changed: 103 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,103 @@
1+
import os
2+
import pyodbc
3+
import logging
4+
import json
5+
from sentence_transformers import SentenceTransformer
6+
from dotenv import load_dotenv
7+
from utilities import get_mssql_connection
8+
9+
load_dotenv()
10+
11+
if __name__ == '__main__':
12+
print('Initializing sample...')
13+
print('Getting embeddings...')
14+
sentences = [
15+
'The dog is barking',
16+
'The cat is purring',
17+
'The bear is growling'
18+
]
19+
model = SentenceTransformer('multi-qa-MiniLM-L6-cos-v1')
20+
embeddings = model.encode(sentences)
21+
22+
print('Cleaning up the database...')
23+
try:
24+
conn = get_mssql_connection()
25+
conn.execute("DELETE FROM dbo.document_embeddings;")
26+
conn.execute("DELETE FROM dbo.documents;")
27+
conn.commit();
28+
finally:
29+
conn.close()
30+
31+
print('Saving documents and embeddings in the database...')
32+
try:
33+
conn = get_mssql_connection()
34+
cursor = conn.cursor()
35+
36+
for content, embedding in zip(sentences, embeddings):
37+
cursor.execute(f"""
38+
INSERT INTO dbo.documents (content, embedding) VALUES (?, ?);
39+
INSERT INTO dbo.document_embeddings SELECT SCOPE_IDENTITY(), CAST([key] AS INT), CAST([value] AS FLOAT) FROM OPENJSON(?);
40+
""",
41+
content,
42+
json.dumps(embedding.tolist()),
43+
json.dumps(embedding.tolist())
44+
)
45+
46+
cursor.close()
47+
conn.commit()
48+
finally:
49+
conn.close()
50+
51+
print('Searching for similar documents...')
52+
print('Getting embeddings...')
53+
query = 'growling bear'
54+
embedding = model.encode(query)
55+
56+
print('Querying database...')
57+
k = 5
58+
try:
59+
conn = get_mssql_connection()
60+
cursor = conn.cursor()
61+
62+
results = cursor.execute(f"""
63+
DECLARE @k INT = ?;
64+
WITH keyword_search AS (
65+
SELECT TOP(@k)
66+
id,
67+
ftt.[RANK] AS rank
68+
FROM
69+
dbo.documents
70+
INNER JOIN
71+
FREETEXTTABLE(dbo.documents, *, ?) AS ftt ON dbo.documents.id = ftt.[KEY]
72+
),
73+
semantic_search AS
74+
(
75+
SELECT
76+
id,
77+
rank
78+
FROM
79+
dbo.similar_documents(?)
80+
)
81+
SELECT TOP(@k)
82+
COALESCE(ss.id, ks.id) AS id,
83+
COALESCE(1.0 / (@k + ss.rank), 0.0) +
84+
COALESCE(1.0 / (@k + ks.rank), 0.0) AS score -- Reciprocal Rank Fusion (RRF)
85+
FROM
86+
semantic_search ss
87+
FULL OUTER JOIN
88+
keyword_search ks ON ss.id = ks.id
89+
ORDER BY
90+
score DESC
91+
""",
92+
k,
93+
query,
94+
json.dumps(embedding.tolist()),
95+
)
96+
97+
for row in results:
98+
print('document:', row[0], 'RRF score:', row[1])
99+
100+
cursor.close()
101+
conn.commit()
102+
finally:
103+
conn.close()

‎python/requirements.txt

Copy file name to clipboard
+4Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
python-dotenv
2+
pyodbc
3+
azure-identity
4+
sentence-transformers

‎python/utilities.py

Copy file name to clipboard
+17Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
import os
2+
import pyodbc
3+
import struct
4+
import logging
5+
from azure import identity
6+
7+
def get_mssql_connection():
8+
logging.info('Getting MSSQL connection')
9+
logging.info(' - Getting EntraID credentials...')
10+
mssql_connection_string = os.environ["MSSQL"]
11+
credential = identity.DefaultAzureCredential(exclude_interactive_browser_credential=False)
12+
token_bytes = credential.get_token("https://database.windows.net/.default").token.encode("UTF-16-LE")
13+
token_struct = struct.pack(f'<I{len(token_bytes)}s', len(token_bytes), token_bytes)
14+
SQL_COPT_SS_ACCESS_TOKEN = 1256 # This connection option is defined by microsoft in msodbcsql.h
15+
logging.info(' - Connecting to MSSQL...')
16+
conn = pyodbc.connect(mssql_connection_string, attrs_before={SQL_COPT_SS_ACCESS_TOKEN: token_struct})
17+
return conn

0 commit comments

Comments
0 (0)
Morty Proxy This is a proxified and sanitized view of the page, visit original site.