Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Appearance settings

Commit a13814e

Browse filesBrowse files
committed
using native vector functions
1 parent 7d138ae commit a13814e
Copy full SHA for a13814e

8 files changed

+110
-276
lines changed

‎python/00-setup-database.sql

Copy file name to clipboard
+7-58Lines changed: 7 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -1,72 +1,21 @@
1-
drop table if exists dbo.document_embeddings
2-
drop table if exists dbo.documents
1+
drop table if exists dbo.sample_documents
32
go
43

5-
create table dbo.documents
4+
create table dbo.sample_documents
65
(
76
id int constraint pk__documents primary key,
87
content nvarchar(max),
9-
embedding nvarchar(max)
8+
embedding varbinary(8000)
109
)
11-
create table dbo.document_embeddings
12-
(
13-
id int references dbo.documents(id),
14-
vector_value_id int,
15-
vector_value float
16-
)
17-
go
18-
19-
create clustered columnstore index csi__document_embeddings
20-
on dbo.document_embeddings order (id)
21-
go
2210

23-
if not exists(select * from sys.fulltext_catalogs where [name] = 'FullTextCatalog')
11+
if not exists(select * from sys.fulltext_catalogs where [name] = 'main_ft_catalog')
2412
begin
25-
create fulltext catalog [FullTextCatalog] as default;
13+
create fulltext catalog [main_ft_catalog] as default;
2614
end
2715
go
2816

29-
create fulltext index on dbo.documents (content) key index pk__documents;
17+
create fulltext index on dbo.sample_documents (content) key index pk__documents;
3018
go
3119

32-
alter fulltext index on dbo.documents enable;
33-
go
34-
35-
create or alter function dbo.similar_documents(@vector nvarchar(max))
36-
returns table
37-
as
38-
return
39-
with cteVector as
40-
(
41-
select
42-
cast([key] as int) as [vector_value_id],
43-
cast([value] as float) as [vector_value]
44-
from
45-
openjson(@vector)
46-
),
47-
cteSimilar as
48-
(
49-
select top (50)
50-
v2.id,
51-
1-sum(v1.[vector_value] * v2.[vector_value]) /
52-
(
53-
sqrt(sum(v1.[vector_value] * v1.[vector_value]))
54-
*
55-
sqrt(sum(v2.[vector_value] * v2.[vector_value]))
56-
) as cosine_distance
57-
from
58-
cteVector v1
59-
inner join
60-
dbo.document_embeddings v2 on v1.vector_value_id = v2.vector_value_id
61-
group by
62-
v2.id
63-
order by
64-
cosine_distance
65-
)
66-
select
67-
rank() over (order by r.cosine_distance) as rank,
68-
r.id,
69-
r.cosine_distance
70-
from
71-
cteSimilar r
20+
alter fulltext index on dbo.sample_documents enable;
7221
go

‎python/hybrid_search.py

Copy file name to clipboardExpand all lines: python/hybrid_search.py
+45-19Lines changed: 45 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,8 @@
1414
sentences = [
1515
'The dog is barking',
1616
'The cat is purring',
17-
'The bear is growling'
17+
'The bear is growling',
18+
'A bear growling to a cat'
1819
]
1920
model = SentenceTransformer('multi-qa-MiniLM-L6-cos-v1')
2021
embeddings = model.encode(sentences)
@@ -24,8 +25,7 @@
2425
print('Cleaning up the database...')
2526
try:
2627
cursor = conn.cursor()
27-
cursor.execute("DELETE FROM dbo.document_embeddings;")
28-
cursor.execute("DELETE FROM dbo.documents;")
28+
cursor.execute("DELETE FROM dbo.sample_documents;")
2929
cursor.commit();
3030
finally:
3131
cursor.close()
@@ -39,8 +39,7 @@
3939
DECLARE @id INT = ?;
4040
DECLARE @content NVARCHAR(MAX) = ?;
4141
DECLARE @embedding NVARCHAR(MAX) = ?;
42-
INSERT INTO dbo.documents (id, content, embedding) VALUES (@id, @content, @embedding);
43-
INSERT INTO dbo.document_embeddings SELECT @id, CAST([key] AS INT), CAST([value] AS FLOAT) FROM OPENJSON(@embedding);
42+
INSERT INTO dbo.sample_documents (id, content, embedding) VALUES (@id, @content, JSON_ARRAY_TO_VECTOR(@embedding));
4443
""",
4544
id,
4645
content,
@@ -53,37 +52,64 @@
5352

5453
print('Searching for similar documents...')
5554
print('Getting embeddings...')
56-
query = 'growling bear'
55+
query = 'a growling bear'
5756
embedding = model.encode(query)
5857

59-
print('Querying database...')
58+
print(f'Querying database for "{query}"...')
6059
k = 5
6160
try:
6261
cursor = conn.cursor()
6362

6463
results = cursor.execute(f"""
6564
DECLARE @k INT = ?;
65+
DECLARE @q NVARCHAR(4000) = ?;
66+
DECLARE @e VARBINARY(8000) = JSON_ARRAY_TO_VECTOR(CAST(? AS NVARCHAR(MAX)));
6667
WITH keyword_search AS (
6768
SELECT TOP(@k)
6869
id,
69-
ftt.[RANK] AS rank
70-
FROM
71-
dbo.documents
72-
INNER JOIN
73-
FREETEXTTABLE(dbo.documents, *, ?) AS ftt ON dbo.documents.id = ftt.[KEY]
70+
RANK() OVER (ORDER BY rank) AS rank,
71+
content
72+
FROM
73+
(
74+
SELECT TOP(@k)
75+
sd.id,
76+
ftt.[RANK] AS rank,
77+
sd.content
78+
FROM
79+
dbo.sample_documents AS sd
80+
INNER JOIN
81+
FREETEXTTABLE(dbo.sample_documents, *, @q) AS ftt ON sd.id = ftt.[KEY]
82+
) AS t
83+
ORDER BY
84+
rank
7485
),
7586
semantic_search AS
7687
(
77-
SELECT
78-
id,
79-
rank
80-
FROM
81-
dbo.similar_documents(?)
88+
SELECT TOP(@k)
89+
id,
90+
RANK() OVER (ORDER BY distance) AS rank,
91+
content
92+
FROM
93+
(
94+
SELECT TOP(@k)
95+
id,
96+
VECTOR_DISTANCE('cosine', embedding, @e) AS distance,
97+
content
98+
FROM
99+
dbo.sample_documents
100+
ORDER BY
101+
distance
102+
) AS t
103+
ORDER BY
104+
rank
82105
)
83106
SELECT TOP(@k)
84107
COALESCE(ss.id, ks.id) AS id,
85108
COALESCE(1.0 / (@k + ss.rank), 0.0) +
86-
COALESCE(1.0 / (@k + ks.rank), 0.0) AS score -- Reciprocal Rank Fusion (RRF)
109+
COALESCE(1.0 / (@k + ks.rank), 0.0) AS score, -- Reciprocal Rank Fusion (RRF)
110+
COALESCE(ss.content, ks.content) AS content,
111+
ss.rank AS semantic_rank,
112+
ks.rank AS keyword_rank
87113
FROM
88114
semantic_search ss
89115
FULL OUTER JOIN
@@ -97,7 +123,7 @@
97123
)
98124

99125
for row in results:
100-
print(f'Document: {row[0]} -> RRF score: {row[1]:0.4}')
126+
print(f'Document: "{row[2]}", Id: {row[0]} -> RRF score: {row[1]:0.4} (Semantic Rank: {row[3]}, Keyword Rank: {row[4]})')
101127

102128
finally:
103129
cursor.close()

‎vector-embeddings/02-create-vectors-table.sql

Copy file name to clipboardExpand all lines: vector-embeddings/02-create-vectors-table.sql
-62Lines changed: 0 additions & 62 deletions
This file was deleted.
+44Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
/*
2+
Add columns to store the native vectors
3+
*/
4+
alter table wikipedia_articles_embeddings
5+
add title_vector_native varbinary(8000);
6+
7+
alter table wikipedia_articles_embeddings
8+
add content_vector_native varbinary(8000);
9+
10+
/*
11+
Update the native vectors
12+
*/
13+
update
14+
wikipedia_articles_embeddings
15+
set
16+
title_vector_native = json_array_to_vector(title_vector),
17+
content_vector_native = json_array_to_vector(content_vector);
18+
go
19+
20+
/*
21+
Remove old columns
22+
*/
23+
alter table wikipedia_articles_embeddings
24+
drop column title_vector;
25+
go
26+
27+
alter table wikipedia_articles_embeddings
28+
drop column content_vector;
29+
go
30+
31+
/*
32+
Rename the columns
33+
*/
34+
EXEC sp_rename 'dbo.wikipedia_articles_embeddings.title_vector_native', 'title_vector_ada2', 'COLUMN';
35+
EXEC sp_rename 'dbo.wikipedia_articles_embeddings.content_vector_native', 'content_vector_ada2', 'COLUMN';
36+
37+
/*
38+
Verify data
39+
*/
40+
select top (100) * from [dbo].[wikipedia_articles_embeddings]
41+
go
42+
43+
select * from [dbo].[wikipedia_articles_embeddings] where title = 'Alan Turing'
44+
go

‎vector-embeddings/04-create-get-embeddings-procedure.sql

Copy file name to clipboardExpand all lines: vector-embeddings/04-create-get-embeddings-procedure.sql
+5-3Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
create or alter procedure dbo.get_embedding
55
@deployedModelName nvarchar(1000),
66
@inputText nvarchar(max),
7-
@embedding nvarchar(max) output
7+
@embedding varbinary(8000) output
88
as
99
declare @retval int, @response nvarchar(max);
1010
declare @payload nvarchar(max) = json_object('input': @inputText);
@@ -16,12 +16,14 @@ exec @retval = sp_invoke_external_rest_endpoint
1616
@payload = @payload,
1717
@response = @response output;
1818

19-
declare @re nvarchar(max) = '[]';
19+
declare @re nvarchar(max) = null;
2020
if (@retval = 0) begin
2121
set @re = json_query(@response, '$.result.data[0].embedding')
22+
end else begin
23+
select @response as 'Error message from OpenAI API';
2224
end
2325

24-
set @embedding = @re;
26+
set @embedding = json_array_to_vector(@re);
2527

2628
return @retval
2729
go

0 commit comments

Comments
0 (0)
Morty Proxy This is a proxified and sanitized view of the page, visit original site.