Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Appearance settings

Commit be8a7a6

Browse filesBrowse files
committed
Add strict_word_similarity to pg_trgm module
strict_word_similarity is similar to existing word_similarity function but it takes into account word boundaries to compute similarity. Author: Alexander Korotkov Review by: David Steele, Liudmila Mantrova, me Discussion: https://www.postgresql.org/message-id/flat/CY4PR17MB13207ED8310F847CF117EED0D85A0@CY4PR17MB1320.namprd17.prod.outlook.com
1 parent f20b328 commit be8a7a6
Copy full SHA for be8a7a6

File tree

Expand file treeCollapse file tree

10 files changed

+1461
-61
lines changed
Filter options
Expand file treeCollapse file tree

10 files changed

+1461
-61
lines changed

‎contrib/pg_trgm/Makefile

Copy file name to clipboardExpand all lines: contrib/pg_trgm/Makefile
+3-2Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,11 +4,12 @@ MODULE_big = pg_trgm
44
OBJS = trgm_op.o trgm_gist.o trgm_gin.o trgm_regexp.o $(WIN32RES)
55

66
EXTENSION = pg_trgm
7-
DATA = pg_trgm--1.3.sql pg_trgm--1.2--1.3.sql pg_trgm--1.1--1.2.sql \
7+
DATA = pg_trgm--1.3--1.4.sql \
8+
pg_trgm--1.3.sql pg_trgm--1.2--1.3.sql pg_trgm--1.1--1.2.sql \
89
pg_trgm--1.0--1.1.sql pg_trgm--unpackaged--1.0.sql
910
PGFILEDESC = "pg_trgm - trigram matching"
1011

11-
REGRESS = pg_trgm pg_word_trgm
12+
REGRESS = pg_trgm pg_word_trgm pg_strict_word_trgm
1213

1314
ifdef USE_PGXS
1415
PG_CONFIG = pg_config

‎contrib/pg_trgm/expected/pg_strict_word_trgm.out

Copy file name to clipboardExpand all lines: contrib/pg_trgm/expected/pg_strict_word_trgm.out
+1,025Lines changed: 1025 additions & 0 deletions
Large diffs are not rendered by default.

‎contrib/pg_trgm/pg_trgm--1.3--1.4.sql

Copy file name to clipboard
+68Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
/* contrib/pg_trgm/pg_trgm--1.3--1.4.sql */
2+
3+
-- complain if script is sourced in psql, rather than via ALTER EXTENSION
4+
\echo Use "ALTER EXTENSION pg_trgm UPDATE TO '1.4'" to load this file. \quit
5+
6+
CREATE FUNCTION strict_word_similarity(text,text)
7+
RETURNS float4
8+
AS 'MODULE_PATHNAME'
9+
LANGUAGE C STRICT IMMUTABLE PARALLEL SAFE;
10+
11+
CREATE FUNCTION strict_word_similarity_op(text,text)
12+
RETURNS bool
13+
AS 'MODULE_PATHNAME'
14+
LANGUAGE C STRICT STABLE PARALLEL SAFE; -- stable because depends on pg_trgm.word_similarity_threshold
15+
16+
CREATE FUNCTION strict_word_similarity_commutator_op(text,text)
17+
RETURNS bool
18+
AS 'MODULE_PATHNAME'
19+
LANGUAGE C STRICT STABLE PARALLEL SAFE; -- stable because depends on pg_trgm.word_similarity_threshold
20+
21+
CREATE OPERATOR <<% (
22+
LEFTARG = text,
23+
RIGHTARG = text,
24+
PROCEDURE = strict_word_similarity_op,
25+
COMMUTATOR = '%>>',
26+
RESTRICT = contsel,
27+
JOIN = contjoinsel
28+
);
29+
30+
CREATE OPERATOR %>> (
31+
LEFTARG = text,
32+
RIGHTARG = text,
33+
PROCEDURE = strict_word_similarity_commutator_op,
34+
COMMUTATOR = '<<%',
35+
RESTRICT = contsel,
36+
JOIN = contjoinsel
37+
);
38+
39+
CREATE FUNCTION strict_word_similarity_dist_op(text,text)
40+
RETURNS float4
41+
AS 'MODULE_PATHNAME'
42+
LANGUAGE C STRICT IMMUTABLE PARALLEL SAFE;
43+
44+
CREATE FUNCTION strict_word_similarity_dist_commutator_op(text,text)
45+
RETURNS float4
46+
AS 'MODULE_PATHNAME'
47+
LANGUAGE C STRICT IMMUTABLE PARALLEL SAFE;
48+
49+
CREATE OPERATOR <<<-> (
50+
LEFTARG = text,
51+
RIGHTARG = text,
52+
PROCEDURE = strict_word_similarity_dist_op,
53+
COMMUTATOR = '<->>>'
54+
);
55+
56+
CREATE OPERATOR <->>> (
57+
LEFTARG = text,
58+
RIGHTARG = text,
59+
PROCEDURE = strict_word_similarity_dist_commutator_op,
60+
COMMUTATOR = '<<<->'
61+
);
62+
63+
ALTER OPERATOR FAMILY gist_trgm_ops USING gist ADD
64+
OPERATOR 9 %>> (text, text),
65+
OPERATOR 10 <->>> (text, text) FOR ORDER BY pg_catalog.float_ops;
66+
67+
ALTER OPERATOR FAMILY gin_trgm_ops USING gin ADD
68+
OPERATOR 9 %>> (text, text);

‎contrib/pg_trgm/pg_trgm.control

Copy file name to clipboard
+1-1Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
# pg_trgm extension
22
comment = 'text similarity measurement and index searching based on trigrams'
3-
default_version = '1.3'
3+
default_version = '1.4'
44
module_pathname = '$libdir/pg_trgm'
55
relocatable = true
+42Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
DROP INDEX trgm_idx2;
2+
3+
\copy test_trgm3 from 'data/trgm2.data'
4+
5+
select t,strict_word_similarity('Baykal',t) as sml from test_trgm2 where 'Baykal' <<% t order by sml desc, t;
6+
select t,strict_word_similarity('Kabankala',t) as sml from test_trgm2 where 'Kabankala' <<% t order by sml desc, t;
7+
select t,strict_word_similarity('Baykal',t) as sml from test_trgm2 where t %>> 'Baykal' order by sml desc, t;
8+
select t,strict_word_similarity('Kabankala',t) as sml from test_trgm2 where t %>> 'Kabankala' order by sml desc, t;
9+
select t <->>> 'Alaikallupoddakulam', t from test_trgm2 order by t <->>> 'Alaikallupoddakulam' limit 7;
10+
11+
create index trgm_idx2 on test_trgm2 using gist (t gist_trgm_ops);
12+
set enable_seqscan=off;
13+
14+
select t,strict_word_similarity('Baykal',t) as sml from test_trgm2 where 'Baykal' <<% t order by sml desc, t;
15+
select t,strict_word_similarity('Kabankala',t) as sml from test_trgm2 where 'Kabankala' <<% t order by sml desc, t;
16+
select t,strict_word_similarity('Baykal',t) as sml from test_trgm2 where t %>> 'Baykal' order by sml desc, t;
17+
select t,strict_word_similarity('Kabankala',t) as sml from test_trgm2 where t %>> 'Kabankala' order by sml desc, t;
18+
19+
explain (costs off)
20+
select t <->>> 'Alaikallupoddakulam', t from test_trgm2 order by t <->>> 'Alaikallupoddakulam' limit 7;
21+
select t <->>> 'Alaikallupoddakulam', t from test_trgm2 order by t <->>> 'Alaikallupoddakulam' limit 7;
22+
23+
drop index trgm_idx2;
24+
create index trgm_idx2 on test_trgm2 using gin (t gin_trgm_ops);
25+
set enable_seqscan=off;
26+
27+
select t,strict_word_similarity('Baykal',t) as sml from test_trgm2 where 'Baykal' <<% t order by sml desc, t;
28+
select t,strict_word_similarity('Kabankala',t) as sml from test_trgm2 where 'Kabankala' <<% t order by sml desc, t;
29+
select t,strict_word_similarity('Baykal',t) as sml from test_trgm2 where t %>> 'Baykal' order by sml desc, t;
30+
select t,strict_word_similarity('Kabankala',t) as sml from test_trgm2 where t %>> 'Kabankala' order by sml desc, t;
31+
32+
set "pg_trgm.strict_word_similarity_threshold" to 0.4;
33+
select t,strict_word_similarity('Baykal',t) as sml from test_trgm2 where 'Baykal' <<% t order by sml desc, t;
34+
select t,strict_word_similarity('Kabankala',t) as sml from test_trgm2 where 'Kabankala' <<% t order by sml desc, t;
35+
select t,strict_word_similarity('Baykal',t) as sml from test_trgm2 where t %>> 'Baykal' order by sml desc, t;
36+
select t,strict_word_similarity('Kabankala',t) as sml from test_trgm2 where t %>> 'Kabankala' order by sml desc, t;
37+
38+
set "pg_trgm.strict_word_similarity_threshold" to 0.2;
39+
select t,strict_word_similarity('Baykal',t) as sml from test_trgm2 where 'Baykal' <<% t order by sml desc, t;
40+
select t,strict_word_similarity('Kabankala',t) as sml from test_trgm2 where 'Kabankala' <<% t order by sml desc, t;
41+
select t,strict_word_similarity('Baykal',t) as sml from test_trgm2 where t %>> 'Baykal' order by sml desc, t;
42+
select t,strict_word_similarity('Kabankala',t) as sml from test_trgm2 where t %>> 'Kabankala' order by sml desc, t;

‎contrib/pg_trgm/trgm.h

Copy file name to clipboardExpand all lines: contrib/pg_trgm/trgm.h
+13-8Lines changed: 13 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66

77
#include "access/gist.h"
88
#include "access/itup.h"
9+
#include "access/stratnum.h"
910
#include "storage/bufpage.h"
1011

1112
/*
@@ -26,14 +27,16 @@
2627
#define DIVUNION
2728

2829
/* operator strategy numbers */
29-
#define SimilarityStrategyNumber 1
30-
#define DistanceStrategyNumber 2
31-
#define LikeStrategyNumber 3
32-
#define ILikeStrategyNumber 4
33-
#define RegExpStrategyNumber 5
34-
#define RegExpICaseStrategyNumber 6
35-
#define WordSimilarityStrategyNumber 7
36-
#define WordDistanceStrategyNumber 8
30+
#define SimilarityStrategyNumber 1
31+
#define DistanceStrategyNumber 2
32+
#define LikeStrategyNumber 3
33+
#define ILikeStrategyNumber 4
34+
#define RegExpStrategyNumber 5
35+
#define RegExpICaseStrategyNumber 6
36+
#define WordSimilarityStrategyNumber 7
37+
#define WordDistanceStrategyNumber 8
38+
#define StrictWordSimilarityStrategyNumber 9
39+
#define StrictWordDistanceStrategyNumber 10
3740

3841
typedef char trgm[3];
3942

@@ -120,7 +123,9 @@ typedef struct TrgmPackedGraph TrgmPackedGraph;
120123

121124
extern double similarity_threshold;
122125
extern double word_similarity_threshold;
126+
extern double strict_word_similarity_threshold;
123127

128+
extern double index_strategy_get_limit(StrategyNumber strategy);
124129
extern uint32 trgm2int(trgm *ptr);
125130
extern void compact_trigram(trgm *tptr, char *str, int bytelen);
126131
extern TRGM *generate_trgm(char *str, int slen);

‎contrib/pg_trgm/trgm_gin.c

Copy file name to clipboardExpand all lines: contrib/pg_trgm/trgm_gin.c
+5-4Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -90,6 +90,7 @@ gin_extract_query_trgm(PG_FUNCTION_ARGS)
9090
{
9191
case SimilarityStrategyNumber:
9292
case WordSimilarityStrategyNumber:
93+
case StrictWordSimilarityStrategyNumber:
9394
trg = generate_trgm(VARDATA_ANY(val), VARSIZE_ANY_EXHDR(val));
9495
break;
9596
case ILikeStrategyNumber:
@@ -187,8 +188,8 @@ gin_trgm_consistent(PG_FUNCTION_ARGS)
187188
{
188189
case SimilarityStrategyNumber:
189190
case WordSimilarityStrategyNumber:
190-
nlimit = (strategy == SimilarityStrategyNumber) ?
191-
similarity_threshold : word_similarity_threshold;
191+
case StrictWordSimilarityStrategyNumber:
192+
nlimit = index_strategy_get_limit(strategy);
192193

193194
/* Count the matches */
194195
ntrue = 0;
@@ -282,8 +283,8 @@ gin_trgm_triconsistent(PG_FUNCTION_ARGS)
282283
{
283284
case SimilarityStrategyNumber:
284285
case WordSimilarityStrategyNumber:
285-
nlimit = (strategy == SimilarityStrategyNumber) ?
286-
similarity_threshold : word_similarity_threshold;
286+
case StrictWordSimilarityStrategyNumber:
287+
nlimit = index_strategy_get_limit(strategy);
287288

288289
/* Count the matches */
289290
ntrue = 0;

‎contrib/pg_trgm/trgm_gist.c

Copy file name to clipboardExpand all lines: contrib/pg_trgm/trgm_gist.c
+9-5Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -221,6 +221,7 @@ gtrgm_consistent(PG_FUNCTION_ARGS)
221221
{
222222
case SimilarityStrategyNumber:
223223
case WordSimilarityStrategyNumber:
224+
case StrictWordSimilarityStrategyNumber:
224225
qtrg = generate_trgm(VARDATA(query),
225226
querysize - VARHDRSZ);
226227
break;
@@ -290,10 +291,11 @@ gtrgm_consistent(PG_FUNCTION_ARGS)
290291
{
291292
case SimilarityStrategyNumber:
292293
case WordSimilarityStrategyNumber:
293-
/* Similarity search is exact. Word similarity search is inexact */
294-
*recheck = (strategy == WordSimilarityStrategyNumber);
295-
nlimit = (strategy == SimilarityStrategyNumber) ?
296-
similarity_threshold : word_similarity_threshold;
294+
case StrictWordSimilarityStrategyNumber:
295+
/* Similarity search is exact. (Strict) word similarity search is inexact */
296+
*recheck = (strategy != SimilarityStrategyNumber);
297+
298+
nlimit = index_strategy_get_limit(strategy);
297299

298300
if (GIST_LEAF(entry))
299301
{ /* all leafs contains orig trgm */
@@ -468,7 +470,9 @@ gtrgm_distance(PG_FUNCTION_ARGS)
468470
{
469471
case DistanceStrategyNumber:
470472
case WordDistanceStrategyNumber:
471-
*recheck = strategy == WordDistanceStrategyNumber;
473+
case StrictWordDistanceStrategyNumber:
474+
/* Only plain trigram distance is exact */
475+
*recheck = (strategy != DistanceStrategyNumber);
472476
if (GIST_LEAF(entry))
473477
{ /* all leafs contains orig trgm */
474478

0 commit comments

Comments
0 (0)
Morty Proxy This is a proxified and sanitized view of the page, visit original site.