Skip to content

Commit 40b7d95

Browse files
author
Alexander Korotkov
committed
Optimize V-gram GIN index for "fast scan"
With GIN "fast scan" it doesn't make much sense to exclude entries from GIN scan just because they are not rare enough. Thus, we don't estimate selectivity of V-grams and just extracts all possible V-grams. Also triconsistent function is implemented for better "fast scan" performance.
1 parent c2da209 commit 40b7d95

4 files changed

Lines changed: 81 additions & 55 deletions

File tree

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -98,4 +98,4 @@ installation. It is available from
9898
[github](https://github.com/akorotkov/vgram)
9999
under the same license as
100100
[PostgreSQL](https://www.postgresql.org/about/licence/)
101-
and supports PostgreSQL 9.2+.
101+
and supports PostgreSQL 9.4+.

vgram--1.0.sql

Lines changed: 17 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -38,9 +38,9 @@ AS 'MODULE_PATHNAME'
3838
LANGUAGE C IMMUTABLE;
3939

4040
CREATE AGGREGATE qgram_stat(text) (
41-
SFUNC=qgram_stat_transfn,
42-
STYPE=internal,
43-
FINALFUNC=qgram_stat_finalfn
41+
SFUNC = qgram_stat_transfn,
42+
STYPE = internal,
43+
FINALFUNC = qgram_stat_finalfn
4444
);
4545

4646
-- support functions for gin
@@ -59,19 +59,25 @@ RETURNS internal
5959
AS 'MODULE_PATHNAME'
6060
LANGUAGE C IMMUTABLE STRICT;
6161

62-
CREATE FUNCTION vgram_gin_consitent(internal, int2, text, int4, internal, internal, internal, internal)
62+
CREATE FUNCTION vgram_gin_consistent(internal, int2, text, int4, internal, internal, internal, internal)
6363
RETURNS bool
6464
AS 'MODULE_PATHNAME'
6565
LANGUAGE C IMMUTABLE STRICT;
6666

67+
CREATE FUNCTION vgram_gin_triconsistent(internal, int2, text, int4, internal, internal, internal)
68+
RETURNS "char"
69+
AS 'MODULE_PATHNAME'
70+
LANGUAGE C IMMUTABLE STRICT;
71+
6772
CREATE OPERATOR CLASS vgram_gin_ops
6873
FOR TYPE text USING gin
6974
AS
70-
OPERATOR 3 pg_catalog.~~ (text, text),
71-
OPERATOR 4 pg_catalog.~~* (text, text),
72-
FUNCTION 1 vgram_cmp (text, text),
73-
FUNCTION 2 vgram_gin_extract_value (text, internal),
74-
FUNCTION 3 vgram_gin_extract_query (text, internal, int2, internal, internal, internal, internal),
75-
FUNCTION 4 vgram_gin_consitent (internal, int2, text, int4, internal, internal, internal, internal),
76-
STORAGE text;
75+
OPERATOR 3 pg_catalog.~~ (text, text),
76+
OPERATOR 4 pg_catalog.~~* (text, text),
77+
FUNCTION 1 vgram_cmp (text, text),
78+
FUNCTION 2 vgram_gin_extract_value (text, internal),
79+
FUNCTION 3 vgram_gin_extract_query (text, internal, int2, internal, internal, internal, internal),
80+
FUNCTION 4 vgram_gin_consistent (internal, int2, text, int4, internal, internal, internal, internal),
81+
FUNCTION 6 vgram_gin_triconsistent (internal, int2, text, int4, internal, internal, internal),
82+
STORAGE text;
7783

vgram_gin.c

Lines changed: 45 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -26,9 +26,13 @@ Datum vgram_gin_extract_value(PG_FUNCTION_ARGS);
2626

2727
PG_FUNCTION_INFO_V1(vgram_gin_extract_value);
2828

29-
Datum vgram_gin_consitent(PG_FUNCTION_ARGS);
29+
Datum vgram_gin_consistent(PG_FUNCTION_ARGS);
3030

31-
PG_FUNCTION_INFO_V1(vgram_gin_consitent);
31+
PG_FUNCTION_INFO_V1(vgram_gin_consistent);
32+
33+
Datum vgram_gin_triconsistent(PG_FUNCTION_ARGS);
34+
35+
PG_FUNCTION_INFO_V1(vgram_gin_triconsistent);
3236

3337
Datum vgram_gin_extract_query(PG_FUNCTION_ARGS);
3438

@@ -146,7 +150,7 @@ vgram_gin_extract_value(PG_FUNCTION_ARGS)
146150
}
147151

148152
Datum
149-
vgram_gin_consitent(PG_FUNCTION_ARGS)
153+
vgram_gin_consistent(PG_FUNCTION_ARGS)
150154
{
151155
bool *check = (bool *) PG_GETARG_POINTER(0);
152156
StrategyNumber strategy = PG_GETARG_UINT16(1);
@@ -186,6 +190,44 @@ vgram_gin_consitent(PG_FUNCTION_ARGS)
186190
PG_RETURN_BOOL(res);
187191
}
188192

193+
Datum
194+
vgram_gin_triconsistent(PG_FUNCTION_ARGS)
195+
{
196+
GinTernaryValue *check = (GinTernaryValue *) PG_GETARG_POINTER(0);
197+
StrategyNumber strategy = PG_GETARG_UINT16(1);
198+
199+
/* text *query = PG_GETARG_TEXT_P(2); */
200+
int32 nkeys = PG_GETARG_INT32(3);
201+
202+
/* Pointer *extra_data = (Pointer *) PG_GETARG_POINTER(4); */
203+
GinTernaryValue res = GIN_MAYBE;
204+
int32 i;
205+
206+
switch (strategy)
207+
{
208+
case ILikeStrategyNumber:
209+
case LikeStrategyNumber:
210+
/* Check if all extracted trigrams are presented. */
211+
for (i = 0; i < nkeys; i++)
212+
{
213+
if (check[i] == GIN_FALSE)
214+
{
215+
res = GIN_FALSE;
216+
break;
217+
}
218+
}
219+
break;
220+
default:
221+
elog(ERROR, "unrecognized strategy number: %d", strategy);
222+
res = false; /* keep compiler quiet */
223+
break;
224+
}
225+
226+
/* All cases served by this function are inexact */
227+
Assert(res != GIN_TRUE);
228+
PG_RETURN_GIN_TERNARY_VALUE(res);
229+
}
230+
189231
Datum
190232
vgram_gin_extract_query(PG_FUNCTION_ARGS)
191233
{

vgram_like.c

Lines changed: 18 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -156,40 +156,24 @@ get_wildcard_part(const char *str, int lenstr,
156156

157157
typedef struct
158158
{
159-
char *vgram;
160-
float4 selectivity;
159+
char **data;
160+
int count;
161+
int allocated;
161162
} VGramInfo;
162163

163164
static void
164165
addVGram(char *vgram, void *userData)
165166
{
166-
float4 selectivity = estimateVGramSelectivilty(vgram);
167-
float4 placeSelectivity;
168-
int i,
169-
place = -1;
170167
VGramInfo *vgrams = (VGramInfo *) userData;
171168

172-
for (i = 0; i < OPTIMAL_VGRAM_COUNT; i++)
169+
if (vgrams->count >= vgrams->allocated)
173170
{
174-
if (vgrams[i].vgram == NULL)
175-
{
176-
vgrams[i].vgram = vgram;
177-
vgrams[i].selectivity = selectivity;
178-
return;
179-
}
180-
if ((place < 0 && selectivity < vgrams[i].selectivity) ||
181-
(place >= 0 && vgrams[i].selectivity > placeSelectivity))
182-
{
183-
place = i;
184-
placeSelectivity = vgrams[i].selectivity;
185-
}
186-
}
187-
if (place >= 0)
188-
{
189-
pfree(vgrams[place].vgram);
190-
vgrams[place].vgram = vgram;
191-
vgrams[place].selectivity = selectivity;
171+
vgrams->allocated *= 2;
172+
vgrams->data = (char **) repalloc(vgrams->data, sizeof(char *) * vgrams->allocated);
192173
}
174+
175+
vgrams->data[vgrams->count] = vgram;
176+
vgrams->count++;
193177
}
194178

195179

@@ -204,14 +188,16 @@ extractQueryLike(int32 *nentries, text *pattern)
204188
bytelen,
205189
charlen,
206190
i;
207-
VGramInfo vgrams[OPTIMAL_VGRAM_COUNT];
191+
VGramInfo vgrams;
208192
ExtractVGramsInfo userData;
209193
Datum *entries;
210194

211195
userData.callback = addVGram;
212-
userData.userData = (void *) vgrams;
196+
userData.userData = (void *) &vgrams;
213197

214-
memset(vgrams, 0, sizeof(vgrams));
198+
vgrams.count = 0;
199+
vgrams.allocated = 16;
200+
vgrams.data = (char **) palloc(sizeof(char *) * vgrams.allocated);
215201

216202
str = (char *) VARDATA_ANY(pattern);
217203
len = VARSIZE_ANY_EXHDR(pattern);
@@ -230,20 +216,12 @@ extractQueryLike(int32 *nentries, text *pattern)
230216
}
231217
pfree(buf);
232218

233-
*nentries = 0;
234-
while (*nentries < OPTIMAL_VGRAM_COUNT && vgrams[*nentries].vgram)
235-
{
236-
/*
237-
* elog(NOTICE, "%s %f", vgrams[*nentries].vgram,
238-
* vgrams[*nentries].selectivity);
239-
*/
240-
(*nentries)++;
241-
}
219+
*nentries = vgrams.count;
242220

243-
entries = (Datum *) palloc(sizeof(Datum) * (*nentries));
244-
for (i = 0; i < *nentries; i++)
221+
entries = (Datum *) palloc(sizeof(Datum) * vgrams.count);
222+
for (i = 0; i < vgrams.count; i++)
245223
{
246-
entries[i] = PointerGetDatum(cstring_to_text(vgrams[i].vgram));
224+
entries[i] = PointerGetDatum(cstring_to_text(vgrams.data[i]));
247225
}
248226
return entries;
249227
}

0 commit comments

Comments
 (0)