postgresql/src/backend/tsearch/dict_synonym.c

242 lines
4.8 KiB
C

/*-------------------------------------------------------------------------
*
* dict_synonym.c
* Synonym dictionary: replace word by its synonym
*
* Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group
*
*
* IDENTIFICATION
* src/backend/tsearch/dict_synonym.c
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#include "commands/defrem.h"
#include "tsearch/ts_locale.h"
#include "tsearch/ts_public.h"
#include "utils/fmgrprotos.h"
typedef struct
{
char *in;
char *out;
int outlen;
uint16 flags;
} Syn;
typedef struct
{
int len; /* length of syn array */
Syn *syn;
bool case_sensitive;
} DictSyn;
/*
* Finds the next whitespace-delimited word within the 'in' string.
* Returns a pointer to the first character of the word, and a pointer
* to the next byte after the last character in the word (in *end).
* Character '*' at the end of word will not be treated as word
* character if flags is not null.
*/
static char *
findwrd(char *in, char **end, uint16 *flags)
{
char *start;
char *lastchar;
/* Skip leading spaces */
while (*in && t_isspace(in))
in += pg_mblen(in);
/* Return NULL on empty lines */
if (*in == '\0')
{
*end = NULL;
return NULL;
}
lastchar = start = in;
/* Find end of word */
while (*in && !t_isspace(in))
{
lastchar = in;
in += pg_mblen(in);
}
if (in - lastchar == 1 && t_iseq(lastchar, '*') && flags)
{
*flags = TSL_PREFIX;
*end = lastchar;
}
else
{
if (flags)
*flags = 0;
*end = in;
}
return start;
}
static int
compareSyn(const void *a, const void *b)
{
return strcmp(((const Syn *) a)->in, ((const Syn *) b)->in);
}
Datum
dsynonym_init(PG_FUNCTION_ARGS)
{
List *dictoptions = (List *) PG_GETARG_POINTER(0);
DictSyn *d;
ListCell *l;
char *filename = NULL;
bool case_sensitive = false;
tsearch_readline_state trst;
char *starti,
*starto,
*end = NULL;
int cur = 0;
char *line = NULL;
uint16 flags = 0;
foreach(l, dictoptions)
{
DefElem *defel = (DefElem *) lfirst(l);
if (strcmp(defel->defname, "synonyms") == 0)
filename = defGetString(defel);
else if (strcmp(defel->defname, "casesensitive") == 0)
case_sensitive = defGetBoolean(defel);
else
ereport(ERROR,
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("unrecognized synonym parameter: \"%s\"",
defel->defname)));
}
if (!filename)
ereport(ERROR,
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("missing Synonyms parameter")));
filename = get_tsearch_config_filename(filename, "syn");
if (!tsearch_readline_begin(&trst, filename))
ereport(ERROR,
(errcode(ERRCODE_CONFIG_FILE_ERROR),
errmsg("could not open synonym file \"%s\": %m",
filename)));
d = (DictSyn *) palloc0(sizeof(DictSyn));
while ((line = tsearch_readline(&trst)) != NULL)
{
starti = findwrd(line, &end, NULL);
if (!starti)
{
/* Empty line */
goto skipline;
}
if (*end == '\0')
{
/* A line with only one word. Ignore silently. */
goto skipline;
}
*end = '\0';
starto = findwrd(end + 1, &end, &flags);
if (!starto)
{
/* A line with only one word (+whitespace). Ignore silently. */
goto skipline;
}
*end = '\0';
/*
* starti now points to the first word, and starto to the second word
* on the line, with a \0 terminator at the end of both words.
*/
if (cur >= d->len)
{
if (d->len == 0)
{
d->len = 64;
d->syn = (Syn *) palloc(sizeof(Syn) * d->len);
}
else
{
d->len *= 2;
d->syn = (Syn *) repalloc(d->syn, sizeof(Syn) * d->len);
}
}
if (case_sensitive)
{
d->syn[cur].in = pstrdup(starti);
d->syn[cur].out = pstrdup(starto);
}
else
{
d->syn[cur].in = lowerstr(starti);
d->syn[cur].out = lowerstr(starto);
}
d->syn[cur].outlen = strlen(starto);
d->syn[cur].flags = flags;
cur++;
skipline:
pfree(line);
}
tsearch_readline_end(&trst);
d->len = cur;
qsort(d->syn, d->len, sizeof(Syn), compareSyn);
d->case_sensitive = case_sensitive;
PG_RETURN_POINTER(d);
}
Datum
dsynonym_lexize(PG_FUNCTION_ARGS)
{
DictSyn *d = (DictSyn *) PG_GETARG_POINTER(0);
char *in = (char *) PG_GETARG_POINTER(1);
int32 len = PG_GETARG_INT32(2);
Syn key,
*found;
TSLexeme *res;
/* note: d->len test protects against Solaris bsearch-of-no-items bug */
if (len <= 0 || d->len <= 0)
PG_RETURN_POINTER(NULL);
if (d->case_sensitive)
key.in = pnstrdup(in, len);
else
key.in = lowerstr_with_len(in, len);
key.out = NULL;
found = (Syn *) bsearch(&key, d->syn, d->len, sizeof(Syn), compareSyn);
pfree(key.in);
if (!found)
PG_RETURN_POINTER(NULL);
res = palloc0(sizeof(TSLexeme) * 2);
res[0].lexeme = pnstrdup(found->out, found->outlen);
res[0].flags = found->flags;
PG_RETURN_POINTER(res);
}