postgresql/src/backend/tsearch/ts_parse.c

680 lines
15 KiB
C

/*-------------------------------------------------------------------------
*
* ts_parse.c
* main parse functions for tsearch
*
* Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group
*
*
* IDENTIFICATION
* src/backend/tsearch/ts_parse.c
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#include "tsearch/ts_cache.h"
#include "tsearch/ts_utils.h"
#include "varatt.h"
#define IGNORE_LONGLEXEME 1
/*
* Lexize subsystem
*/
typedef struct ParsedLex
{
int type;
char *lemm;
int lenlemm;
struct ParsedLex *next;
} ParsedLex;
typedef struct ListParsedLex
{
ParsedLex *head;
ParsedLex *tail;
} ListParsedLex;
typedef struct
{
TSConfigCacheEntry *cfg;
Oid curDictId;
int posDict;
DictSubState dictState;
ParsedLex *curSub;
ListParsedLex towork; /* current list to work */
ListParsedLex waste; /* list of lexemes that already lexized */
/*
* fields to store last variant to lexize (basically, thesaurus or similar
* to, which wants several lexemes
*/
ParsedLex *lastRes;
TSLexeme *tmpRes;
} LexizeData;
static void
LexizeInit(LexizeData *ld, TSConfigCacheEntry *cfg)
{
ld->cfg = cfg;
ld->curDictId = InvalidOid;
ld->posDict = 0;
ld->towork.head = ld->towork.tail = ld->curSub = NULL;
ld->waste.head = ld->waste.tail = NULL;
ld->lastRes = NULL;
ld->tmpRes = NULL;
}
static void
LPLAddTail(ListParsedLex *list, ParsedLex *newpl)
{
if (list->tail)
{
list->tail->next = newpl;
list->tail = newpl;
}
else
list->head = list->tail = newpl;
newpl->next = NULL;
}
static ParsedLex *
LPLRemoveHead(ListParsedLex *list)
{
ParsedLex *res = list->head;
if (list->head)
list->head = list->head->next;
if (list->head == NULL)
list->tail = NULL;
return res;
}
static void
LexizeAddLemm(LexizeData *ld, int type, char *lemm, int lenlemm)
{
ParsedLex *newpl = (ParsedLex *) palloc(sizeof(ParsedLex));
newpl->type = type;
newpl->lemm = lemm;
newpl->lenlemm = lenlemm;
LPLAddTail(&ld->towork, newpl);
ld->curSub = ld->towork.tail;
}
static void
RemoveHead(LexizeData *ld)
{
LPLAddTail(&ld->waste, LPLRemoveHead(&ld->towork));
ld->posDict = 0;
}
static void
setCorrLex(LexizeData *ld, ParsedLex **correspondLexem)
{
if (correspondLexem)
{
*correspondLexem = ld->waste.head;
}
else
{
ParsedLex *tmp,
*ptr = ld->waste.head;
while (ptr)
{
tmp = ptr->next;
pfree(ptr);
ptr = tmp;
}
}
ld->waste.head = ld->waste.tail = NULL;
}
static void
moveToWaste(LexizeData *ld, ParsedLex *stop)
{
bool go = true;
while (ld->towork.head && go)
{
if (ld->towork.head == stop)
{
ld->curSub = stop->next;
go = false;
}
RemoveHead(ld);
}
}
static void
setNewTmpRes(LexizeData *ld, ParsedLex *lex, TSLexeme *res)
{
if (ld->tmpRes)
{
TSLexeme *ptr;
for (ptr = ld->tmpRes; ptr->lexeme; ptr++)
pfree(ptr->lexeme);
pfree(ld->tmpRes);
}
ld->tmpRes = res;
ld->lastRes = lex;
}
static TSLexeme *
LexizeExec(LexizeData *ld, ParsedLex **correspondLexem)
{
int i;
ListDictionary *map;
TSDictionaryCacheEntry *dict;
TSLexeme *res;
if (ld->curDictId == InvalidOid)
{
/*
* usual mode: dictionary wants only one word, but we should keep in
* mind that we should go through all stack
*/
while (ld->towork.head)
{
ParsedLex *curVal = ld->towork.head;
char *curValLemm = curVal->lemm;
int curValLenLemm = curVal->lenlemm;
map = ld->cfg->map + curVal->type;
if (curVal->type == 0 || curVal->type >= ld->cfg->lenmap || map->len == 0)
{
/* skip this type of lexeme */
RemoveHead(ld);
continue;
}
for (i = ld->posDict; i < map->len; i++)
{
dict = lookup_ts_dictionary_cache(map->dictIds[i]);
ld->dictState.isend = ld->dictState.getnext = false;
ld->dictState.private_state = NULL;
res = (TSLexeme *) DatumGetPointer(FunctionCall4(&(dict->lexize),
PointerGetDatum(dict->dictData),
PointerGetDatum(curValLemm),
Int32GetDatum(curValLenLemm),
PointerGetDatum(&ld->dictState)));
if (ld->dictState.getnext)
{
/*
* dictionary wants next word, so setup and store current
* position and go to multiword mode
*/
ld->curDictId = DatumGetObjectId(map->dictIds[i]);
ld->posDict = i + 1;
ld->curSub = curVal->next;
if (res)
setNewTmpRes(ld, curVal, res);
return LexizeExec(ld, correspondLexem);
}
if (!res) /* dictionary doesn't know this lexeme */
continue;
if (res->flags & TSL_FILTER)
{
curValLemm = res->lexeme;
curValLenLemm = strlen(res->lexeme);
continue;
}
RemoveHead(ld);
setCorrLex(ld, correspondLexem);
return res;
}
RemoveHead(ld);
}
}
else
{ /* curDictId is valid */
dict = lookup_ts_dictionary_cache(ld->curDictId);
/*
* Dictionary ld->curDictId asks us about following words
*/
while (ld->curSub)
{
ParsedLex *curVal = ld->curSub;
map = ld->cfg->map + curVal->type;
if (curVal->type != 0)
{
bool dictExists = false;
if (curVal->type >= ld->cfg->lenmap || map->len == 0)
{
/* skip this type of lexeme */
ld->curSub = curVal->next;
continue;
}
/*
* We should be sure that current type of lexeme is recognized
* by our dictionary: we just check is it exist in list of
* dictionaries ?
*/
for (i = 0; i < map->len && !dictExists; i++)
if (ld->curDictId == DatumGetObjectId(map->dictIds[i]))
dictExists = true;
if (!dictExists)
{
/*
* Dictionary can't work with current type of lexeme,
* return to basic mode and redo all stored lexemes
*/
ld->curDictId = InvalidOid;
return LexizeExec(ld, correspondLexem);
}
}
ld->dictState.isend = (curVal->type == 0);
ld->dictState.getnext = false;
res = (TSLexeme *) DatumGetPointer(FunctionCall4(&(dict->lexize),
PointerGetDatum(dict->dictData),
PointerGetDatum(curVal->lemm),
Int32GetDatum(curVal->lenlemm),
PointerGetDatum(&ld->dictState)));
if (ld->dictState.getnext)
{
/* Dictionary wants one more */
ld->curSub = curVal->next;
if (res)
setNewTmpRes(ld, curVal, res);
continue;
}
if (res || ld->tmpRes)
{
/*
* Dictionary normalizes lexemes, so we remove from stack all
* used lexemes, return to basic mode and redo end of stack
* (if it exists)
*/
if (res)
{
moveToWaste(ld, ld->curSub);
}
else
{
res = ld->tmpRes;
moveToWaste(ld, ld->lastRes);
}
/* reset to initial state */
ld->curDictId = InvalidOid;
ld->posDict = 0;
ld->lastRes = NULL;
ld->tmpRes = NULL;
setCorrLex(ld, correspondLexem);
return res;
}
/*
* Dict don't want next lexem and didn't recognize anything, redo
* from ld->towork.head
*/
ld->curDictId = InvalidOid;
return LexizeExec(ld, correspondLexem);
}
}
setCorrLex(ld, correspondLexem);
return NULL;
}
/*
* Parse string and lexize words.
*
* prs will be filled in.
*/
void
parsetext(Oid cfgId, ParsedText *prs, char *buf, int buflen)
{
int type,
lenlemm = 0; /* silence compiler warning */
char *lemm = NULL;
LexizeData ldata;
TSLexeme *norms;
TSConfigCacheEntry *cfg;
TSParserCacheEntry *prsobj;
void *prsdata;
cfg = lookup_ts_config_cache(cfgId);
prsobj = lookup_ts_parser_cache(cfg->prsId);
prsdata = (void *) DatumGetPointer(FunctionCall2(&prsobj->prsstart,
PointerGetDatum(buf),
Int32GetDatum(buflen)));
LexizeInit(&ldata, cfg);
do
{
type = DatumGetInt32(FunctionCall3(&(prsobj->prstoken),
PointerGetDatum(prsdata),
PointerGetDatum(&lemm),
PointerGetDatum(&lenlemm)));
if (type > 0 && lenlemm >= MAXSTRLEN)
{
#ifdef IGNORE_LONGLEXEME
ereport(NOTICE,
(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
errmsg("word is too long to be indexed"),
errdetail("Words longer than %d characters are ignored.",
MAXSTRLEN)));
continue;
#else
ereport(ERROR,
(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
errmsg("word is too long to be indexed"),
errdetail("Words longer than %d characters are ignored.",
MAXSTRLEN)));
#endif
}
LexizeAddLemm(&ldata, type, lemm, lenlemm);
while ((norms = LexizeExec(&ldata, NULL)) != NULL)
{
TSLexeme *ptr = norms;
prs->pos++; /* set pos */
while (ptr->lexeme)
{
if (prs->curwords == prs->lenwords)
{
prs->lenwords *= 2;
prs->words = (ParsedWord *) repalloc(prs->words, prs->lenwords * sizeof(ParsedWord));
}
if (ptr->flags & TSL_ADDPOS)
prs->pos++;
prs->words[prs->curwords].len = strlen(ptr->lexeme);
prs->words[prs->curwords].word = ptr->lexeme;
prs->words[prs->curwords].nvariant = ptr->nvariant;
prs->words[prs->curwords].flags = ptr->flags & TSL_PREFIX;
prs->words[prs->curwords].alen = 0;
prs->words[prs->curwords].pos.pos = LIMITPOS(prs->pos);
ptr++;
prs->curwords++;
}
pfree(norms);
}
} while (type > 0);
FunctionCall1(&(prsobj->prsend), PointerGetDatum(prsdata));
}
/*
* Headline framework
*/
/* Add a word to prs->words[] */
static void
hladdword(HeadlineParsedText *prs, char *buf, int buflen, int type)
{
if (prs->curwords >= prs->lenwords)
{
prs->lenwords *= 2;
prs->words = (HeadlineWordEntry *) repalloc(prs->words, prs->lenwords * sizeof(HeadlineWordEntry));
}
memset(&(prs->words[prs->curwords]), 0, sizeof(HeadlineWordEntry));
prs->words[prs->curwords].type = (uint8) type;
prs->words[prs->curwords].len = buflen;
prs->words[prs->curwords].word = palloc(buflen);
memcpy(prs->words[prs->curwords].word, buf, buflen);
prs->curwords++;
}
/*
* Add pos and matching-query-item data to the just-added word.
* Here, buf/buflen represent a processed lexeme, not raw token text.
*
* If the query contains more than one matching item, we replicate
* the last-added word so that each item can be pointed to. The
* duplicate entries are marked with repeated = 1.
*/
static void
hlfinditem(HeadlineParsedText *prs, TSQuery query, int32 pos, char *buf, int buflen)
{
int i;
QueryItem *item = GETQUERY(query);
HeadlineWordEntry *word;
while (prs->curwords + query->size >= prs->lenwords)
{
prs->lenwords *= 2;
prs->words = (HeadlineWordEntry *) repalloc(prs->words, prs->lenwords * sizeof(HeadlineWordEntry));
}
word = &(prs->words[prs->curwords - 1]);
word->pos = LIMITPOS(pos);
for (i = 0; i < query->size; i++)
{
if (item->type == QI_VAL &&
tsCompareString(GETOPERAND(query) + item->qoperand.distance, item->qoperand.length,
buf, buflen, item->qoperand.prefix) == 0)
{
if (word->item)
{
memcpy(&(prs->words[prs->curwords]), word, sizeof(HeadlineWordEntry));
prs->words[prs->curwords].item = &item->qoperand;
prs->words[prs->curwords].repeated = 1;
prs->curwords++;
}
else
word->item = &item->qoperand;
}
item++;
}
}
static void
addHLParsedLex(HeadlineParsedText *prs, TSQuery query, ParsedLex *lexs, TSLexeme *norms)
{
ParsedLex *tmplexs;
TSLexeme *ptr;
int32 savedpos;
while (lexs)
{
if (lexs->type > 0)
hladdword(prs, lexs->lemm, lexs->lenlemm, lexs->type);
ptr = norms;
savedpos = prs->vectorpos;
while (ptr && ptr->lexeme)
{
if (ptr->flags & TSL_ADDPOS)
savedpos++;
hlfinditem(prs, query, savedpos, ptr->lexeme, strlen(ptr->lexeme));
ptr++;
}
tmplexs = lexs->next;
pfree(lexs);
lexs = tmplexs;
}
if (norms)
{
ptr = norms;
while (ptr->lexeme)
{
if (ptr->flags & TSL_ADDPOS)
prs->vectorpos++;
pfree(ptr->lexeme);
ptr++;
}
pfree(norms);
}
}
void
hlparsetext(Oid cfgId, HeadlineParsedText *prs, TSQuery query, char *buf, int buflen)
{
int type,
lenlemm = 0; /* silence compiler warning */
char *lemm = NULL;
LexizeData ldata;
TSLexeme *norms;
ParsedLex *lexs;
TSConfigCacheEntry *cfg;
TSParserCacheEntry *prsobj;
void *prsdata;
cfg = lookup_ts_config_cache(cfgId);
prsobj = lookup_ts_parser_cache(cfg->prsId);
prsdata = (void *) DatumGetPointer(FunctionCall2(&(prsobj->prsstart),
PointerGetDatum(buf),
Int32GetDatum(buflen)));
LexizeInit(&ldata, cfg);
do
{
type = DatumGetInt32(FunctionCall3(&(prsobj->prstoken),
PointerGetDatum(prsdata),
PointerGetDatum(&lemm),
PointerGetDatum(&lenlemm)));
if (type > 0 && lenlemm >= MAXSTRLEN)
{
#ifdef IGNORE_LONGLEXEME
ereport(NOTICE,
(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
errmsg("word is too long to be indexed"),
errdetail("Words longer than %d characters are ignored.",
MAXSTRLEN)));
continue;
#else
ereport(ERROR,
(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
errmsg("word is too long to be indexed"),
errdetail("Words longer than %d characters are ignored.",
MAXSTRLEN)));
#endif
}
LexizeAddLemm(&ldata, type, lemm, lenlemm);
do
{
if ((norms = LexizeExec(&ldata, &lexs)) != NULL)
{
prs->vectorpos++;
addHLParsedLex(prs, query, lexs, norms);
}
else
addHLParsedLex(prs, query, lexs, NULL);
} while (norms);
} while (type > 0);
FunctionCall1(&(prsobj->prsend), PointerGetDatum(prsdata));
}
/*
* Generate the headline, as a text object, from HeadlineParsedText.
*/
text *
generateHeadline(HeadlineParsedText *prs)
{
text *out;
char *ptr;
int len = 128;
int numfragments = 0;
int16 infrag = 0;
HeadlineWordEntry *wrd = prs->words;
out = (text *) palloc(len);
ptr = ((char *) out) + VARHDRSZ;
while (wrd - prs->words < prs->curwords)
{
while (wrd->len + prs->stopsellen + prs->startsellen + prs->fragdelimlen + (ptr - ((char *) out)) >= len)
{
int dist = ptr - ((char *) out);
len *= 2;
out = (text *) repalloc(out, len);
ptr = ((char *) out) + dist;
}
if (wrd->in && !wrd->repeated)
{
if (!infrag)
{
/* start of a new fragment */
infrag = 1;
numfragments++;
/* add a fragment delimiter if this is after the first one */
if (numfragments > 1)
{
memcpy(ptr, prs->fragdelim, prs->fragdelimlen);
ptr += prs->fragdelimlen;
}
}
if (wrd->replace)
{
*ptr = ' ';
ptr++;
}
else if (!wrd->skip)
{
if (wrd->selected)
{
memcpy(ptr, prs->startsel, prs->startsellen);
ptr += prs->startsellen;
}
memcpy(ptr, wrd->word, wrd->len);
ptr += wrd->len;
if (wrd->selected)
{
memcpy(ptr, prs->stopsel, prs->stopsellen);
ptr += prs->stopsellen;
}
}
}
else if (!wrd->repeated)
{
if (infrag)
infrag = 0;
pfree(wrd->word);
}
wrd++;
}
SET_VARSIZE(out, ptr - ((char *) out));
return out;
}