|
|
|
|
/*-------------------------------------------------------------------------
|
|
|
|
|
*
|
|
|
|
|
* tsquery.c
|
|
|
|
|
* I/O functions for tsquery
|
|
|
|
|
*
|
|
|
|
|
* Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
|
|
|
|
|
*
|
|
|
|
|
*
|
|
|
|
|
* IDENTIFICATION
|
|
|
|
|
* src/backend/utils/adt/tsquery.c
|
|
|
|
|
*
|
|
|
|
|
*-------------------------------------------------------------------------
|
|
|
|
|
*/
|
|
|
|
|
|
|
|
|
|
#include "postgres.h"
|
|
|
|
|
|
|
|
|
|
#include "libpq/pqformat.h"
|
|
|
|
|
#include "miscadmin.h"
|
|
|
|
|
#include "nodes/miscnodes.h"
|
|
|
|
|
#include "tsearch/ts_locale.h"
|
|
|
|
|
#include "tsearch/ts_type.h"
|
|
|
|
|
#include "tsearch/ts_utils.h"
|
|
|
|
|
#include "utils/builtins.h"
|
|
|
|
|
#include "utils/memutils.h"
|
|
|
|
|
#include "utils/pg_crc.h"
|
|
|
|
|
|
|
|
|
|
/* FTS operator priorities, see ts_type.h */
|
|
|
|
|
const int tsearch_op_priority[OP_COUNT] =
|
|
|
|
|
{
|
|
|
|
|
4, /* OP_NOT */
|
|
|
|
|
2, /* OP_AND */
|
|
|
|
|
1, /* OP_OR */
|
|
|
|
|
3 /* OP_PHRASE */
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* parser's states
|
|
|
|
|
*/
|
|
|
|
|
typedef enum
|
|
|
|
|
{
|
|
|
|
|
WAITOPERAND = 1,
|
|
|
|
|
WAITOPERATOR = 2,
|
|
|
|
|
WAITFIRSTOPERAND = 3
|
|
|
|
|
} ts_parserstate;
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* token types for parsing
|
|
|
|
|
*/
|
|
|
|
|
typedef enum
|
|
|
|
|
{
|
|
|
|
|
PT_END = 0,
|
|
|
|
|
PT_ERR = 1,
|
|
|
|
|
PT_VAL = 2,
|
|
|
|
|
PT_OPR = 3,
|
|
|
|
|
PT_OPEN = 4,
|
|
|
|
|
PT_CLOSE = 5
|
|
|
|
|
} ts_tokentype;
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* get token from query string
|
|
|
|
|
*
|
|
|
|
|
* All arguments except "state" are output arguments.
|
|
|
|
|
*
|
|
|
|
|
* If return value is PT_OPR, then *operator is filled with an OP_* code
|
|
|
|
|
* and *weight will contain a distance value in case of phrase operator.
|
|
|
|
|
*
|
|
|
|
|
* If return value is PT_VAL, then *lenval, *strval, *weight, and *prefix
|
|
|
|
|
* are filled.
|
|
|
|
|
*
|
|
|
|
|
* If PT_ERR is returned then a soft error has occurred. If state->escontext
|
|
|
|
|
* isn't already filled then this should be reported as a generic parse error.
|
|
|
|
|
*/
|
|
|
|
|
typedef ts_tokentype (*ts_tokenizer) (TSQueryParserState state, int8 *operator,
|
|
|
|
|
int *lenval, char **strval,
|
|
|
|
|
int16 *weight, bool *prefix);
|
|
|
|
|
|
|
|
|
|
struct TSQueryParserStateData
|
|
|
|
|
{
|
|
|
|
|
/* Tokenizer used for parsing tsquery */
|
|
|
|
|
ts_tokenizer gettoken;
|
|
|
|
|
|
|
|
|
|
/* State of tokenizer function */
|
|
|
|
|
char *buffer; /* entire string we are scanning */
|
|
|
|
|
char *buf; /* current scan point */
|
|
|
|
|
int count; /* nesting count, incremented by (,
|
|
|
|
|
* decremented by ) */
|
|
|
|
|
ts_parserstate state;
|
|
|
|
|
|
|
|
|
|
/* polish (prefix) notation in list, filled in by push* functions */
|
|
|
|
|
List *polstr;
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* Strings from operands are collected in op. curop is a pointer to the
|
|
|
|
|
* end of used space of op.
|
|
|
|
|
*/
|
|
|
|
|
char *op;
|
|
|
|
|
char *curop;
|
|
|
|
|
int lenop; /* allocated size of op */
|
|
|
|
|
int sumlen; /* used size of op */
|
|
|
|
|
|
|
|
|
|
/* state for value's parser */
|
|
|
|
|
TSVectorParseState valstate;
|
|
|
|
|
|
|
|
|
|
/* context object for soft errors - must match valstate's escontext */
|
|
|
|
|
Node *escontext;
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* subroutine to parse the modifiers (weight and prefix flag currently)
|
|
|
|
|
* part, like ':AB*' of a query.
|
|
|
|
|
*/
|
|
|
|
|
static char *
|
|
|
|
|
get_modifiers(char *buf, int16 *weight, bool *prefix)
|
|
|
|
|
{
|
|
|
|
|
*weight = 0;
|
|
|
|
|
*prefix = false;
|
|
|
|
|
|
|
|
|
|
if (!t_iseq(buf, ':'))
|
|
|
|
|
return buf;
|
|
|
|
|
|
|
|
|
|
buf++;
|
|
|
|
|
while (*buf && pg_mblen(buf) == 1)
|
|
|
|
|
{
|
|
|
|
|
switch (*buf)
|
|
|
|
|
{
|
|
|
|
|
case 'a':
|
|
|
|
|
case 'A':
|
|
|
|
|
*weight |= 1 << 3;
|
|
|
|
|
break;
|
|
|
|
|
case 'b':
|
|
|
|
|
case 'B':
|
|
|
|
|
*weight |= 1 << 2;
|
|
|
|
|
break;
|
|
|
|
|
case 'c':
|
|
|
|
|
case 'C':
|
|
|
|
|
*weight |= 1 << 1;
|
|
|
|
|
break;
|
|
|
|
|
case 'd':
|
|
|
|
|
case 'D':
|
|
|
|
|
*weight |= 1;
|
|
|
|
|
break;
|
|
|
|
|
case '*':
|
|
|
|
|
*prefix = true;
|
|
|
|
|
break;
|
|
|
|
|
default:
|
|
|
|
|
return buf;
|
|
|
|
|
}
|
|
|
|
|
buf++;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return buf;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* Parse phrase operator. The operator
|
|
|
|
|
* may take the following forms:
|
|
|
|
|
*
|
|
|
|
|
* a <N> b (distance is exactly N lexemes)
|
|
|
|
|
* a <-> b (default distance = 1)
|
|
|
|
|
*
|
|
|
|
|
* The buffer should begin with '<' char
|
|
|
|
|
*/
|
|
|
|
|
static bool
|
|
|
|
|
parse_phrase_operator(TSQueryParserState pstate, int16 *distance)
|
|
|
|
|
{
|
|
|
|
|
enum
|
|
|
|
|
{
|
|
|
|
|
PHRASE_OPEN = 0,
|
|
|
|
|
PHRASE_DIST,
|
|
|
|
|
PHRASE_CLOSE,
|
|
|
|
|
PHRASE_FINISH
|
|
|
|
|
} state = PHRASE_OPEN;
|
|
|
|
|
char *ptr = pstate->buf;
|
|
|
|
|
char *endptr;
|
|
|
|
|
long l = 1; /* default distance */
|
|
|
|
|
|
|
|
|
|
while (*ptr)
|
|
|
|
|
{
|
|
|
|
|
switch (state)
|
|
|
|
|
{
|
|
|
|
|
case PHRASE_OPEN:
|
|
|
|
|
if (t_iseq(ptr, '<'))
|
|
|
|
|
{
|
|
|
|
|
state = PHRASE_DIST;
|
|
|
|
|
ptr++;
|
|
|
|
|
}
|
|
|
|
|
else
|
|
|
|
|
return false;
|
|
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
case PHRASE_DIST:
|
|
|
|
|
if (t_iseq(ptr, '-'))
|
|
|
|
|
{
|
|
|
|
|
state = PHRASE_CLOSE;
|
|
|
|
|
ptr++;
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (!t_isdigit(ptr))
|
|
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
errno = 0;
|
|
|
|
|
l = strtol(ptr, &endptr, 10);
|
|
|
|
|
if (ptr == endptr)
|
|
|
|
|
return false;
|
|
|
|
|
else if (errno == ERANGE || l < 0 || l > MAXENTRYPOS)
|
|
|
|
|
ereturn(pstate->escontext, false,
|
|
|
|
|
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
|
|
|
|
|
errmsg("distance in phrase operator must be an integer value between zero and %d inclusive",
|
|
|
|
|
MAXENTRYPOS)));
|
|
|
|
|
else
|
|
|
|
|
{
|
|
|
|
|
state = PHRASE_CLOSE;
|
|
|
|
|
ptr = endptr;
|
|
|
|
|
}
|
|
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
case PHRASE_CLOSE:
|
|
|
|
|
if (t_iseq(ptr, '>'))
|
|
|
|
|
{
|
|
|
|
|
state = PHRASE_FINISH;
|
|
|
|
|
ptr++;
|
|
|
|
|
}
|
|
|
|
|
else
|
|
|
|
|
return false;
|
|
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
case PHRASE_FINISH:
|
|
|
|
|
*distance = (int16) l;
|
|
|
|
|
pstate->buf = ptr;
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* Parse OR operator used in websearch_to_tsquery(), returns true if we
|
|
|
|
|
* believe that "OR" literal could be an operator OR
|
|
|
|
|
*/
|
|
|
|
|
static bool
|
|
|
|
|
parse_or_operator(TSQueryParserState pstate)
|
|
|
|
|
{
|
|
|
|
|
char *ptr = pstate->buf;
|
|
|
|
|
|
|
|
|
|
/* it should begin with "OR" literal */
|
|
|
|
|
if (pg_strncasecmp(ptr, "or", 2) != 0)
|
|
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
ptr += 2;
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* it shouldn't be a part of any word but somewhere later it should be
|
|
|
|
|
* some operand
|
|
|
|
|
*/
|
|
|
|
|
if (*ptr == '\0') /* no operand */
|
|
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
/* it shouldn't be a part of any word */
|
|
|
|
|
if (t_iseq(ptr, '-') || t_iseq(ptr, '_') || t_isalnum(ptr))
|
|
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
for (;;)
|
|
|
|
|
{
|
|
|
|
|
ptr += pg_mblen(ptr);
|
|
|
|
|
|
|
|
|
|
if (*ptr == '\0') /* got end of string without operand */
|
|
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* Suppose, we found an operand, but could be a not correct operand.
|
|
|
|
|
* So we still treat OR literal as operation with possibly incorrect
|
|
|
|
|
* operand and will not search it as lexeme
|
|
|
|
|
*/
|
|
|
|
|
if (!t_isspace(ptr))
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
pstate->buf += 2;
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static ts_tokentype
|
|
|
|
|
gettoken_query_standard(TSQueryParserState state, int8 *operator,
|
|
|
|
|
int *lenval, char **strval,
|
|
|
|
|
int16 *weight, bool *prefix)
|
|
|
|
|
{
|
|
|
|
|
*weight = 0;
|
|
|
|
|
*prefix = false;
|
|
|
|
|
|
|
|
|
|
while (true)
|
|
|
|
|
{
|
|
|
|
|
switch (state->state)
|
|
|
|
|
{
|
|
|
|
|
case WAITFIRSTOPERAND:
|
|
|
|
|
case WAITOPERAND:
|
|
|
|
|
if (t_iseq(state->buf, '!'))
|
|
|
|
|
{
|
|
|
|
|
state->buf++;
|
|
|
|
|
state->state = WAITOPERAND;
|
|
|
|
|
*operator = OP_NOT;
|
|
|
|
|
return PT_OPR;
|
|
|
|
|
}
|
|
|
|
|
else if (t_iseq(state->buf, '('))
|
|
|
|
|
{
|
|
|
|
|
state->buf++;
|
|
|
|
|
state->state = WAITOPERAND;
|
|
|
|
|
state->count++;
|
|
|
|
|
return PT_OPEN;
|
|
|
|
|
}
|
|
|
|
|
else if (t_iseq(state->buf, ':'))
|
|
|
|
|
{
|
|
|
|
|
/* generic syntax error message is fine */
|
|
|
|
|
return PT_ERR;
|
|
|
|
|
}
|
|
|
|
|
else if (!t_isspace(state->buf))
|
|
|
|
|
{
|
|
|
|
|
/*
|
|
|
|
|
* We rely on the tsvector parser to parse the value for
|
|
|
|
|
* us
|
|
|
|
|
*/
|
|
|
|
|
reset_tsvector_parser(state->valstate, state->buf);
|
|
|
|
|
if (gettoken_tsvector(state->valstate, strval, lenval,
|
|
|
|
|
NULL, NULL, &state->buf))
|
|
|
|
|
{
|
|
|
|
|
state->buf = get_modifiers(state->buf, weight, prefix);
|
|
|
|
|
state->state = WAITOPERATOR;
|
|
|
|
|
return PT_VAL;
|
|
|
|
|
}
|
|
|
|
|
else if (SOFT_ERROR_OCCURRED(state->escontext))
|
|
|
|
|
{
|
|
|
|
|
/* gettoken_tsvector reported a soft error */
|
|
|
|
|
return PT_ERR;
|
|
|
|
|
}
|
|
|
|
|
else if (state->state == WAITFIRSTOPERAND)
|
|
|
|
|
{
|
|
|
|
|
return PT_END;
|
|
|
|
|
}
|
|
|
|
|
else
|
|
|
|
|
ereturn(state->escontext, PT_ERR,
|
|
|
|
|
(errcode(ERRCODE_SYNTAX_ERROR),
|
|
|
|
|
errmsg("no operand in tsquery: \"%s\"",
|
|
|
|
|
state->buffer)));
|
|
|
|
|
}
|
|
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
case WAITOPERATOR:
|
|
|
|
|
if (t_iseq(state->buf, '&'))
|
|
|
|
|
{
|
|
|
|
|
state->buf++;
|
|
|
|
|
state->state = WAITOPERAND;
|
|
|
|
|
*operator = OP_AND;
|
|
|
|
|
return PT_OPR;
|
|
|
|
|
}
|
|
|
|
|
else if (t_iseq(state->buf, '|'))
|
|
|
|
|
{
|
|
|
|
|
state->buf++;
|
|
|
|
|
state->state = WAITOPERAND;
|
|
|
|
|
*operator = OP_OR;
|
|
|
|
|
return PT_OPR;
|
|
|
|
|
}
|
|
|
|
|
else if (parse_phrase_operator(state, weight))
|
|
|
|
|
{
|
|
|
|
|
/* weight var is used as storage for distance */
|
|
|
|
|
state->state = WAITOPERAND;
|
|
|
|
|
*operator = OP_PHRASE;
|
|
|
|
|
return PT_OPR;
|
|
|
|
|
}
|
|
|
|
|
else if (SOFT_ERROR_OCCURRED(state->escontext))
|
|
|
|
|
{
|
|
|
|
|
/* parse_phrase_operator reported a soft error */
|
|
|
|
|
return PT_ERR;
|
|
|
|
|
}
|
|
|
|
|
else if (t_iseq(state->buf, ')'))
|
|
|
|
|
{
|
|
|
|
|
state->buf++;
|
|
|
|
|
state->count--;
|
|
|
|
|
return (state->count < 0) ? PT_ERR : PT_CLOSE;
|
|
|
|
|
}
|
|
|
|
|
else if (*state->buf == '\0')
|
|
|
|
|
{
|
|
|
|
|
return (state->count) ? PT_ERR : PT_END;
|
|
|
|
|
}
|
|
|
|
|
else if (!t_isspace(state->buf))
|
|
|
|
|
{
|
|
|
|
|
return PT_ERR;
|
|
|
|
|
}
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
state->buf += pg_mblen(state->buf);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static ts_tokentype
|
|
|
|
|
gettoken_query_websearch(TSQueryParserState state, int8 *operator,
|
|
|
|
|
int *lenval, char **strval,
|
|
|
|
|
int16 *weight, bool *prefix)
|
|
|
|
|
{
|
|
|
|
|
*weight = 0;
|
|
|
|
|
*prefix = false;
|
|
|
|
|
|
|
|
|
|
while (true)
|
|
|
|
|
{
|
|
|
|
|
switch (state->state)
|
|
|
|
|
{
|
|
|
|
|
case WAITFIRSTOPERAND:
|
|
|
|
|
case WAITOPERAND:
|
|
|
|
|
if (t_iseq(state->buf, '-'))
|
|
|
|
|
{
|
|
|
|
|
state->buf++;
|
|
|
|
|
state->state = WAITOPERAND;
|
|
|
|
|
|
|
|
|
|
*operator = OP_NOT;
|
|
|
|
|
return PT_OPR;
|
|
|
|
|
}
|
|
|
|
|
else if (t_iseq(state->buf, '"'))
|
|
|
|
|
{
|
|
|
|
|
/* Everything in quotes is processed as a single token */
|
|
|
|
|
|
|
|
|
|
/* skip opening quote */
|
|
|
|
|
state->buf++;
|
|
|
|
|
*strval = state->buf;
|
|
|
|
|
|
|
|
|
|
/* iterate to the closing quote or end of the string */
|
|
|
|
|
while (*state->buf != '\0' && !t_iseq(state->buf, '"'))
|
|
|
|
|
state->buf++;
|
|
|
|
|
*lenval = state->buf - *strval;
|
|
|
|
|
|
|
|
|
|
/* skip closing quote if not end of the string */
|
|
|
|
|
if (*state->buf != '\0')
|
|
|
|
|
state->buf++;
|
|
|
|
|
|
|
|
|
|
state->state = WAITOPERATOR;
|
|
|
|
|
state->count++;
|
|
|
|
|
return PT_VAL;
|
|
|
|
|
}
|
|
|
|
|
else if (ISOPERATOR(state->buf))
|
|
|
|
|
{
|
|
|
|
|
/* or else gettoken_tsvector() will raise an error */
|
|
|
|
|
state->buf++;
|
|
|
|
|
state->state = WAITOPERAND;
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
else if (!t_isspace(state->buf))
|
|
|
|
|
{
|
|
|
|
|
/*
|
|
|
|
|
* We rely on the tsvector parser to parse the value for
|
|
|
|
|
* us
|
|
|
|
|
*/
|
|
|
|
|
reset_tsvector_parser(state->valstate, state->buf);
|
|
|
|
|
if (gettoken_tsvector(state->valstate, strval, lenval,
|
|
|
|
|
NULL, NULL, &state->buf))
|
|
|
|
|
{
|
|
|
|
|
state->state = WAITOPERATOR;
|
|
|
|
|
return PT_VAL;
|
|
|
|
|
}
|
|
|
|
|
else if (SOFT_ERROR_OCCURRED(state->escontext))
|
|
|
|
|
{
|
|
|
|
|
/* gettoken_tsvector reported a soft error */
|
|
|
|
|
return PT_ERR;
|
|
|
|
|
}
|
|
|
|
|
else if (state->state == WAITFIRSTOPERAND)
|
|
|
|
|
{
|
|
|
|
|
return PT_END;
|
|
|
|
|
}
|
|
|
|
|
else
|
|
|
|
|
{
|
|
|
|
|
/* finally, we have to provide an operand */
|
|
|
|
|
pushStop(state);
|
|
|
|
|
return PT_END;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
case WAITOPERATOR:
|
|
|
|
|
if (t_iseq(state->buf, '"'))
|
|
|
|
|
{
|
|
|
|
|
/*
|
|
|
|
|
* put implicit AND after an operand and handle this quote
|
|
|
|
|
* in WAITOPERAND
|
|
|
|
|
*/
|
|
|
|
|
state->state = WAITOPERAND;
|
|
|
|
|
*operator = OP_AND;
|
|
|
|
|
return PT_OPR;
|
|
|
|
|
}
|
|
|
|
|
else if (parse_or_operator(state))
|
|
|
|
|
{
|
|
|
|
|
state->state = WAITOPERAND;
|
|
|
|
|
*operator = OP_OR;
|
|
|
|
|
return PT_OPR;
|
|
|
|
|
}
|
|
|
|
|
else if (*state->buf == '\0')
|
|
|
|
|
{
|
|
|
|
|
return PT_END;
|
|
|
|
|
}
|
|
|
|
|
else if (!t_isspace(state->buf))
|
|
|
|
|
{
|
|
|
|
|
/* put implicit AND after an operand */
|
|
|
|
|
*operator = OP_AND;
|
|
|
|
|
state->state = WAITOPERAND;
|
|
|
|
|
return PT_OPR;
|
|
|
|
|
}
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
state->buf += pg_mblen(state->buf);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static ts_tokentype
|
|
|
|
|
gettoken_query_plain(TSQueryParserState state, int8 *operator,
|
|
|
|
|
int *lenval, char **strval,
|
|
|
|
|
int16 *weight, bool *prefix)
|
|
|
|
|
{
|
|
|
|
|
*weight = 0;
|
|
|
|
|
*prefix = false;
|
|
|
|
|
|
|
|
|
|
if (*state->buf == '\0')
|
|
|
|
|
return PT_END;
|
|
|
|
|
|
|
|
|
|
*strval = state->buf;
|
|
|
|
|
*lenval = strlen(state->buf);
|
|
|
|
|
state->buf += *lenval;
|
|
|
|
|
state->count++;
|
|
|
|
|
return PT_VAL;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* Push an operator to state->polstr
|
|
|
|
|
*/
|
|
|
|
|
void
|
|
|
|
|
pushOperator(TSQueryParserState state, int8 oper, int16 distance)
|
|
|
|
|
{
|
|
|
|
|
QueryOperator *tmp;
|
|
|
|
|
|
|
|
|
|
Assert(oper == OP_NOT || oper == OP_AND || oper == OP_OR || oper == OP_PHRASE);
|
|
|
|
|
|
|
|
|
|
tmp = (QueryOperator *) palloc0(sizeof(QueryOperator));
|
|
|
|
|
tmp->type = QI_OPR;
|
|
|
|
|
tmp->oper = oper;
|
|
|
|
|
tmp->distance = (oper == OP_PHRASE) ? distance : 0;
|
|
|
|
|
/* left is filled in later with findoprnd */
|
|
|
|
|
|
|
|
|
|
state->polstr = lcons(tmp, state->polstr);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static void
|
|
|
|
|
pushValue_internal(TSQueryParserState state, pg_crc32 valcrc, int distance, int lenval, int weight, bool prefix)
|
|
|
|
|
{
|
|
|
|
|
QueryOperand *tmp;
|
|
|
|
|
|
|
|
|
|
if (distance >= MAXSTRPOS)
|
|
|
|
|
ereturn(state->escontext,,
|
|
|
|
|
(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
|
|
|
|
|
errmsg("value is too big in tsquery: \"%s\"",
|
|
|
|
|
state->buffer)));
|
|
|
|
|
if (lenval >= MAXSTRLEN)
|
|
|
|
|
ereturn(state->escontext,,
|
|
|
|
|
(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
|
|
|
|
|
errmsg("operand is too long in tsquery: \"%s\"",
|
|
|
|
|
state->buffer)));
|
|
|
|
|
|
|
|
|
|
tmp = (QueryOperand *) palloc0(sizeof(QueryOperand));
|
|
|
|
|
tmp->type = QI_VAL;
|
|
|
|
|
tmp->weight = weight;
|
|
|
|
|
tmp->prefix = prefix;
|
|
|
|
|
tmp->valcrc = (int32) valcrc;
|
|
|
|
|
tmp->length = lenval;
|
|
|
|
|
tmp->distance = distance;
|
|
|
|
|
|
|
|
|
|
state->polstr = lcons(tmp, state->polstr);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* Push an operand to state->polstr.
|
|
|
|
|
*
|
|
|
|
|
* strval must point to a string equal to state->curop. lenval is the length
|
|
|
|
|
* of the string.
|
|
|
|
|
*/
|
|
|
|
|
void
|
|
|
|
|
pushValue(TSQueryParserState state, char *strval, int lenval, int16 weight, bool prefix)
|
|
|
|
|
{
|
|
|
|
|
pg_crc32 valcrc;
|
|
|
|
|
|
|
|
|
|
if (lenval >= MAXSTRLEN)
|
|
|
|
|
ereturn(state->escontext,,
|
|
|
|
|
(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
|
|
|
|
|
errmsg("word is too long in tsquery: \"%s\"",
|
|
|
|
|
state->buffer)));
|
|
|
|
|
|
Switch to CRC-32C in WAL and other places.
The old algorithm was found to not be the usual CRC-32 algorithm, used by
Ethernet et al. We were using a non-reflected lookup table with code meant
for a reflected lookup table. That's a strange combination that AFAICS does
not correspond to any bit-wise CRC calculation, which makes it difficult to
reason about its properties. Although it has worked well in practice, seems
safer to use a well-known algorithm.
Since we're changing the algorithm anyway, we might as well choose a
different polynomial. The Castagnoli polynomial has better error-correcting
properties than the traditional CRC-32 polynomial, even if we had
implemented it correctly. Another reason for picking that is that some new
CPUs have hardware support for calculating CRC-32C, but not CRC-32, let
alone our strange variant of it. This patch doesn't add any support for such
hardware, but a future patch could now do that.
The old algorithm is kept around for tsquery and pg_trgm, which use the
values in indexes that need to remain compatible so that pg_upgrade works.
While we're at it, share the old lookup table for CRC-32 calculation
between hstore, ltree and core. They all use the same table, so might as
well.
11 years ago
|
|
|
INIT_LEGACY_CRC32(valcrc);
|
|
|
|
|
COMP_LEGACY_CRC32(valcrc, strval, lenval);
|
|
|
|
|
FIN_LEGACY_CRC32(valcrc);
|
|
|
|
|
pushValue_internal(state, valcrc, state->curop - state->op, lenval, weight, prefix);
|
|
|
|
|
|
|
|
|
|
/* append the value string to state.op, enlarging buffer if needed first */
|
|
|
|
|
while (state->curop - state->op + lenval + 1 >= state->lenop)
|
|
|
|
|
{
|
|
|
|
|
int used = state->curop - state->op;
|
|
|
|
|
|
|
|
|
|
state->lenop *= 2;
|
|
|
|
|
state->op = (char *) repalloc((void *) state->op, state->lenop);
|
|
|
|
|
state->curop = state->op + used;
|
|
|
|
|
}
|
|
|
|
|
memcpy((void *) state->curop, (void *) strval, lenval);
|
|
|
|
|
state->curop += lenval;
|
|
|
|
|
*(state->curop) = '\0';
|
|
|
|
|
state->curop++;
|
|
|
|
|
state->sumlen += lenval + 1 /* \0 */ ;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* Push a stopword placeholder to state->polstr
|
|
|
|
|
*/
|
|
|
|
|
void
|
|
|
|
|
pushStop(TSQueryParserState state)
|
|
|
|
|
{
|
|
|
|
|
QueryOperand *tmp;
|
|
|
|
|
|
|
|
|
|
tmp = (QueryOperand *) palloc0(sizeof(QueryOperand));
|
|
|
|
|
tmp->type = QI_VALSTOP;
|
|
|
|
|
|
|
|
|
|
state->polstr = lcons(tmp, state->polstr);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#define STACKDEPTH 32
|
|
|
|
|
|
|
|
|
|
typedef struct OperatorElement
|
|
|
|
|
{
|
|
|
|
|
int8 op;
|
|
|
|
|
int16 distance;
|
|
|
|
|
} OperatorElement;
|
|
|
|
|
|
|
|
|
|
static void
|
|
|
|
|
pushOpStack(OperatorElement *stack, int *lenstack, int8 op, int16 distance)
|
|
|
|
|
{
|
|
|
|
|
if (*lenstack == STACKDEPTH) /* internal error */
|
|
|
|
|
elog(ERROR, "tsquery stack too small");
|
|
|
|
|
|
|
|
|
|
stack[*lenstack].op = op;
|
|
|
|
|
stack[*lenstack].distance = distance;
|
|
|
|
|
|
|
|
|
|
(*lenstack)++;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static void
|
|
|
|
|
cleanOpStack(TSQueryParserState state,
|
|
|
|
|
OperatorElement *stack, int *lenstack, int8 op)
|
|
|
|
|
{
|
|
|
|
|
int opPriority = OP_PRIORITY(op);
|
|
|
|
|
|
|
|
|
|
while (*lenstack)
|
|
|
|
|
{
|
|
|
|
|
/* NOT is right associative unlike to others */
|
|
|
|
|
if ((op != OP_NOT && opPriority > OP_PRIORITY(stack[*lenstack - 1].op)) ||
|
Phase 3 of pgindent updates.
Don't move parenthesized lines to the left, even if that means they
flow past the right margin.
By default, BSD indent lines up statement continuation lines that are
within parentheses so that they start just to the right of the preceding
left parenthesis. However, traditionally, if that resulted in the
continuation line extending to the right of the desired right margin,
then indent would push it left just far enough to not overrun the margin,
if it could do so without making the continuation line start to the left of
the current statement indent. That makes for a weird mix of indentations
unless one has been completely rigid about never violating the 80-column
limit.
This behavior has been pretty universally panned by Postgres developers.
Hence, disable it with indent's new -lpl switch, so that parenthesized
lines are always lined up with the preceding left paren.
This patch is much less interesting than the first round of indent
changes, but also bulkier, so I thought it best to separate the effects.
Discussion: https://postgr.es/m/E1dAmxK-0006EE-1r@gemulon.postgresql.org
Discussion: https://postgr.es/m/30527.1495162840@sss.pgh.pa.us
9 years ago
|
|
|
(op == OP_NOT && opPriority >= OP_PRIORITY(stack[*lenstack - 1].op)))
|
|
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
(*lenstack)--;
|
|
|
|
|
pushOperator(state, stack[*lenstack].op,
|
|
|
|
|
stack[*lenstack].distance);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* Make polish (prefix) notation of query.
|
|
|
|
|
*
|
|
|
|
|
* See parse_tsquery for explanation of pushval.
|
|
|
|
|
*/
|
|
|
|
|
static void
|
|
|
|
|
makepol(TSQueryParserState state,
|
|
|
|
|
PushFunction pushval,
|
|
|
|
|
Datum opaque)
|
|
|
|
|
{
|
|
|
|
|
int8 operator = 0;
|
|
|
|
|
ts_tokentype type;
|
|
|
|
|
int lenval = 0;
|
|
|
|
|
char *strval = NULL;
|
|
|
|
|
OperatorElement opstack[STACKDEPTH];
|
|
|
|
|
int lenstack = 0;
|
|
|
|
|
int16 weight = 0;
|
|
|
|
|
bool prefix;
|
|
|
|
|
|
|
|
|
|
/* since this function recurses, it could be driven to stack overflow */
|
|
|
|
|
check_stack_depth();
|
|
|
|
|
|
|
|
|
|
while ((type = state->gettoken(state, &operator,
|
|
|
|
|
&lenval, &strval,
|
|
|
|
|
&weight, &prefix)) != PT_END)
|
|
|
|
|
{
|
|
|
|
|
switch (type)
|
|
|
|
|
{
|
|
|
|
|
case PT_VAL:
|
|
|
|
|
pushval(opaque, state, strval, lenval, weight, prefix);
|
|
|
|
|
break;
|
|
|
|
|
case PT_OPR:
|
|
|
|
|
cleanOpStack(state, opstack, &lenstack, operator);
|
|
|
|
|
pushOpStack(opstack, &lenstack, operator, weight);
|
|
|
|
|
break;
|
|
|
|
|
case PT_OPEN:
|
|
|
|
|
makepol(state, pushval, opaque);
|
|
|
|
|
break;
|
|
|
|
|
case PT_CLOSE:
|
|
|
|
|
cleanOpStack(state, opstack, &lenstack, OP_OR /* lowest */ );
|
|
|
|
|
return;
|
|
|
|
|
case PT_ERR:
|
|
|
|
|
default:
|
|
|
|
|
/* don't overwrite a soft error saved by gettoken function */
|
|
|
|
|
if (!SOFT_ERROR_OCCURRED(state->escontext))
|
|
|
|
|
errsave(state->escontext,
|
|
|
|
|
(errcode(ERRCODE_SYNTAX_ERROR),
|
|
|
|
|
errmsg("syntax error in tsquery: \"%s\"",
|
|
|
|
|
state->buffer)));
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
/* detect soft error in pushval or recursion */
|
|
|
|
|
if (SOFT_ERROR_OCCURRED(state->escontext))
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
cleanOpStack(state, opstack, &lenstack, OP_OR /* lowest */ );
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static void
|
|
|
|
|
findoprnd_recurse(QueryItem *ptr, uint32 *pos, int nnodes, bool *needcleanup)
|
|
|
|
|
{
|
|
|
|
|
/* since this function recurses, it could be driven to stack overflow. */
|
|
|
|
|
check_stack_depth();
|
|
|
|
|
|
|
|
|
|
if (*pos >= nnodes)
|
|
|
|
|
elog(ERROR, "malformed tsquery: operand not found");
|
|
|
|
|
|
|
|
|
|
if (ptr[*pos].type == QI_VAL)
|
|
|
|
|
{
|
|
|
|
|
(*pos)++;
|
|
|
|
|
}
|
|
|
|
|
else if (ptr[*pos].type == QI_VALSTOP)
|
|
|
|
|
{
|
|
|
|
|
*needcleanup = true; /* we'll have to remove stop words */
|
|
|
|
|
(*pos)++;
|
|
|
|
|
}
|
|
|
|
|
else
|
|
|
|
|
{
|
|
|
|
|
Assert(ptr[*pos].type == QI_OPR);
|
|
|
|
|
|
|
|
|
|
if (ptr[*pos].qoperator.oper == OP_NOT)
|
|
|
|
|
{
|
Phase 2 of pgindent updates.
Change pg_bsd_indent to follow upstream rules for placement of comments
to the right of code, and remove pgindent hack that caused comments
following #endif to not obey the general rule.
Commit e3860ffa4dd0dad0dd9eea4be9cc1412373a8c89 wasn't actually using
the published version of pg_bsd_indent, but a hacked-up version that
tried to minimize the amount of movement of comments to the right of
code. The situation of interest is where such a comment has to be
moved to the right of its default placement at column 33 because there's
code there. BSD indent has always moved right in units of tab stops
in such cases --- but in the previous incarnation, indent was working
in 8-space tab stops, while now it knows we use 4-space tabs. So the
net result is that in about half the cases, such comments are placed
one tab stop left of before. This is better all around: it leaves
more room on the line for comment text, and it means that in such
cases the comment uniformly starts at the next 4-space tab stop after
the code, rather than sometimes one and sometimes two tabs after.
Also, ensure that comments following #endif are indented the same
as comments following other preprocessor commands such as #else.
That inconsistency turns out to have been self-inflicted damage
from a poorly-thought-through post-indent "fixup" in pgindent.
This patch is much less interesting than the first round of indent
changes, but also bulkier, so I thought it best to separate the effects.
Discussion: https://postgr.es/m/E1dAmxK-0006EE-1r@gemulon.postgresql.org
Discussion: https://postgr.es/m/30527.1495162840@sss.pgh.pa.us
9 years ago
|
|
|
ptr[*pos].qoperator.left = 1; /* fixed offset */
|
|
|
|
|
(*pos)++;
|
|
|
|
|
|
|
|
|
|
/* process the only argument */
|
|
|
|
|
findoprnd_recurse(ptr, pos, nnodes, needcleanup);
|
|
|
|
|
}
|
|
|
|
|
else
|
|
|
|
|
{
|
|
|
|
|
QueryOperator *curitem = &ptr[*pos].qoperator;
|
Phase 2 of pgindent updates.
Change pg_bsd_indent to follow upstream rules for placement of comments
to the right of code, and remove pgindent hack that caused comments
following #endif to not obey the general rule.
Commit e3860ffa4dd0dad0dd9eea4be9cc1412373a8c89 wasn't actually using
the published version of pg_bsd_indent, but a hacked-up version that
tried to minimize the amount of movement of comments to the right of
code. The situation of interest is where such a comment has to be
moved to the right of its default placement at column 33 because there's
code there. BSD indent has always moved right in units of tab stops
in such cases --- but in the previous incarnation, indent was working
in 8-space tab stops, while now it knows we use 4-space tabs. So the
net result is that in about half the cases, such comments are placed
one tab stop left of before. This is better all around: it leaves
more room on the line for comment text, and it means that in such
cases the comment uniformly starts at the next 4-space tab stop after
the code, rather than sometimes one and sometimes two tabs after.
Also, ensure that comments following #endif are indented the same
as comments following other preprocessor commands such as #else.
That inconsistency turns out to have been self-inflicted damage
from a poorly-thought-through post-indent "fixup" in pgindent.
This patch is much less interesting than the first round of indent
changes, but also bulkier, so I thought it best to separate the effects.
Discussion: https://postgr.es/m/E1dAmxK-0006EE-1r@gemulon.postgresql.org
Discussion: https://postgr.es/m/30527.1495162840@sss.pgh.pa.us
9 years ago
|
|
|
int tmp = *pos; /* save current position */
|
|
|
|
|
|
|
|
|
|
Assert(curitem->oper == OP_AND ||
|
|
|
|
|
curitem->oper == OP_OR ||
|
|
|
|
|
curitem->oper == OP_PHRASE);
|
|
|
|
|
|
|
|
|
|
(*pos)++;
|
|
|
|
|
|
|
|
|
|
/* process RIGHT argument */
|
|
|
|
|
findoprnd_recurse(ptr, pos, nnodes, needcleanup);
|
Fix strange behavior (and possible crashes) in full text phrase search.
In an attempt to simplify the tsquery matching engine, the original
phrase search patch invented rewrite rules that would rearrange a
tsquery so that no AND/OR/NOT operator appeared below a PHRASE operator.
But this approach had numerous problems. The rearrangement step was
missed by ts_rewrite (and perhaps other places), allowing tsqueries
to be created that would cause Assert failures or perhaps crashes at
execution, as reported by Andreas Seltenreich. The rewrite rules
effectively defined semantics for operators underneath PHRASE that were
buggy, or at least unintuitive. And because rewriting was done in
tsqueryin() rather than at execution, the rearrangement was user-visible,
which is not very desirable --- for example, it might cause unexpected
matches or failures to match in ts_rewrite.
As a somewhat independent problem, the behavior of nested PHRASE operators
was only sane for left-deep trees; queries like "x <-> (y <-> z)" did not
behave intuitively at all.
To fix, get rid of the rewrite logic altogether, and instead teach the
tsquery execution engine to manage AND/OR/NOT below a PHRASE operator
by explicitly computing the match location(s) and match widths for these
operators.
This requires introducing some additional fields into the publicly visible
ExecPhraseData struct; but since there's no way for third-party code to
pass such a struct to TS_phrase_execute, it shouldn't create an ABI problem
as long as we don't move the offsets of the existing fields.
Another related problem was that index searches supposed that "!x <-> y"
could be lossily approximated as "!x & y", which isn't correct because
the latter will reject, say, "x q y" which the query itself accepts.
This required some tweaking in TS_execute_ternary along with the main
tsquery engine.
Back-patch to 9.6 where phrase operators were introduced. While this
could be argued to change behavior more than we'd like in a stable branch,
we have to do something about the crash hazards and index-vs-seqscan
inconsistency, and it doesn't seem desirable to let the unintuitive
behaviors induced by the rewriting implementation stand as precedent.
Discussion: https://postgr.es/m/28215.1481999808@sss.pgh.pa.us
Discussion: https://postgr.es/m/26706.1482087250@sss.pgh.pa.us
9 years ago
|
|
|
|
|
|
|
|
curitem->left = *pos - tmp; /* set LEFT arg's offset */
|
|
|
|
|
|
|
|
|
|
/* process LEFT argument */
|
|
|
|
|
findoprnd_recurse(ptr, pos, nnodes, needcleanup);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
/*
|
Fix strange behavior (and possible crashes) in full text phrase search.
In an attempt to simplify the tsquery matching engine, the original
phrase search patch invented rewrite rules that would rearrange a
tsquery so that no AND/OR/NOT operator appeared below a PHRASE operator.
But this approach had numerous problems. The rearrangement step was
missed by ts_rewrite (and perhaps other places), allowing tsqueries
to be created that would cause Assert failures or perhaps crashes at
execution, as reported by Andreas Seltenreich. The rewrite rules
effectively defined semantics for operators underneath PHRASE that were
buggy, or at least unintuitive. And because rewriting was done in
tsqueryin() rather than at execution, the rearrangement was user-visible,
which is not very desirable --- for example, it might cause unexpected
matches or failures to match in ts_rewrite.
As a somewhat independent problem, the behavior of nested PHRASE operators
was only sane for left-deep trees; queries like "x <-> (y <-> z)" did not
behave intuitively at all.
To fix, get rid of the rewrite logic altogether, and instead teach the
tsquery execution engine to manage AND/OR/NOT below a PHRASE operator
by explicitly computing the match location(s) and match widths for these
operators.
This requires introducing some additional fields into the publicly visible
ExecPhraseData struct; but since there's no way for third-party code to
pass such a struct to TS_phrase_execute, it shouldn't create an ABI problem
as long as we don't move the offsets of the existing fields.
Another related problem was that index searches supposed that "!x <-> y"
could be lossily approximated as "!x & y", which isn't correct because
the latter will reject, say, "x q y" which the query itself accepts.
This required some tweaking in TS_execute_ternary along with the main
tsquery engine.
Back-patch to 9.6 where phrase operators were introduced. While this
could be argued to change behavior more than we'd like in a stable branch,
we have to do something about the crash hazards and index-vs-seqscan
inconsistency, and it doesn't seem desirable to let the unintuitive
behaviors induced by the rewriting implementation stand as precedent.
Discussion: https://postgr.es/m/28215.1481999808@sss.pgh.pa.us
Discussion: https://postgr.es/m/26706.1482087250@sss.pgh.pa.us
9 years ago
|
|
|
* Fill in the left-fields previously left unfilled.
|
|
|
|
|
* The input QueryItems must be in polish (prefix) notation.
|
|
|
|
|
* Also, set *needcleanup to true if there are any QI_VALSTOP nodes.
|
|
|
|
|
*/
|
|
|
|
|
static void
|
|
|
|
|
findoprnd(QueryItem *ptr, int size, bool *needcleanup)
|
|
|
|
|
{
|
|
|
|
|
uint32 pos;
|
|
|
|
|
|
|
|
|
|
*needcleanup = false;
|
|
|
|
|
pos = 0;
|
|
|
|
|
findoprnd_recurse(ptr, &pos, size, needcleanup);
|
|
|
|
|
|
|
|
|
|
if (pos != size)
|
|
|
|
|
elog(ERROR, "malformed tsquery: extra nodes");
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* Parse the tsquery stored in "buf".
|
|
|
|
|
*
|
|
|
|
|
* Each value (operand) in the query is passed to pushval. pushval can
|
|
|
|
|
* transform the simple value to an arbitrarily complex expression using
|
|
|
|
|
* pushValue and pushOperator. It must push a single value with pushValue,
|
|
|
|
|
* a complete expression with all operands, or a stopword placeholder
|
|
|
|
|
* with pushStop, otherwise the prefix notation representation will be broken,
|
|
|
|
|
* having an operator with no operand.
|
|
|
|
|
*
|
|
|
|
|
* opaque is passed on to pushval as is, pushval can use it to store its
|
|
|
|
|
* private state.
|
|
|
|
|
*
|
|
|
|
|
* The pushval function can record soft errors via escontext.
|
|
|
|
|
* Callers must check SOFT_ERROR_OCCURRED to detect that.
|
|
|
|
|
*
|
|
|
|
|
* A bitmask of flags (see ts_utils.h) and an error context object
|
|
|
|
|
* can be provided as well. If a soft error occurs, NULL is returned.
|
|
|
|
|
*/
|
|
|
|
|
TSQuery
|
|
|
|
|
parse_tsquery(char *buf,
|
|
|
|
|
PushFunction pushval,
|
|
|
|
|
Datum opaque,
|
|
|
|
|
int flags,
|
|
|
|
|
Node *escontext)
|
|
|
|
|
{
|
|
|
|
|
struct TSQueryParserStateData state;
|
|
|
|
|
int i;
|
|
|
|
|
TSQuery query;
|
|
|
|
|
int commonlen;
|
|
|
|
|
QueryItem *ptr;
|
|
|
|
|
ListCell *cell;
|
|
|
|
|
bool noisy;
|
|
|
|
|
bool needcleanup;
|
|
|
|
|
int tsv_flags = P_TSV_OPR_IS_DELIM | P_TSV_IS_TSQUERY;
|
|
|
|
|
|
|
|
|
|
/* plain should not be used with web */
|
|
|
|
|
Assert((flags & (P_TSQ_PLAIN | P_TSQ_WEB)) != (P_TSQ_PLAIN | P_TSQ_WEB));
|
|
|
|
|
|
|
|
|
|
/* select suitable tokenizer */
|
|
|
|
|
if (flags & P_TSQ_PLAIN)
|
|
|
|
|
state.gettoken = gettoken_query_plain;
|
|
|
|
|
else if (flags & P_TSQ_WEB)
|
|
|
|
|
{
|
|
|
|
|
state.gettoken = gettoken_query_websearch;
|
|
|
|
|
tsv_flags |= P_TSV_IS_WEB;
|
|
|
|
|
}
|
|
|
|
|
else
|
|
|
|
|
state.gettoken = gettoken_query_standard;
|
|
|
|
|
|
|
|
|
|
/* emit nuisance NOTICEs only if not doing soft errors */
|
|
|
|
|
noisy = !(escontext && IsA(escontext, ErrorSaveContext));
|
|
|
|
|
|
|
|
|
|
/* init state */
|
|
|
|
|
state.buffer = buf;
|
|
|
|
|
state.buf = buf;
|
|
|
|
|
state.count = 0;
|
|
|
|
|
state.state = WAITFIRSTOPERAND;
|
|
|
|
|
state.polstr = NIL;
|
|
|
|
|
state.escontext = escontext;
|
|
|
|
|
|
|
|
|
|
/* init value parser's state */
|
|
|
|
|
state.valstate = init_tsvector_parser(state.buffer, tsv_flags, escontext);
|
|
|
|
|
|
|
|
|
|
/* init list of operand */
|
|
|
|
|
state.sumlen = 0;
|
|
|
|
|
state.lenop = 64;
|
|
|
|
|
state.curop = state.op = (char *) palloc(state.lenop);
|
|
|
|
|
*(state.curop) = '\0';
|
|
|
|
|
|
|
|
|
|
/* parse query & make polish notation (postfix, but in reverse order) */
|
|
|
|
|
makepol(&state, pushval, opaque);
|
|
|
|
|
|
|
|
|
|
close_tsvector_parser(state.valstate);
|
|
|
|
|
|
|
|
|
|
if (SOFT_ERROR_OCCURRED(escontext))
|
|
|
|
|
return NULL;
|
|
|
|
|
|
|
|
|
|
if (state.polstr == NIL)
|
|
|
|
|
{
|
|
|
|
|
if (noisy)
|
|
|
|
|
ereport(NOTICE,
|
|
|
|
|
(errmsg("text-search query doesn't contain lexemes: \"%s\"",
|
|
|
|
|
state.buffer)));
|
|
|
|
|
query = (TSQuery) palloc(HDRSIZETQ);
|
|
|
|
|
SET_VARSIZE(query, HDRSIZETQ);
|
|
|
|
|
query->size = 0;
|
|
|
|
|
return query;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (TSQUERY_TOO_BIG(list_length(state.polstr), state.sumlen))
|
|
|
|
|
ereturn(escontext, NULL,
|
|
|
|
|
(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
|
|
|
|
|
errmsg("tsquery is too large")));
|
|
|
|
|
commonlen = COMPUTESIZE(list_length(state.polstr), state.sumlen);
|
|
|
|
|
|
|
|
|
|
/* Pack the QueryItems in the final TSQuery struct to return to caller */
|
|
|
|
|
query = (TSQuery) palloc0(commonlen);
|
|
|
|
|
SET_VARSIZE(query, commonlen);
|
|
|
|
|
query->size = list_length(state.polstr);
|
|
|
|
|
ptr = GETQUERY(query);
|
|
|
|
|
|
|
|
|
|
/* Copy QueryItems to TSQuery */
|
|
|
|
|
i = 0;
|
|
|
|
|
foreach(cell, state.polstr)
|
|
|
|
|
{
|
|
|
|
|
QueryItem *item = (QueryItem *) lfirst(cell);
|
|
|
|
|
|
|
|
|
|
switch (item->type)
|
|
|
|
|
{
|
|
|
|
|
case QI_VAL:
|
|
|
|
|
memcpy(&ptr[i], item, sizeof(QueryOperand));
|
|
|
|
|
break;
|
|
|
|
|
case QI_VALSTOP:
|
|
|
|
|
ptr[i].type = QI_VALSTOP;
|
|
|
|
|
break;
|
|
|
|
|
case QI_OPR:
|
|
|
|
|
memcpy(&ptr[i], item, sizeof(QueryOperator));
|
|
|
|
|
break;
|
|
|
|
|
default:
|
|
|
|
|
elog(ERROR, "unrecognized QueryItem type: %d", item->type);
|
|
|
|
|
}
|
|
|
|
|
i++;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* Copy all the operand strings to TSQuery */
|
|
|
|
|
memcpy((void *) GETOPERAND(query), (void *) state.op, state.sumlen);
|
|
|
|
|
pfree(state.op);
|
|
|
|
|
|
Fix strange behavior (and possible crashes) in full text phrase search.
In an attempt to simplify the tsquery matching engine, the original
phrase search patch invented rewrite rules that would rearrange a
tsquery so that no AND/OR/NOT operator appeared below a PHRASE operator.
But this approach had numerous problems. The rearrangement step was
missed by ts_rewrite (and perhaps other places), allowing tsqueries
to be created that would cause Assert failures or perhaps crashes at
execution, as reported by Andreas Seltenreich. The rewrite rules
effectively defined semantics for operators underneath PHRASE that were
buggy, or at least unintuitive. And because rewriting was done in
tsqueryin() rather than at execution, the rearrangement was user-visible,
which is not very desirable --- for example, it might cause unexpected
matches or failures to match in ts_rewrite.
As a somewhat independent problem, the behavior of nested PHRASE operators
was only sane for left-deep trees; queries like "x <-> (y <-> z)" did not
behave intuitively at all.
To fix, get rid of the rewrite logic altogether, and instead teach the
tsquery execution engine to manage AND/OR/NOT below a PHRASE operator
by explicitly computing the match location(s) and match widths for these
operators.
This requires introducing some additional fields into the publicly visible
ExecPhraseData struct; but since there's no way for third-party code to
pass such a struct to TS_phrase_execute, it shouldn't create an ABI problem
as long as we don't move the offsets of the existing fields.
Another related problem was that index searches supposed that "!x <-> y"
could be lossily approximated as "!x & y", which isn't correct because
the latter will reject, say, "x q y" which the query itself accepts.
This required some tweaking in TS_execute_ternary along with the main
tsquery engine.
Back-patch to 9.6 where phrase operators were introduced. While this
could be argued to change behavior more than we'd like in a stable branch,
we have to do something about the crash hazards and index-vs-seqscan
inconsistency, and it doesn't seem desirable to let the unintuitive
behaviors induced by the rewriting implementation stand as precedent.
Discussion: https://postgr.es/m/28215.1481999808@sss.pgh.pa.us
Discussion: https://postgr.es/m/26706.1482087250@sss.pgh.pa.us
9 years ago
|
|
|
/*
|
|
|
|
|
* Set left operand pointers for every operator. While we're at it,
|
|
|
|
|
* detect whether there are any QI_VALSTOP nodes.
|
|
|
|
|
*/
|
|
|
|
|
findoprnd(ptr, query->size, &needcleanup);
|
|
|
|
|
|
|
|
|
|
/*
|
Fix strange behavior (and possible crashes) in full text phrase search.
In an attempt to simplify the tsquery matching engine, the original
phrase search patch invented rewrite rules that would rearrange a
tsquery so that no AND/OR/NOT operator appeared below a PHRASE operator.
But this approach had numerous problems. The rearrangement step was
missed by ts_rewrite (and perhaps other places), allowing tsqueries
to be created that would cause Assert failures or perhaps crashes at
execution, as reported by Andreas Seltenreich. The rewrite rules
effectively defined semantics for operators underneath PHRASE that were
buggy, or at least unintuitive. And because rewriting was done in
tsqueryin() rather than at execution, the rearrangement was user-visible,
which is not very desirable --- for example, it might cause unexpected
matches or failures to match in ts_rewrite.
As a somewhat independent problem, the behavior of nested PHRASE operators
was only sane for left-deep trees; queries like "x <-> (y <-> z)" did not
behave intuitively at all.
To fix, get rid of the rewrite logic altogether, and instead teach the
tsquery execution engine to manage AND/OR/NOT below a PHRASE operator
by explicitly computing the match location(s) and match widths for these
operators.
This requires introducing some additional fields into the publicly visible
ExecPhraseData struct; but since there's no way for third-party code to
pass such a struct to TS_phrase_execute, it shouldn't create an ABI problem
as long as we don't move the offsets of the existing fields.
Another related problem was that index searches supposed that "!x <-> y"
could be lossily approximated as "!x & y", which isn't correct because
the latter will reject, say, "x q y" which the query itself accepts.
This required some tweaking in TS_execute_ternary along with the main
tsquery engine.
Back-patch to 9.6 where phrase operators were introduced. While this
could be argued to change behavior more than we'd like in a stable branch,
we have to do something about the crash hazards and index-vs-seqscan
inconsistency, and it doesn't seem desirable to let the unintuitive
behaviors induced by the rewriting implementation stand as precedent.
Discussion: https://postgr.es/m/28215.1481999808@sss.pgh.pa.us
Discussion: https://postgr.es/m/26706.1482087250@sss.pgh.pa.us
9 years ago
|
|
|
* If there are QI_VALSTOP nodes, delete them and simplify the tree.
|
|
|
|
|
*/
|
|
|
|
|
if (needcleanup)
|
|
|
|
|
query = cleanup_tsquery_stopwords(query, noisy);
|
|
|
|
|
|
|
|
|
|
return query;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static void
|
|
|
|
|
pushval_asis(Datum opaque, TSQueryParserState state, char *strval, int lenval,
|
|
|
|
|
int16 weight, bool prefix)
|
|
|
|
|
{
|
|
|
|
|
pushValue(state, strval, lenval, weight, prefix);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* in without morphology
|
|
|
|
|
*/
|
|
|
|
|
Datum
|
|
|
|
|
tsqueryin(PG_FUNCTION_ARGS)
|
|
|
|
|
{
|
|
|
|
|
char *in = PG_GETARG_CSTRING(0);
|
|
|
|
|
Node *escontext = fcinfo->context;
|
|
|
|
|
|
|
|
|
|
PG_RETURN_TSQUERY(parse_tsquery(in,
|
|
|
|
|
pushval_asis,
|
|
|
|
|
PointerGetDatum(NULL),
|
|
|
|
|
0,
|
|
|
|
|
escontext));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* out function
|
|
|
|
|
*/
|
|
|
|
|
typedef struct
|
|
|
|
|
{
|
|
|
|
|
QueryItem *curpol;
|
|
|
|
|
char *buf;
|
|
|
|
|
char *cur;
|
|
|
|
|
char *op;
|
|
|
|
|
int buflen;
|
|
|
|
|
} INFIX;
|
|
|
|
|
|
|
|
|
|
/* Makes sure inf->buf is large enough for adding 'addsize' bytes */
|
|
|
|
|
#define RESIZEBUF(inf, addsize) \
|
|
|
|
|
while( ( (inf)->cur - (inf)->buf ) + (addsize) + 1 >= (inf)->buflen ) \
|
|
|
|
|
{ \
|
|
|
|
|
int len = (inf)->cur - (inf)->buf; \
|
|
|
|
|
(inf)->buflen *= 2; \
|
|
|
|
|
(inf)->buf = (char*) repalloc( (void*)(inf)->buf, (inf)->buflen ); \
|
|
|
|
|
(inf)->cur = (inf)->buf + len; \
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* recursively traverse the tree and
|
|
|
|
|
* print it in infix (human-readable) form
|
|
|
|
|
*/
|
|
|
|
|
static void
|
|
|
|
|
infix(INFIX *in, int parentPriority, bool rightPhraseOp)
|
|
|
|
|
{
|
|
|
|
|
/* since this function recurses, it could be driven to stack overflow. */
|
|
|
|
|
check_stack_depth();
|
|
|
|
|
|
|
|
|
|
if (in->curpol->type == QI_VAL)
|
|
|
|
|
{
|
|
|
|
|
QueryOperand *curpol = &in->curpol->qoperand;
|
|
|
|
|
char *op = in->op + curpol->distance;
|
|
|
|
|
int clen;
|
|
|
|
|
|
|
|
|
|
RESIZEBUF(in, curpol->length * (pg_database_encoding_max_length() + 1) + 2 + 6);
|
|
|
|
|
*(in->cur) = '\'';
|
|
|
|
|
in->cur++;
|
|
|
|
|
while (*op)
|
|
|
|
|
{
|
|
|
|
|
if (t_iseq(op, '\''))
|
|
|
|
|
{
|
|
|
|
|
*(in->cur) = '\'';
|
|
|
|
|
in->cur++;
|
|
|
|
|
}
|
|
|
|
|
else if (t_iseq(op, '\\'))
|
|
|
|
|
{
|
|
|
|
|
*(in->cur) = '\\';
|
|
|
|
|
in->cur++;
|
|
|
|
|
}
|
|
|
|
|
COPYCHAR(in->cur, op);
|
|
|
|
|
|
|
|
|
|
clen = pg_mblen(op);
|
|
|
|
|
op += clen;
|
|
|
|
|
in->cur += clen;
|
|
|
|
|
}
|
|
|
|
|
*(in->cur) = '\'';
|
|
|
|
|
in->cur++;
|
|
|
|
|
if (curpol->weight || curpol->prefix)
|
|
|
|
|
{
|
|
|
|
|
*(in->cur) = ':';
|
|
|
|
|
in->cur++;
|
|
|
|
|
if (curpol->prefix)
|
|
|
|
|
{
|
|
|
|
|
*(in->cur) = '*';
|
|
|
|
|
in->cur++;
|
|
|
|
|
}
|
|
|
|
|
if (curpol->weight & (1 << 3))
|
|
|
|
|
{
|
|
|
|
|
*(in->cur) = 'A';
|
|
|
|
|
in->cur++;
|
|
|
|
|
}
|
|
|
|
|
if (curpol->weight & (1 << 2))
|
|
|
|
|
{
|
|
|
|
|
*(in->cur) = 'B';
|
|
|
|
|
in->cur++;
|
|
|
|
|
}
|
|
|
|
|
if (curpol->weight & (1 << 1))
|
|
|
|
|
{
|
|
|
|
|
*(in->cur) = 'C';
|
|
|
|
|
in->cur++;
|
|
|
|
|
}
|
|
|
|
|
if (curpol->weight & 1)
|
|
|
|
|
{
|
|
|
|
|
*(in->cur) = 'D';
|
|
|
|
|
in->cur++;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
*(in->cur) = '\0';
|
|
|
|
|
in->curpol++;
|
|
|
|
|
}
|
|
|
|
|
else if (in->curpol->qoperator.oper == OP_NOT)
|
|
|
|
|
{
|
|
|
|
|
int priority = QO_PRIORITY(in->curpol);
|
|
|
|
|
|
|
|
|
|
if (priority < parentPriority)
|
|
|
|
|
{
|
|
|
|
|
RESIZEBUF(in, 2);
|
|
|
|
|
sprintf(in->cur, "( ");
|
|
|
|
|
in->cur = strchr(in->cur, '\0');
|
|
|
|
|
}
|
|
|
|
|
RESIZEBUF(in, 1);
|
|
|
|
|
*(in->cur) = '!';
|
|
|
|
|
in->cur++;
|
|
|
|
|
*(in->cur) = '\0';
|
|
|
|
|
in->curpol++;
|
|
|
|
|
|
|
|
|
|
infix(in, priority, false);
|
|
|
|
|
if (priority < parentPriority)
|
|
|
|
|
{
|
|
|
|
|
RESIZEBUF(in, 2);
|
|
|
|
|
sprintf(in->cur, " )");
|
|
|
|
|
in->cur = strchr(in->cur, '\0');
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
else
|
|
|
|
|
{
|
|
|
|
|
int8 op = in->curpol->qoperator.oper;
|
|
|
|
|
int priority = QO_PRIORITY(in->curpol);
|
|
|
|
|
int16 distance = in->curpol->qoperator.distance;
|
|
|
|
|
INFIX nrm;
|
|
|
|
|
bool needParenthesis = false;
|
|
|
|
|
|
|
|
|
|
in->curpol++;
|
|
|
|
|
if (priority < parentPriority ||
|
|
|
|
|
/* phrase operator depends on order */
|
|
|
|
|
(op == OP_PHRASE && rightPhraseOp))
|
|
|
|
|
{
|
|
|
|
|
needParenthesis = true;
|
|
|
|
|
RESIZEBUF(in, 2);
|
|
|
|
|
sprintf(in->cur, "( ");
|
|
|
|
|
in->cur = strchr(in->cur, '\0');
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
nrm.curpol = in->curpol;
|
|
|
|
|
nrm.op = in->op;
|
|
|
|
|
nrm.buflen = 16;
|
|
|
|
|
nrm.cur = nrm.buf = (char *) palloc(sizeof(char) * nrm.buflen);
|
|
|
|
|
|
|
|
|
|
/* get right operand */
|
|
|
|
|
infix(&nrm, priority, (op == OP_PHRASE));
|
|
|
|
|
|
|
|
|
|
/* get & print left operand */
|
|
|
|
|
in->curpol = nrm.curpol;
|
|
|
|
|
infix(in, priority, false);
|
|
|
|
|
|
|
|
|
|
/* print operator & right operand */
|
|
|
|
|
RESIZEBUF(in, 3 + (2 + 10 /* distance */ ) + (nrm.cur - nrm.buf));
|
|
|
|
|
switch (op)
|
|
|
|
|
{
|
|
|
|
|
case OP_OR:
|
|
|
|
|
sprintf(in->cur, " | %s", nrm.buf);
|
|
|
|
|
break;
|
|
|
|
|
case OP_AND:
|
|
|
|
|
sprintf(in->cur, " & %s", nrm.buf);
|
|
|
|
|
break;
|
|
|
|
|
case OP_PHRASE:
|
|
|
|
|
if (distance != 1)
|
|
|
|
|
sprintf(in->cur, " <%d> %s", distance, nrm.buf);
|
|
|
|
|
else
|
|
|
|
|
sprintf(in->cur, " <-> %s", nrm.buf);
|
|
|
|
|
break;
|
|
|
|
|
default:
|
|
|
|
|
/* OP_NOT is handled in above if-branch */
|
|
|
|
|
elog(ERROR, "unrecognized operator type: %d", op);
|
|
|
|
|
}
|
|
|
|
|
in->cur = strchr(in->cur, '\0');
|
|
|
|
|
pfree(nrm.buf);
|
|
|
|
|
|
|
|
|
|
if (needParenthesis)
|
|
|
|
|
{
|
|
|
|
|
RESIZEBUF(in, 2);
|
|
|
|
|
sprintf(in->cur, " )");
|
|
|
|
|
in->cur = strchr(in->cur, '\0');
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
Datum
|
|
|
|
|
tsqueryout(PG_FUNCTION_ARGS)
|
|
|
|
|
{
|
|
|
|
|
TSQuery query = PG_GETARG_TSQUERY(0);
|
|
|
|
|
INFIX nrm;
|
|
|
|
|
|
|
|
|
|
if (query->size == 0)
|
|
|
|
|
{
|
|
|
|
|
char *b = palloc(1);
|
|
|
|
|
|
|
|
|
|
*b = '\0';
|
|
|
|
|
PG_RETURN_POINTER(b);
|
|
|
|
|
}
|
|
|
|
|
nrm.curpol = GETQUERY(query);
|
|
|
|
|
nrm.buflen = 32;
|
|
|
|
|
nrm.cur = nrm.buf = (char *) palloc(sizeof(char) * nrm.buflen);
|
|
|
|
|
*(nrm.cur) = '\0';
|
|
|
|
|
nrm.op = GETOPERAND(query);
|
|
|
|
|
infix(&nrm, -1 /* lowest priority */ , false);
|
|
|
|
|
|
|
|
|
|
PG_FREE_IF_COPY(query, 0);
|
|
|
|
|
PG_RETURN_CSTRING(nrm.buf);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* Binary Input / Output functions. The binary format is as follows:
|
|
|
|
|
*
|
|
|
|
|
* uint32 number of operators/operands in the query
|
|
|
|
|
*
|
|
|
|
|
* Followed by the operators and operands, in prefix notation. For each
|
|
|
|
|
* operand:
|
|
|
|
|
*
|
|
|
|
|
* uint8 type, QI_VAL
|
|
|
|
|
* uint8 weight
|
|
|
|
|
* operand text in client encoding, null-terminated
|
|
|
|
|
* uint8 prefix
|
|
|
|
|
*
|
|
|
|
|
* For each operator:
|
|
|
|
|
* uint8 type, QI_OPR
|
|
|
|
|
* uint8 operator, one of OP_AND, OP_PHRASE OP_OR, OP_NOT.
|
|
|
|
|
* uint16 distance (only for OP_PHRASE)
|
|
|
|
|
*/
|
|
|
|
|
Datum
|
|
|
|
|
tsquerysend(PG_FUNCTION_ARGS)
|
|
|
|
|
{
|
|
|
|
|
TSQuery query = PG_GETARG_TSQUERY(0);
|
|
|
|
|
StringInfoData buf;
|
|
|
|
|
int i;
|
|
|
|
|
QueryItem *item = GETQUERY(query);
|
|
|
|
|
|
|
|
|
|
pq_begintypsend(&buf);
|
|
|
|
|
|
|
|
|
|
pq_sendint32(&buf, query->size);
|
|
|
|
|
for (i = 0; i < query->size; i++)
|
|
|
|
|
{
|
|
|
|
|
pq_sendint8(&buf, item->type);
|
|
|
|
|
|
|
|
|
|
switch (item->type)
|
|
|
|
|
{
|
|
|
|
|
case QI_VAL:
|
|
|
|
|
pq_sendint8(&buf, item->qoperand.weight);
|
|
|
|
|
pq_sendint8(&buf, item->qoperand.prefix);
|
|
|
|
|
pq_sendstring(&buf, GETOPERAND(query) + item->qoperand.distance);
|
|
|
|
|
break;
|
|
|
|
|
case QI_OPR:
|
|
|
|
|
pq_sendint8(&buf, item->qoperator.oper);
|
|
|
|
|
if (item->qoperator.oper == OP_PHRASE)
|
|
|
|
|
pq_sendint16(&buf, item->qoperator.distance);
|
|
|
|
|
break;
|
|
|
|
|
default:
|
|
|
|
|
elog(ERROR, "unrecognized tsquery node type: %d", item->type);
|
|
|
|
|
}
|
|
|
|
|
item++;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
PG_FREE_IF_COPY(query, 0);
|
|
|
|
|
|
|
|
|
|
PG_RETURN_BYTEA_P(pq_endtypsend(&buf));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
Datum
|
|
|
|
|
tsqueryrecv(PG_FUNCTION_ARGS)
|
|
|
|
|
{
|
|
|
|
|
StringInfo buf = (StringInfo) PG_GETARG_POINTER(0);
|
|
|
|
|
TSQuery query;
|
|
|
|
|
int i,
|
|
|
|
|
len;
|
|
|
|
|
QueryItem *item;
|
|
|
|
|
int datalen;
|
|
|
|
|
char *ptr;
|
|
|
|
|
uint32 size;
|
|
|
|
|
const char **operands;
|
|
|
|
|
bool needcleanup;
|
|
|
|
|
|
|
|
|
|
size = pq_getmsgint(buf, sizeof(uint32));
|
|
|
|
|
if (size > (MaxAllocSize / sizeof(QueryItem)))
|
|
|
|
|
elog(ERROR, "invalid size of tsquery");
|
|
|
|
|
|
|
|
|
|
/* Allocate space to temporarily hold operand strings */
|
|
|
|
|
operands = palloc(size * sizeof(char *));
|
|
|
|
|
|
|
|
|
|
/* Allocate space for all the QueryItems. */
|
|
|
|
|
len = HDRSIZETQ + sizeof(QueryItem) * size;
|
|
|
|
|
query = (TSQuery) palloc0(len);
|
|
|
|
|
query->size = size;
|
|
|
|
|
item = GETQUERY(query);
|
|
|
|
|
|
|
|
|
|
datalen = 0;
|
|
|
|
|
for (i = 0; i < size; i++)
|
|
|
|
|
{
|
|
|
|
|
item->type = (int8) pq_getmsgint(buf, sizeof(int8));
|
|
|
|
|
|
|
|
|
|
if (item->type == QI_VAL)
|
|
|
|
|
{
|
|
|
|
|
size_t val_len; /* length after recoding to server
|
|
|
|
|
* encoding */
|
|
|
|
|
uint8 weight;
|
|
|
|
|
uint8 prefix;
|
|
|
|
|
const char *val;
|
|
|
|
|
pg_crc32 valcrc;
|
|
|
|
|
|
|
|
|
|
weight = (uint8) pq_getmsgint(buf, sizeof(uint8));
|
|
|
|
|
prefix = (uint8) pq_getmsgint(buf, sizeof(uint8));
|
|
|
|
|
val = pq_getmsgstring(buf);
|
|
|
|
|
val_len = strlen(val);
|
|
|
|
|
|
|
|
|
|
/* Sanity checks */
|
|
|
|
|
|
|
|
|
|
if (weight > 0xF)
|
|
|
|
|
elog(ERROR, "invalid tsquery: invalid weight bitmap");
|
|
|
|
|
|
|
|
|
|
if (val_len > MAXSTRLEN)
|
|
|
|
|
elog(ERROR, "invalid tsquery: operand too long");
|
|
|
|
|
|
|
|
|
|
if (datalen > MAXSTRPOS)
|
|
|
|
|
elog(ERROR, "invalid tsquery: total operand length exceeded");
|
|
|
|
|
|
|
|
|
|
/* Looks valid. */
|
|
|
|
|
|
Switch to CRC-32C in WAL and other places.
The old algorithm was found to not be the usual CRC-32 algorithm, used by
Ethernet et al. We were using a non-reflected lookup table with code meant
for a reflected lookup table. That's a strange combination that AFAICS does
not correspond to any bit-wise CRC calculation, which makes it difficult to
reason about its properties. Although it has worked well in practice, seems
safer to use a well-known algorithm.
Since we're changing the algorithm anyway, we might as well choose a
different polynomial. The Castagnoli polynomial has better error-correcting
properties than the traditional CRC-32 polynomial, even if we had
implemented it correctly. Another reason for picking that is that some new
CPUs have hardware support for calculating CRC-32C, but not CRC-32, let
alone our strange variant of it. This patch doesn't add any support for such
hardware, but a future patch could now do that.
The old algorithm is kept around for tsquery and pg_trgm, which use the
values in indexes that need to remain compatible so that pg_upgrade works.
While we're at it, share the old lookup table for CRC-32 calculation
between hstore, ltree and core. They all use the same table, so might as
well.
11 years ago
|
|
|
INIT_LEGACY_CRC32(valcrc);
|
|
|
|
|
COMP_LEGACY_CRC32(valcrc, val, val_len);
|
|
|
|
|
FIN_LEGACY_CRC32(valcrc);
|
|
|
|
|
|
|
|
|
|
item->qoperand.weight = weight;
|
|
|
|
|
item->qoperand.prefix = (prefix) ? true : false;
|
|
|
|
|
item->qoperand.valcrc = (int32) valcrc;
|
|
|
|
|
item->qoperand.length = val_len;
|
|
|
|
|
item->qoperand.distance = datalen;
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* Operand strings are copied to the final struct after this loop;
|
|
|
|
|
* here we just collect them to an array
|
|
|
|
|
*/
|
|
|
|
|
operands[i] = val;
|
|
|
|
|
|
Phase 2 of pgindent updates.
Change pg_bsd_indent to follow upstream rules for placement of comments
to the right of code, and remove pgindent hack that caused comments
following #endif to not obey the general rule.
Commit e3860ffa4dd0dad0dd9eea4be9cc1412373a8c89 wasn't actually using
the published version of pg_bsd_indent, but a hacked-up version that
tried to minimize the amount of movement of comments to the right of
code. The situation of interest is where such a comment has to be
moved to the right of its default placement at column 33 because there's
code there. BSD indent has always moved right in units of tab stops
in such cases --- but in the previous incarnation, indent was working
in 8-space tab stops, while now it knows we use 4-space tabs. So the
net result is that in about half the cases, such comments are placed
one tab stop left of before. This is better all around: it leaves
more room on the line for comment text, and it means that in such
cases the comment uniformly starts at the next 4-space tab stop after
the code, rather than sometimes one and sometimes two tabs after.
Also, ensure that comments following #endif are indented the same
as comments following other preprocessor commands such as #else.
That inconsistency turns out to have been self-inflicted damage
from a poorly-thought-through post-indent "fixup" in pgindent.
This patch is much less interesting than the first round of indent
changes, but also bulkier, so I thought it best to separate the effects.
Discussion: https://postgr.es/m/E1dAmxK-0006EE-1r@gemulon.postgresql.org
Discussion: https://postgr.es/m/30527.1495162840@sss.pgh.pa.us
9 years ago
|
|
|
datalen += val_len + 1; /* + 1 for the '\0' terminator */
|
|
|
|
|
}
|
|
|
|
|
else if (item->type == QI_OPR)
|
|
|
|
|
{
|
|
|
|
|
int8 oper;
|
|
|
|
|
|
|
|
|
|
oper = (int8) pq_getmsgint(buf, sizeof(int8));
|
|
|
|
|
if (oper != OP_NOT && oper != OP_OR && oper != OP_AND && oper != OP_PHRASE)
|
|
|
|
|
elog(ERROR, "invalid tsquery: unrecognized operator type %d",
|
|
|
|
|
(int) oper);
|
|
|
|
|
if (i == size - 1)
|
|
|
|
|
elog(ERROR, "invalid pointer to right operand");
|
|
|
|
|
|
|
|
|
|
item->qoperator.oper = oper;
|
|
|
|
|
if (oper == OP_PHRASE)
|
|
|
|
|
item->qoperator.distance = (int16) pq_getmsgint(buf, sizeof(int16));
|
|
|
|
|
}
|
|
|
|
|
else
|
|
|
|
|
elog(ERROR, "unrecognized tsquery node type: %d", item->type);
|
|
|
|
|
|
|
|
|
|
item++;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* Enlarge buffer to make room for the operand values. */
|
|
|
|
|
query = (TSQuery) repalloc(query, len + datalen);
|
|
|
|
|
item = GETQUERY(query);
|
|
|
|
|
ptr = GETOPERAND(query);
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* Fill in the left-pointers. Checks that the tree is well-formed as a
|
|
|
|
|
* side-effect.
|
|
|
|
|
*/
|
|
|
|
|
findoprnd(item, size, &needcleanup);
|
|
|
|
|
|
Fix strange behavior (and possible crashes) in full text phrase search.
In an attempt to simplify the tsquery matching engine, the original
phrase search patch invented rewrite rules that would rearrange a
tsquery so that no AND/OR/NOT operator appeared below a PHRASE operator.
But this approach had numerous problems. The rearrangement step was
missed by ts_rewrite (and perhaps other places), allowing tsqueries
to be created that would cause Assert failures or perhaps crashes at
execution, as reported by Andreas Seltenreich. The rewrite rules
effectively defined semantics for operators underneath PHRASE that were
buggy, or at least unintuitive. And because rewriting was done in
tsqueryin() rather than at execution, the rearrangement was user-visible,
which is not very desirable --- for example, it might cause unexpected
matches or failures to match in ts_rewrite.
As a somewhat independent problem, the behavior of nested PHRASE operators
was only sane for left-deep trees; queries like "x <-> (y <-> z)" did not
behave intuitively at all.
To fix, get rid of the rewrite logic altogether, and instead teach the
tsquery execution engine to manage AND/OR/NOT below a PHRASE operator
by explicitly computing the match location(s) and match widths for these
operators.
This requires introducing some additional fields into the publicly visible
ExecPhraseData struct; but since there's no way for third-party code to
pass such a struct to TS_phrase_execute, it shouldn't create an ABI problem
as long as we don't move the offsets of the existing fields.
Another related problem was that index searches supposed that "!x <-> y"
could be lossily approximated as "!x & y", which isn't correct because
the latter will reject, say, "x q y" which the query itself accepts.
This required some tweaking in TS_execute_ternary along with the main
tsquery engine.
Back-patch to 9.6 where phrase operators were introduced. While this
could be argued to change behavior more than we'd like in a stable branch,
we have to do something about the crash hazards and index-vs-seqscan
inconsistency, and it doesn't seem desirable to let the unintuitive
behaviors induced by the rewriting implementation stand as precedent.
Discussion: https://postgr.es/m/28215.1481999808@sss.pgh.pa.us
Discussion: https://postgr.es/m/26706.1482087250@sss.pgh.pa.us
9 years ago
|
|
|
/* Can't have found any QI_VALSTOP nodes */
|
|
|
|
|
Assert(!needcleanup);
|
|
|
|
|
|
|
|
|
|
/* Copy operands to output struct */
|
|
|
|
|
for (i = 0; i < size; i++)
|
|
|
|
|
{
|
|
|
|
|
if (item->type == QI_VAL)
|
|
|
|
|
{
|
|
|
|
|
memcpy(ptr, operands[i], item->qoperand.length + 1);
|
|
|
|
|
ptr += item->qoperand.length + 1;
|
|
|
|
|
}
|
|
|
|
|
item++;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
pfree(operands);
|
|
|
|
|
|
|
|
|
|
Assert(ptr - GETOPERAND(query) == datalen);
|
|
|
|
|
|
|
|
|
|
SET_VARSIZE(query, len + datalen);
|
|
|
|
|
|
|
|
|
|
PG_RETURN_TSQUERY(query);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* debug function, used only for view query
|
|
|
|
|
* which will be executed in non-leaf pages in index
|
|
|
|
|
*/
|
|
|
|
|
Datum
|
|
|
|
|
tsquerytree(PG_FUNCTION_ARGS)
|
|
|
|
|
{
|
|
|
|
|
TSQuery query = PG_GETARG_TSQUERY(0);
|
|
|
|
|
INFIX nrm;
|
|
|
|
|
text *res;
|
|
|
|
|
QueryItem *q;
|
|
|
|
|
int len;
|
|
|
|
|
|
|
|
|
|
if (query->size == 0)
|
|
|
|
|
{
|
|
|
|
|
res = (text *) palloc(VARHDRSZ);
|
|
|
|
|
SET_VARSIZE(res, VARHDRSZ);
|
|
|
|
|
PG_RETURN_POINTER(res);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
q = clean_NOT(GETQUERY(query), &len);
|
|
|
|
|
|
|
|
|
|
if (!q)
|
|
|
|
|
{
|
|
|
|
|
res = cstring_to_text("T");
|
|
|
|
|
}
|
|
|
|
|
else
|
|
|
|
|
{
|
|
|
|
|
nrm.curpol = q;
|
|
|
|
|
nrm.buflen = 32;
|
|
|
|
|
nrm.cur = nrm.buf = (char *) palloc(sizeof(char) * nrm.buflen);
|
|
|
|
|
*(nrm.cur) = '\0';
|
|
|
|
|
nrm.op = GETOPERAND(query);
|
|
|
|
|
infix(&nrm, -1, false);
|
|
|
|
|
res = cstring_to_text_with_len(nrm.buf, nrm.cur - nrm.buf);
|
|
|
|
|
pfree(q);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
PG_FREE_IF_COPY(query, 0);
|
|
|
|
|
|
|
|
|
|
PG_RETURN_TEXT_P(res);
|
|
|
|
|
}
|