mirror of https://github.com/postgres/postgres
- supports multibyte encodings
- more strict rules for lexemes
- flex isn't used
Add:
- tsquery plainto_tsquery(text)
Function makes tsquery from plain text.
- &&, ||, !! operation for tsquery for combining
tsquery from it's parts: 'foo & bar' || 'asd' => 'foo & bar | asd'
REL8_2_STABLE
parent
b91e6ed93e
commit
c52795d18a
@ -0,0 +1,61 @@ |
||||
#include "ts_locale.h" |
||||
|
||||
#include "utils/builtins.h" |
||||
#include "utils/pg_locale.h" |
||||
#include "mb/pg_wchar.h" |
||||
|
||||
|
||||
#if defined(TS_USE_WIDE) && defined(WIN32) |
||||
|
||||
size_t |
||||
wchar2char( const char *to, const wchar_t *from, size_t len ) { |
||||
if (GetDatabaseEncoding() == PG_UTF8) { |
||||
int r; |
||||
|
||||
if (len==0) |
||||
return 0; |
||||
|
||||
r = WideCharToMultiByte(CP_UTF8, 0, from, len, to, nbytes, |
||||
NULL, NULL); |
||||
|
||||
|
||||
if ( r==0 ) |
||||
ereport(ERROR, |
||||
(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE), |
||||
errmsg("UTF-16 to UTF-8 translation failed: %lu", |
||||
GetLastError()))); |
||||
|
||||
return r; |
||||
} |
||||
|
||||
return wcstombs(to, from, len); |
||||
} |
||||
|
||||
size_t
|
||||
char2wchar( const wchar_t *to, const char *from, size_t len ) { |
||||
if (GetDatabaseEncoding() == PG_UTF8) { |
||||
int r; |
||||
|
||||
if (len==0) |
||||
return 0; |
||||
|
||||
r = MultiByteToWideChar(CP_UTF8, 0, from, len, |
||||
to, len); |
||||
|
||||
if (!r) { |
||||
pg_verifymbstr(from, len, false); |
||||
ereport(ERROR, |
||||
(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE), |
||||
errmsg("invalid multibyte character for locale"), |
||||
errhint("The server's LC_CTYPE locale is probably incompatible with the database encoding."))); |
||||
} |
||||
|
||||
Assert(r <= nbytes); |
||||
|
||||
return r; |
||||
} |
||||
|
||||
return mbstowcs(to, from, len); |
||||
} |
||||
|
||||
#endif |
||||
@ -0,0 +1,38 @@ |
||||
#ifndef __TSLOCALE_H__ |
||||
#define __TSLOCALE_H__ |
||||
|
||||
#include "postgres.h" |
||||
|
||||
#include <ctype.h> |
||||
#include <limits.h> |
||||
|
||||
/*
|
||||
* towlower() and friends should be in <wctype.h>, but some pre-C99 systems |
||||
* declare them in <wchar.h>. |
||||
*/ |
||||
#ifdef HAVE_WCHAR_H |
||||
#include <wchar.h> |
||||
#endif |
||||
#ifdef HAVE_WCTYPE_H |
||||
#include <wctype.h> |
||||
#endif |
||||
|
||||
#if defined(HAVE_WCSTOMBS) && defined(HAVE_TOWLOWER) |
||||
#define TS_USE_WIDE |
||||
|
||||
#ifdef WIN32 |
||||
|
||||
size_t wchar2char( const char *to, const wchar_t *from, size_t len ); |
||||
size_t char2wchar( const wchar_t *to, const char *from, size_t len ); |
||||
|
||||
#else /* WIN32 */ |
||||
|
||||
/* correct mbstowcs */ |
||||
#define char2wchar mbstowcs |
||||
#define wchar2char wcstombs |
||||
|
||||
#endif /* WIN32 */ |
||||
|
||||
#endif /* defined(HAVE_WCSTOMBS) && defined(HAVE_TOWLOWER) */ |
||||
|
||||
#endif /* __TSLOCALE_H__ */ |
||||
File diff suppressed because it is too large
Load Diff
@ -1,10 +1,147 @@ |
||||
#ifndef __PARSER_H__ |
||||
#define __PARSER_H__ |
||||
|
||||
extern char *token; |
||||
extern int tokenlen; |
||||
int tsearch2_yylex(void); |
||||
void tsearch2_start_parse_str(char *, int); |
||||
void tsearch2_end_parse(void); |
||||
#include <ctype.h> |
||||
#include <limits.h> |
||||
#include "ts_locale.h" |
||||
|
||||
typedef enum { |
||||
TPS_Base = 0, |
||||
TPS_InUWord, |
||||
TPS_InLatWord, |
||||
TPS_InCyrWord, |
||||
TPS_InUnsignedInt, |
||||
TPS_InSignedIntFirst, |
||||
TPS_InSignedInt, |
||||
TPS_InSpace, |
||||
TPS_InUDecimalFirst, |
||||
TPS_InUDecimal, |
||||
TPS_InDecimalFirst, |
||||
TPS_InDecimal, |
||||
TPS_InVersionFirst, |
||||
TPS_InVersion, |
||||
TPS_InMantissaFirst, |
||||
TPS_InMantissaSign, |
||||
TPS_InMantissa, |
||||
TPS_InHTMLEntityFirst, |
||||
TPS_InHTMLEntity, |
||||
TPS_InHTMLEntityNumFirst, |
||||
TPS_InHTMLEntityNum, |
||||
TPS_InHTMLEntityEnd, |
||||
TPS_InTagFirst, |
||||
TPS_InTagCloseFirst, |
||||
TPS_InTag, |
||||
TPS_InTagEscapeK, |
||||
TPS_InTagEscapeKK, |
||||
TPS_InTagBackSleshed, |
||||
TPS_InTagEnd, |
||||
TPS_InCommentFirst, |
||||
TPS_InCommentLast, |
||||
TPS_InComment, |
||||
TPS_InCloseCommentFirst, |
||||
TPS_InCloseCommentLast, |
||||
TPS_InCommentEnd, |
||||
TPS_InHostFirstDomen, |
||||
TPS_InHostDomenSecond, |
||||
TPS_InHostDomen, |
||||
TPS_InPortFirst, |
||||
TPS_InPort, |
||||
TPS_InHostFirstAN, |
||||
TPS_InHost, |
||||
TPS_InEmail, |
||||
TPS_InFileFirst, |
||||
TPS_InFile, |
||||
TPS_InFileNext, |
||||
TPS_InURIFirst, |
||||
TPS_InURIStart, |
||||
TPS_InURI, |
||||
TPS_InFURL, |
||||
TPS_InProtocolFirst, |
||||
TPS_InProtocolSecond, |
||||
TPS_InProtocolEnd, |
||||
TPS_InHyphenLatWordFirst, |
||||
TPS_InHyphenLatWord, |
||||
TPS_InHyphenCyrWordFirst, |
||||
TPS_InHyphenCyrWord, |
||||
TPS_InHyphenUWordFirst, |
||||
TPS_InHyphenUWord, |
||||
TPS_InHyphenValueFirst, |
||||
TPS_InHyphenValue, |
||||
TPS_InHyphenValueExact, |
||||
TPS_InParseHyphen, |
||||
TPS_InParseHyphenHyphen, |
||||
TPS_InHyphenCyrWordPart, |
||||
TPS_InHyphenLatWordPart, |
||||
TPS_InHyphenUWordPart, |
||||
TPS_InHyphenUnsignedInt, |
||||
TPS_InHDecimalPartFirst, |
||||
TPS_InHDecimalPart, |
||||
TPS_InHVersionPartFirst, |
||||
TPS_InHVersionPart, |
||||
TPS_Null /* last state (fake value) */ |
||||
} TParserState; |
||||
|
||||
/* forward declaration */ |
||||
struct TParser; |
||||
|
||||
|
||||
typedef int (*TParserCharTest)(struct TParser*); /* any p_is* functions except p_iseq */ |
||||
typedef void (*TParserSpecial)(struct TParser*); /* special handler for special cases... */ |
||||
|
||||
typedef struct { |
||||
TParserCharTest isclass; |
||||
char c; |
||||
uint16 flags; |
||||
TParserState tostate; |
||||
int type; |
||||
TParserSpecial special; |
||||
} TParserStateActionItem; |
||||
|
||||
typedef struct { |
||||
TParserState state; |
||||
TParserStateActionItem *action; |
||||
} TParserStateAction; |
||||
|
||||
typedef struct TParserPosition { |
||||
int posbyte; /* position of parser in bytes */ |
||||
int poschar; /* osition of parser in characters */ |
||||
int charlen; /* length of current char */ |
||||
int lenbytelexeme; |
||||
int lencharlexeme; |
||||
TParserState state; |
||||
struct TParserPosition *prev; |
||||
int flags; |
||||
TParserStateActionItem *pushedAtAction; |
||||
} TParserPosition; |
||||
|
||||
typedef struct TParser { |
||||
/* string and position information */ |
||||
char *str; /* multibyte string */ |
||||
int lenstr; /* length of mbstring */ |
||||
wchar_t *wstr; /* wide character string */
|
||||
int lenwstr; /* length of wsting */ |
||||
|
||||
/* State of parse */ |
||||
int charmaxlen; |
||||
bool usewide; |
||||
TParserPosition *state; |
||||
bool ignore; |
||||
bool wanthost; |
||||
|
||||
/* silly char */ |
||||
char c; |
||||
|
||||
/* out */ |
||||
char *lexeme; |
||||
int lenbytelexeme; |
||||
int lencharlexeme; |
||||
int type; |
||||
|
||||
} TParser; |
||||
|
||||
|
||||
TParser* TParserInit( char *, int ); |
||||
bool TParserGet( TParser* ); |
||||
void TParserClose( TParser* ); |
||||
|
||||
#endif |
||||
|
||||
@ -1,346 +0,0 @@ |
||||
%{ |
||||
#include "postgres.h" |
||||
|
||||
#include "deflex.h" |
||||
#include "parser.h" |
||||
#include "common.h" |
||||
|
||||
/* Avoid exit() on fatal scanner errors */ |
||||
#undef fprintf |
||||
#define fprintf(file, fmt, msg) ts_error(ERROR, fmt, msg) |
||||
|
||||
char *token = NULL; /* pointer to token */ |
||||
int tokenlen; |
||||
static char *s = NULL; /* to return WHOLE hyphenated-word */ |
||||
|
||||
YY_BUFFER_STATE buf = NULL; /* buffer to parse; it need for parse from string */ |
||||
|
||||
typedef struct { |
||||
int tlen; |
||||
int clen; |
||||
char *str; |
||||
} TagStorage; |
||||
|
||||
static TagStorage ts={0,0,NULL}; |
||||
|
||||
static void |
||||
addTag(void) |
||||
{ |
||||
while( ts.clen+tsearch2_yyleng+1 > ts.tlen ) { |
||||
ts.tlen*=2; |
||||
ts.str=realloc(ts.str,ts.tlen); |
||||
if (!ts.str) |
||||
ereport(ERROR, |
||||
(errcode(ERRCODE_OUT_OF_MEMORY), |
||||
errmsg("out of memory"))); |
||||
} |
||||
memcpy(ts.str+ts.clen,tsearch2_yytext,tsearch2_yyleng); |
||||
ts.clen+=tsearch2_yyleng; |
||||
ts.str[ts.clen]='\0'; |
||||
} |
||||
|
||||
static void |
||||
startTag(void) |
||||
{ |
||||
if ( ts.str==NULL ) { |
||||
ts.tlen=tsearch2_yyleng+1; |
||||
ts.str=malloc(ts.tlen); |
||||
if (!ts.str) |
||||
ereport(ERROR, |
||||
(errcode(ERRCODE_OUT_OF_MEMORY), |
||||
errmsg("out of memory"))); |
||||
} |
||||
ts.clen=0; |
||||
ts.str[0]='\0'; |
||||
addTag(); |
||||
} |
||||
|
||||
%} |
||||
|
||||
%option 8bit |
||||
%option never-interactive |
||||
%option nodefault |
||||
%option nounput |
||||
%option noyywrap |
||||
|
||||
/* parser's state for parsing hyphenated-word */ |
||||
%x DELIM |
||||
/* parser's state for parsing URL*/ |
||||
%x URL |
||||
%x SERVER |
||||
|
||||
/* parser's state for parsing TAGS */ |
||||
%x INTAG |
||||
%x QINTAG |
||||
%x INCOMMENT |
||||
%x INSCRIPT |
||||
|
||||
/* cyrillic koi8 char */ |
||||
CYRALNUM [0-9\200-\377] |
||||
CYRALPHA [\200-\377] |
||||
ALPHA [a-zA-Z\200-\377] |
||||
ALNUM [0-9a-zA-Z\200-\377] |
||||
|
||||
|
||||
HOSTNAME ([-_[:alnum:]]+\.)+[[:alpha:]]+ |
||||
URI [-_[:alnum:]/%,\.;=&?#]+ |
||||
|
||||
%% |
||||
|
||||
"<"[Ss][Cc][Rr][Ii][Pp][Tt] { BEGIN INSCRIPT; startTag(); } |
||||
|
||||
<INSCRIPT>"</"[Ss][Cc][Rr][Ii][Pp][Tt]">" { |
||||
BEGIN INITIAL; |
||||
addTag(); |
||||
token = ts.str; |
||||
tokenlen = ts.clen; |
||||
return TAG; |
||||
} |
||||
|
||||
"<!--" { BEGIN INCOMMENT; startTag(); } |
||||
|
||||
<INCOMMENT>"-->" { |
||||
BEGIN INITIAL; |
||||
addTag(); |
||||
token = ts.str; |
||||
tokenlen = ts.clen; |
||||
return TAG; |
||||
} |
||||
|
||||
|
||||
"<"[\![:alpha:]] { BEGIN INTAG; startTag(); } |
||||
|
||||
"</"[[:alpha:]] { BEGIN INTAG; startTag(); } |
||||
|
||||
<INTAG>"\"" { BEGIN QINTAG; addTag(); } |
||||
|
||||
<QINTAG>"\\\"" { addTag(); } |
||||
|
||||
<QINTAG>"\"" { BEGIN INTAG; addTag(); } |
||||
|
||||
<INTAG>">" { |
||||
BEGIN INITIAL; |
||||
addTag(); |
||||
token = ts.str; |
||||
tokenlen = ts.clen; |
||||
return TAG; |
||||
} |
||||
|
||||
<QINTAG,INTAG,INCOMMENT,INSCRIPT>.|\n { addTag(); } |
||||
|
||||
\&(quot|amp|nbsp|lt|gt)\; { |
||||
token = tsearch2_yytext; |
||||
tokenlen = tsearch2_yyleng; |
||||
return HTMLENTITY; |
||||
} |
||||
|
||||
\&\#[0-9][0-9]?[0-9]?\; { |
||||
token = tsearch2_yytext; |
||||
tokenlen = tsearch2_yyleng; |
||||
return HTMLENTITY; |
||||
} |
||||
|
||||
[-_\.[:alnum:]]+@{HOSTNAME} /* Emails */ { |
||||
token = tsearch2_yytext; |
||||
tokenlen = tsearch2_yyleng; |
||||
return EMAIL; |
||||
} |
||||
|
||||
[+-]?[0-9]+(\.[0-9]+)?[eEdD][+-]?[0-9]+ /* float */ { |
||||
token = tsearch2_yytext; |
||||
tokenlen = tsearch2_yyleng; |
||||
return SCIENTIFIC; |
||||
} |
||||
|
||||
[0-9]+\.[0-9]+\.[0-9\.]*[0-9] { |
||||
token = tsearch2_yytext; |
||||
tokenlen = tsearch2_yyleng; |
||||
return VERSIONNUMBER; |
||||
} |
||||
|
||||
[+-]?[0-9]+\.[0-9]+ { |
||||
token = tsearch2_yytext; |
||||
tokenlen = tsearch2_yyleng; |
||||
return DECIMAL; |
||||
} |
||||
|
||||
[+-][0-9]+ { |
||||
token = tsearch2_yytext; |
||||
tokenlen = tsearch2_yyleng; |
||||
return SIGNEDINT; |
||||
} |
||||
|
||||
<DELIM,INITIAL>[0-9]+ { |
||||
token = tsearch2_yytext; |
||||
tokenlen = tsearch2_yyleng; |
||||
return UNSIGNEDINT; |
||||
} |
||||
|
||||
http"://" { |
||||
BEGIN URL; |
||||
token = tsearch2_yytext; |
||||
tokenlen = tsearch2_yyleng; |
||||
return HTTP; |
||||
} |
||||
|
||||
ftp"://" { |
||||
BEGIN URL; |
||||
token = tsearch2_yytext; |
||||
tokenlen = tsearch2_yyleng; |
||||
return HTTP; |
||||
} |
||||
|
||||
<URL,INITIAL>{HOSTNAME}[/:]{URI} { |
||||
BEGIN SERVER; |
||||
if (s) { free(s); s=NULL; } |
||||
s = strdup( tsearch2_yytext ); |
||||
tokenlen = tsearch2_yyleng; |
||||
yyless( 0 ); |
||||
token = s; |
||||
return FURL; |
||||
} |
||||
|
||||
<SERVER,URL,INITIAL>{HOSTNAME} { |
||||
token = tsearch2_yytext; |
||||
tokenlen = tsearch2_yyleng; |
||||
return HOST; |
||||
} |
||||
|
||||
<SERVER>[/:]{URI} { |
||||
token = tsearch2_yytext; |
||||
tokenlen = tsearch2_yyleng; |
||||
return URI; |
||||
} |
||||
|
||||
[[:alnum:]\./_-]+"/"[[:alnum:]\./_-]+ { |
||||
token = tsearch2_yytext; |
||||
tokenlen = tsearch2_yyleng; |
||||
return FILEPATH; |
||||
} |
||||
|
||||
({CYRALPHA}+-)+{CYRALPHA}+ /* composite-word */ { |
||||
BEGIN DELIM; |
||||
if (s) { free(s); s=NULL; } |
||||
s = strdup( tsearch2_yytext ); |
||||
tokenlen = tsearch2_yyleng; |
||||
yyless( 0 ); |
||||
token = s; |
||||
return CYRHYPHENWORD; |
||||
} |
||||
|
||||
([[:alpha:]]+-)+[[:alpha:]]+ /* composite-word */ { |
||||
BEGIN DELIM; |
||||
if (s) { free(s); s=NULL; } |
||||
s = strdup( tsearch2_yytext ); |
||||
tokenlen = tsearch2_yyleng; |
||||
yyless( 0 ); |
||||
token = s; |
||||
return LATHYPHENWORD; |
||||
} |
||||
|
||||
({ALNUM}+-)+{ALNUM}+ /* composite-word */ { |
||||
BEGIN DELIM; |
||||
if (s) { free(s); s=NULL; } |
||||
s = strdup( tsearch2_yytext ); |
||||
tokenlen = tsearch2_yyleng; |
||||
yyless( 0 ); |
||||
token = s; |
||||
return HYPHENWORD; |
||||
} |
||||
|
||||
<DELIM>[0-9]+\.[0-9]+\.[0-9\.]*[0-9] { |
||||
token = tsearch2_yytext; |
||||
tokenlen = tsearch2_yyleng; |
||||
return VERSIONNUMBER; |
||||
} |
||||
|
||||
<DELIM>\+?[0-9]+\.[0-9]+ { |
||||
token = tsearch2_yytext; |
||||
tokenlen = tsearch2_yyleng; |
||||
return DECIMAL; |
||||
} |
||||
|
||||
<DELIM>{CYRALPHA}+ /* one word in composite-word */ { |
||||
token = tsearch2_yytext; |
||||
tokenlen = tsearch2_yyleng; |
||||
return CYRPARTHYPHENWORD; |
||||
} |
||||
|
||||
<DELIM>[[:alpha:]]+ /* one word in composite-word */ { |
||||
token = tsearch2_yytext; |
||||
tokenlen = tsearch2_yyleng; |
||||
return LATPARTHYPHENWORD; |
||||
} |
||||
|
||||
<DELIM>{ALNUM}+ /* one word in composite-word */ { |
||||
token = tsearch2_yytext; |
||||
tokenlen = tsearch2_yyleng; |
||||
return PARTHYPHENWORD; |
||||
} |
||||
|
||||
<DELIM>- { |
||||
token = tsearch2_yytext; |
||||
tokenlen = tsearch2_yyleng; |
||||
return SPACE; |
||||
} |
||||
|
||||
<DELIM,SERVER,URL>.|\n /* return in basic state */ { |
||||
BEGIN INITIAL; |
||||
yyless( 0 ); |
||||
} |
||||
|
||||
{CYRALPHA}+ /* normal word */ { |
||||
token = tsearch2_yytext; |
||||
tokenlen = tsearch2_yyleng; |
||||
return CYRWORD; |
||||
} |
||||
|
||||
[[:alpha:]]+ /* normal word */ { |
||||
token = tsearch2_yytext; |
||||
tokenlen = tsearch2_yyleng; |
||||
return LATWORD; |
||||
} |
||||
|
||||
{ALNUM}+ /* normal word */ { |
||||
token = tsearch2_yytext; |
||||
tokenlen = tsearch2_yyleng; |
||||
return UWORD; |
||||
} |
||||
|
||||
[ \r\n\t]+ { |
||||
token = tsearch2_yytext; |
||||
tokenlen = tsearch2_yyleng; |
||||
return SPACE; |
||||
} |
||||
|
||||
. { |
||||
token = tsearch2_yytext; |
||||
tokenlen = tsearch2_yyleng; |
||||
return SPACE; |
||||
} |
||||
|
||||
%% |
||||
|
||||
/* clearing after parsing from string */ |
||||
void |
||||
tsearch2_end_parse(void) |
||||
{ |
||||
if (s) |
||||
{ |
||||
free(s); |
||||
s = NULL; |
||||
} |
||||
tsearch2_yy_delete_buffer( buf ); |
||||
buf = NULL; |
||||
} |
||||
|
||||
/* start parse from string */ |
||||
void |
||||
tsearch2_start_parse_str(char* str, int limit) |
||||
{ |
||||
if (buf) |
||||
tsearch2_end_parse(); |
||||
buf = tsearch2_yy_scan_bytes( str, limit ); |
||||
tsearch2_yy_switch_to_buffer( buf ); |
||||
BEGIN INITIAL; |
||||
} |
||||
Loading…
Reference in new issue