Fix ts_delete(tsvector, text[]) to cope with duplicate array entries.

Such cases either failed an Assert, or produced a corrupt tsvector in non-Assert builds, as reported by Andreas Seltenreich. The reason is that tsvector_delete_by_indices() just assumed that its input array had no duplicates. Fix by explicitly de-duping. In passing, improve some comments, and fix a number of tests for null values to use ERRCODE_NULL_VALUE_NOT_ALLOWED not ERRCODE_INVALID_PARAMETER_VALUE. Discussion: <87invhoj6e.fsf@credativ.de>
10 years ago · c50d192ce3
parent 33fe7360af
commit c50d192ce3
3 changed files with 53 additions and 31 deletions
--- a/src/backend/utils/adt/tsvector_op.c
+++ b/src/backend/utils/adt/tsvector_op.c
@ -317,7 +317,7 @@ tsvector_setweight_by_filter(PG_FUNCTION_ARGS)

 		if (nulls[i])
 			ereport(ERROR,
-					(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+					(errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED),
 					 errmsg("lexeme array may not contain nulls")));

 		lex = VARDATA(dlexemes[i]);
@ -430,7 +430,7 @@ compareint(const void *va, const void *vb)
 /*
 * Internal routine to delete lexemes from TSVector by array of offsets.
 *
- * int *indices_to_delete -- array of lexeme offsets to delete
+ * int *indices_to_delete -- array of lexeme offsets to delete (modified here!)
 * int indices_count -- size of that array
 *
 * Returns new TSVector without given lexemes along with their positions
@ -445,35 +445,51 @@ tsvector_delete_by_indices(TSVector tsv, int *indices_to_delete,
 			   *arrout;
 	char	   *data = STRPTR(tsv),
 			   *dataout;
-	int			i,
-				j,
-				k,
-				curoff;
+	int			i,				/* index in arrin */
+				j,				/* index in arrout */
+				k,				/* index in indices_to_delete */
+				curoff;			/* index in dataout area */

 	/*
-	 * Here we overestimates tsout size, since we don't know exact size
-	 * occupied by positions and weights. We will set exact size later after a
-	 * pass through TSVector.
+	 * Sort the filter array to simplify membership checks below.  Also, get
+	 * rid of any duplicate entries, so that we can assume that indices_count
+	 * is exactly equal to the number of lexemes that will be removed.
 	 */
-	tsout = (TSVector) palloc0(VARSIZE(tsv));
-	arrout = ARRPTR(tsout);
-	tsout->size = tsv->size - indices_count;
-
-	/* Sort our filter array to simplify membership check later. */
 	if (indices_count > 1)
+	{
+		int			kp;
+
 		qsort(indices_to_delete, indices_count, sizeof(int), compareint);
+		kp = 0;
+		for (k = 1; k < indices_count; k++)
+		{
+			if (indices_to_delete[k] != indices_to_delete[kp])
+				indices_to_delete[++kp] = indices_to_delete[k];
+		}
+		indices_count = ++kp;
+	}

 	/*
-	 * Copy tsv to tsout skipping lexemes that enlisted in indices_to_delete.
+	 * Here we overestimate tsout size, since we don't know how much space is
+	 * used by the deleted lexeme(s).  We will set exact size below.
 	 */
-	curoff = 0;
+	tsout = (TSVector) palloc0(VARSIZE(tsv));
+
+	/* This count must be correct because STRPTR(tsout) relies on it. */
+	tsout->size = tsv->size - indices_count;
+
+	/*
+	 * Copy tsv to tsout, skipping lexemes listed in indices_to_delete.
+	 */
+	arrout = ARRPTR(tsout);
 	dataout = STRPTR(tsout);
+	curoff = 0;
 	for (i = j = k = 0; i < tsv->size; i++)
 	{
 		/*
-		 * Here we should check whether current i is present in
-		 * indices_to_delete or not. Since indices_to_delete is already sorted
-		 * we can advance it index only when we have match.
+		 * If current i is present in indices_to_delete, skip this lexeme.
+		 * Since indices_to_delete is already sorted, we only need to check
+		 * the current (k'th) entry.
 		 */
 		if (k < indices_count && i == indices_to_delete[k])
 		{
@ -481,7 +497,7 @@ tsvector_delete_by_indices(TSVector tsv, int *indices_to_delete,
 			continue;
 		}

-		/* Copy lexeme, it's positions and weights */
+		/* Copy lexeme and its positions and weights */
 		memcpy(dataout + curoff, data + arrin[i].pos, arrin[i].len);
 		arrout[j].haspos = arrin[i].haspos;
 		arrout[j].len = arrin[i].len;
@ -489,8 +505,8 @@ tsvector_delete_by_indices(TSVector tsv, int *indices_to_delete,
 		curoff += arrin[i].len;
 		if (arrin[i].haspos)
 		{
-			int			len = POSDATALEN(tsv, arrin + i) * sizeof(WordEntryPos) +
-			sizeof(uint16);
+			int			len = POSDATALEN(tsv, arrin + i) * sizeof(WordEntryPos)
+			+ sizeof(uint16);

 			curoff = SHORTALIGN(curoff);
 			memcpy(dataout + curoff,
@ -503,10 +519,9 @@ tsvector_delete_by_indices(TSVector tsv, int *indices_to_delete,
 	}

 	/*
-	 * After the pass through TSVector k should equals exactly to
-	 * indices_count. If it isn't then the caller provided us with indices
-	 * outside of [0, tsv->size) range and estimation of tsout's size is
-	 * wrong.
+	 * k should now be exactly equal to indices_count. If it isn't then the
+	 * caller provided us with indices outside of [0, tsv->size) range and
+	 * estimation of tsout's size is wrong.
 	 */
 	Assert(k == indices_count);

@ -560,7 +575,7 @@ tsvector_delete_arr(PG_FUNCTION_ARGS)

 	/*
 	 * In typical use case array of lexemes to delete is relatively small. So
-	 * here we optimizing things for that scenario: iterate through lexarr
+	 * here we optimize things for that scenario: iterate through lexarr
 	 * performing binary search of each lexeme from lexarr in tsvector.
 	 */
 	skip_indices = palloc0(nlex * sizeof(int));
@ -572,10 +587,10 @@ tsvector_delete_arr(PG_FUNCTION_ARGS)

 		if (nulls[i])
 			ereport(ERROR,
-					(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+					(errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED),
 					 errmsg("lexeme array may not contain nulls")));

-		lex = VARDATA(dlexemes[i]);
+		lex = VARDATA_ANY(dlexemes[i]);
 		lex_len = VARSIZE_ANY_EXHDR(dlexemes[i]);
 		lex_pos = tsvector_bsearch(tsin, lex, lex_len);

@ -738,7 +753,7 @@ array_to_tsvector(PG_FUNCTION_ARGS)
 	{
 		if (nulls[i])
 			ereport(ERROR,
-					(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+					(errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED),
 					 errmsg("lexeme array may not contain nulls")));

 		datalen += VARSIZE_ANY_EXHDR(dlexemes[i]);
@ -797,7 +812,7 @@ tsvector_filter(PG_FUNCTION_ARGS)

 		if (nulls[i])
 			ereport(ERROR,
-					(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+					(errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED),
 					 errmsg("weight array may not contain nulls")));

 		char_weight = DatumGetChar(dweights[i]);
--- a/src/test/regress/expected/tstypes.out
+++ b/src/test/regress/expected/tstypes.out
@ -1087,6 +1087,12 @@ SELECT ts_delete('base hidden rebel spaceship strike'::tsvector, ARRAY['spaceshi
 'base' 'hidden' 'strike'
 (1 row)

+SELECT ts_delete('base hidden rebel spaceship strike'::tsvector, ARRAY['spaceship','leya','rebel','rebel']);
+        ts_delete         
+--------------------------
+ 'base' 'hidden' 'strike'
+(1 row)
+
 SELECT ts_delete('base hidden rebel spaceship strike'::tsvector, ARRAY['spaceship','leya','rebel', NULL]);
 ERROR:  lexeme array may not contain nulls
 SELECT unnest('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector);
--- a/src/test/regress/sql/tstypes.sql
+++ b/src/test/regress/sql/tstypes.sql
@ -212,6 +212,7 @@ SELECT ts_delete('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3':
 SELECT ts_delete('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector, ARRAY['spaceshi','rebel']);
 SELECT ts_delete('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector, ARRAY['spaceship','leya','rebel']);
 SELECT ts_delete('base hidden rebel spaceship strike'::tsvector, ARRAY['spaceship','leya','rebel']);
+SELECT ts_delete('base hidden rebel spaceship strike'::tsvector, ARRAY['spaceship','leya','rebel','rebel']);
 SELECT ts_delete('base hidden rebel spaceship strike'::tsvector, ARRAY['spaceship','leya','rebel', NULL]);

 SELECT unnest('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector);