Tweak GROUP BY so that it will still accept result-column names, but only

after trying to resolve the item as an input-column name. This allows us to be compliant with the SQL92 spec for queries that fall within the spec, while still accepting the same out-of-spec queries as 6.5 did. You'll only lose if there is an output column name that is the same as an input column name, but doesn't refer to the same value. 7.0 will interpret such a GROUP BY spec differently than 6.5 did. No way around that, because 6.5 was clearly not spec compliant.
26 years ago · 1763a7c1ea
parent 8e0790519a
commit 1763a7c1ea
4 changed files with 139 additions and 76 deletions
--- a/doc/src/sgml/ref/select.sgml
+++ b/doc/src/sgml/ref/select.sgml
@ -1,5 +1,5 @@
 <!--
-$Header: /cvsroot/pgsql/doc/src/sgml/ref/select.sgml,v 1.25 2000/02/21 01:13:52 tgl Exp $
+$Header: /cvsroot/pgsql/doc/src/sgml/ref/select.sgml,v 1.26 2000/03/15 23:31:19 tgl Exp $
 Postgres documentation
 -->
@ -37,7 +37,7 @@ SELECT [ ALL | DISTINCT [ ON ( <replaceable class="PARAMETER">expression</replac
  <refsect2 id="R2-SQL-SELECT-1">
   <refsect2info>
-    <date>1998-09-24</date>
+    <date>2000-03-15</date>
   </refsect2info>
   <title>
    Inputs
@ -59,10 +59,12 @@ SELECT [ ALL | DISTINCT [ ON ( <replaceable class="PARAMETER">expression</replac
      <listitem>
       <para>
 	Specifies another name for a column or an expression using
-	the AS clause.  This name is primarily used to label the output
+	the AS clause.  This name is primarily used to label the column
-	column.  The <replaceable class="PARAMETER">name</replaceable>
+	for display.  It can also be used to refer to the column's value in
-	cannot be used in the WHERE, GROUP BY, or HAVING clauses.
+	ORDER BY and GROUP BY clauses.  But the
-	It can, however, be referenced in ORDER BY clauses.
+	<replaceable class="PARAMETER">name</replaceable>
 	cannot be used in the WHERE or HAVING clauses; write out the
 	expression instead.
       </para>
      </listitem>
     </varlistentry>
@ -72,7 +74,8 @@ SELECT [ ALL | DISTINCT [ ON ( <replaceable class="PARAMETER">expression</replac
     <term>TEMP</term>
     <listitem>
      <para>
-	The table is created unique to this session, and is
+	If TEMPORARY or TEMP is specified,
 	the table is created unique to this session, and is
 	automatically dropped on session exit.
      </para>
     </listitem>
@ -83,10 +86,10 @@ SELECT [ ALL | DISTINCT [ ON ( <replaceable class="PARAMETER">expression</replac
      <listitem>
       <para>
 	If the INTO TABLE clause is specified, the result of the
-	query will be stored in another table with the indicated
+	query will be stored in a new table with the indicated
 	name.
 	The target table (<replaceable class="PARAMETER">new_table</replaceable>) will
-	be created automatically and should not exist before this command.
+	be created automatically and must not exist before this command.
        Refer to <command>SELECT INTO</command> for more information.
 	<note>
@ -143,7 +146,8 @@ SELECT [ ALL | DISTINCT [ ON ( <replaceable class="PARAMETER">expression</replac
      <term><replaceable class="PARAMETER">select</replaceable></term>
      <listitem>
       <para>
-	A select statement with all features except the ORDER BY clause.
+	A select statement with all features except the ORDER BY and
 	LIMIT clauses.
       </para>
      </listitem>
     </varlistentry>
@ -188,7 +192,7 @@ SELECT [ ALL | DISTINCT [ ON ( <replaceable class="PARAMETER">expression</replac
 <refsect1 id="R1-SQL-SELECT-1">
  <refsect1info>
-   <date>1998-09-24</date>
+   <date>2000-03-15</date>
  </refsect1info>
  <title>
   Description
@ -210,7 +214,9 @@ SELECT [ ALL | DISTINCT [ ON ( <replaceable class="PARAMETER">expression</replac
  <para>
   <command>DISTINCT ON</command> eliminates rows that match on all the
   specified expressions, keeping only the first row of each set of
-   duplicates.  Note that "the first row" of each set is unpredictable
+   duplicates.  The DISTINCT ON expressions are interpreted using the
   same rules as for ORDER BY items; see below.
   Note that "the first row" of each set is unpredictable
   unless <command>ORDER BY</command> is used to ensure that the desired
   row appears first.  For example,
   <programlisting>
@ -226,21 +232,20 @@ SELECT [ ALL | DISTINCT [ ON ( <replaceable class="PARAMETER">expression</replac
  <para>
   The GROUP BY clause allows a user to divide a table
-   conceptually into groups.
+   into groups of rows that match on one or more values.
   (See <xref linkend="sql-groupby" endterm="sql-groupby-title">.)
  </para>
  <para>
-   The HAVING clause specifies a grouped table derived by the
+   The HAVING clause allows selection of only those groups of rows
-   elimination of groups from the result of the previously
+   meeting the specified condition.
   specified clause.
   (See <xref linkend="sql-having" endterm="sql-having-title">.)
  </para>
  <para>
-   The ORDER BY clause allows a user to specify that he/she
+   The ORDER BY clause causes the returned rows to be sorted in a specified
-   wishes the rows sorted according to the ASCending or 
+   order.  If ORDER BY is not given, the rows are returned in whatever order
-   DESCending mode operator.
+   the system finds cheapest to produce.
   (See <xref linkend="sql-orderby-title" endterm="sql-orderby-title">.)
  </para>
@ -279,7 +284,7 @@ SELECT [ ALL | DISTINCT [ ON ( <replaceable class="PARAMETER">expression</replac
  <refsect2 id="SQL-WHERE">
   <refsect2info>
-    <date>1998-09-24</date>
+    <date>2000-03-15</date>
   </refsect2info>
   <title id="sql-where-title">
    WHERE Clause
@ -312,15 +317,14 @@ WHERE <replaceable class="PARAMETER">boolean_expr</replaceable>
    locally-defined operator, 
    and <replaceable class="PARAMETER">log_op</replaceable> can be one 
    of: AND, OR, NOT.
-    The comparison returns either TRUE or FALSE and all
+    SELECT will ignore all rows for which the WHERE condition does not return
-    instances will be discarded
+    TRUE.
    if the expression evaluates to FALSE.
   </para>
  </refsect2>
  <refsect2 id="SQL-GROUPBY">
   <refsect2info>
-    <date>1998-09-24</date>
+    <date>2000-03-15</date>
   </refsect2info>
   <title id="sql-groupby-title">
    GROUP BY Clause
@ -334,20 +338,28 @@ GROUP BY <replaceable class="PARAMETER">column</replaceable> [, ...]
   </para>
   <para>
-    GROUP BY will condense into a single row all rows that share the
+    GROUP BY will condense into a single row all selected rows that share the
    same values for the grouped columns.  Aggregate functions, if any,
    are computed across all rows making up each group, producing a
    separate value for each group (whereas without GROUP BY, an
    aggregate produces a single value computed across all the selected
-    rows).  When GROUP BY is present, it is not valid to refer to
+    rows).  When GROUP BY is present, it is not valid for the SELECT
    output expression(s) to refer to
    ungrouped columns except within aggregate functions, since there
    would be more than one possible value to return for an ungrouped column.
   </para>
   <para>
    An item in GROUP BY can also be the name or ordinal number of an output
    column (SELECT expression), or it can be an arbitrary expression formed
    from input-column values.  In case of ambiguity, a GROUP BY name will
    be interpreted as an input-column name rather than an output column name.
   </para>
  </refsect2>
  <refsect2 id="SQL-HAVING">
   <refsect2info>
-    <date>1998-09-24</date>
+    <date>2000-03-15</date>
   </refsect2info>
   <title id="sql-having-title">
    HAVING Clause
@ -365,8 +377,12 @@ HAVING <replaceable class="PARAMETER">cond_expr</replaceable>
   <para>
    HAVING specifies a grouped table derived by the elimination
-    of groups from the result of the previously specified clause
+    of group rows that do not satisfy the
-    that do not meet the <replaceable class="PARAMETER">cond_expr</replaceable>.</para>
+    <replaceable class="PARAMETER">cond_expr</replaceable>.
    HAVING is different from WHERE:
    WHERE filters individual rows before application of GROUP BY,
    while HAVING filters group rows created by GROUP BY.
   </para>
   <para>
    Each column referenced in 
@ -378,7 +394,7 @@ HAVING <replaceable class="PARAMETER">cond_expr</replaceable>
  <refsect2 id="SQL-ORDERBY">
   <refsect2info>
-    <date>1998-09-24</date>
+    <date>2000-03-15</date>
   </refsect2info>
   <title id="sql-orderby-title">
    ORDER BY Clause
@ -389,15 +405,15 @@ ORDER BY <replaceable class="PARAMETER">column</replaceable> [ ASC | DESC ] [, .
    </synopsis></para>
   <para>
-    <replaceable class="PARAMETER">column</replaceable> can be either a column
+    <replaceable class="PARAMETER">column</replaceable> can be either a
-    name or an ordinal number.
+    result column name or an ordinal number.
   </para>
   <para>
    The ordinal numbers refers to the ordinal (left-to-right) position
-    of the column. This feature makes it possible to define an ordering
+    of the result column. This feature makes it possible to define an ordering
    on the basis of a column that does not have a proper name.
    This is never absolutely necessary because it is always possible
-    to assign a name to a calculated column using the AS clause, e.g.:
+    to assign a name to a result column using the AS clause, e.g.:
    <programlisting>
 SELECT title, date_prod + 1 AS newlen FROM films ORDER BY newlen;
    </programlisting></para>
@ -410,6 +426,11 @@ SELECT title, date_prod + 1 AS newlen FROM films ORDER BY newlen;
    <programlisting>
 SELECT name FROM distributors ORDER BY code;
    </programlisting>
    Note that if an ORDER BY item is a simple name that matches both
    a result column name and an input column name, ORDER BY will interpret
    it as the result column name.  This is the opposite of the choice that
    GROUP BY will make in the same situation.  This inconsistency is
    mandated by the SQL92 standard.
   </para>
   <para>
@ -436,7 +457,7 @@ SELECT name FROM distributors ORDER BY code;
    where
    <replaceable class="PARAMETER">table_query</replaceable>
-    specifies any select expression without an ORDER BY clause.
+    specifies any select expression without an ORDER BY or LIMIT clause.
   </para>
   <para>
@ -476,7 +497,7 @@ SELECT name FROM distributors ORDER BY code;
    where
    <replaceable class="PARAMETER">table_query</replaceable>
-    specifies any select expression without an ORDER BY clause.
+    specifies any select expression without an ORDER BY or LIMIT clause.
   </para>
   <para>
@ -507,7 +528,7 @@ SELECT name FROM distributors ORDER BY code;
    where
    <replaceable class="PARAMETER">table_query</replaceable>
-    specifies any select expression without an ORDER BY clause.
+    specifies any select expression without an ORDER BY or LIMIT clause.
   </para>
   <para>
@ -560,7 +581,7 @@ SELECT name FROM distributors ORDER BY code;
   </para>
   <para>
-    As of Postgres 7.0, the
+    As of PostgreSQL 7.0, the
    query optimizer takes LIMIT into account when generating a query plan,
    so you are very likely to get different plans (yielding different row
    orders) depending on what you give for LIMIT and OFFSET.  Thus, using
@ -765,6 +786,18 @@ SELECT distributors.* WHERE name = 'Westwood';
     The DISTINCT ON phrase is not part of <acronym>SQL92</acronym>.
     Nor are LIMIT and OFFSET.
    </para>
    <para>
     In <acronym>SQL92</acronym>, an ORDER BY clause may only use result
     column names or numbers, while a GROUP BY clause may only use input
     column names.
     <productname>Postgres</productname> extends each of these clauses to
     allow the other choice as well (but it uses the standard's interpretation
     if there is ambiguity).
     <productname>Postgres</productname> also allows both clauses to specify
     arbitrary expressions.  Note that names appearing in an expression will
     always be taken as input-column names, not as result-column names.
    </para>
   </refsect3>
   <refsect3 id="R3-SQL-UNION-1">
--- a/src/backend/parser/parse_clause.c
+++ b/src/backend/parser/parse_clause.c
@ -8,7 +8,7 @@
 *
 *
 * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/parser/parse_clause.c,v 1.56 2000/03/14 23:06:32 thomas Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/parser/parse_clause.c,v 1.57 2000/03/15 23:31:04 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@ -719,9 +719,9 @@ parseFromClause(ParseState *pstate, List *frmList)
 *	  list as a "resjunk" node.
 *
 * node		the ORDER BY, GROUP BY, or DISTINCT ON expression to be matched
- * tlist	the existing target list (NB: this cannot be NIL, which is a
+ * tlist	the existing target list (NB: this will never be NIL, which is a
- *			good thing since we'd be unable to append to it...)
+ *			good thing since we'd be unable to append to it if it were...)
- * clause	identifies clause type (mainly for error messages).
+ * clause	identifies clause type being processed.
 */
 static TargetEntry *
 findTargetlistEntry(ParseState *pstate, Node *node, List *tlist, int clause)
@ -733,7 +733,7 @@ findTargetlistEntry(ParseState *pstate, Node *node, List *tlist, int clause)
 	/*----------
 	 * Handle two special cases as mandated by the SQL92 spec:
 	 *
-	 * 1. ORDER BY ColumnName
+	 * 1. Bare ColumnName (no qualifier or subscripts)
 	 *    For a bare identifier, we search for a matching column name
 	 *	  in the existing target list.  Multiple matches are an error
 	 *	  unless they refer to identical values; for example,
@ -741,49 +741,76 @@ findTargetlistEntry(ParseState *pstate, Node *node, List *tlist, int clause)
 	 *	  but not   SELECT a AS b, b FROM table ORDER BY b
 	 *	  If no match is found, we fall through and treat the identifier
 	 *	  as an expression.
-	 *	  We do NOT attempt this match for GROUP BY, since it is clearly
+	 *	  For GROUP BY, it is incorrect to match the grouping item against
-	 *	  contrary to the spec to use an output column name in preference
+	 *	  targetlist entries: according to SQL92, an identifier in GROUP BY
-	 *	  to an underlying column name in GROUP BY.  DISTINCT ON isn't in
+	 *	  is a reference to a column name exposed by FROM, not to a target
-	 *	  the standard, so we can do what we like there; we choose to make
+	 *	  list column.  However, many implementations (including pre-7.0
-	 *	  it work like GROUP BY.
+	 *	  PostgreSQL) accept this anyway.  So for GROUP BY, we look first
 	 *	  to see if the identifier matches any FROM column name, and only
 	 *	  try for a targetlist name if it doesn't.  This ensures that we
 	 *	  adhere to the spec in the case where the name could be both.
 	 *	  DISTINCT ON isn't in the standard, so we can do what we like there;
 	 *	  we choose to make it work like ORDER BY, on the rather flimsy
 	 *	  grounds that ordinary DISTINCT works on targetlist entries.
 	 *
-	 * 2. ORDER BY/GROUP BY/DISTINCT ON IntegerConstant
+	 * 2. IntegerConstant
 	 *	  This means to use the n'th item in the existing target list.
 	 *	  Note that it would make no sense to order/group/distinct by an
 	 *	  actual constant, so this does not create a conflict with our
 	 *	  extension to order/group by an expression.
-	 *	  I believe that GROUP BY column-number is not sanctioned by SQL92,
+	 *	  GROUP BY column-number is not allowed by SQL92, but since
-	 *	  but since the standard has no other behavior defined for this
+	 *	  the standard has no other behavior defined for this syntax,
-	 *	  syntax, we may as well continue to support our past behavior.
+	 *	  we may as well accept this common extension.
 	 *
-	 * Note that pre-existing resjunk targets must not be used in either case.
+	 * Note that pre-existing resjunk targets must not be used in either case,
 	 * since the user didn't write them in his SELECT list.
 	 *
 	 * If neither special case applies, fall through to treat the item as
 	 * an expression.
 	 *----------
 	 */
-	if (clause == ORDER_CLAUSE &&
+	if (IsA(node, Ident) && ((Ident *) node)->indirection == NIL)
 		IsA(node, Ident) && ((Ident *) node)->indirection == NIL)
 	{
 		char	   *name = ((Ident *) node)->name;
-		foreach(tl, tlist)
+
 		if (clause == GROUP_CLAUSE)
 		{
-			TargetEntry *tle = (TargetEntry *) lfirst(tl);
+			/*
-			Resdom	   *resnode = tle->resdom;
+			 * In GROUP BY, we must prefer a match against a FROM-clause
 			 * column to one against the targetlist.  Look to see if there is
 			 * a matching column.  If so, fall through to let transformExpr()
 			 * do the rest.  NOTE: if name could refer ambiguously to more
 			 * than one column name exposed by FROM, colnameRangeTableEntry
 			 * will elog(ERROR).  That's just what we want here.
 			 */
 			if (colnameRangeTableEntry(pstate, name) != NULL)
 				name = NULL;
 		}
-			if (!resnode->resjunk &&
+		if (name != NULL)
-				strcmp(resnode->resname, name) == 0)
+		{
 			foreach(tl, tlist)
 			{
-				if (target_result != NULL)
+				TargetEntry *tle = (TargetEntry *) lfirst(tl);
 				Resdom	   *resnode = tle->resdom;
 				if (!resnode->resjunk &&
 					strcmp(resnode->resname, name) == 0)
 				{
-					if (! equal(target_result->expr, tle->expr))
+					if (target_result != NULL)
-						elog(ERROR, "%s '%s' is ambiguous",
+					{
-							 clauseText[clause], name);
+						if (! equal(target_result->expr, tle->expr))
 							elog(ERROR, "%s '%s' is ambiguous",
 								 clauseText[clause], name);
 					}
 					else
 						target_result = tle;
 					/* Stay in loop to check for ambiguity */
 				}
 				else
 					target_result = tle;
 				/* Stay in loop to check for ambiguity */
 			}
 			if (target_result != NULL)
 				return target_result; /* return the first match */
 		}
 		if (target_result != NULL)
 			return target_result; /* return the first match */
 	}
 	if (IsA(node, A_Const))
 	{
--- a/src/test/regress/expected/numerology.out
+++ b/src/test/regress/expected/numerology.out
@ -96,14 +96,18 @@ SELECT f1 AS two, max(f3) AS max_float, min(f3) as min_float
   2 |                    0 | -1.2345678901234e+200
 (2 rows)
-- Postgres used to accept this, but it is clearly against SQL92 to
+-- GROUP BY a result column name is not legal per SQL92, but we accept it
-- interpret GROUP BY arguments as result column names; they should
+-- anyway (if the name is not the name of any column exposed by FROM).
 -- be source column names *only*.  An error is expected.
 SELECT f1 AS two, max(f3) AS max_float, min(f3) AS min_float
  FROM TEMP_GROUP
  GROUP BY two
  ORDER BY two, max_float, min_float;
-ERROR:  Attribute 'two' not found
+ two |      max_float       |       min_float       
 -----+----------------------+-----------------------
   1 | 1.2345678901234e+200 |                     0
   2 |                    0 | -1.2345678901234e+200
 (2 rows)
 SELECT f1 AS two, (max(f3) + 1) AS max_plus_1, (min(f3) - 1) AS min_minus_1
  FROM TEMP_GROUP
  GROUP BY f1
--- a/src/test/regress/sql/numerology.sql
+++ b/src/test/regress/sql/numerology.sql
@ -70,9 +70,8 @@ SELECT f1 AS two, max(f3) AS max_float, min(f3) as min_float
  GROUP BY f1
  ORDER BY two, max_float, min_float;
-- Postgres used to accept this, but it is clearly against SQL92 to
+-- GROUP BY a result column name is not legal per SQL92, but we accept it
-- interpret GROUP BY arguments as result column names; they should
+-- anyway (if the name is not the name of any column exposed by FROM).
 -- be source column names *only*.  An error is expected.
 SELECT f1 AS two, max(f3) AS max_float, min(f3) AS min_float
  FROM TEMP_GROUP
  GROUP BY two