New file format for COPY BINARY, in accordance with pghackers discussions

of early December 2000. COPY BINARY is now TOAST-safe.
25 years ago · 676cf18c5b
parent 8fd2e269f7
commit 676cf18c5b
2 changed files with 567 additions and 346 deletions
--- a/doc/src/sgml/ref/copy.sgml
+++ b/doc/src/sgml/ref/copy.sgml
@ -1,5 +1,5 @@
 <!--
-$Header: /cvsroot/pgsql/doc/src/sgml/ref/copy.sgml,v 1.18 2000/10/05 19:48:17 momjian Exp $
+$Header: /cvsroot/pgsql/doc/src/sgml/ref/copy.sgml,v 1.19 2001/01/03 20:04:09 tgl Exp $
 Postgres documentation
 -->

@ -49,6 +49,7 @@ COPY [ BINARY ] <replaceable class="parameter">table</replaceable> [ WITH OIDS ]
       <para>
 	Changes the behavior of field formatting, forcing all data to be
 	stored or read in binary format rather than as text.
+	The DELIMITERS and WITH NULL options are irrelevant for binary format.
       </para>
      </listitem>
     </varlistentry>
@ -66,7 +67,7 @@ COPY [ BINARY ] <replaceable class="parameter">table</replaceable> [ WITH OIDS ]
      <term>WITH OIDS</term>
      <listitem>
       <para>
-	Copies the internal unique object id (OID) for each row.
+	Specifies copying the internal unique object id (OID) for each row.
       </para>
      </listitem>
     </varlistentry>
@ -84,7 +85,7 @@ COPY [ BINARY ] <replaceable class="parameter">table</replaceable> [ WITH OIDS ]
      <term><filename>stdin</filename></term>
      <listitem>
       <para>
-	Specifies that input comes from a pipe or terminal.
+	Specifies that input comes from the client application.
       </para>
      </listitem>
     </varlistentry>
@ -93,7 +94,7 @@ COPY [ BINARY ] <replaceable class="parameter">table</replaceable> [ WITH OIDS ]
      <term><filename>stdout</filename></term>
      <listitem>
       <para>
-	Specifies that output goes to a pipe or terminal.
+	Specifies that output goes to the client application.
       </para>
      </listitem>
     </varlistentry>
@ -102,16 +103,16 @@ COPY [ BINARY ] <replaceable class="parameter">table</replaceable> [ WITH OIDS ]
      <term><replaceable class="parameter">delimiter</replaceable></term>
      <listitem>
       <para>
-	A character that delimits the input or output fields.
+	The character that separates fields within each row (line) of the file.
       </para>
      </listitem>
     </varlistentry>

     <varlistentry>
-      <term><replaceable class="parameter">null print</replaceable></term>
+      <term><replaceable class="parameter">null string</replaceable></term>
      <listitem>
       <para>
-        A string to represent NULL values. The default is
+        The string that represents a NULL value. The default is
        <quote><literal>\N</literal></quote> (backslash-N).
 	You might prefer an empty string, for example.
       </para>
@ -166,7 +167,7 @@ ERROR: <replaceable>reason</replaceable>
 
 <refsect1 id="R1-SQL-COPY-1">
  <refsect1info>
-   <date>1998-09-08</date>
+   <date>2001-01-02</date>
  </refsect1info>
  <title>
   Description
@ -176,17 +177,36 @@ ERROR: <replaceable>reason</replaceable>
   <productname>Postgres</productname> tables and
   standard file-system files.

+   <command>COPY TO</command> copies the entire contents of a table to
+   a file, while <command>COPY FROM</command> copies data from a file to a
+   table (appending the data to whatever is in the table already).
+  </para>
+
+  <para>
   <command>COPY</command> instructs
   the <productname>Postgres</productname> backend
-   to directly read from or write to a file. The file must be directly visible to
-   the backend and the name must be specified from the viewpoint of the backend.
-   If <filename>stdin</filename> or <filename>stdout</filename> are
+   to directly read from or write to a file. If a file name is specified,
+   the file must be accessible to the backend and the name must be specified
+   from the viewpoint of the backend.
+   If <filename>stdin</filename> or <filename>stdout</filename> is
   specified, data flows through the client frontend to  the backend.
-  </para>
+    
+    <tip>
+     <para>
+      Do not confuse <command>COPY</command> with the
+      <application>psql</application> instruction <command>\copy</command>.
+      <command>\copy</command> invokes <command>COPY FROM stdin</command> 
+      or <command>COPY TO stdout</command>, and then fetches/stores the data
+      in a file accessible to the <application>psql</application> client.
+      Thus, file accessibility and access rights depend on the client
+      rather than the backend when <command>\copy</command> is used.
+     </para>
+    </tip>
+   </para>

  <refsect2 id="R2-SQL-COPY-3">
   <refsect2info>
-    <date>1998-09-08</date>
+    <date>2001-01-02</date>
   </refsect2info>
   <title>
    Notes
@ -194,16 +214,19 @@ ERROR: <replaceable>reason</replaceable>
   <para>
    The BINARY keyword will force all data to be
    stored/read as binary format rather than as text.  It is
-    somewhat faster than the normal copy command, but is not
-    generally portable, and the files generated are somewhat larger,
-    although this factor is highly dependent on the data itself.  
+    somewhat faster than the normal copy command, but a binary copy
+    file is not portable across machine architectures.
    </para>
+
    <para>
-    By default, a text copy uses a tab ("\t") character as a delimiter.
-    The delimiter may also be changed to any other single character
-    with the keyword phrase USING DELIMITERS.  Characters
+    By default, a text copy uses a tab ("\t") character as a delimiter
+    between fields.  The field delimiter may be changed to any other single
+    character with the keyword phrase USING DELIMITERS.  Characters
    in data fields which happen to match the delimiter character will
    be backslash quoted.
+    Note that the delimiter is always a single character.
+    If multiple characters are specified in the delimiter string,
+    only the first character is used.
   </para>
   
   <para>
@ -217,67 +240,63 @@ ERROR: <replaceable>reason</replaceable>
   </para>

   <para>
-    The keyword phrase USING DELIMITERS specifies a single character
-    to be used for all delimiters between columns. If multiple characters
-    are specified in the delimiter string,  only the first character is
-    used.
-    
-    <tip>
-     <para>
-      Do not confuse <command>COPY</command> with the
-      <application>psql</application> instruction <command>\copy</command>.
-     </para>
-    </tip>
+    <command>COPY TO</command> neither invokes rules nor acts on column
+    defaults.  It does invoke triggers and check constraints.
   </para>

-   <para>
-    <command>COPY</command> neither invokes rules nor acts on column defaults.
-    It does invoke triggers, however.
-   </para>
   <para>
    <command>COPY</command> stops operation at the first error.  This
    should not lead to problems in the event of
    a <command>COPY FROM</command>, but the
-    target relation will, of course, be partially modified in a
-    <command>COPY TO</command>.
-    <command>VACUUM</command> should be used to clean up
-    after a failed copy.
-   </para>
-   <para>
-    Because the Postgres backend's current working directory
-    is not usually the same as the user's
-    working directory, the result of copying to a file
-    "<filename>foo</filename>" (without
-    additional path information) may yield unexpected results for the
-    naive user.  In this case, <filename>foo</filename>
-    will wind up in <filename>$PGDATA/foo</filename>.  In
-    general, the full pathname as it would appear to the backend server machine
-    should be used when specifying files to
-    be copied.
+    target relation will already have received earlier rows in a
+    <command>COPY TO</command>.  These rows will not be visible or
+    accessible, but they still occupy disk space.  This may amount to a
+    considerable amount
+    of wasted disk space if the failure happened well into a large copy
+    operation.  You may wish to invoke <command>VACUUM</command> to recover
+    the wasted space.
   </para>
+
   <para>
-    Files used as arguments to <command>COPY</command>
-    must reside on or be
-    accessible to the database server machine by being either on
-    local disks or on a networked file system.
+    Files named in a <command>COPY</command> command are read or written
+    directly by the backend, not by the client application.  Therefore,
+    they must reside on or be accessible to the database server machine,
+    not the client.  They must be accessible to and readable or writable
+    by the Postgres user (the userid the backend runs as), not the client.
+    <command>COPY</command> naming a file is only allowed to database
+    superusers, since it allows writing on any file that the backend has
+    privileges to write on.
+    
+    <tip>
+     <para>
+      The
+      <application>psql</application> instruction <command>\copy</command>
+      reads or writes files on the client machine with the client's
+      permissions, so it is not restricted to superusers.
+     </para>
+    </tip>
   </para>
+
   <para>
-    When a TCP/IP connection from one machine to another is used, and a
-    target file is specified, the target file will be written on the
-    machine where the backend is running rather than the user's
-    machine. 
+    It is recommended that the filename used in <command>COPY</command>
+    always be specified as an absolute path.  This is enforced by the backend
+    in the case of <command>COPY TO</command>, but for <command>COPY
+    FROM</command> you do have the option of reading from a file specified
+    by a relative path.  The path will be interpreted relative to the
+    backend's working directory (somewhere below
+    <filename>$PGDATA</filename>), not the client's working directory.
   </para>
  </refsect2>
 </refsect1>
 
 <refsect1 id="R1-SQL-COPY-2">
  <refsect1info>
-   <date>1998-05-04</date>
+   <date>2001-01-02</date>
  </refsect1info>
  <title>File Formats</title>
  <refsect2>
   <refsect2info>
-    <date>1998-05-04</date>
+    <date>2001-01-02</date>
   </refsect2info>
   <title>Text Format</title>
   <para>
@ -293,27 +312,34 @@ ERROR: <replaceable>reason</replaceable>
   <para>
    The actual format for each instance is
    <programlisting>
-&lt;attr1&gt;&lt;<replaceable class=parameter>separator</replaceable>&gt;&lt;attr2&gt;&lt;<replaceable class=parameter>separator</replaceable>&gt;...&lt;<replaceable class=parameter>separator</replaceable>&gt;&lt;attr<replaceable class="parameter">n</replaceable>&gt;&lt;newline&gt;.
+&lt;attr1&gt;&lt;<replaceable class=parameter>separator</replaceable>&gt;&lt;attr2&gt;&lt;<replaceable class=parameter>separator</replaceable>&gt;...&lt;<replaceable class=parameter>separator</replaceable>&gt;&lt;attr<replaceable class="parameter">n</replaceable>&gt;&lt;newline&gt;
    </programlisting>
-    The oid is placed on the beginning of the line
-    if WITH OIDS is specified.
+    Note that the end of each row is marked by a Unix-style newline
+    ("\n").  <command>COPY FROM</command> will not behave as desired
+    if given a file containing DOS- or Mac-style newlines.
+   </para>
+   <para>
+    The OID is emitted as the first column if WITH OIDS is specified.
   </para>
   <para>
-    If <command>COPY</command> is sending its output to standard
-    output instead of a file, it will send a backslash("\") and a period
-    (".")  followed immediately by a newline, on a separate line,
-    when it is done.  Similarly, if <command>COPY</command> is reading
+    If <command>COPY TO</command> is sending its output to standard
+    output instead of a file, after the last row it will send a backslash ("\")
+    and a period (".") followed by a newline.
+    Similarly, if <command>COPY FROM</command> is reading
    from standard input, it will expect a backslash ("\") and a period
    (".") followed by a newline, as the first three characters on a
-    line to denote end-of-file.  However, <command>COPY</command>
-    will terminate (followed by the backend itself) if a true EOF is
-    encountered before this special end-of-file pattern is found.
+    line to denote end-of-file.  However, <command>COPY FROM</command>
+    will terminate correctly (followed by the backend itself) if the
+    input connection is closed before this special end-of-file pattern is
+    found.
   </para>
   <para>
    The backslash character has other special meanings.  A literal backslash
    character is represented as two
    consecutive backslashes ("\\").  A literal tab character is represented
-    as a backslash and a tab.  A literal newline character is
+    as a backslash and a tab.  (If you are using something other than tab
+    as the column delimiter, backslash that delimiter character to include
+    it in data.)  A literal newline character is
    represented as a backslash and a newline.  When loading text data
    not generated by <acronym>Postgres</acronym>,
    you will need to convert backslash
@ -324,82 +350,207 @@ ERROR: <replaceable>reason</replaceable>

  <refsect2>
   <refsect2info>
-    <date>1998-05-04</date>
+    <date>2001-01-02</date>
   </refsect2info>
   <title>Binary Format</title>
   <para>
-    In the case of <command>COPY BINARY</command>, the first four
-    bytes in the file will be the number of instances in the file.  If
-    this number is zero, the <command>COPY BINARY</command> command
-    will read until end-of-file is encountered.  Otherwise, it will
-    stop reading when this number of instances has been read.
-    Remaining data in the file will be ignored.
-   </para>
-   <para>
-    The format for each instance in the file is as follows.  Note that
-    this format must be followed <emphasis>exactly</emphasis>.
-    Unsigned four-byte integer quantities are called uint32 in the
-    table below.
-   </para>
-   <table frame="all">
-    <title>Contents of a binary copy file</title>
-    <tgroup cols="2" colsep="1" rowsep="1" align="center">
-     <colspec colname="col1">
-     <colspec colname="col2">
-     <spanspec namest="col1" nameend="col2" spanname="subhead">
-     <tbody>
-      <row>
-       <entry spanname="subhead" align="center">At the start of the file</entry>
-      </row>
-      <row>
-       <entry>uint32</entry>
-       <entry>number of tuples</entry>
-      </row>
-      <row>
-       <entry spanname="subhead" align="center">For each tuple</entry>
-      </row>
-      <row>
-       <entry>uint32</entry>
-       <entry>total length of tuple data</entry>
-      </row>
-      <row>
-       <entry>uint32</entry>
-       <entry>oid (if specified)</entry>
-      </row>
-      <row>
-       <entry>uint32</entry>
-       <entry>number of null attributes</entry>
-      </row>
-      <row>
-       <entry>[uint32,...,uint32]</entry>
-       <entry>attribute numbers of attributes, counting from 0</entry>
-      </row>
-      <row>
-       <entry>-</entry>
-       <entry>&lt;tuple data&gt;</entry>
-      </row>
-     </tbody>
-    </tgroup>
-   </table>
-   
-  </refsect2>
-  <refsect2>
-   <refsect2info>
-    <date>1998-05-04</date>
-   </refsect2info>
-   <title>Alignment of Binary Data</title>
-   <para>
-    On Sun-3s, 2-byte attributes are aligned on two-byte boundaries,
-    and all larger attributes are aligned on four-byte boundaries.
-    Character attributes are aligned on single-byte boundaries.  On
-    most other machines, all attributes larger than 1 byte are aligned on
-    four-byte boundaries.  Note that variable length attributes are
-    preceded by the attribute's length; arrays are simply contiguous
-    streams of the array element type.
+    The file format used for <command>COPY BINARY</command> changed in
+    Postgres v7.1.  The new format consists of a file header, zero or more
+    tuples, and a file trailer.
   </para>
+
+   <refsect3>
+    <refsect3info>
+     <date>2001-01-02</date>
+    </refsect3info>
+    <title>
+     File Header
+    </title>
+    <para>
+     The file header consists of 24 bytes of fixed fields, followed
+     by a variable-length header extension area.  The fixed fields are:
+
+    <variablelist>
+     <varlistentry>
+      <term>Signature</term>
+      <listitem>
+       <para>
+12-byte sequence "PGBCOPY\n\377\r\n\0" --- note that the null
+is a required part of the signature.  (The signature is designed to allow
+easy identification of files that have been munged by a non-8-bit-clean
+transfer.  This signature will be changed by newline-translation
+filters, dropped nulls, dropped high bits, or parity changes.)
+       </para>
+      </listitem>
+     </varlistentry>
+
+     <varlistentry>
+      <term>Integer layout field</term>
+      <listitem>
+       <para>
+int32 constant 0x01020304 in source's byte order.
+Potentially, a reader could engage in byte-flipping of subsequent fields
+if the wrong byte order is detected here.
+       </para>
+      </listitem>
+     </varlistentry>
+
+     <varlistentry>
+      <term>Flags field</term>
+      <listitem>
+       <para>
+int32 bit mask to denote important aspects of the file
+format.  Bits are numbered from 0 (LSB) to 31 (MSB) --- note that this
+field is stored with source's endianness, as are all subsequent integer
+fields.  Bits 16-31 are reserved to denote critical file format issues;
+a reader should abort if it finds an unexpected bit set in this range.
+Bits 0-15 are reserved to signal backwards-compatible format issues;
+a reader should simply ignore any unexpected bits set in this range.
+Currently only one flag bit is defined, and the rest must be zero:
+        <variablelist>
+         <varlistentry>
+          <term>Bit 16</term>
+          <listitem>
+           <para>
+            if 1, OIDs are included in the dump; if 0, not
+           </para>
+          </listitem>
+         </varlistentry>
+        </variablelist>
+       </para>
+      </listitem>
+     </varlistentry>
+
+     <varlistentry>
+      <term>Header extension area length</term>
+      <listitem>
+       <para>
+int32 length in bytes of remainder of header, not including self.  In
+the initial version this will be zero, and the first tuple follows
+immediately.  Future changes to the format might allow additional data
+to be present in the header.  A reader should silently skip over any header
+extension data it does not know what to do with.
+       </para>
+      </listitem>
+     </varlistentry>
+    </variablelist>
+    </para>
+
+    <para>
+The header extension area is envisioned to contain a sequence of
+self-identifying chunks.  The flags field is not intended to tell readers
+what is in the extension area.  Specific design of header extension contents
+is left for a later release.
+    </para>
+
+    <para>
+     This design allows for both backwards-compatible header additions (add
+     header extension chunks, or set low-order flag bits) and
+     non-backwards-compatible changes (set high-order flag bits to signal such
+     changes, and add supporting data to the extension area if needed).
+    </para>
+   </refsect3>
+
+   <refsect3>
+    <refsect3info>
+     <date>2001-01-02</date>
+    </refsect3info>
+    <title>
+     Tuples
+    </title>
+    <para>
+Each tuple begins with an int16 count of the number of fields in the
+tuple.  (Presently, all tuples in a table will have the same count, but
+that might not always be true.)  Then, repeated for each field in the
+tuple, there is an int16 typlen word possibly followed by field data.
+The typlen field is interpreted thus:
+
+    <variablelist>
+     <varlistentry>
+      <term>Zero</term>
+      <listitem>
+       <para>
+	Field is NULL.  No data follows.
+       </para>
+      </listitem>
+     </varlistentry>
+
+     <varlistentry>
+      <term>&gt; 0</term>
+      <listitem>
+       <para>
+        Field is a fixed-length datatype.  Exactly N
+	bytes of data follow the typlen word.
+       </para>
+      </listitem>
+     </varlistentry>
+
+     <varlistentry>
+      <term>-1</term>
+      <listitem>
+       <para>
+	Field is a varlena datatype.  The next four
+	bytes are the varlena header, which contains
+	the total value length including itself.
+       </para>
+      </listitem>
+     </varlistentry>
+
+     <varlistentry>
+      <term>&lt; -1</term>
+      <listitem>
+       <para>
+	Reserved for future use.
+       </para>
+      </listitem>
+     </varlistentry>
+    </variablelist>
+    </para>
+
+    <para>
+For non-NULL fields, the reader can check that the typlen matches the
+expected typlen for the destination column.  This provides a simple
+but very useful check that the data is as expected.
+    </para>
+
+    <para>
+There is no alignment padding or any other extra data between fields.
+Note also that the format does not distinguish whether a datatype is
+pass-by-reference or pass-by-value.  Both of these provisions are
+deliberate: they might help improve portability of the files (although
+of course endianness and floating-point-format issues can still keep
+you from moving a binary file across machines).
+    </para>
+
+    <para>
+If OIDs are included in the dump, the OID field immediately follows the
+field-count word.  It is a normal field except that it's not included
+in the field-count.  In particular it has a typlen --- this will allow
+handling of 4-byte vs 8-byte OIDs without too much pain, and will allow
+OIDs to be shown as NULL if we someday allow OIDs to be optional.
+    </para>
+   </refsect3>
+
+   <refsect3>
+    <refsect3info>
+     <date>2001-01-02</date>
+    </refsect3info>
+    <title>
+     File Trailer
+    </title>
+    <para>
+     The file trailer consists of an int16 word containing -1.  This is
+     easily distinguished from a tuple's field-count word.
+    </para>
+
+    <para>
+     A reader should report an error if a field-count word is neither -1
+     nor the expected number of columns.  This provides an extra
+     check against somehow getting out of sync with the data.
+    </para>
+   </refsect3>
  </refsect2>
 </refsect1>
-
 
 <refsect1 id="R1-SQL-COPY-3">
  <title>
@ -407,7 +558,7 @@ ERROR: <replaceable>reason</replaceable>
  </title>
  <para>
 The following example copies a table to standard output,
- using a pipe (|) as the field
+ using a vertical bar (|) as the field
 delimiter:
  </para>
  <programlisting>
@ -425,36 +576,36 @@ COPY country FROM '/usr1/proj/bray/sql/country_data';
 has the termination sequence on the last line):
  </para>
  <programlisting>
-   AF      AFGHANISTAN
-   AL      ALBANIA
-   DZ      ALGERIA
-   ...
-   ZM      ZAMBIA
-   ZW      ZIMBABWE
-   \.
+AF      AFGHANISTAN
+AL      ALBANIA
+DZ      ALGERIA
+ZM      ZAMBIA
+ZW      ZIMBABWE
+\.
  </programlisting>
  <para>
-   The following is the same data, output in binary format on a Linux/i586 machine.
-   The data is shown after filtering through
- the Unix utility <command>od -c</command>. The table has
-   three fields; the first is <classname>char(2)</classname>
- and the second is <classname>text</classname>. All the
+   Note that the white space on each line is actually a TAB.
+  </para>
+  <para>
+   The following is the same data, output in binary format on a Linux/i586
+   machine. The data is shown after filtering through
+   the Unix utility <command>od -c</command>. The table has
+   three fields; the first is <classname>char(2)</classname>,
+   the second is <classname>text</classname>, and the third is
+   <classname>int4</classname>. All the
   rows have a null value in the third field.
-  Notice how the <classname>char(2)</classname>
-   field is padded with nulls to four bytes and the text field is
-   preceded by its length:
  </para>
  <programlisting>
-   355  \0  \0  \0 027  \0  \0  \0 001  \0  \0  \0 002  \0  \0  \0
-   006  \0  \0  \0   A   F  \0  \0 017  \0  \0  \0   A   F   G   H
-     A   N   I   S   T   A   N 023  \0  \0  \0 001  \0  \0  \0 002
-    \0  \0  \0 006  \0  \0  \0   A   L  \0  \0  \v  \0  \0  \0   A
-     L   B   A   N   I   A 023  \0  \0  \0 001  \0  \0  \0 002  \0
-    \0  \0 006  \0  \0  \0   D   Z  \0  \0  \v  \0  \0  \0   A   L
-     G   E   R   I   A
-   ...              \n  \0  \0  \0   Z   A   M   B   I   A 024  \0
-    \0  \0 001  \0  \0  \0 002  \0  \0  \0 006  \0  \0  \0   Z   W
-    \0  \0  \f  \0  \0  \0   Z   I   M   B   A   B   W   E
+0000000   P   G   B   C   O   P   Y  \n 377  \r  \n  \0 004 003 002 001
+0000020  \0  \0  \0  \0  \0  \0  \0  \0 003  \0 377 377 006  \0  \0  \0
+0000040   A   F 377 377 017  \0  \0  \0   A   F   G   H   A   N   I   S
+0000060   T   A   N  \0  \0 003  \0 377 377 006  \0  \0  \0   A   L 377
+0000100 377  \v  \0  \0  \0   A   L   B   A   N   I   A  \0  \0 003  \0
+0000120 377 377 006  \0  \0  \0   D   Z 377 377  \v  \0  \0  \0   A   L
+0000140   G   E   R   I   A  \0  \0 003  \0 377 377 006  \0  \0  \0   Z
+0000160   M 377 377  \n  \0  \0  \0   Z   A   M   B   I   A  \0  \0 003
+0000200  \0 377 377 006  \0  \0  \0   Z   W 377 377  \f  \0  \0  \0   Z
+0000220   I   M   B   A   B   W   E  \0  \0 377 377
  </programlisting>
 </refsect1>
 
--- a/src/backend/commands/copy.c
+++ b/src/backend/commands/copy.c
@ -7,7 +7,7 @@
 *
 *
 * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/commands/copy.c,v 1.126 2000/12/27 23:59:14 tgl Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/commands/copy.c,v 1.127 2001/01/03 20:04:10 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@ -52,7 +52,8 @@ static Oid	GetTypeElement(Oid type);
 static void CopyReadNewline(FILE *fp, int *newline);
 static char *CopyReadAttribute(FILE *fp, bool *isnull, char *delim, int *newline, char *null_print);
 static void CopyAttributeOut(FILE *fp, char *string, char *delim);
-static int	CountTuples(Relation relation);
+
+static const char BinarySignature[12] = "PGBCOPY\n\377\r\n\0";

 /*
 * Static communication variables ... pretty grotty, but COPY has
@ -387,7 +388,8 @@ DoCopy(char *relname, bool binary, bool oids, bool from, bool pipe,
 * Copy from relation TO file.
 */
 static void
-CopyTo(Relation rel, bool binary, bool oids, FILE *fp, char *delim, char *null_print)
+CopyTo(Relation rel, bool binary, bool oids, FILE *fp,
+	   char *delim, char *null_print)
 {
 	HeapTuple	tuple;
 	TupleDesc	tupDesc;
@ -398,20 +400,9 @@ CopyTo(Relation rel, bool binary, bool oids, FILE *fp, char *delim, char *null_p
 	FmgrInfo   *out_functions;
 	Oid		   *elements;
 	bool	   *isvarlena;
-	int32	   *typmod;
-	char	   *nulls;
-
-	/*
-	 * <nulls> is a (dynamically allocated) array with one character per
-	 * attribute in the instance being copied.	nulls[I-1] is 'n' if
-	 * Attribute Number I is null, and ' ' otherwise.
-	 *
-	 * <nulls> is meaningful only if we are doing a binary copy.
-	 */
+	int16		fld_size;
 	char	   *string;

-	scandesc = heap_beginscan(rel, 0, QuerySnapshot, 0, NULL);
-
 	tupDesc = rel->rd_att;
 	attr_count = rel->rd_att->natts;
 	attr = rel->rd_att->attrs;
@ -420,7 +411,6 @@ CopyTo(Relation rel, bool binary, bool oids, FILE *fp, char *delim, char *null_p
 	out_functions = (FmgrInfo *) palloc(attr_count * sizeof(FmgrInfo));
 	elements = (Oid *) palloc(attr_count * sizeof(Oid));
 	isvarlena = (bool *) palloc(attr_count * sizeof(bool));
-	typmod = (int32 *) palloc(attr_count * sizeof(int32));
 	for (i = 0; i < attr_count; i++)
 	{
 		Oid			out_func_oid;
@ -430,40 +420,62 @@ CopyTo(Relation rel, bool binary, bool oids, FILE *fp, char *delim, char *null_p
 			elog(ERROR, "COPY: couldn't lookup info for type %u",
 				 attr[i]->atttypid);
 		fmgr_info(out_func_oid, &out_functions[i]);
-		typmod[i] = attr[i]->atttypmod;
 	}

-	if (!binary)
+	if (binary)
 	{
-		nulls = NULL;			/* meaningless, but compiler doesn't know
-								 * that */
+		/* Generate header for a binary copy */
+		int32		tmp;
+
+		/* Signature */
+		CopySendData((char *) BinarySignature, 12, fp);
+		/* Integer layout field */
+		tmp = 0x01020304;
+		CopySendData(&tmp, sizeof(int32), fp);
+		/* Flags field */
+		tmp = 0;
+		if (oids)
+			tmp |= (1 << 16);
+		CopySendData(&tmp, sizeof(int32), fp);
+		/* No header extension */
+		tmp = 0;
+		CopySendData(&tmp, sizeof(int32), fp);
 	}
-	else
-	{
-		int32		ntuples;

-		nulls = (char *) palloc(attr_count);
-		for (i = 0; i < attr_count; i++)
-			nulls[i] = ' ';
-
-		/* XXX expensive */
-
-		ntuples = CountTuples(rel);
-		CopySendData(&ntuples, sizeof(int32), fp);
-	}
+	scandesc = heap_beginscan(rel, 0, QuerySnapshot, 0, NULL);

 	while (HeapTupleIsValid(tuple = heap_getnext(scandesc, 0)))
 	{
+		bool		need_delim = false;
+
 		if (QueryCancel)
 			CancelQuery();

-		if (oids && !binary)
+		if (binary)
+		{
+			/* Binary per-tuple header */
+			int16	fld_count = attr_count;
+
+			CopySendData(&fld_count, sizeof(int16), fp);
+			/* Send OID if wanted --- note fld_count doesn't include it */
+			if (oids)
+			{
+				fld_size = sizeof(Oid);
+				CopySendData(&fld_size, sizeof(int16), fp);
+				CopySendData(&tuple->t_data->t_oid, sizeof(Oid), fp);
+			}
+		}
+		else
 		{
-			string = DatumGetCString(DirectFunctionCall1(oidout,
-									 ObjectIdGetDatum(tuple->t_data->t_oid)));
-			CopySendString(string, fp);
-			CopySendChar(delim[0], fp);
-			pfree(string);
+			/* Text format has no per-tuple header, but send OID if wanted */
+			if (oids)
+			{
+				string = DatumGetCString(DirectFunctionCall1(oidout,
+									ObjectIdGetDatum(tuple->t_data->t_oid)));
+				CopySendString(string, fp);
+				pfree(string);
+				need_delim = true;
+			}
 		}

 		for (i = 0; i < attr_count; i++)
@ -474,18 +486,31 @@ CopyTo(Relation rel, bool binary, bool oids, FILE *fp, char *delim, char *null_p

 			origvalue = heap_getattr(tuple, i + 1, tupDesc, &isnull);

+			if (!binary)
+			{
+				if (need_delim)
+					CopySendChar(delim[0], fp);
+				need_delim = true;
+			}
+
 			if (isnull)
 			{
 				if (!binary)
+				{
 					CopySendString(null_print, fp);	/* null indicator */
+				}
 				else
-					nulls[i] = 'n';
+				{
+					fld_size = 0; /* null marker */
+					CopySendData(&fld_size, sizeof(int16), fp);
+				}
 			}
 			else
 			{
 				/*
 				 * If we have a toasted datum, forcibly detoast it to avoid
-				 * memory leakage inside the type's output routine.
+				 * memory leakage inside the type's output routine (or
+				 * for binary case, becase we must output untoasted value).
 				 */
 				if (isvarlena[i])
 					value = PointerGetDatum(PG_DETOAST_DATUM(origvalue));
@ -495,75 +520,71 @@ CopyTo(Relation rel, bool binary, bool oids, FILE *fp, char *delim, char *null_p
 				if (!binary)
 				{
 					string = DatumGetCString(FunctionCall3(&out_functions[i],
-												value,
-												ObjectIdGetDatum(elements[i]),
-												Int32GetDatum(typmod[i])));
+										value,
+										ObjectIdGetDatum(elements[i]),
+										Int32GetDatum(attr[i]->atttypmod)));
 					CopyAttributeOut(fp, string, delim);
 					pfree(string);
 				}
+				else
+				{
+					fld_size = attr[i]->attlen;
+					CopySendData(&fld_size, sizeof(int16), fp);
+					if (isvarlena[i])
+					{
+						/* varlena */
+						Assert(fld_size == -1);
+						CopySendData(DatumGetPointer(value),
+									 VARSIZE(value),
+									 fp);
+					}
+					else if (!attr[i]->attbyval)
+					{
+						/* fixed-length pass-by-reference */
+						Assert(fld_size > 0);
+						CopySendData(DatumGetPointer(value),
+									 fld_size,
+									 fp);
+					}
+					else
+					{
+						/* pass-by-value */
+						Datum		datumBuf;
+
+						/*
+						 * We need this horsing around because we don't know
+						 * how shorter data values are aligned within a Datum.
+						 */
+						store_att_byval(&datumBuf, value, fld_size);
+						CopySendData(&datumBuf,
+									 fld_size,
+									 fp);
+					}
+				}

 				/* Clean up detoasted copy, if any */
 				if (value != origvalue)
 					pfree(DatumGetPointer(value));
 			}
-
-			if (!binary)
-			{
-				if (i == attr_count - 1)
-					CopySendChar('\n', fp);
-				else
-				{
-
-					/*
-					 * when copying out, only use the first char of the
-					 * delim string
-					 */
-					CopySendChar(delim[0], fp);
-				}
-			}
 		}

-		if (binary)
-		{
-			int32		null_ct = 0,
-						length;
+		if (!binary)
+			CopySendChar('\n', fp);
+	}

-			for (i = 0; i < attr_count; i++)
-			{
-				if (nulls[i] == 'n')
-					null_ct++;
-			}
+	heap_endscan(scandesc);

-			length = tuple->t_len - tuple->t_data->t_hoff;
-			CopySendData(&length, sizeof(int32), fp);
-			if (oids)
-				CopySendData((char *) &tuple->t_data->t_oid, sizeof(int32), fp);
+	if (binary)
+	{
+		/* Generate trailer for a binary copy */
+		int16	fld_count = -1;

-			CopySendData(&null_ct, sizeof(int32), fp);
-			if (null_ct > 0)
-			{
-				for (i = 0; i < attr_count; i++)
-				{
-					if (nulls[i] == 'n')
-					{
-						CopySendData(&i, sizeof(int32), fp);
-						nulls[i] = ' ';
-					}
-				}
-			}
-			CopySendData((char *) tuple->t_data + tuple->t_data->t_hoff,
-						 length, fp);
-		}
+		CopySendData(&fld_count, sizeof(int16), fp);
 	}

-	heap_endscan(scandesc);
-
 	pfree(out_functions);
 	pfree(elements);
 	pfree(isvarlena);
-	pfree(typmod);
-	if (binary)
-		pfree(nulls);
 }


@ -580,27 +601,20 @@ CopyFrom(Relation rel, bool binary, bool oids, FILE *fp,
 	AttrNumber	attr_count;
 	FmgrInfo   *in_functions;
 	Oid		   *elements;
-	int32	   *typmod;
 	int			i;
 	Oid			in_func_oid;
 	Datum	   *values;
 	char	   *nulls;
 	bool		isnull;
 	int			done = 0;
-	char	   *string = NULL,
-			   *ptr;
-	int32		len,
-				null_ct,
-				null_id;
-	int32		ntuples,
-				tuples_read = 0;
-	bool		reading_to_eof = true;
+	char	   *string;
 	ResultRelInfo *resultRelInfo;
 	EState	   *estate = CreateExecutorState();	/* for ExecConstraints() */
 	TupleTable	tupleTable;
 	TupleTableSlot *slot;
 	Oid			loaded_oid = InvalidOid;
 	bool		skip_tuple = false;
+	bool		file_has_oids;

 	tupDesc = RelationGetDescr(rel);
 	attr = tupDesc->attrs;
@ -630,31 +644,58 @@ CopyFrom(Relation rel, bool binary, bool oids, FILE *fp,
 	{
 		in_functions = (FmgrInfo *) palloc(attr_count * sizeof(FmgrInfo));
 		elements = (Oid *) palloc(attr_count * sizeof(Oid));
-		typmod = (int32 *) palloc(attr_count * sizeof(int32));
 		for (i = 0; i < attr_count; i++)
 		{
 			in_func_oid = (Oid) GetInputFunction(attr[i]->atttypid);
 			fmgr_info(in_func_oid, &in_functions[i]);
 			elements[i] = GetTypeElement(attr[i]->atttypid);
-			typmod[i] = attr[i]->atttypmod;
 		}
+		file_has_oids = oids;	/* must rely on user to tell us this... */
 	}
 	else
 	{
+		/* Read and verify binary header */
+		char		readSig[12];
+		int32		tmp;
+
+		/* Signature */
+		CopyGetData(readSig, 12, fp);
+		if (CopyGetEof(fp) ||
+			memcmp(readSig, BinarySignature, 12) != 0)
+			elog(ERROR, "COPY BINARY: file signature not recognized");
+		/* Integer layout field */
+		CopyGetData(&tmp, sizeof(int32), fp);
+		if (CopyGetEof(fp) ||
+			tmp != 0x01020304)
+			elog(ERROR, "COPY BINARY: incompatible integer layout");
+		/* Flags field */
+		CopyGetData(&tmp, sizeof(int32), fp);
+		if (CopyGetEof(fp))
+			elog(ERROR, "COPY BINARY: bogus file header (missing flags)");
+		file_has_oids = (tmp & (1 << 16)) != 0;
+		tmp &= ~ (1 << 16);
+		if ((tmp >> 16) != 0)
+			elog(ERROR, "COPY BINARY: unrecognized critical flags in header");
+		/* Header extension length */
+		CopyGetData(&tmp, sizeof(int32), fp);
+		if (CopyGetEof(fp) ||
+			tmp < 0)
+			elog(ERROR, "COPY BINARY: bogus file header (missing length)");
+		/* Skip extension header, if present */
+		while (tmp-- > 0)
+		{
+			CopyGetData(readSig, 1, fp);
+			if (CopyGetEof(fp))
+				elog(ERROR, "COPY BINARY: bogus file header (wrong length)");
+		}
+
 		in_functions = NULL;
 		elements = NULL;
-		typmod = NULL;
-		CopyGetData(&ntuples, sizeof(int32), fp);
-		if (ntuples != 0)
-			reading_to_eof = false;
 	}

 	values = (Datum *) palloc(attr_count * sizeof(Datum));
 	nulls = (char *) palloc(attr_count * sizeof(char));

-	for (i = 0; i < attr_count; i++)
-		nulls[i] = ' ';
-
 	lineno = 0;
 	fe_eof = false;

@ -668,15 +709,22 @@ CopyFrom(Relation rel, bool binary, bool oids, FILE *fp,

 		lineno++;

+		/* Initialize all values for row to NULL */
+		MemSet(values, 0, attr_count * sizeof(Datum));
+		MemSet(nulls, 'n', attr_count * sizeof(char));
+
 		if (!binary)
 		{
 			int			newline = 0;

-			if (oids)
+			if (file_has_oids)
 			{
-				string = CopyReadAttribute(fp, &isnull, delim, &newline, null_print);
-				if (string == NULL)
-					done = 1;
+				string = CopyReadAttribute(fp, &isnull, delim,
+										   &newline, null_print);
+				if (isnull)
+					elog(ERROR, "COPY TEXT: NULL Oid");
+				else if (string == NULL)
+					done = 1;	/* end of file */
 				else
 				{
 					loaded_oid = DatumGetObjectId(DirectFunctionCall1(oidin,
@ -685,22 +733,24 @@ CopyFrom(Relation rel, bool binary, bool oids, FILE *fp,
 						elog(ERROR, "COPY TEXT: Invalid Oid");
 				}
 			}
+
 			for (i = 0; i < attr_count && !done; i++)
 			{
-				string = CopyReadAttribute(fp, &isnull, delim, &newline, null_print);
+				string = CopyReadAttribute(fp, &isnull, delim,
+										   &newline, null_print);
 				if (isnull)
 				{
-					values[i] = PointerGetDatum(NULL);
-					nulls[i] = 'n';
+					/* already set values[i] and nulls[i] */
 				}
 				else if (string == NULL)
-					done = 1;
+					done = 1;	/* end of file */
 				else
 				{
 					values[i] = FunctionCall3(&in_functions[i],
 											  CStringGetDatum(string),
 											  ObjectIdGetDatum(elements[i]),
-											  Int32GetDatum(typmod[i]));
+											  Int32GetDatum(attr[i]->atttypmod));
+					nulls[i] = ' ';
 				}
 			}
 			if (!done)
@ -708,47 +758,103 @@ CopyFrom(Relation rel, bool binary, bool oids, FILE *fp,
 		}
 		else
 		{						/* binary */
-			CopyGetData(&len, sizeof(int32), fp);
-			if (CopyGetEof(fp))
+			int16	fld_count,
+					fld_size;
+
+			CopyGetData(&fld_count, sizeof(int16), fp);
+			if (CopyGetEof(fp) ||
+				fld_count == -1)
 				done = 1;
 			else
 			{
-				if (oids)
+				if (fld_count <= 0 || fld_count > attr_count)
+					elog(ERROR, "COPY BINARY: tuple field count is %d, expected %d",
+						 (int) fld_count, attr_count);
+
+				if (file_has_oids)
 				{
-					CopyGetData(&loaded_oid, sizeof(int32), fp);
+					CopyGetData(&fld_size, sizeof(int16), fp);
+					if (CopyGetEof(fp))
+						elog(ERROR, "COPY BINARY: unexpected EOF");
+					if (fld_size != (int16) sizeof(Oid))
+						elog(ERROR, "COPY BINARY: sizeof(Oid) is %d, expected %d",
+							 (int) fld_size, (int) sizeof(Oid));
+					CopyGetData(&loaded_oid, sizeof(Oid), fp);
+					if (CopyGetEof(fp))
+						elog(ERROR, "COPY BINARY: unexpected EOF");
 					if (loaded_oid == InvalidOid)
 						elog(ERROR, "COPY BINARY: Invalid Oid");
 				}
-				CopyGetData(&null_ct, sizeof(int32), fp);
-				if (null_ct > 0)
+
+				for (i = 0; i < (int) fld_count; i++)
 				{
-					for (i = 0; i < null_ct; i++)
+					CopyGetData(&fld_size, sizeof(int16), fp);
+					if (CopyGetEof(fp))
+						elog(ERROR, "COPY BINARY: unexpected EOF");
+					if (fld_size == 0)
+						continue; /* it's NULL; nulls[i] already set */
+					if (fld_size != attr[i]->attlen)
+						elog(ERROR, "COPY BINARY: sizeof(field %d) is %d, expected %d",
+							 i+1, (int) fld_size, (int) attr[i]->attlen);
+					if (fld_size == -1)
 					{
-						CopyGetData(&null_id, sizeof(int32), fp);
-						nulls[null_id] = 'n';
+						/* varlena field */
+						int32	varlena_size;
+						Pointer	varlena_ptr;
+
+						CopyGetData(&varlena_size, sizeof(int32), fp);
+						if (CopyGetEof(fp))
+							elog(ERROR, "COPY BINARY: unexpected EOF");
+						if (varlena_size < (int32) sizeof(int32))
+							elog(ERROR, "COPY BINARY: bogus varlena length");
+						varlena_ptr = (Pointer) palloc(varlena_size);
+						VARATT_SIZEP(varlena_ptr) = varlena_size;
+						CopyGetData(VARDATA(varlena_ptr),
+									varlena_size - sizeof(int32),
+									fp);
+						if (CopyGetEof(fp))
+							elog(ERROR, "COPY BINARY: unexpected EOF");
+						values[i] = PointerGetDatum(varlena_ptr);
+					}
+					else if (!attr[i]->attbyval)
+					{
+						/* fixed-length pass-by-reference */
+						Pointer	refval_ptr;
+
+						Assert(fld_size > 0);
+						refval_ptr = (Pointer) palloc(fld_size);
+						CopyGetData(refval_ptr, fld_size, fp);
+						if (CopyGetEof(fp))
+							elog(ERROR, "COPY BINARY: unexpected EOF");
+						values[i] = PointerGetDatum(refval_ptr);
+					}
+					else
+					{
+						/* pass-by-value */
+						Datum		datumBuf;
+
+						/*
+						 * We need this horsing around because we don't know
+						 * how shorter data values are aligned within a Datum.
+						 */
+						Assert(fld_size > 0 && fld_size <= sizeof(Datum));
+						CopyGetData(&datumBuf, fld_size, fp);
+						if (CopyGetEof(fp))
+							elog(ERROR, "COPY BINARY: unexpected EOF");
+						values[i] = fetch_att(&datumBuf, true, fld_size);
 					}
-				}
-
-				string = (char *) palloc(len);
-				CopyGetData(string, len, fp);
-
-				ptr = string;

-				for (i = 0; i < attr_count; i++)
-				{
-					if (nulls[i] == 'n')
-						continue;
-					ptr = (char *) att_align((long) ptr, attr[i]->attlen, attr[i]->attalign);
-					values[i] = fetchatt(attr[i], ptr);
-					ptr = att_addlength(ptr, attr[i]->attlen, ptr);
+					nulls[i] = ' ';
 				}
 			}
 		}
+
 		if (done)
-			continue;
+			break;

 		tuple = heap_formtuple(tupDesc, values, nulls);
-		if (oids)
+
+		if (oids && file_has_oids)
 			tuple->t_data->t_oid = loaded_oid;

 		skip_tuple = false;
@ -796,25 +902,13 @@ CopyFrom(Relation rel, bool binary, bool oids, FILE *fp,
 				ExecARInsertTriggers(rel, tuple);
 		}

-		if (binary)
-			pfree(string);
-
 		for (i = 0; i < attr_count; i++)
 		{
 			if (!attr[i]->attbyval && nulls[i] != 'n')
-			{
-				if (!binary)
-					pfree((void *) values[i]);
-			}
-			/* reset nulls[] array for next time */
-			nulls[i] = ' ';
+				pfree(DatumGetPointer(values[i]));
 		}

 		heap_freetuple(tuple);
-		tuples_read++;
-
-		if (!reading_to_eof && ntuples == tuples_read)
-			done = true;
 	}

 	/*
@ -829,7 +923,6 @@ CopyFrom(Relation rel, bool binary, bool oids, FILE *fp,
 	{
 		pfree(in_functions);
 		pfree(elements);
-		pfree(typmod);
 	}

 	ExecDropTupleTable(tupleTable, true);
@ -1099,26 +1192,3 @@ CopyAttributeOut(FILE *fp, char *server_string, char *delim)
 		pfree(string_start);	/* pfree pg_server_to_client result */
 #endif
 }
-
-/*
- * Returns the number of tuples in a relation.	Unfortunately, currently
- * must do a scan of the entire relation to determine this.
- *
- * relation is expected to be an open relation descriptor.
- */
-static int
-CountTuples(Relation relation)
-{
-	HeapScanDesc scandesc;
-	HeapTuple	tuple;
-
-	int			i;
-
-	scandesc = heap_beginscan(relation, 0, QuerySnapshot, 0, NULL);
-
-	i = 0;
-	while (HeapTupleIsValid(tuple = heap_getnext(scandesc, 0)))
-		i++;
-	heap_endscan(scandesc);
-	return i;
-}