Add some tests for encoding conversion in COPY TO/FROM

This adds a couple of tests to trigger encoding conversion when input
and server encodings do not match in COPY FROM/TO, or need_transcoding
set to true in the COPY state data.  These tests rely on UTF8 <-> LATIN1
for the valid cases as LATIN1 accepts any bytes, and UTF8 <-> EUC_JP for
some of the invalid cases where a character cannot be understood,
causing a conversion failure.

Both ENCODING and client_encoding are covered.  Test suggested by Andres
Freund.

Author: Sutou Kouhei
Discussion: https://postgr.es/m/20240206222445.hzq22pb2nye7rm67@awork3.anarazel.de
pull/194/head
Michael Paquier 9 months ago
parent bf9165bb0c
commit 3ad8b840ce
  1. 46
      src/test/regress/expected/copyencoding.out
  2. 8
      src/test/regress/expected/copyencoding_1.out
  3. 2
      src/test/regress/parallel_schedule
  4. 53
      src/test/regress/sql/copyencoding.sql

@ -0,0 +1,46 @@
--
-- Test cases for encoding with COPY commands
--
-- skip test if not UTF8 server encoding
SELECT getdatabaseencoding() <> 'UTF8'
AS skip_test \gset
\if :skip_test
\quit
\endif
-- directory paths are passed to us in environment variables
\getenv abs_builddir PG_ABS_BUILDDIR
\set utf8_csv :abs_builddir '/results/copyencoding_utf8.csv'
CREATE TABLE copy_encoding_tab (t text);
-- Valid cases
-- Use ENCODING option
-- U+3042 HIRAGANA LETTER A
COPY (SELECT E'\u3042') TO :'utf8_csv' WITH (FORMAT csv, ENCODING 'UTF8');
-- Read UTF8 data as LATIN1: no error
COPY copy_encoding_tab FROM :'utf8_csv' WITH (FORMAT csv, ENCODING 'LATIN1');
-- Use client_encoding
SET client_encoding TO UTF8;
-- U+3042 HIRAGANA LETTER A
COPY (SELECT E'\u3042') TO :'utf8_csv' WITH (FORMAT csv);
-- Read UTF8 data as LATIN1: no error
SET client_encoding TO LATIN1;
COPY copy_encoding_tab FROM :'utf8_csv' WITH (FORMAT csv);
RESET client_encoding;
-- Invalid cases
-- Use ENCODING explicitly
-- U+3042 HIRAGANA LETTER A
COPY (SELECT E'\u3042') TO :'utf8_csv' WITH (FORMAT csv, ENCODING 'UTF8');
-- Read UTF8 data as EUC_JP: no error
COPY copy_encoding_tab FROM :'utf8_csv' WITH (FORMAT csv, ENCODING 'EUC_JP');
ERROR: invalid byte sequence for encoding "EUC_JP": 0xe3 0x81
CONTEXT: COPY copy_encoding_tab, line 1
-- Use client_encoding
SET client_encoding TO UTF8;
-- U+3042 HIRAGANA LETTER A
COPY (SELECT E'\u3042') TO :'utf8_csv' WITH (FORMAT csv);
-- Read UTF8 data as EUC_JP: no error
SET client_encoding TO EUC_JP;
COPY copy_encoding_tab FROM :'utf8_csv' WITH (FORMAT csv);
ERROR: invalid byte sequence for encoding "EUC_JP": 0xe3 0x81
CONTEXT: COPY copy_encoding_tab, line 1
RESET client_encoding;
DROP TABLE copy_encoding_tab;

@ -0,0 +1,8 @@
--
-- Test cases for encoding with COPY commands
--
-- skip test if not UTF8 server encoding
SELECT getdatabaseencoding() <> 'UTF8'
AS skip_test \gset
\if :skip_test
\quit

@ -36,7 +36,7 @@ test: geometry horology tstypes regex type_sanity opr_sanity misc_sanity comment
# execute two copy tests in parallel, to check that copy itself
# is concurrent safe.
# ----------
test: copy copyselect copydml insert insert_conflict
test: copy copyselect copydml copyencoding insert insert_conflict
# ----------
# More groups of parallel tests

@ -0,0 +1,53 @@
--
-- Test cases for encoding with COPY commands
--
-- skip test if not UTF8 server encoding
SELECT getdatabaseencoding() <> 'UTF8'
AS skip_test \gset
\if :skip_test
\quit
\endif
-- directory paths are passed to us in environment variables
\getenv abs_builddir PG_ABS_BUILDDIR
\set utf8_csv :abs_builddir '/results/copyencoding_utf8.csv'
CREATE TABLE copy_encoding_tab (t text);
-- Valid cases
-- Use ENCODING option
-- U+3042 HIRAGANA LETTER A
COPY (SELECT E'\u3042') TO :'utf8_csv' WITH (FORMAT csv, ENCODING 'UTF8');
-- Read UTF8 data as LATIN1: no error
COPY copy_encoding_tab FROM :'utf8_csv' WITH (FORMAT csv, ENCODING 'LATIN1');
-- Use client_encoding
SET client_encoding TO UTF8;
-- U+3042 HIRAGANA LETTER A
COPY (SELECT E'\u3042') TO :'utf8_csv' WITH (FORMAT csv);
-- Read UTF8 data as LATIN1: no error
SET client_encoding TO LATIN1;
COPY copy_encoding_tab FROM :'utf8_csv' WITH (FORMAT csv);
RESET client_encoding;
-- Invalid cases
-- Use ENCODING explicitly
-- U+3042 HIRAGANA LETTER A
COPY (SELECT E'\u3042') TO :'utf8_csv' WITH (FORMAT csv, ENCODING 'UTF8');
-- Read UTF8 data as EUC_JP: no error
COPY copy_encoding_tab FROM :'utf8_csv' WITH (FORMAT csv, ENCODING 'EUC_JP');
-- Use client_encoding
SET client_encoding TO UTF8;
-- U+3042 HIRAGANA LETTER A
COPY (SELECT E'\u3042') TO :'utf8_csv' WITH (FORMAT csv);
-- Read UTF8 data as EUC_JP: no error
SET client_encoding TO EUC_JP;
COPY copy_encoding_tab FROM :'utf8_csv' WITH (FORMAT csv);
RESET client_encoding;
DROP TABLE copy_encoding_tab;
Loading…
Cancel
Save