Skip to content

Commit c5a298d

Browse files
committed
Fix COPY FROM encoding error double-counting and enable SREH for transcoding
COPY FROM with SEGMENT REJECT LIMIT had two bugs when encountering invalid multi-byte encoding sequences: 1. Encoding errors were double-counted: HandleCopyError() incremented rejectcount, then RemoveInvalidDataInBuf() incremented it again for the same error. This caused the reject limit to be reached twice as fast as expected. 2. SREH (Single Row Error Handling) was completely disabled when transcoding was required (file encoding != database encoding). Any encoding error during transcoding would raise an ERROR instead of skipping the bad row. Fix by removing the duplicate rejectcount++ from RemoveInvalidDataInBuf(), removing the !need_transcoding guard that blocked SREH for transcoding, and adding proper buffer cleanup for the transcoding case (advance raw_buf past the bad line using FindEolInUnverifyRawBuf). Add regression tests covering both non-transcoding (invalid UTF-8) and transcoding (invalid EUC_CN to UTF-8) cases with various reject limits. Fixes #1425
1 parent cce58a8 commit c5a298d

10 files changed

Lines changed: 318 additions & 15 deletions

src/backend/commands/copyfromparse.c

Lines changed: 31 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -649,8 +649,7 @@ CopyLoadInputBuf(CopyFromState cstate)
649649
*/
650650
if (cstate->input_reached_error)
651651
{
652-
/* so far, we only support no transcoding conversion error handling */
653-
if (cstate->cdbsreh && !cstate->need_transcoding)
652+
if (cstate->cdbsreh)
654653
{
655654
MemoryContext oldcontext = CurrentMemoryContext;
656655
PG_TRY();
@@ -1788,7 +1787,6 @@ CopyReadBinaryAttribute(CopyFromState cstate, FmgrInfo *flinfo,
17881787
void
17891788
RemoveInvalidDataInBuf(CopyFromState cstate)
17901789
{
1791-
int nbytes;
17921790
int scanidx;
17931791

17941792
if (cstate->errMode == ALL_OR_NOTHING)
@@ -1800,6 +1798,8 @@ RemoveInvalidDataInBuf(CopyFromState cstate)
18001798

18011799
if (!cstate->need_transcoding)
18021800
{
1801+
int nbytes;
1802+
18031803
/*
18041804
* According to `BeginCopyFrom`, if not need_transcoding these two
18051805
* pointer share one memory space.
@@ -1826,22 +1826,38 @@ RemoveInvalidDataInBuf(CopyFromState cstate)
18261826
/* leave a hint to identify find eol after next raw page read */
18271827
cstate->find_eol_with_rawreading = true;
18281828
}
1829-
1830-
/* reset input buf, so we can redo conversion/verification */
1831-
cstate->input_reached_error = false;
1832-
cstate->input_buf_index = 0;
1833-
cstate->input_buf_len = 0;
1834-
1835-
/* reset line_buf */
1836-
resetStringInfo(&cstate->line_buf);
1837-
cstate->line_buf_valid = false;
1838-
cstate->cdbsreh->rejectcount++;
18391829
}
18401830
else
18411831
{
1842-
ereport(ERROR, (errmsg("Data validation error: since the source data "
1843-
"need transcoding sreh can not handle yet.")));
1832+
/*
1833+
* Transcoding case: raw_buf and input_buf are separate buffers.
1834+
* Skip the bad line in raw_buf by finding the next EOL. No need to
1835+
* memmove raw_buf here; CopyLoadRawBuf() will compact it when more
1836+
* raw data is needed.
1837+
*/
1838+
if (FindEolInUnverifyRawBuf(cstate, &scanidx))
1839+
{
1840+
cstate->raw_buf_index += scanidx;
1841+
}
1842+
else
1843+
{
1844+
/* Current page can not find eol, to skip current raw buffer */
1845+
cstate->raw_buf_len = 0;
1846+
cstate->raw_buf_index = 0;
1847+
1848+
/* leave a hint to identify find eol after next raw page read */
1849+
cstate->find_eol_with_rawreading = true;
1850+
}
18441851
}
1852+
1853+
/* reset input buf, so we can redo conversion/verification */
1854+
cstate->input_reached_error = false;
1855+
cstate->input_buf_index = 0;
1856+
cstate->input_buf_len = 0;
1857+
1858+
/* reset line_buf */
1859+
resetStringInfo(&cstate->line_buf);
1860+
cstate->line_buf_valid = false;
18451861
}
18461862

18471863
static bool
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
1|good1
2+
2|bad�
3+
3|good3
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
1|good1
2+
2|bad�
3+
3|good3
4+
4|bad�
5+
5|good5
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
1|good1
2+
2|bad�
3+
3|good3
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
1|good1
2+
2|bad�
3+
3|good3
4+
4|bad�
5+
5|good5

src/test/regress/expected/.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -69,3 +69,4 @@
6969
/tag.out
7070
/ao_unique_index_partition.out
7171
/bfv_copy.out
72+
/copy_encoding_error.out

src/test/regress/greenplum_schedule

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ test: gp_dispatch_keepalives
3535
# copy command
3636
# copy form a file with different EOL
3737
test: copy_eol
38+
test: copy_encoding_error
3839

3940
test: dedupset
4041

Lines changed: 107 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,107 @@
1+
--
2+
-- Test COPY FROM with invalid multi-byte encoding and SEGMENT REJECT LIMIT.
3+
--
4+
-- Regression test for https://github.com/apache/cloudberry/issues/1425
5+
-- COPY FROM should correctly count encoding errors as single rejected rows,
6+
-- not double-count them. Also, encoding error SREH should work when
7+
-- transcoding is required.
8+
--
9+
10+
-- ===================================================================
11+
-- Test 1: Non-transcoding case (invalid UTF-8 into UTF-8 database)
12+
--
13+
-- The file has 3 lines:
14+
-- line 1: valid
15+
-- line 2: ends with 0xC2 (incomplete 2-byte UTF-8 sequence before newline)
16+
-- line 3: valid
17+
--
18+
-- With SEGMENT REJECT LIMIT 2, this should succeed: only 1 error row,
19+
-- and 1 < 2. Before the fix, the error was double-counted (counted as 2),
20+
-- which would cause the reject limit to be reached on the next error check.
21+
-- ===================================================================
22+
23+
CREATE TABLE copy_enc_err(a int, b text) DISTRIBUTED BY (a);
24+
25+
COPY copy_enc_err FROM '@abs_srcdir@/data/copy_enc_err_utf8.data' DELIMITER '|'
26+
LOG ERRORS SEGMENT REJECT LIMIT 2 ROWS;
27+
28+
-- Verify that valid rows (lines 1 and 3) were imported.
29+
SELECT * FROM copy_enc_err ORDER BY a;
30+
31+
-- Verify that exactly 1 error was logged (not 2).
32+
SELECT count(*) AS error_count FROM gp_read_error_log('copy_enc_err');
33+
34+
SELECT gp_truncate_error_log('copy_enc_err');
35+
TRUNCATE copy_enc_err;
36+
37+
-- ===================================================================
38+
-- Test 2: Non-transcoding with multiple bad lines
39+
--
40+
-- The file has 5 lines: lines 2 and 4 are bad.
41+
-- With SEGMENT REJECT LIMIT 10, this should succeed with 2 errors.
42+
-- ===================================================================
43+
44+
COPY copy_enc_err FROM '@abs_srcdir@/data/copy_enc_err_utf8_multi.data' DELIMITER '|'
45+
LOG ERRORS SEGMENT REJECT LIMIT 10 ROWS;
46+
47+
-- All 3 valid rows should be imported.
48+
SELECT * FROM copy_enc_err ORDER BY a;
49+
50+
-- Exactly 2 errors should be logged.
51+
SELECT count(*) AS error_count FROM gp_read_error_log('copy_enc_err');
52+
53+
SELECT gp_truncate_error_log('copy_enc_err');
54+
TRUNCATE copy_enc_err;
55+
56+
-- ===================================================================
57+
-- Test 3: Non-transcoding, reject limit reached correctly
58+
--
59+
-- 2 bad lines with SEGMENT REJECT LIMIT 2 should fail, because
60+
-- rejectcount (2) >= rejectlimit (2).
61+
-- ===================================================================
62+
63+
COPY copy_enc_err FROM '@abs_srcdir@/data/copy_enc_err_utf8_multi.data' DELIMITER '|'
64+
LOG ERRORS SEGMENT REJECT LIMIT 2 ROWS;
65+
66+
SELECT gp_truncate_error_log('copy_enc_err');
67+
68+
-- ===================================================================
69+
-- Test 4: Transcoding case (invalid EUC_CN into UTF-8 database)
70+
--
71+
-- The file has 3 lines with data that claims to be EUC_CN:
72+
-- line 1: valid ASCII (valid in EUC_CN)
73+
-- line 2: ends with 0xA1 (starts a 2-byte EUC_CN char, but \n follows)
74+
-- line 3: valid ASCII (valid in EUC_CN)
75+
--
76+
-- Before the fix, this would error with:
77+
-- "Data validation error: since the source data need transcoding
78+
-- sreh can not handle yet."
79+
-- After the fix, it should skip line 2 and import lines 1 and 3.
80+
-- ===================================================================
81+
82+
COPY copy_enc_err FROM '@abs_srcdir@/data/copy_enc_err_euccn.data' DELIMITER '|'
83+
ENCODING 'euc_cn' LOG ERRORS SEGMENT REJECT LIMIT 2 ROWS;
84+
85+
-- Valid rows should be imported.
86+
SELECT * FROM copy_enc_err ORDER BY a;
87+
88+
-- Exactly 1 error should be logged.
89+
SELECT count(*) AS error_count FROM gp_read_error_log('copy_enc_err');
90+
91+
SELECT gp_truncate_error_log('copy_enc_err');
92+
TRUNCATE copy_enc_err;
93+
94+
-- ===================================================================
95+
-- Test 5: Transcoding with multiple bad lines
96+
-- ===================================================================
97+
98+
COPY copy_enc_err FROM '@abs_srcdir@/data/copy_enc_err_euccn_multi.data' DELIMITER '|'
99+
ENCODING 'euc_cn' LOG ERRORS SEGMENT REJECT LIMIT 10 ROWS;
100+
101+
SELECT * FROM copy_enc_err ORDER BY a;
102+
103+
SELECT count(*) AS error_count FROM gp_read_error_log('copy_enc_err');
104+
105+
-- Cleanup
106+
SELECT gp_truncate_error_log('copy_enc_err');
107+
DROP TABLE copy_enc_err;
Lines changed: 161 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,161 @@
1+
--
2+
-- Test COPY FROM with invalid multi-byte encoding and SEGMENT REJECT LIMIT.
3+
--
4+
-- Regression test for https://github.com/apache/cloudberry/issues/1425
5+
-- COPY FROM should correctly count encoding errors as single rejected rows,
6+
-- not double-count them. Also, encoding error SREH should work when
7+
-- transcoding is required.
8+
--
9+
-- ===================================================================
10+
-- Test 1: Non-transcoding case (invalid UTF-8 into UTF-8 database)
11+
--
12+
-- The file has 3 lines:
13+
-- line 1: valid
14+
-- line 2: ends with 0xC2 (incomplete 2-byte UTF-8 sequence before newline)
15+
-- line 3: valid
16+
--
17+
-- With SEGMENT REJECT LIMIT 2, this should succeed: only 1 error row,
18+
-- and 1 < 2. Before the fix, the error was double-counted (counted as 2),
19+
-- which would cause the reject limit to be reached on the next error check.
20+
-- ===================================================================
21+
CREATE TABLE copy_enc_err(a int, b text) DISTRIBUTED BY (a);
22+
COPY copy_enc_err FROM '@abs_srcdir@/data/copy_enc_err_utf8.data' DELIMITER '|'
23+
LOG ERRORS SEGMENT REJECT LIMIT 2 ROWS;
24+
NOTICE: found 1 data formatting errors (1 or more input rows), rejected related input data
25+
-- Verify that valid rows (lines 1 and 3) were imported.
26+
SELECT * FROM copy_enc_err ORDER BY a;
27+
a | b
28+
---+-------
29+
1 | good1
30+
3 | good3
31+
(2 rows)
32+
33+
-- Verify that exactly 1 error was logged (not 2).
34+
SELECT count(*) AS error_count FROM gp_read_error_log('copy_enc_err');
35+
error_count
36+
-------------
37+
1
38+
(1 row)
39+
40+
SELECT gp_truncate_error_log('copy_enc_err');
41+
gp_truncate_error_log
42+
-----------------------
43+
t
44+
(1 row)
45+
46+
TRUNCATE copy_enc_err;
47+
-- ===================================================================
48+
-- Test 2: Non-transcoding with multiple bad lines
49+
--
50+
-- The file has 5 lines: lines 2 and 4 are bad.
51+
-- With SEGMENT REJECT LIMIT 10, this should succeed with 2 errors.
52+
-- ===================================================================
53+
COPY copy_enc_err FROM '@abs_srcdir@/data/copy_enc_err_utf8_multi.data' DELIMITER '|'
54+
LOG ERRORS SEGMENT REJECT LIMIT 10 ROWS;
55+
NOTICE: found 2 data formatting errors (2 or more input rows), rejected related input data
56+
-- All 3 valid rows should be imported.
57+
SELECT * FROM copy_enc_err ORDER BY a;
58+
a | b
59+
---+-------
60+
1 | good1
61+
3 | good3
62+
5 | good5
63+
(3 rows)
64+
65+
-- Exactly 2 errors should be logged.
66+
SELECT count(*) AS error_count FROM gp_read_error_log('copy_enc_err');
67+
error_count
68+
-------------
69+
2
70+
(1 row)
71+
72+
SELECT gp_truncate_error_log('copy_enc_err');
73+
gp_truncate_error_log
74+
-----------------------
75+
t
76+
(1 row)
77+
78+
TRUNCATE copy_enc_err;
79+
-- ===================================================================
80+
-- Test 3: Non-transcoding, reject limit reached correctly
81+
--
82+
-- 2 bad lines with SEGMENT REJECT LIMIT 2 should fail, because
83+
-- rejectcount (2) >= rejectlimit (2).
84+
-- ===================================================================
85+
COPY copy_enc_err FROM '@abs_srcdir@/data/copy_enc_err_utf8_multi.data' DELIMITER '|'
86+
LOG ERRORS SEGMENT REJECT LIMIT 2 ROWS;
87+
ERROR: segment reject limit reached, aborting operation
88+
DETAIL: Last error was: invalid byte sequence for encoding "UTF8": 0xfe
89+
CONTEXT: COPY copy_enc_err, line 3
90+
SELECT gp_truncate_error_log('copy_enc_err');
91+
gp_truncate_error_log
92+
-----------------------
93+
t
94+
(1 row)
95+
96+
-- ===================================================================
97+
-- Test 4: Transcoding case (invalid EUC_CN into UTF-8 database)
98+
--
99+
-- The file has 3 lines with data that claims to be EUC_CN:
100+
-- line 1: valid ASCII (valid in EUC_CN)
101+
-- line 2: ends with 0xA1 (starts a 2-byte EUC_CN char, but \n follows)
102+
-- line 3: valid ASCII (valid in EUC_CN)
103+
--
104+
-- Before the fix, this would error with:
105+
-- "Data validation error: since the source data need transcoding
106+
-- sreh can not handle yet."
107+
-- After the fix, it should skip line 2 and import lines 1 and 3.
108+
-- ===================================================================
109+
COPY copy_enc_err FROM '@abs_srcdir@/data/copy_enc_err_euccn.data' DELIMITER '|'
110+
ENCODING 'euc_cn' LOG ERRORS SEGMENT REJECT LIMIT 2 ROWS;
111+
NOTICE: found 1 data formatting errors (1 or more input rows), rejected related input data
112+
-- Valid rows should be imported.
113+
SELECT * FROM copy_enc_err ORDER BY a;
114+
a | b
115+
---+-------
116+
1 | good1
117+
3 | good3
118+
(2 rows)
119+
120+
-- Exactly 1 error should be logged.
121+
SELECT count(*) AS error_count FROM gp_read_error_log('copy_enc_err');
122+
error_count
123+
-------------
124+
1
125+
(1 row)
126+
127+
SELECT gp_truncate_error_log('copy_enc_err');
128+
gp_truncate_error_log
129+
-----------------------
130+
t
131+
(1 row)
132+
133+
TRUNCATE copy_enc_err;
134+
-- ===================================================================
135+
-- Test 5: Transcoding with multiple bad lines
136+
-- ===================================================================
137+
COPY copy_enc_err FROM '@abs_srcdir@/data/copy_enc_err_euccn_multi.data' DELIMITER '|'
138+
ENCODING 'euc_cn' LOG ERRORS SEGMENT REJECT LIMIT 10 ROWS;
139+
NOTICE: found 2 data formatting errors (2 or more input rows), rejected related input data
140+
SELECT * FROM copy_enc_err ORDER BY a;
141+
a | b
142+
---+-------
143+
1 | good1
144+
3 | good3
145+
5 | good5
146+
(3 rows)
147+
148+
SELECT count(*) AS error_count FROM gp_read_error_log('copy_enc_err');
149+
error_count
150+
-------------
151+
2
152+
(1 row)
153+
154+
-- Cleanup
155+
SELECT gp_truncate_error_log('copy_enc_err');
156+
gp_truncate_error_log
157+
-----------------------
158+
t
159+
(1 row)
160+
161+
DROP TABLE copy_enc_err;

src/test/regress/sql/.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,3 +63,4 @@
6363
/tag.sql
6464
/ao_unique_index_partition.sql
6565
/bfv_copy.sql
66+
/copy_encoding_error.sql

0 commit comments

Comments
 (0)