Fix COPY FROM encoding error double-counting and enable SREH for transcoding

avamingli · avamingli · commit c5a298db737f · 2026-03-03T18:56:58.000+08:00
COPY FROM with SEGMENT REJECT LIMIT had two bugs when encountering invalid multi-byte encoding sequences: 1. Encoding errors were double-counted: HandleCopyError() incremented rejectcount, then RemoveInvalidDataInBuf() incremented it again for the same error. This caused the reject limit to be reached twice as fast as expected. 2. SREH (Single Row Error Handling) was completely disabled when transcoding was required (file encoding != database encoding). Any encoding error during transcoding would raise an ERROR instead of skipping the bad row. Fix by removing the duplicate rejectcount++ from RemoveInvalidDataInBuf(), removing the !need_transcoding guard that blocked SREH for transcoding, and adding proper buffer cleanup for the transcoding case (advance raw_buf past the bad line using FindEolInUnverifyRawBuf). Add regression tests covering both non-transcoding (invalid UTF-8) and transcoding (invalid EUC_CN to UTF-8) cases with various reject limits. Fixes #1425
diff --git a/src/backend/commands/copyfromparse.c b/src/backend/commands/copyfromparse.c
@@ -649,8 +649,7 @@ CopyLoadInputBuf(CopyFromState cstate)
 		 */
 		if (cstate->input_reached_error)
 		{
-			/* so far, we only support no transcoding conversion error handling */
-			if (cstate->cdbsreh && !cstate->need_transcoding)
+			if (cstate->cdbsreh)
 			{
 				MemoryContext oldcontext = CurrentMemoryContext;
 				PG_TRY();
@@ -1788,7 +1787,6 @@ CopyReadBinaryAttribute(CopyFromState cstate, FmgrInfo *flinfo,
 void
 RemoveInvalidDataInBuf(CopyFromState cstate)
 {
-	int nbytes;
 	int scanidx;
 
 	if (cstate->errMode == ALL_OR_NOTHING)
@@ -1800,6 +1798,8 @@ RemoveInvalidDataInBuf(CopyFromState cstate)
 
 	if (!cstate->need_transcoding)
 	{
+		int nbytes;
+
 		/*
 		 * According to `BeginCopyFrom`, if not need_transcoding these two
 		 * pointer share one memory space.
@@ -1826,22 +1826,38 @@ RemoveInvalidDataInBuf(CopyFromState cstate)
 			/* leave a hint to identify find eol after next raw page read */
 			cstate->find_eol_with_rawreading = true;
 		}
-
-		/* reset input buf, so we can redo conversion/verification */
-		cstate->input_reached_error = false;
-		cstate->input_buf_index = 0;
-		cstate->input_buf_len = 0;
-
-		/* reset line_buf */
-		resetStringInfo(&cstate->line_buf);
-		cstate->line_buf_valid = false;
-		cstate->cdbsreh->rejectcount++;
 	}
 	else
 	{
-		ereport(ERROR, (errmsg("Data validation error: since the source data "
-							   "need transcoding sreh can not handle yet.")));
+		/*
+		 * Transcoding case: raw_buf and input_buf are separate buffers.
+		 * Skip the bad line in raw_buf by finding the next EOL.  No need to
+		 * memmove raw_buf here; CopyLoadRawBuf() will compact it when more
+		 * raw data is needed.
+		 */
+		if (FindEolInUnverifyRawBuf(cstate, &scanidx))
+		{
+			cstate->raw_buf_index += scanidx;
+		}
+		else
+		{
+			/* Current page can not find eol, to skip current raw buffer */
+			cstate->raw_buf_len = 0;
+			cstate->raw_buf_index = 0;
+
+			/* leave a hint to identify find eol after next raw page read */
+			cstate->find_eol_with_rawreading = true;
+		}
 	}
+
+	/* reset input buf, so we can redo conversion/verification */
+	cstate->input_reached_error = false;
+	cstate->input_buf_index = 0;
+	cstate->input_buf_len = 0;
+
+	/* reset line_buf */
+	resetStringInfo(&cstate->line_buf);
+	cstate->line_buf_valid = false;
 }
 
 static bool
diff --git a/src/test/regress/data/copy_enc_err_euccn.data b/src/test/regress/data/copy_enc_err_euccn.data
@@ -0,0 +1,3 @@
+1|good1
+2|bad�
+3|good3
diff --git a/src/test/regress/data/copy_enc_err_euccn_multi.data b/src/test/regress/data/copy_enc_err_euccn_multi.data
@@ -0,0 +1,5 @@
+1|good1
+2|bad�
+3|good3
+4|bad�
+5|good5
diff --git a/src/test/regress/data/copy_enc_err_utf8.data b/src/test/regress/data/copy_enc_err_utf8.data
@@ -0,0 +1,3 @@
+1|good1
+2|bad�
+3|good3
diff --git a/src/test/regress/data/copy_enc_err_utf8_multi.data b/src/test/regress/data/copy_enc_err_utf8_multi.data
@@ -0,0 +1,5 @@
+1|good1
+2|bad�
+3|good3
+4|bad�
+5|good5
diff --git a/src/test/regress/expected/.gitignore b/src/test/regress/expected/.gitignore
@@ -69,3 +69,4 @@
 /tag.out
 /ao_unique_index_partition.out
 /bfv_copy.out
+/copy_encoding_error.out
diff --git a/src/test/regress/greenplum_schedule b/src/test/regress/greenplum_schedule
@@ -35,6 +35,7 @@ test: gp_dispatch_keepalives
 # copy command
 # copy form a file with different EOL
 test: copy_eol
+test: copy_encoding_error
 
 test: dedupset
 
diff --git a/src/test/regress/input/copy_encoding_error.source b/src/test/regress/input/copy_encoding_error.source
@@ -0,0 +1,107 @@
+--
+-- Test COPY FROM with invalid multi-byte encoding and SEGMENT REJECT LIMIT.
+--
+-- Regression test for https://github.com/apache/cloudberry/issues/1425
+-- COPY FROM should correctly count encoding errors as single rejected rows,
+-- not double-count them.  Also, encoding error SREH should work when
+-- transcoding is required.
+--
+
+-- ===================================================================
+-- Test 1: Non-transcoding case (invalid UTF-8 into UTF-8 database)
+--
+-- The file has 3 lines:
+--   line 1: valid
+--   line 2: ends with 0xC2 (incomplete 2-byte UTF-8 sequence before newline)
+--   line 3: valid
+--
+-- With SEGMENT REJECT LIMIT 2, this should succeed: only 1 error row,
+-- and 1 < 2.  Before the fix, the error was double-counted (counted as 2),
+-- which would cause the reject limit to be reached on the next error check.
+-- ===================================================================
+
+CREATE TABLE copy_enc_err(a int, b text) DISTRIBUTED BY (a);
+
+COPY copy_enc_err FROM '@abs_srcdir@/data/copy_enc_err_utf8.data' DELIMITER '|'
+    LOG ERRORS SEGMENT REJECT LIMIT 2 ROWS;
+
+-- Verify that valid rows (lines 1 and 3) were imported.
+SELECT * FROM copy_enc_err ORDER BY a;
+
+-- Verify that exactly 1 error was logged (not 2).
+SELECT count(*) AS error_count FROM gp_read_error_log('copy_enc_err');
+
+SELECT gp_truncate_error_log('copy_enc_err');
+TRUNCATE copy_enc_err;
+
+-- ===================================================================
+-- Test 2: Non-transcoding with multiple bad lines
+--
+-- The file has 5 lines: lines 2 and 4 are bad.
+-- With SEGMENT REJECT LIMIT 10, this should succeed with 2 errors.
+-- ===================================================================
+
+COPY copy_enc_err FROM '@abs_srcdir@/data/copy_enc_err_utf8_multi.data' DELIMITER '|'
+    LOG ERRORS SEGMENT REJECT LIMIT 10 ROWS;
+
+-- All 3 valid rows should be imported.
+SELECT * FROM copy_enc_err ORDER BY a;
+
+-- Exactly 2 errors should be logged.
+SELECT count(*) AS error_count FROM gp_read_error_log('copy_enc_err');
+
+SELECT gp_truncate_error_log('copy_enc_err');
+TRUNCATE copy_enc_err;
+
+-- ===================================================================
+-- Test 3: Non-transcoding, reject limit reached correctly
+--
+-- 2 bad lines with SEGMENT REJECT LIMIT 2 should fail, because
+-- rejectcount (2) >= rejectlimit (2).
+-- ===================================================================
+
+COPY copy_enc_err FROM '@abs_srcdir@/data/copy_enc_err_utf8_multi.data' DELIMITER '|'
+    LOG ERRORS SEGMENT REJECT LIMIT 2 ROWS;
+
+SELECT gp_truncate_error_log('copy_enc_err');
+
+-- ===================================================================
+-- Test 4: Transcoding case (invalid EUC_CN into UTF-8 database)
+--
+-- The file has 3 lines with data that claims to be EUC_CN:
+--   line 1: valid ASCII (valid in EUC_CN)
+--   line 2: ends with 0xA1 (starts a 2-byte EUC_CN char, but \n follows)
+--   line 3: valid ASCII (valid in EUC_CN)
+--
+-- Before the fix, this would error with:
+--   "Data validation error: since the source data need transcoding
+--    sreh can not handle yet."
+-- After the fix, it should skip line 2 and import lines 1 and 3.
+-- ===================================================================
+
+COPY copy_enc_err FROM '@abs_srcdir@/data/copy_enc_err_euccn.data' DELIMITER '|'
+    ENCODING 'euc_cn' LOG ERRORS SEGMENT REJECT LIMIT 2 ROWS;
+
+-- Valid rows should be imported.
+SELECT * FROM copy_enc_err ORDER BY a;
+
+-- Exactly 1 error should be logged.
+SELECT count(*) AS error_count FROM gp_read_error_log('copy_enc_err');
+
+SELECT gp_truncate_error_log('copy_enc_err');
+TRUNCATE copy_enc_err;
+
+-- ===================================================================
+-- Test 5: Transcoding with multiple bad lines
+-- ===================================================================
+
+COPY copy_enc_err FROM '@abs_srcdir@/data/copy_enc_err_euccn_multi.data' DELIMITER '|'
+    ENCODING 'euc_cn' LOG ERRORS SEGMENT REJECT LIMIT 10 ROWS;
+
+SELECT * FROM copy_enc_err ORDER BY a;
+
+SELECT count(*) AS error_count FROM gp_read_error_log('copy_enc_err');
+
+-- Cleanup
+SELECT gp_truncate_error_log('copy_enc_err');
+DROP TABLE copy_enc_err;
diff --git a/src/test/regress/output/copy_encoding_error.source b/src/test/regress/output/copy_encoding_error.source
@@ -0,0 +1,161 @@
+--
+-- Test COPY FROM with invalid multi-byte encoding and SEGMENT REJECT LIMIT.
+--
+-- Regression test for https://github.com/apache/cloudberry/issues/1425
+-- COPY FROM should correctly count encoding errors as single rejected rows,
+-- not double-count them.  Also, encoding error SREH should work when
+-- transcoding is required.
+--
+-- ===================================================================
+-- Test 1: Non-transcoding case (invalid UTF-8 into UTF-8 database)
+--
+-- The file has 3 lines:
+--   line 1: valid
+--   line 2: ends with 0xC2 (incomplete 2-byte UTF-8 sequence before newline)
+--   line 3: valid
+--
+-- With SEGMENT REJECT LIMIT 2, this should succeed: only 1 error row,
+-- and 1 < 2.  Before the fix, the error was double-counted (counted as 2),
+-- which would cause the reject limit to be reached on the next error check.
+-- ===================================================================
+CREATE TABLE copy_enc_err(a int, b text) DISTRIBUTED BY (a);
+COPY copy_enc_err FROM '@abs_srcdir@/data/copy_enc_err_utf8.data' DELIMITER '|'
+    LOG ERRORS SEGMENT REJECT LIMIT 2 ROWS;
+NOTICE:  found 1 data formatting errors (1 or more input rows), rejected related input data
+-- Verify that valid rows (lines 1 and 3) were imported.
+SELECT * FROM copy_enc_err ORDER BY a;
+ a |   b   
+---+-------
+ 1 | good1
+ 3 | good3
+(2 rows)
+
+-- Verify that exactly 1 error was logged (not 2).
+SELECT count(*) AS error_count FROM gp_read_error_log('copy_enc_err');
+ error_count 
+-------------
+           1
+(1 row)
+
+SELECT gp_truncate_error_log('copy_enc_err');
+ gp_truncate_error_log 
+-----------------------
+ t
+(1 row)
+
+TRUNCATE copy_enc_err;
+-- ===================================================================
+-- Test 2: Non-transcoding with multiple bad lines
+--
+-- The file has 5 lines: lines 2 and 4 are bad.
+-- With SEGMENT REJECT LIMIT 10, this should succeed with 2 errors.
+-- ===================================================================
+COPY copy_enc_err FROM '@abs_srcdir@/data/copy_enc_err_utf8_multi.data' DELIMITER '|'
+    LOG ERRORS SEGMENT REJECT LIMIT 10 ROWS;
+NOTICE:  found 2 data formatting errors (2 or more input rows), rejected related input data
+-- All 3 valid rows should be imported.
+SELECT * FROM copy_enc_err ORDER BY a;
+ a |   b   
+---+-------
+ 1 | good1
+ 3 | good3
+ 5 | good5
+(3 rows)
+
+-- Exactly 2 errors should be logged.
+SELECT count(*) AS error_count FROM gp_read_error_log('copy_enc_err');
+ error_count 
+-------------
+           2
+(1 row)
+
+SELECT gp_truncate_error_log('copy_enc_err');
+ gp_truncate_error_log 
+-----------------------
+ t
+(1 row)
+
+TRUNCATE copy_enc_err;
+-- ===================================================================
+-- Test 3: Non-transcoding, reject limit reached correctly
+--
+-- 2 bad lines with SEGMENT REJECT LIMIT 2 should fail, because
+-- rejectcount (2) >= rejectlimit (2).
+-- ===================================================================
+COPY copy_enc_err FROM '@abs_srcdir@/data/copy_enc_err_utf8_multi.data' DELIMITER '|'
+    LOG ERRORS SEGMENT REJECT LIMIT 2 ROWS;
+ERROR:  segment reject limit reached, aborting operation
+DETAIL:  Last error was: invalid byte sequence for encoding "UTF8": 0xfe
+CONTEXT:  COPY copy_enc_err, line 3
+SELECT gp_truncate_error_log('copy_enc_err');
+ gp_truncate_error_log 
+-----------------------
+ t
+(1 row)
+
+-- ===================================================================
+-- Test 4: Transcoding case (invalid EUC_CN into UTF-8 database)
+--
+-- The file has 3 lines with data that claims to be EUC_CN:
+--   line 1: valid ASCII (valid in EUC_CN)
+--   line 2: ends with 0xA1 (starts a 2-byte EUC_CN char, but \n follows)
+--   line 3: valid ASCII (valid in EUC_CN)
+--
+-- Before the fix, this would error with:
+--   "Data validation error: since the source data need transcoding
+--    sreh can not handle yet."
+-- After the fix, it should skip line 2 and import lines 1 and 3.
+-- ===================================================================
+COPY copy_enc_err FROM '@abs_srcdir@/data/copy_enc_err_euccn.data' DELIMITER '|'
+    ENCODING 'euc_cn' LOG ERRORS SEGMENT REJECT LIMIT 2 ROWS;
+NOTICE:  found 1 data formatting errors (1 or more input rows), rejected related input data
+-- Valid rows should be imported.
+SELECT * FROM copy_enc_err ORDER BY a;
+ a |   b   
+---+-------
+ 1 | good1
+ 3 | good3
+(2 rows)
+
+-- Exactly 1 error should be logged.
+SELECT count(*) AS error_count FROM gp_read_error_log('copy_enc_err');
+ error_count 
+-------------
+           1
+(1 row)
+
+SELECT gp_truncate_error_log('copy_enc_err');
+ gp_truncate_error_log 
+-----------------------
+ t
+(1 row)
+
+TRUNCATE copy_enc_err;
+-- ===================================================================
+-- Test 5: Transcoding with multiple bad lines
+-- ===================================================================
+COPY copy_enc_err FROM '@abs_srcdir@/data/copy_enc_err_euccn_multi.data' DELIMITER '|'
+    ENCODING 'euc_cn' LOG ERRORS SEGMENT REJECT LIMIT 10 ROWS;
+NOTICE:  found 2 data formatting errors (2 or more input rows), rejected related input data
+SELECT * FROM copy_enc_err ORDER BY a;
+ a |   b   
+---+-------
+ 1 | good1
+ 3 | good3
+ 5 | good5
+(3 rows)
+
+SELECT count(*) AS error_count FROM gp_read_error_log('copy_enc_err');
+ error_count 
+-------------
+           2
+(1 row)
+
+-- Cleanup
+SELECT gp_truncate_error_log('copy_enc_err');
+ gp_truncate_error_log 
+-----------------------
+ t
+(1 row)
+
+DROP TABLE copy_enc_err;
diff --git a/src/test/regress/sql/.gitignore b/src/test/regress/sql/.gitignore
@@ -63,3 +63,4 @@
 /tag.sql
 /ao_unique_index_partition.sql
 /bfv_copy.sql
+/copy_encoding_error.sql

-Original file line number
+Diff line change
@@ @@ -0,0 +1,5 @@ @@
 +1|good1
 +2|bad�
 +3|good3
 +4|bad�
 +5|good5