|
| 1 | +-- |
| 2 | +-- Test COPY FROM with invalid multi-byte encoding and SEGMENT REJECT LIMIT. |
| 3 | +-- |
| 4 | +-- Regression test for https://github.com/apache/cloudberry/issues/1425 |
| 5 | +-- COPY FROM should correctly count encoding errors as single rejected rows, |
| 6 | +-- not double-count them. Also, encoding error SREH should work when |
| 7 | +-- transcoding is required. |
| 8 | +-- |
| 9 | +-- =================================================================== |
| 10 | +-- Test 1: Non-transcoding case (invalid UTF-8 into UTF-8 database) |
| 11 | +-- |
| 12 | +-- The file has 3 lines: |
| 13 | +-- line 1: valid |
| 14 | +-- line 2: ends with 0xC2 (incomplete 2-byte UTF-8 sequence before newline) |
| 15 | +-- line 3: valid |
| 16 | +-- |
| 17 | +-- With SEGMENT REJECT LIMIT 2, this should succeed: only 1 error row, |
| 18 | +-- and 1 < 2. Before the fix, the error was double-counted (counted as 2), |
| 19 | +-- which would cause the reject limit to be reached on the next error check. |
| 20 | +-- =================================================================== |
| 21 | +CREATE TABLE copy_enc_err(a int, b text) DISTRIBUTED BY (a); |
| 22 | +COPY copy_enc_err FROM '@abs_srcdir@/data/copy_enc_err_utf8.data' DELIMITER '|' |
| 23 | + LOG ERRORS SEGMENT REJECT LIMIT 2 ROWS; |
| 24 | +NOTICE: found 1 data formatting errors (1 or more input rows), rejected related input data |
| 25 | +-- Verify that valid rows (lines 1 and 3) were imported. |
| 26 | +SELECT * FROM copy_enc_err ORDER BY a; |
| 27 | + a | b |
| 28 | +---+------- |
| 29 | + 1 | good1 |
| 30 | + 3 | good3 |
| 31 | +(2 rows) |
| 32 | + |
| 33 | +-- Verify that exactly 1 error was logged (not 2). |
| 34 | +SELECT count(*) AS error_count FROM gp_read_error_log('copy_enc_err'); |
| 35 | + error_count |
| 36 | +------------- |
| 37 | + 1 |
| 38 | +(1 row) |
| 39 | + |
| 40 | +SELECT gp_truncate_error_log('copy_enc_err'); |
| 41 | + gp_truncate_error_log |
| 42 | +----------------------- |
| 43 | + t |
| 44 | +(1 row) |
| 45 | + |
| 46 | +TRUNCATE copy_enc_err; |
| 47 | +-- =================================================================== |
| 48 | +-- Test 2: Non-transcoding with multiple bad lines |
| 49 | +-- |
| 50 | +-- The file has 5 lines: lines 2 and 4 are bad. |
| 51 | +-- With SEGMENT REJECT LIMIT 10, this should succeed with 2 errors. |
| 52 | +-- =================================================================== |
| 53 | +COPY copy_enc_err FROM '@abs_srcdir@/data/copy_enc_err_utf8_multi.data' DELIMITER '|' |
| 54 | + LOG ERRORS SEGMENT REJECT LIMIT 10 ROWS; |
| 55 | +NOTICE: found 2 data formatting errors (2 or more input rows), rejected related input data |
| 56 | +-- All 3 valid rows should be imported. |
| 57 | +SELECT * FROM copy_enc_err ORDER BY a; |
| 58 | + a | b |
| 59 | +---+------- |
| 60 | + 1 | good1 |
| 61 | + 3 | good3 |
| 62 | + 5 | good5 |
| 63 | +(3 rows) |
| 64 | + |
| 65 | +-- Exactly 2 errors should be logged. |
| 66 | +SELECT count(*) AS error_count FROM gp_read_error_log('copy_enc_err'); |
| 67 | + error_count |
| 68 | +------------- |
| 69 | + 2 |
| 70 | +(1 row) |
| 71 | + |
| 72 | +SELECT gp_truncate_error_log('copy_enc_err'); |
| 73 | + gp_truncate_error_log |
| 74 | +----------------------- |
| 75 | + t |
| 76 | +(1 row) |
| 77 | + |
| 78 | +TRUNCATE copy_enc_err; |
| 79 | +-- =================================================================== |
| 80 | +-- Test 3: Non-transcoding, reject limit reached correctly |
| 81 | +-- |
| 82 | +-- 2 bad lines with SEGMENT REJECT LIMIT 2 should fail, because |
| 83 | +-- rejectcount (2) >= rejectlimit (2). |
| 84 | +-- =================================================================== |
| 85 | +COPY copy_enc_err FROM '@abs_srcdir@/data/copy_enc_err_utf8_multi.data' DELIMITER '|' |
| 86 | + LOG ERRORS SEGMENT REJECT LIMIT 2 ROWS; |
| 87 | +ERROR: segment reject limit reached, aborting operation |
| 88 | +DETAIL: Last error was: invalid byte sequence for encoding "UTF8": 0xfe |
| 89 | +CONTEXT: COPY copy_enc_err, line 3 |
| 90 | +SELECT gp_truncate_error_log('copy_enc_err'); |
| 91 | + gp_truncate_error_log |
| 92 | +----------------------- |
| 93 | + t |
| 94 | +(1 row) |
| 95 | + |
| 96 | +-- =================================================================== |
| 97 | +-- Test 4: Transcoding case (invalid EUC_CN into UTF-8 database) |
| 98 | +-- |
| 99 | +-- The file has 3 lines with data that claims to be EUC_CN: |
| 100 | +-- line 1: valid ASCII (valid in EUC_CN) |
| 101 | +-- line 2: ends with 0xA1 (starts a 2-byte EUC_CN char, but \n follows) |
| 102 | +-- line 3: valid ASCII (valid in EUC_CN) |
| 103 | +-- |
| 104 | +-- Before the fix, this would error with: |
| 105 | +-- "Data validation error: since the source data need transcoding |
| 106 | +-- sreh can not handle yet." |
| 107 | +-- After the fix, it should skip line 2 and import lines 1 and 3. |
| 108 | +-- =================================================================== |
| 109 | +COPY copy_enc_err FROM '@abs_srcdir@/data/copy_enc_err_euccn.data' DELIMITER '|' |
| 110 | + ENCODING 'euc_cn' LOG ERRORS SEGMENT REJECT LIMIT 2 ROWS; |
| 111 | +NOTICE: found 1 data formatting errors (1 or more input rows), rejected related input data |
| 112 | +-- Valid rows should be imported. |
| 113 | +SELECT * FROM copy_enc_err ORDER BY a; |
| 114 | + a | b |
| 115 | +---+------- |
| 116 | + 1 | good1 |
| 117 | + 3 | good3 |
| 118 | +(2 rows) |
| 119 | + |
| 120 | +-- Exactly 1 error should be logged. |
| 121 | +SELECT count(*) AS error_count FROM gp_read_error_log('copy_enc_err'); |
| 122 | + error_count |
| 123 | +------------- |
| 124 | + 1 |
| 125 | +(1 row) |
| 126 | + |
| 127 | +SELECT gp_truncate_error_log('copy_enc_err'); |
| 128 | + gp_truncate_error_log |
| 129 | +----------------------- |
| 130 | + t |
| 131 | +(1 row) |
| 132 | + |
| 133 | +TRUNCATE copy_enc_err; |
| 134 | +-- =================================================================== |
| 135 | +-- Test 5: Transcoding with multiple bad lines |
| 136 | +-- =================================================================== |
| 137 | +COPY copy_enc_err FROM '@abs_srcdir@/data/copy_enc_err_euccn_multi.data' DELIMITER '|' |
| 138 | + ENCODING 'euc_cn' LOG ERRORS SEGMENT REJECT LIMIT 10 ROWS; |
| 139 | +NOTICE: found 2 data formatting errors (2 or more input rows), rejected related input data |
| 140 | +SELECT * FROM copy_enc_err ORDER BY a; |
| 141 | + a | b |
| 142 | +---+------- |
| 143 | + 1 | good1 |
| 144 | + 3 | good3 |
| 145 | + 5 | good5 |
| 146 | +(3 rows) |
| 147 | + |
| 148 | +SELECT count(*) AS error_count FROM gp_read_error_log('copy_enc_err'); |
| 149 | + error_count |
| 150 | +------------- |
| 151 | + 2 |
| 152 | +(1 row) |
| 153 | + |
| 154 | +-- Cleanup |
| 155 | +SELECT gp_truncate_error_log('copy_enc_err'); |
| 156 | + gp_truncate_error_log |
| 157 | +----------------------- |
| 158 | + t |
| 159 | +(1 row) |
| 160 | + |
| 161 | +DROP TABLE copy_enc_err; |
0 commit comments