Skip to content
This repository was archived by the owner on Aug 8, 2025. It is now read-only.

Commit 1dc9cf5

Browse files
committed
Update validations to use term field instead of column
1 parent a423065 commit 1dc9cf5

3 files changed

Lines changed: 68 additions & 58 deletions

File tree

docker/data/custom/validation/all/all-validation.yaml

Lines changed: 13 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -15,66 +15,66 @@ dataSources:
1515
errorThreshold: 200
1616
- preFilterExpr: "name == 'peter'"
1717
expr: "amount > 50"
18-
- groupByCols: ["account_id"]
18+
- groupByFields: ["account_id"]
1919
aggType: "count"
2020
aggExpr: "count == 1"
21-
- columnNameType: "columnCountEqual"
21+
- fieldNameType: "columnCountEqual"
2222
count: "3"
23-
- columnNameType: "columnCountBetween"
23+
- fieldNameType: "columnCountBetween"
2424
minCount: "1"
2525
maxCount: "2"
26-
- columnNameType: "columnNameMatchOrder"
26+
- fieldNameType: "columnNameMatchOrder"
2727
names: ["account_id", "amount", "name"]
28-
- columnNameType: "columnNameMatchSet"
28+
- fieldNameType: "columnNameMatchSet"
2929
names: ["account_id", "my_name"]
3030
- upstreamDataSource: "my_first_json"
3131
upstreamReadOptions: {}
32-
joinColumns: ["account_id"]
32+
joinFields: ["account_id"]
3333
joinType: "outer"
3434
validation:
3535
expr: "my_first_json_customer_details.name == name"
3636
- upstreamDataSource: "my_first_json"
3737
upstreamReadOptions: {}
38-
joinColumns: ["account_id"]
38+
joinFields: ["account_id"]
3939
joinType: "outer"
4040
validation:
4141
expr: "amount != my_first_json_balance"
4242
- upstreamDataSource: "my_first_json"
4343
upstreamReadOptions: {}
44-
joinColumns: ["expr:account_id == my_first_json_account_id"]
44+
joinFields: ["expr:account_id == my_first_json_account_id"]
4545
joinType: "outer"
4646
validation:
4747
groupByCols: ["account_id", "my_first_json_balance"]
4848
aggExpr: "sum(amount) BETWEEN my_first_json_balance * 0.8 AND my_first_json_balance * 1.2"
4949
- upstreamDataSource: "my_first_json"
5050
upstreamReadOptions: {}
51-
joinColumns: ["account_id"]
51+
joinFields: ["account_id"]
5252
joinType: "outer"
5353
validation:
5454
groupByCols: ["account_id", "my_first_json_balance"]
5555
aggExpr: "sum(amount) BETWEEN my_first_json_balance * 0.8 AND my_first_json_balance * 1.2"
5656
- upstreamDataSource: "my_first_json"
5757
upstreamReadOptions: {}
58-
joinColumns: ["account_id"]
58+
joinFields: ["account_id"]
5959
joinType: "anti"
6060
validation:
6161
aggType: "count"
6262
aggExpr: "count == 0"
6363
- upstreamDataSource: "my_first_json"
6464
upstreamReadOptions: {}
65-
joinColumns: ["account_id"]
65+
joinFields: ["account_id"]
6666
joinType: "outer"
6767
validation:
6868
aggType: "count"
6969
aggExpr: "count == 30"
7070
- upstreamDataSource: "my_first_json"
7171
upstreamReadOptions: {}
72-
joinColumns: ["account_id"]
72+
joinFields: ["account_id"]
7373
joinType: "outer"
7474
validation:
7575
upstreamDataSource: "my_third_json"
7676
upstreamReadOptions: {}
77-
joinColumns: ["account_id"]
77+
joinFields: ["account_id"]
7878
joinType: "outer"
7979
validation:
8080
aggType: "count"

misc/script/validate-yaml-with-json-schema.sh

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -11,16 +11,16 @@ json_schema_version=${JSON_SCHEMA_VERSION:-latest}
1111
num_failed_validation=0
1212

1313
echo "Checking if $json_schema_version JSON schema is valid"
14-
if ajv compile --spec=draft2019 -c ajv-formats -s "${script_dir}/../../schema/data-caterer-task-${json_schema_version}.json"; then
14+
if ajv compile --spec=draft2019 -c ajv-formats -s "${script_dir}/../../schema/data-caterer-${json_schema_version}.json"; then
1515
echo -e "${GREEN}Valid JSON schema${NC}"
1616
else
1717
echo -e "${RED}Invalid JSON schema, exiting${NC}"
1818
exit 1
1919
fi
2020

2121
echo -e "Validating YAML files based on ${json_schema_version} JSON schema"
22-
for file in docker/data/custom/task/*/*/*.yaml docker/data/custom/task/*/*.yaml; do
23-
if ajv validate --all-errors --spec=draft2019 -c ajv-formats -s "${script_dir}/../../schema/data-caterer-task-${json_schema_version}.json" -d "${script_dir}/../../${file}"; then
22+
for file in docker/data/custom/task/*/*/*.yaml docker/data/custom/task/*/*.yaml docker/data/custom/validation/*.yaml docker/data/custom/validation/*/*.yaml; do
23+
if ajv validate --all-errors --spec=draft2019 -c ajv-formats -s "${script_dir}/../../schema/data-caterer-${json_schema_version}.json" -d "${script_dir}/../../${file}"; then
2424
echo -e "${GREEN}Passed validation, file=${file}${NC}"
2525
else
2626
num_failed_validation=$((num_failed_validation+1))

schema/data-caterer-latest.json

Lines changed: 52 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,6 @@
1414
"$ref": "#/$defs/DataCatererValidation"
1515
}
1616
],
17-
"additionalProperties": false,
1817
"$defs": {
1918
"DataCatererPlan": {
2019
"type": "object",
@@ -76,7 +75,9 @@
7675
"$ref": "#/$defs/ForeignKeyRelation"
7776
}
7877
}
79-
}
78+
},
79+
"required": ["source"],
80+
"additionalProperties": false
8081
}
8182
},
8283
"seed": {
@@ -156,22 +157,22 @@
156157
},
157158
"perField": {
158159
"type": "object",
159-
"description": "Generate records per unique set of column values. For example, generate 5 transactions for each 'account_number'.",
160+
"description": "Generate records per unique set of field values. For example, generate 5 transactions for each 'account_number'.",
160161
"properties": {
161162
"fieldNames": {
162163
"type": "array",
163-
"description": "Array of column names to generate records for each set of unique values.",
164+
"description": "Array of field names to generate records for each set of unique values.",
164165
"items": {
165166
"type": "string",
166-
"description": "Column name found within schema fields."
167+
"description": "Field name found within schema fields."
167168
}
168169
},
169170
"count": {
170171
"type": "integer",
171-
"description": "Static number of records per unique set of column values."
172+
"description": "Static number of records per unique set of field values."
172173
},
173174
"options": {
174-
"description": "Random number of records per unique set of column values.",
175+
"description": "Random number of records per unique set of field values.",
175176
"$ref": "#/$defs/CountGeneratorOptions"
176177
}
177178
},
@@ -272,7 +273,7 @@
272273
},
273274
"partitionBy": {
274275
"type": "string",
275-
"description": "Column name(s) to partition dataset by (comma separated)."
276+
"description": "Field name(s) to partition dataset by (comma separated)."
276277
}
277278
},
278279
"required": ["path"]
@@ -539,8 +540,7 @@
539540
"unevaluatedProperties": false
540541
}
541542
}
542-
},
543-
"additionalProperties": false
543+
}
544544
},
545545
"DataCatererValidation": {
546546
"type": "object",
@@ -589,7 +589,7 @@
589589
"$ref": "#/$defs/UpstreamValidation"
590590
},
591591
{
592-
"$ref": "#/$defs/ColumnNameValidation"
592+
"$ref": "#/$defs/FieldNameValidation"
593593
}
594594
]
595595
}
@@ -612,7 +612,8 @@
612612
"type": "array",
613613
"description": "Name of the field(s) to relate with."
614614
}
615-
}
615+
},
616+
"additionalProperties": false
616617
},
617618
"EnvironmentVariables": {
618619
"type": "object",
@@ -714,6 +715,15 @@
714715
"type": "number",
715716
"description": "Mean of generated values.",
716717
"default": 500.0
718+
},
719+
"distribution": {
720+
"type": "string",
721+
"description": "Type of distribution values follow.",
722+
"enum": ["normal", "exponential"]
723+
},
724+
"distributionRateParam": {
725+
"type": "number",
726+
"description": "If distribution is `exponential`, rate parameter to adjust exponential distribution."
717727
}
718728
},
719729
"allOf": [
@@ -1097,27 +1107,27 @@
10971107
},
10981108
"ValidationSelectExpression": {
10991109
"const": "selectExpr",
1100-
"description": "SQL expression used to apply to columns before running validations."
1110+
"description": "SQL expression used to apply to fields before running validations."
11011111
},
11021112
"ValidationAggregateExpression": {
11031113
"const": "aggExpr",
11041114
"description": "SQL expression to define validation after aggregation. Check https://spark.apache.org/docs/latest/api/sql for reference."
11051115
},
1106-
"ValidationAggregateColumn": {
1107-
"const": "aggCol",
1108-
"description": "Column name to run aggregation on."
1116+
"ValidationAggregateField": {
1117+
"const": "aggField",
1118+
"description": "Field name to run aggregation on."
11091119
},
11101120
"ValidationAggregateType": {
11111121
"const": "aggType",
11121122
"description": "Type of aggregation to run. Can be either: sum, avg, max, min, mean, stddev, count."
11131123
},
1114-
"ValidationAggregateGroupByColumns": {
1115-
"const": "groupByCols",
1116-
"description": "Column names to group by for aggregation calculation."
1124+
"ValidationAggregateGroupByFields": {
1125+
"const": "groupByFields",
1126+
"description": "Field names to group by for aggregation calculation."
11171127
},
1118-
"ValidationUpstreamJoinColumns": {
1119-
"const": "joinColumns",
1120-
"description": "Column names to run join operation on."
1128+
"ValidationUpstreamJoinFields": {
1129+
"const": "joinFields",
1130+
"description": "Field names to run join operation on."
11211131
},
11221132
"ValidationUpstreamJoinType": {
11231133
"const": "joinType",
@@ -1135,25 +1145,25 @@
11351145
"const": "validation",
11361146
"description": "Validation to run after join with upstream data source."
11371147
},
1138-
"ValidationColumnNameType": {
1139-
"const": "columnNameType",
1140-
"description": "Type of column name validation to run."
1148+
"ValidationFieldNameType": {
1149+
"const": "fieldNameType",
1150+
"description": "Type of field name validation to run."
11411151
},
1142-
"ValidationColumnNameCount": {
1152+
"ValidationFieldNameCount": {
11431153
"const": "count",
1144-
"description": "Expected number of column names."
1154+
"description": "Expected number of field names."
11451155
},
1146-
"ValidationColumnNameMaxCount": {
1156+
"ValidationFieldNameMaxCount": {
11471157
"const": "maxCount",
1148-
"description": "Maximum number of column names."
1158+
"description": "Maximum number of field names."
11491159
},
1150-
"ValidationColumnNameMinCount": {
1160+
"ValidationFieldNameMinCount": {
11511161
"const": "minCount",
1152-
"description": "Minimum number of column names."
1162+
"description": "Minimum number of field names."
11531163
},
1154-
"ValidationColumnNames": {
1164+
"ValidationFieldNames": {
11551165
"const": "names",
1156-
"description": "Expected column names."
1166+
"description": "Expected field names."
11571167
},
11581168
"BasicValidation": {
11591169
"type": "object",
@@ -1191,13 +1201,13 @@
11911201
"$ref": "#/$defs/ValidationPreFilterExpression"
11921202
},
11931203
{
1194-
"$ref": "#/$defs/ValidationAggregateColumn"
1204+
"$ref": "#/$defs/ValidationAggregateField"
11951205
},
11961206
{
11971207
"$ref": "#/$defs/ValidationAggregateExpression"
11981208
},
11991209
{
1200-
"$ref": "#/$defs/ValidationAggregateGroupByColumns"
1210+
"$ref": "#/$defs/ValidationAggregateGroupByFields"
12011211
},
12021212
{
12031213
"$ref": "#/$defs/ValidationAggregateType"
@@ -1222,7 +1232,7 @@
12221232
"$ref": "#/$defs/ValidationUpstreamDataSource"
12231233
},
12241234
{
1225-
"$ref": "#/$defs/ValidationUpstreamJoinColumns"
1235+
"$ref": "#/$defs/ValidationUpstreamJoinFields"
12261236
},
12271237
{
12281238
"$ref": "#/$defs/ValidationUpstreamJoinType"
@@ -1236,7 +1246,7 @@
12361246
]
12371247
}
12381248
},
1239-
"ColumnNameValidation": {
1249+
"FieldNameValidation": {
12401250
"type": "object",
12411251
"propertyNames": {
12421252
"anyOf": [
@@ -1250,19 +1260,19 @@
12501260
"$ref": "#/$defs/ValidationPreFilterExpression"
12511261
},
12521262
{
1253-
"$ref": "#/$defs/ValidationColumnNameCount"
1263+
"$ref": "#/$defs/ValidationFieldNameCount"
12541264
},
12551265
{
1256-
"$ref": "#/$defs/ValidationColumnNameMaxCount"
1266+
"$ref": "#/$defs/ValidationFieldNameMaxCount"
12571267
},
12581268
{
1259-
"$ref": "#/$defs/ValidationColumnNameMinCount"
1269+
"$ref": "#/$defs/ValidationFieldNameMinCount"
12601270
},
12611271
{
1262-
"$ref": "#/$defs/ValidationColumnNames"
1272+
"$ref": "#/$defs/ValidationFieldNames"
12631273
},
12641274
{
1265-
"$ref": "#/$defs/ValidationColumnNameType"
1275+
"$ref": "#/$defs/ValidationFieldNameType"
12661276
}
12671277
]
12681278
}

0 commit comments

Comments
 (0)