Skip to content

Commit 8cb7e01

Browse files
authored
Merge pull request JanKaul#163 from splitgraph/remove-distinct-counts
fix: remove `distinct_counts` manifest field
2 parents 4025cbc + d225517 commit 8cb7e01

3 files changed

Lines changed: 1 addition & 76 deletions

File tree

datafusion_iceberg/src/statistics.rs

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -126,12 +126,7 @@ fn column_statistics<'a>(
126126
))
127127
})
128128
.unwrap_or(Precision::Absent),
129-
distinct_count: data_file
130-
.distinct_counts()
131-
.as_ref()
132-
.and_then(|x| x.get(&id))
133-
.map(|x| Precision::Exact(*x as usize))
134-
.unwrap_or(Precision::Absent),
129+
distinct_count: Precision::Absent,
135130
sum_value: Precision::Absent,
136131
}
137132
})

iceberg-rust-spec/src/spec/manifest.rs

Lines changed: 0 additions & 69 deletions
Original file line numberDiff line numberDiff line change
@@ -539,8 +539,6 @@ pub struct DataFile {
539539
null_value_counts: Option<AvroMap<i64>>,
540540
/// Map from column id to number of NaN values
541541
nan_value_counts: Option<AvroMap<i64>>,
542-
/// Map from column id to number of distinct values in the column.
543-
distinct_counts: Option<AvroMap<i64>>,
544542
/// Map from column id to lower bound in the column
545543
lower_bounds: Option<HashMap<i32, Value>>,
546544
/// Map from column id to upper bound in the column
@@ -584,7 +582,6 @@ impl DataFile {
584582
value_counts: value.value_counts,
585583
null_value_counts: value.null_value_counts,
586584
nan_value_counts: value.nan_value_counts,
587-
distinct_counts: value.distinct_counts,
588585
lower_bounds: value
589586
.lower_bounds
590587
.map(|map| map.into_value_map(schema.fields()))
@@ -618,7 +615,6 @@ impl DataFile {
618615
value_counts: value.value_counts,
619616
null_value_counts: value.null_value_counts,
620617
nan_value_counts: value.nan_value_counts,
621-
distinct_counts: value.distinct_counts,
622618
lower_bounds: value
623619
.lower_bounds
624620
.map(|map| map.into_value_map(schema.fields()))
@@ -658,8 +654,6 @@ pub struct DataFileV2 {
658654
pub null_value_counts: Option<AvroMap<i64>>,
659655
/// Map from column id to number of NaN values
660656
pub nan_value_counts: Option<AvroMap<i64>>,
661-
/// Map from column id to number of distinct values in the column.
662-
pub distinct_counts: Option<AvroMap<i64>>,
663657
/// Map from column id to lower bound in the column
664658
pub lower_bounds: Option<AvroMap<ByteBuf>>,
665659
/// Map from column id to upper bound in the column
@@ -701,8 +695,6 @@ pub struct DataFileV1 {
701695
pub null_value_counts: Option<AvroMap<i64>>,
702696
/// Map from column id to number of NaN values
703697
pub nan_value_counts: Option<AvroMap<i64>>,
704-
/// Map from column id to number of distinct values in the column.
705-
pub distinct_counts: Option<AvroMap<i64>>,
706698
/// Map from column id to lower bound in the column
707699
pub lower_bounds: Option<AvroMap<ByteBuf>>,
708700
/// Map from column id to upper bound in the column
@@ -728,7 +720,6 @@ impl From<DataFile> for DataFileV2 {
728720
value_counts: value.value_counts,
729721
null_value_counts: value.null_value_counts,
730722
nan_value_counts: value.nan_value_counts,
731-
distinct_counts: value.distinct_counts,
732723
lower_bounds: value.lower_bounds.map(Into::into),
733724
upper_bounds: value.upper_bounds.map(Into::into),
734725
key_metadata: value.key_metadata,
@@ -751,7 +742,6 @@ impl From<DataFile> for DataFileV1 {
751742
value_counts: value.value_counts,
752743
null_value_counts: value.null_value_counts,
753744
nan_value_counts: value.nan_value_counts,
754-
distinct_counts: value.distinct_counts,
755745
lower_bounds: value.lower_bounds.map(Into::into),
756746
upper_bounds: value.upper_bounds.map(Into::into),
757747
key_metadata: value.key_metadata,
@@ -777,7 +767,6 @@ impl From<DataFileV1> for DataFileV2 {
777767
value_counts: v1.value_counts,
778768
null_value_counts: v1.null_value_counts,
779769
nan_value_counts: v1.nan_value_counts,
780-
distinct_counts: v1.distinct_counts,
781770
lower_bounds: v1.lower_bounds,
782771
upper_bounds: v1.upper_bounds,
783772
key_metadata: v1.key_metadata,
@@ -962,34 +951,6 @@ impl DataFileV1 {
962951
"default": null,
963952
"field-id": 137
964953
},
965-
{
966-
"name": "distinct_counts",
967-
"type": [
968-
"null",
969-
{
970-
"type": "array",
971-
"logicalType": "map",
972-
"items": {
973-
"type": "record",
974-
"name": "k123_v124",
975-
"fields": [
976-
{
977-
"name": "key",
978-
"type": "int",
979-
"field-id": 123
980-
},
981-
{
982-
"name": "value",
983-
"type": "long",
984-
"field-id": 124
985-
}
986-
]
987-
}
988-
}
989-
],
990-
"default": null,
991-
"field-id": 111
992-
},
993954
{
994955
"name": "lower_bounds",
995956
"type": [
@@ -1234,34 +1195,6 @@ impl DataFileV2 {
12341195
"default": null,
12351196
"field-id": 137
12361197
},
1237-
{
1238-
"name": "distinct_counts",
1239-
"type": [
1240-
"null",
1241-
{
1242-
"type": "array",
1243-
"logicalType": "map",
1244-
"items": {
1245-
"type": "record",
1246-
"name": "k123_v124",
1247-
"fields": [
1248-
{
1249-
"name": "key",
1250-
"type": "int",
1251-
"field-id": 123
1252-
},
1253-
{
1254-
"name": "value",
1255-
"type": "long",
1256-
"field-id": 124
1257-
}
1258-
]
1259-
}
1260-
}
1261-
],
1262-
"default": null,
1263-
"field-id": 111
1264-
},
12651198
{
12661199
"name": "lower_bounds",
12671200
"type": [
@@ -1424,7 +1357,6 @@ mod tests {
14241357
value_counts: None,
14251358
null_value_counts: None,
14261359
nan_value_counts: None,
1427-
distinct_counts: None,
14281360
lower_bounds: Some(HashMap::from_iter(vec![(0, Value::Date(0))])),
14291361
upper_bounds: None,
14301362
key_metadata: None,
@@ -1545,7 +1477,6 @@ mod tests {
15451477
value_counts: None,
15461478
null_value_counts: None,
15471479
nan_value_counts: None,
1548-
distinct_counts: None,
15491480
lower_bounds: Some(HashMap::from_iter(vec![(0, Value::Date(0))])),
15501481
upper_bounds: None,
15511482
key_metadata: None,

iceberg-rust/src/file_format/parquet.rs

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -250,7 +250,6 @@ pub fn parquet_to_datafile(
250250
.with_value_counts(Some(value_counts))
251251
.with_null_value_counts(Some(null_value_counts))
252252
.with_nan_value_counts(None)
253-
.with_distinct_counts(Some(distinct_counts))
254253
.with_lower_bounds(Some(lower_bounds))
255254
.with_upper_bounds(Some(upper_bounds));
256255

0 commit comments

Comments
 (0)