Skip to content

Commit 3245120

Browse files
author
haileyajohnson
authored
Merge pull request #886 from haileyajohnson/zarr-filters
[5.x] add filters for zarr
2 parents 36ed523 + ed807ac commit 3245120

8 files changed

Lines changed: 174 additions & 72 deletions

File tree

cdm/core/src/main/java/ucar/nc2/filter/Checksum32.java

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,8 +33,10 @@ public Checksum32(CType type, ByteOrder bo) {
3333
this.byteOrder = bo;
3434
}
3535

36+
3637
public Checksum32(CType type) {
37-
this(type, ByteOrder.BIG_ENDIAN);
38+
// TODO: can we do this better?
39+
this(type, ByteOrder.LITTLE_ENDIAN);
3840
}
3941

4042
@Override

cdm/core/src/main/java/ucar/nc2/iosp/hdf5/DataBTree.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -297,7 +297,7 @@ DataChunk next() throws IOException {
297297
public class DataChunk {
298298
public final int size; // size of chunk in bytes; need storage layout dimensions to interpret
299299
public final int filterMask; // bitfield indicating which filters have been skipped for this chunk
300-
public final int[] offset; // offset index of this chunk, reletive to entire array
300+
public final int[] offset; // offset index of this chunk, relative to entire array
301301
public final long filePos; // filePos of a single raw data chunk, already shifted by the offset if needed
302302

303303
DataChunk(int ndim, boolean last) throws IOException {

cdm/zarr/src/main/java/ucar/nc2/iosp/zarr/ZArray.java

Lines changed: 23 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,9 @@
99
import com.fasterxml.jackson.databind.deser.std.StdDeserializer;
1010
import com.fasterxml.jackson.databind.node.ArrayNode;
1111
import ucar.ma2.DataType;
12+
import ucar.nc2.filter.Filter;
13+
import ucar.nc2.filter.Filters;
14+
import ucar.nc2.filter.UnknownFilterException;
1215

1316
import java.io.IOException;
1417
import java.nio.ByteOrder;
@@ -63,14 +66,14 @@ public enum Order {
6366
private final Object fillValue;
6467
private final DataType datatype;
6568
private final String dtype;
66-
private final ZarrFilter compressor;
69+
private final Filter compressor;
6770
private final ByteOrder byteOrder;
6871
private final Order order;
69-
private final List<ZarrFilter> filters;
72+
private final List<Filter> filters;
7073
private final String separator;
7174

72-
public ZArray(int[] shape, int[] chunks, Object fill_value, String dtype, ZarrFilter compressor, String order,
73-
List<ZarrFilter> filters, String separator) throws ZarrFormatException {
75+
public ZArray(int[] shape, int[] chunks, Object fill_value, String dtype, Filter compressor, String order,
76+
List<Filter> filters, String separator) throws ZarrFormatException {
7477
this.shape = shape;
7578
this.chunks = chunks;
7679
this.fillValue = fill_value;
@@ -91,11 +94,11 @@ public int[] getChunks() {
9194
return this.chunks;
9295
}
9396

94-
public ZarrFilter getCompressor() {
97+
public Filter getCompressor() {
9598
return this.compressor;
9699
}
97100

98-
public List<ZarrFilter> getFilters() {
101+
public List<Filter> getFilters() {
99102
return this.filters;
100103
}
101104

@@ -192,21 +195,24 @@ public ZArray deserialize(JsonParser p, DeserializationContext ctxt) throws IOEx
192195
TreeNode dim_sep = root.path(ZarrKeys.DIMENSION_SEPARATOR);
193196
String delimiter = dim_sep.isMissingNode() ? DEFAULT_SEPARATOR : ((JsonNode) dim_sep).asText();
194197

195-
Map<String, Object> compBean = codec.readValue(root.path(ZarrKeys.COMPRESSOR).traverse(codec), HashMap.class);
198+
// Filters and compressor
199+
try {
200+
Map<String, Object> compBean = codec.readValue(root.path(ZarrKeys.COMPRESSOR).traverse(codec), HashMap.class);
196201

197-
ZarrFilter compressor;
198-
if (compBean == null) {
199-
compressor = null;
200-
} else {
201-
compressor = null; // TODO: implement compressors
202-
}
202+
Filter compressor = Filters.getFilterByName(compBean);
203203

204-
// TODO: filters
205-
List<ZarrFilter> filters = null;
204+
List<Filter> filters = new ArrayList<>();
206205

207-
try {
206+
Map<String, Object>[] filtersBean =
207+
codec.readValue(root.path(ZarrKeys.FILTERS).traverse(codec), HashMap[].class);
208+
209+
if (filtersBean != null) {
210+
for (Map<String, Object> bean : filtersBean) {
211+
filters.add(Filters.getFilterByName(bean));
212+
}
213+
}
208214
return new ZArray(shape, chunks, fill, dtype, compressor, order, filters, delimiter);
209-
} catch (ZarrFormatException ex) {
215+
} catch (UnknownFilterException | ZarrFormatException ex) {
210216
throw new IOException(ex.getMessage(), ex.getCause());
211217
}
212218
}

cdm/zarr/src/main/java/ucar/nc2/iosp/zarr/ZarrFilter.java

Lines changed: 0 additions & 13 deletions
This file was deleted.

cdm/zarr/src/main/java/ucar/nc2/iosp/zarr/ZarrHeader.java

Lines changed: 13 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
import ucar.nc2.Dimension;
1111
import ucar.nc2.Group;
1212
import ucar.nc2.Variable;
13+
import ucar.nc2.filter.Filter;
1314
import ucar.unidata.io.RandomAccessFile;
1415
import ucar.unidata.io.zarr.RandomAccessDirectory;
1516
import ucar.unidata.io.zarr.RandomAccessDirectoryItem;
@@ -42,7 +43,7 @@ public ZarrHeader(RandomAccessDirectory raf, Group.Builder rootGroup) {
4243
private class DelayedVarMaker {
4344
private RandomAccessDirectoryItem var;
4445
private ZArray zarray;
45-
private Set<Integer> initializedChunks; // track any uninitialized chunks for var
46+
private Map<Integer, Long> initializedChunks; // track any uninitialized chunks for var
4647
private List<Attribute> attrs; // list of variable attributes
4748
private long dataOffset; // byte position where data starts
4849

@@ -53,7 +54,7 @@ void setAttrs(List<Attribute> attrs) {
5354
void setVar(RandomAccessDirectoryItem var) {
5455
this.var = var;
5556
this.attrs = null;
56-
this.initializedChunks = new HashSet<>();
57+
this.initializedChunks = new HashMap<>();
5758
this.dataOffset = -1;
5859
if (var != null) {
5960
try {
@@ -90,7 +91,7 @@ void processItem(RandomAccessDirectoryItem item) {
9091
ZarrIosp.logger.error(new ZarrFormatException().getMessage());
9192
this.var = null; // skip rest of var is unrecognized files found
9293
}
93-
this.initializedChunks.add(index);
94+
this.initializedChunks.put(index, item.length());
9495
// if data offset is uninitialized, set here
9596
if (this.dataOffset < 0) {
9697
this.dataOffset = item.startIndex();
@@ -179,7 +180,7 @@ private void makeGroup(RandomAccessDirectoryItem item, List<Attribute> attrs) {
179180
}
180181

181182
private void makeVariable(RandomAccessDirectoryItem item, long dataOffset, ZArray zarray,
182-
Set<Integer> initializedChunks, List<Attribute> attrs) throws ZarrFormatException {
183+
Map<Integer, Long> initializedChunks, List<Attribute> attrs) throws ZarrFormatException {
183184
// make new Variable
184185
Variable.Builder var = Variable.builder();
185186
String location = ZarrUtils.trimLocation(item.getLocation());
@@ -300,16 +301,16 @@ private Group.Builder findGroup(String location) throws ZarrFormatException {
300301
class VInfo {
301302
private final int[] chunks;
302303
private final Object fillValue;
303-
private final ZarrFilter compressor;
304+
private final Filter compressor;
304305
private final ByteOrder byteOrder;
305306
private final ZArray.Order order;
306307
private final String separator;
307-
private final List<ZarrFilter> filters;
308+
private final List<Filter> filters;
308309
private final long offset;
309-
private final Set<Integer> initializedChunks;
310+
private final Map<Integer, Long> initializedChunks;
310311

311-
VInfo(int[] chunks, Object fillValue, ZarrFilter compressor, ByteOrder byteOrder, ZArray.Order order,
312-
String separator, List<ZarrFilter> filters, long offset, Set<Integer> initializedChunks) {
312+
VInfo(int[] chunks, Object fillValue, Filter compressor, ByteOrder byteOrder, ZArray.Order order, String separator,
313+
List<Filter> filters, long offset, Map<Integer, Long> initializedChunks) {
313314
this.chunks = chunks;
314315
this.fillValue = fillValue;
315316
this.byteOrder = byteOrder;
@@ -329,7 +330,7 @@ public Object getFillValue() {
329330
return this.fillValue;
330331
}
331332

332-
public ZarrFilter getCompressor() {
333+
public Filter getCompressor() {
333334
return this.compressor;
334335
}
335336

@@ -345,15 +346,15 @@ public String getSeparator() {
345346
return this.separator;
346347
}
347348

348-
public List<ZarrFilter> getFilters() {
349+
public List<Filter> getFilters() {
349350
return this.filters;
350351
}
351352

352353
public long getOffset() {
353354
return this.offset;
354355
}
355356

356-
public Set<Integer> getInitializedChunks() {
357+
public Map<Integer, Long> getInitializedChunks() {
357358
return this.initializedChunks;
358359
}
359360

cdm/zarr/src/main/java/ucar/nc2/iosp/zarr/ZarrLayoutBB.java

Lines changed: 34 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
import ucar.ma2.Section;
55
import ucar.nc2.Dimension;
66
import ucar.nc2.Variable;
7+
import ucar.nc2.filter.Filter;
78
import ucar.nc2.iosp.LayoutBB;
89
import ucar.nc2.iosp.LayoutBBTiled;
910
import ucar.unidata.io.RandomAccessFile;
@@ -12,10 +13,11 @@
1213
import java.nio.*;
1314
import java.util.ArrayList;
1415
import java.util.List;
16+
import java.util.Map;
1517
import java.util.Set;
1618

1719
/**
18-
* A tiled layout for Zarr formats that accommodates uncompressing anf filtering data before returning
20+
* A tiled layout for Zarr formats that accommodates uncompressing and filtering data before returning
1921
*/
2022
public class ZarrLayoutBB implements LayoutBB {
2123

@@ -27,39 +29,35 @@ public class ZarrLayoutBB implements LayoutBB {
2729
private final Section want;
2830

2931
private int[] chunkSize; // number of elements per chunks
30-
private int elemSize; // size of eelements in bytes
32+
private int elemSize; // size of elements in bytes
3133
private int nChunks[]; // number of chunks per dimension
32-
private int nBytes; // number of bytes per chunk
3334
private int totalNChunks; // total number of chunks
34-
private int totalChunkSize; // total number of elements per chunk
3535
private boolean F_order = false; // F order storage?
36-
private Set<Integer> initializedChunks; // set of chunks that exist as files
37-
38-
// offset to start of data
39-
private static final int ZARR_COMPRESSOR_OFFSET = 16;
40-
private int data_bytes_offset;
36+
private Map<Integer, Long> initializedChunks; // set of chunks that exist as files and their compressed size
37+
private Filter compressor;
38+
private List<Filter> filters;
4139

4240
public ZarrLayoutBB(Variable v2, Section wantSection, RandomAccessFile raf) {
4341
// var data info
4442
this.raf = raf;
4543
ZarrHeader.VInfo vinfo = (ZarrHeader.VInfo) v2.getSPobject();
4644
this.byteOrder = vinfo.getByteOrder();
4745
this.varOffset = vinfo.getOffset();
48-
this.data_bytes_offset = vinfo.getCompressor() == null ? 0 : ZARR_COMPRESSOR_OFFSET;
46+
this.compressor = vinfo.getCompressor();
47+
this.filters = vinfo.getFilters();
48+
4949

5050
// fill in chunk info
5151
this.chunkSize = vinfo.getChunks();
5252
int ndims = this.chunkSize.length;
5353
this.initializedChunks = vinfo.getInitializedChunks();
5454
this.nChunks = new int[ndims];
5555
this.totalNChunks = 1;
56-
this.totalChunkSize = 1;
5756
for (int i = 0; i < ndims; i++) {
5857
Dimension dim = v2.getDimension(i);
5958
// round up nchunks if not evenly divisible by chunk size
6059
this.nChunks[i] = (int) Math.ceil(dim.getLength() / this.chunkSize[i]);
6160
this.totalNChunks *= nChunks[i];
62-
this.totalChunkSize *= chunkSize[i];
6361
}
6462

6563
// transpose wantsSection and chunk shape if F order
@@ -79,7 +77,6 @@ public ZarrLayoutBB(Variable v2, Section wantSection, RandomAccessFile raf) {
7977
}
8078

8179
this.elemSize = v2.getDataType().getSize();
82-
this.nBytes = totalChunkSize * elemSize;
8380

8481
// create delegate and chunk iterator
8582
ZarrLayoutBB.DataChunkIterator iter = new ZarrLayoutBB.DataChunkIterator();
@@ -110,18 +107,21 @@ private class DataChunkIterator implements LayoutBBTiled.DataChunkIterator {
110107

111108
private int[] currChunk; // current chunk in subscript coords
112109
private int chunkNum; // current chunk as flat index
110+
private long currOffset; // byte position of current chunk
111+
113112

114113
DataChunkIterator() {
115114
this.currChunk = new int[chunkSize.length];
116115
this.chunkNum = 0;
116+
this.currOffset = varOffset; // start at start of variable data
117117
}
118118

119119
public boolean hasNext() {
120120
return this.chunkNum < totalNChunks;
121121
}
122122

123123
public LayoutBBTiled.DataChunk next() {
124-
DataChunk chunk = new ZarrLayoutBB.DataChunk(this.currChunk, this.chunkNum);
124+
DataChunk chunk = new ZarrLayoutBB.DataChunk(this.currChunk, this.chunkNum, this.currOffset);
125125
incrementChunk();
126126
return chunk;
127127
}
@@ -134,6 +134,7 @@ private void incrementChunk() {
134134
i--;
135135
}
136136
this.currChunk[i]++;
137+
this.currOffset += initializedChunks.getOrDefault(this.chunkNum, (long) 0);
137138
this.chunkNum = ZarrUtils.subscriptsToIndex(this.currChunk, nChunks);
138139
}
139140
}
@@ -144,9 +145,9 @@ private class DataChunk implements LayoutBBTiled.DataChunk {
144145
private long rafOffset; // start position of chunk in bytes
145146
private int chunkNum;
146147

147-
DataChunk(int[] index, int chunkNum) {
148+
DataChunk(int[] index, int chunkNum, long rafOffset) {
149+
this.rafOffset = rafOffset;
148150
this.offset = new int[index.length];
149-
this.rafOffset = varOffset + (chunkNum * (nBytes + data_bytes_offset)) + data_bytes_offset;
150151
for (int i = 0; i < index.length; i++) {
151152
int j = F_order ? index.length - i - 1 : i;
152153
this.offset[i] = index[j] * chunkSize[i];
@@ -162,14 +163,23 @@ public ByteBuffer getByteBuffer() throws IOException {
162163
// read the data
163164
byte[] data;
164165
// if chunk does not exist as file, return empty buffer
165-
if (!initializedChunks.contains(chunkNum)) {
166-
data = new byte[0];
167-
} else {
168-
data = new byte[nBytes];
169-
raf.seek(this.rafOffset);
170-
raf.readFully(data);
171-
172-
// TODO: apply filters in reverse order
166+
long dataLength = initializedChunks.getOrDefault(chunkNum, (long) 0);
167+
if (dataLength == 0) {
168+
ByteBuffer result = ByteBuffer.wrap(new byte[0]);
169+
result.order(byteOrder);
170+
return result;
171+
}
172+
173+
data = new byte[(int) dataLength];
174+
raf.seek(this.rafOffset);
175+
// raf.read(data, 0, (int)dataLength);
176+
raf.readFully(data);
177+
178+
// apply compressor
179+
data = compressor.decode(data);
180+
// apply filters in reverse order
181+
for (int i = filters.size() - 1; i >= 0; i--) {
182+
data = filters.get(i).decode(data);
173183
}
174184

175185
ByteBuffer result = ByteBuffer.wrap(data);

0 commit comments

Comments
 (0)