diff --git a/src/Arrow.dom.ts b/src/Arrow.dom.ts index 777b8599..8ef06431 100644 --- a/src/Arrow.dom.ts +++ b/src/Arrow.dom.ts @@ -57,6 +57,7 @@ export { Time, TimeSecond, TimeMillisecond, TimeMicrosecond, TimeNanosecond, Decimal, List, + LargeList, Struct, StructRow, Union, DenseUnion, SparseUnion, Dictionary, @@ -100,6 +101,7 @@ export { DurationBuilder, DurationSecondBuilder, DurationMillisecondBuilder, DurationMicrosecondBuilder, DurationNanosecondBuilder, IntBuilder, Int8Builder, Int16Builder, Int32Builder, Int64Builder, Uint8Builder, Uint16Builder, Uint32Builder, Uint64Builder, ListBuilder, + LargeListBuilder, MapBuilder, NullBuilder, StructBuilder, diff --git a/src/Arrow.ts b/src/Arrow.ts index 848156d8..f7d65add 100644 --- a/src/Arrow.ts +++ b/src/Arrow.ts @@ -45,6 +45,7 @@ export { Time, TimeSecond, TimeMillisecond, TimeMicrosecond, TimeNanosecond, Decimal, List, + LargeList, Struct, Union, DenseUnion, SparseUnion, Dictionary, @@ -85,6 +86,7 @@ export { BinaryBuilder } from './builder/binary.js'; export { BinaryViewBuilder } from './builder/binaryview.js'; export { LargeBinaryBuilder } from './builder/largebinary.js'; export { ListBuilder } from './builder/list.js'; +export { LargeListBuilder } from './builder/largelist.js'; export { FixedSizeListBuilder } from './builder/fixedsizelist.js'; export { MapBuilder } from './builder/map.js'; export { StructBuilder } from './builder/struct.js'; diff --git a/src/builder.ts b/src/builder.ts index 5ae43a88..3516ca9f 100644 --- a/src/builder.ts +++ b/src/builder.ts @@ -22,7 +22,7 @@ import { DataType, strideForType, Float, Int, Decimal, FixedSizeBinary, Date_, Time, Timestamp, Interval, Duration, - Utf8, LargeUtf8, Binary, LargeBinary, List, Map_, + Utf8, LargeUtf8, Binary, LargeBinary, List, LargeList, Map_, } from './type.js'; import { createIsValidFunction } from './builder/valid.js'; import { BufferBuilder, BitmapBufferBuilder, DataBufferBuilder, OffsetsBufferBuilder } from './builder/buffer.js'; @@ -357,7 +357,7 @@ export abstract class FixedWidthBuilder extends Builder { +export abstract class VariableWidthBuilder extends Builder { protected _pendingLength = 0; protected _offsets: OffsetsBufferBuilder; protected _pending: Map | undefined; diff --git a/src/builder/largelist.ts b/src/builder/largelist.ts new file mode 100644 index 00000000..2133a55a --- /dev/null +++ b/src/builder/largelist.ts @@ -0,0 +1,55 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import { Field } from '../schema.js'; +import { DataType, LargeList } from '../type.js'; +import { OffsetsBufferBuilder } from './buffer.js'; +import { bigIntToNumber } from '../util/bigint.js'; +import { Builder, BuilderOptions, VariableWidthBuilder } from '../builder.js'; + +/** @ignore */ +export class LargeListBuilder extends VariableWidthBuilder, TNull> { + protected _offsets: OffsetsBufferBuilder>; + constructor(opts: BuilderOptions, TNull>) { + super(opts); + this._offsets = new OffsetsBufferBuilder(opts.type); + } + public addChild(child: Builder, name = '0') { + if (this.numChildren > 0) { + throw new Error('LargeListBuilder can only have one child.'); + } + this.children[this.numChildren] = child; + this.type = new LargeList(new Field(name, child.type, true)); + return this.numChildren - 1; + } + protected _flushPending(pending: Map) { + const offsets = this._offsets; + const [child] = this.children; + for (const [index, value] of pending) { + if (typeof value === 'undefined') { + offsets.set(index, BigInt(0)); + } else { + const v = value as T['TValue']; + const n = v.length; + const start = bigIntToNumber(offsets.set(index, BigInt(n)).buffer[index]); + for (let i = -1; ++i < n;) { + child.set(start + i, v[i]); + } + } + } + } +} diff --git a/src/data.ts b/src/data.ts index 46080e68..b13efdbf 100644 --- a/src/data.ts +++ b/src/data.ts @@ -316,7 +316,7 @@ Object.defineProperty(Data, Symbol.hasInstance, { import { Dictionary, - Bool, Null, Utf8, Utf8View, LargeUtf8, Binary, BinaryView, LargeBinary, Decimal, FixedSizeBinary, List, FixedSizeList, Map_, Struct, + Bool, Null, Utf8, Utf8View, LargeUtf8, Binary, BinaryView, LargeBinary, Decimal, FixedSizeBinary, List, LargeList, FixedSizeList, Map_, Struct, Float, Int, Date_, @@ -455,6 +455,13 @@ class MakeDataVisitor extends Visitor { const { ['length']: length = valueOffsets.length - 1, ['nullCount']: nullCount = props['nullBitmap'] ? -1 : 0 } = props; return new Data(type, offset, length, nullCount, [valueOffsets, undefined, nullBitmap], [child]); } + public visitLargeList(props: LargeListDataProps) { + const { ['type']: type, ['offset']: offset = 0, ['child']: child } = props; + const nullBitmap = toUint8Array(props['nullBitmap']); + const valueOffsets = toBigInt64Array(props['valueOffsets']); + const { ['length']: length = valueOffsets.length - 1, ['nullCount']: nullCount = props['nullBitmap'] ? -1 : 0 } = props; + return new Data(type, offset, length, nullCount, [valueOffsets, undefined, nullBitmap], [child]); + } public visitStruct(props: StructDataProps) { const { ['type']: type, ['offset']: offset = 0, ['children']: children = [] } = props; const nullBitmap = toUint8Array(props['nullBitmap']); @@ -539,6 +546,7 @@ interface Utf8DataProps extends DataProps_ { valueOffsets: Va interface Utf8ViewDataProps extends DataProps_ { views: DataBuffer; variadicBuffers?: ReadonlyArray | Iterable | Uint8Array>; data?: DataBuffer } interface LargeUtf8DataProps extends DataProps_ { valueOffsets: LargeValueOffsetsBuffer | ValueOffsetsBuffer; data?: DataBuffer } interface ListDataProps extends DataProps_ { valueOffsets: ValueOffsetsBuffer; child: Data } +interface LargeListDataProps extends DataProps_ { valueOffsets: LargeValueOffsetsBuffer | ValueOffsetsBuffer; child: Data } interface FixedSizeListDataProps extends DataProps_ { child: Data } interface StructDataProps extends DataProps_ { children: Data[] } interface Map_DataProps extends DataProps_ { valueOffsets: ValueOffsetsBuffer; child: Data } @@ -566,6 +574,7 @@ export type DataProps = ( T extends LargeUtf8 /* */ ? LargeUtf8DataProps : T extends Utf8View /* */ ? Utf8ViewDataProps : T extends List /* */ ? ListDataProps : + T extends LargeList /* */ ? LargeListDataProps : T extends FixedSizeList /* */ ? FixedSizeListDataProps : T extends Struct /* */ ? StructDataProps : T extends Map_ /* */ ? Map_DataProps : @@ -596,6 +605,7 @@ export function makeData(props: Utf8DataProps): Data; export function makeData(props: LargeUtf8DataProps): Data; export function makeData(props: Utf8ViewDataProps): Data; export function makeData(props: ListDataProps): Data; +export function makeData(props: LargeListDataProps): Data; export function makeData(props: FixedSizeListDataProps): Data; export function makeData(props: StructDataProps): Data; export function makeData(props: Map_DataProps): Data; diff --git a/src/enum.ts b/src/enum.ts index 514a8168..a44bf6e1 100644 --- a/src/enum.ts +++ b/src/enum.ts @@ -70,6 +70,7 @@ export enum Type { Duration = 18, /** Measure of elapsed time in either seconds, milliseconds, microseconds or nanoseconds */ LargeBinary = 19, /** Large variable-length bytes (no guarantee of UTF8-ness) */ LargeUtf8 = 20, /** Large variable-length string as List */ + LargeList = 21, /** A list of some logical data type with 64-bit offsets */ BinaryView = 23, /** Variable-length binary values backed by inline-or-referenced views */ Utf8View = 24, /** Variable-length UTF8 string values backed by inline-or-referenced views */ diff --git a/src/interfaces.ts b/src/interfaces.ts index eea88bd4..93f9f9b2 100644 --- a/src/interfaces.ts +++ b/src/interfaces.ts @@ -37,6 +37,7 @@ import type { LargeUtf8Builder } from './builder/largeutf8.js'; import type { BinaryBuilder } from './builder/binary.js'; import type { LargeBinaryBuilder } from './builder/largebinary.js'; import type { ListBuilder } from './builder/list.js'; +import type { LargeListBuilder } from './builder/largelist.js'; import type { FixedSizeListBuilder } from './builder/fixedsizelist.js'; import type { MapBuilder } from './builder/map.js'; import type { StructBuilder } from './builder/struct.js'; @@ -242,6 +243,7 @@ export type TypeToDataType = { [Type.DurationNanosecond]: type.DurationNanosecond; [Type.Map]: type.Map_; [Type.List]: type.List; + [Type.LargeList]: type.LargeList; [Type.Struct]: type.Struct; [Type.Dictionary]: type.Dictionary; [Type.FixedSizeList]: type.FixedSizeList; @@ -300,6 +302,7 @@ type TypeToBuilder = { [Type.DurationNanosecond]: DurationNanosecondBuilder; [Type.Map]: MapBuilder; [Type.List]: ListBuilder; + [Type.LargeList]: LargeListBuilder; [Type.Struct]: StructBuilder; [Type.Dictionary]: DictionaryBuilder; [Type.FixedSizeList]: FixedSizeListBuilder; @@ -358,6 +361,7 @@ type DataTypeToBuilder = { [Type.DurationNanosecond]: T extends type.DurationNanosecond ? DurationNanosecondBuilder : never; [Type.Map]: T extends type.Map_ ? MapBuilder : never; [Type.List]: T extends type.List ? ListBuilder : never; + [Type.LargeList]: T extends type.LargeList ? LargeListBuilder : never; [Type.Struct]: T extends type.Struct ? StructBuilder : never; [Type.Dictionary]: T extends type.Dictionary ? DictionaryBuilder : never; [Type.FixedSizeList]: T extends type.FixedSizeList ? FixedSizeListBuilder : never; diff --git a/src/ipc/metadata/json.ts b/src/ipc/metadata/json.ts index 8aed54ec..e862738c 100644 --- a/src/ipc/metadata/json.ts +++ b/src/ipc/metadata/json.ts @@ -19,7 +19,7 @@ import { Schema, Field } from '../../schema.js'; import { DataType, Dictionary, TimeBitWidth, Utf8, LargeUtf8, Binary, LargeBinary, BinaryView, Utf8View, Decimal, FixedSizeBinary, - List, FixedSizeList, Map_, Struct, Union, + List, LargeList, FixedSizeList, Map_, Struct, Union, Bool, Null, Int, Float, Date_, Time, Interval, Timestamp, IntBitWidth, Int32, TKeys, Duration, } from '../../type.js'; @@ -172,6 +172,7 @@ function typeFromJSON(f: any, children?: Field[]): DataType { case 'utf8view': return new Utf8View(); case 'bool': return new Bool(); case 'list': return new List((children || [])[0]); + case 'largelist': return new LargeList((children || [])[0]); case 'struct': return new Struct(children || []); case 'struct_': return new Struct(children || []); } diff --git a/src/ipc/metadata/message.ts b/src/ipc/metadata/message.ts index 347b7c93..71adb8bd 100644 --- a/src/ipc/metadata/message.ts +++ b/src/ipc/metadata/message.ts @@ -58,7 +58,7 @@ import ByteBuffer = flatbuffers.ByteBuffer; import { DataType, Dictionary, TimeBitWidth, Utf8, LargeUtf8, Binary, LargeBinary, BinaryView, Utf8View, Decimal, FixedSizeBinary, - List, FixedSizeList, Map_, Struct, Union, + List, LargeList, FixedSizeList, Map_, Struct, Union, Bool, Null, Int, Float, Date_, Time, Interval, Timestamp, IntBitWidth, Int32, TKeys, Duration, } from '../../type.js'; @@ -521,6 +521,7 @@ function decodeFieldType(f: _Field, children?: Field[]): DataType { case Type['Utf8View']: return new Utf8View(); case Type['Bool']: return new Bool(); case Type['List']: return new List((children || [])[0]); + case Type['LargeList']: return new LargeList((children || [])[0]); case Type['Struct_']: return new Struct(children || []); } diff --git a/src/type.ts b/src/type.ts index 7bb9a077..81daefac 100644 --- a/src/type.ts +++ b/src/type.ts @@ -90,6 +90,7 @@ export abstract class DataType extends DataType extends DataType { + TArray: Array; + TValue: Vector; + TOffsetArray: BigInt64Array; + OffsetArrayType: BigIntArrayConstructor; +} + +/** + * Like `List`, but with 64-bit value offsets — use this when a list column's + * cumulative child-element count would exceed `List`'s 32-bit offset ceiling + * of `2^31 - 1`. Otherwise prefer `List`. + * + * Values round-trip losslessly through Arrow IPC at any 64-bit offset magnitude. + * In-process JavaScript APIs (`.get(i)`, slicing, builders) narrow offsets to a + * JS `number` and are therefore bounded by `Number.MAX_SAFE_INTEGER` + * (`2^53 - 1`, about 9×10^15) cumulative child elements per column. If an + * operation would cross that ceiling, it throws a `TypeError` rather than + * silently returning data at a truncated index. This matches the behavior of + * `LargeUtf8` and `LargeBinary`. + * + * @ignore + */ +export class LargeList extends DataType { + constructor(child: Field) { + super(Type.LargeList); + this.children = [child]; + } + public declare readonly children: Field[]; + public toString() { return `LargeList<${this.valueType}>`; } + public get valueType(): T { return this.children[0].type as T; } + public get valueField(): Field { return this.children[0] as Field; } + public get ArrayType(): T['ArrayType'] { return this.valueType.ArrayType; } + protected static [Symbol.toStringTag] = ((proto: LargeList) => { + (proto).children = null; + (proto).OffsetArrayType = BigInt64Array; + return proto[Symbol.toStringTag] = 'LargeList'; + })(LargeList.prototype); +} + /** @ignore */ export interface Struct extends DataType { TArray: Array>; diff --git a/src/util/buffer.ts b/src/util/buffer.ts index 152336ca..b4b13247 100644 --- a/src/util/buffer.ts +++ b/src/util/buffer.ts @@ -218,8 +218,9 @@ export function rebaseValueOffsets(offset: number, length: number, valueOffsets: // shifted by the start offset, such that the new start offset is 0 if (offset !== 0) { valueOffsets = valueOffsets.slice(0, length); + const delta = typeof valueOffsets[0] === 'bigint' ? BigInt(offset) : offset; for (let i = -1, n = valueOffsets.length; ++i < n;) { - valueOffsets[i] += offset; + valueOffsets[i] += delta; } } return valueOffsets.subarray(0, length); diff --git a/src/visitor.ts b/src/visitor.ts index a6d27a76..fe0cd8c2 100644 --- a/src/visitor.ts +++ b/src/visitor.ts @@ -47,6 +47,7 @@ export abstract class Visitor { public visitTime(_node: any, ..._args: any[]): any { return null; } public visitDecimal(_node: any, ..._args: any[]): any { return null; } public visitList(_node: any, ..._args: any[]): any { return null; } + public visitLargeList(_node: any, ..._args: any[]): any { return null; } public visitStruct(_node: any, ..._args: any[]): any { return null; } public visitUnion(_node: any, ..._args: any[]): any { return null; } public visitDictionary(_node: any, ..._args: any[]): any { return null; } @@ -114,6 +115,7 @@ function getVisitFnByTypeId(visitor: Visitor, dtype: Type, throwIfNotFound = tru case Type.TimeNanosecond: fn = visitor.visitTimeNanosecond || visitor.visitTime; break; case Type.Decimal: fn = visitor.visitDecimal; break; case Type.List: fn = visitor.visitList; break; + case Type.LargeList: fn = visitor.visitLargeList; break; case Type.Struct: fn = visitor.visitStruct; break; case Type.Union: fn = visitor.visitUnion; break; case Type.DenseUnion: fn = visitor.visitDenseUnion || visitor.visitUnion; break; @@ -211,6 +213,7 @@ function inferDType(type: T): Type { return Type.Duration; case Type.Map: return Type.Map; case Type.List: return Type.List; + case Type.LargeList: return Type.LargeList; case Type.Struct: return Type.Struct; case Type.Union: switch ((type as any as Union).mode) { diff --git a/src/visitor/builderctor.ts b/src/visitor/builderctor.ts index ca7669a8..7a1f1115 100644 --- a/src/visitor/builderctor.ts +++ b/src/visitor/builderctor.ts @@ -34,6 +34,7 @@ import { IntervalBuilder, IntervalDayTimeBuilder, IntervalMonthDayNanoBuilder, I import { DurationBuilder, DurationSecondBuilder, DurationMillisecondBuilder, DurationMicrosecondBuilder, DurationNanosecondBuilder } from '../builder/duration.js'; import { IntBuilder, Int8Builder, Int16Builder, Int32Builder, Int64Builder, Uint8Builder, Uint16Builder, Uint32Builder, Uint64Builder } from '../builder/int.js'; import { ListBuilder } from '../builder/list.js'; +import { LargeListBuilder } from '../builder/largelist.js'; import { MapBuilder } from '../builder/map.js'; import { NullBuilder } from '../builder/null.js'; import { StructBuilder } from '../builder/struct.js'; @@ -90,6 +91,7 @@ export class GetBuilderCtor extends Visitor { public visitTimeNanosecond() { return TimeNanosecondBuilder; } public visitDecimal() { return DecimalBuilder; } public visitList() { return ListBuilder; } + public visitLargeList() { return LargeListBuilder; } public visitStruct() { return StructBuilder; } public visitUnion() { return UnionBuilder; } public visitDenseUnion() { return DenseUnionBuilder; } diff --git a/src/visitor/get.ts b/src/visitor/get.ts index b914624e..10c2c75b 100644 --- a/src/visitor/get.ts +++ b/src/visitor/get.ts @@ -28,7 +28,7 @@ import { uint16ToFloat64 } from '../util/math.js'; import { Type, UnionMode, Precision, DateUnit, TimeUnit, IntervalUnit } from '../enum.js'; import { DataType, Dictionary, - Bool, Null, Utf8, Utf8View, LargeUtf8, Binary, BinaryView, LargeBinary, Decimal, FixedSizeBinary, List, FixedSizeList, Map_, Struct, + Bool, Null, Utf8, Utf8View, LargeUtf8, Binary, BinaryView, LargeBinary, Decimal, FixedSizeBinary, List, LargeList, FixedSizeList, Map_, Struct, Float, Float16, Float32, Float64, Int, Uint8, Uint16, Uint32, Uint64, Int8, Int16, Int32, Int64, Date_, DateDay, DateMillisecond, @@ -83,6 +83,7 @@ export interface GetVisitor extends Visitor { visitTimeNanosecond(data: Data, index: number): T['TValue'] | null; visitDecimal(data: Data, index: number): T['TValue'] | null; visitList(data: Data, index: number): T['TValue'] | null; + visitLargeList(data: Data, index: number): T['TValue'] | null; visitStruct(data: Data, index: number): T['TValue'] | null; visitUnion(data: Data, index: number): T['TValue'] | null; visitDenseUnion(data: Data, index: number): T['TValue'] | null; @@ -261,9 +262,10 @@ const getTime = (data: Data, index: number): T['TValue'] => { const getDecimal = ({ values, stride }: Data, index: number): T['TValue'] => BN.decimal(values.subarray(stride * index, stride * (index + 1))); /** @ignore */ -const getList = (data: Data, index: number): T['TValue'] => { +const getList = (data: Data, index: number): T['TValue'] => { const { valueOffsets, stride, children } = data; - const { [index * stride]: begin, [index * stride + 1]: end } = valueOffsets; + const begin = bigIntToNumber(valueOffsets[index * stride]); + const end = bigIntToNumber(valueOffsets[index * stride + 1]); const child: Data = children[0]; const slice = child.slice(begin, end - begin); return new Vector([slice]) as T['TValue']; @@ -399,6 +401,7 @@ GetVisitor.prototype.visitTimeMicrosecond = wrapGet(getTimeMicrosecond); GetVisitor.prototype.visitTimeNanosecond = wrapGet(getTimeNanosecond); GetVisitor.prototype.visitDecimal = wrapGet(getDecimal); GetVisitor.prototype.visitList = wrapGet(getList); +GetVisitor.prototype.visitLargeList = wrapGet(getList); GetVisitor.prototype.visitStruct = wrapGet(getStruct); GetVisitor.prototype.visitUnion = wrapGet(getUnion); GetVisitor.prototype.visitDenseUnion = wrapGet(getDenseUnion); diff --git a/src/visitor/indexof.ts b/src/visitor/indexof.ts index 6881f99f..ea3869c8 100644 --- a/src/visitor/indexof.ts +++ b/src/visitor/indexof.ts @@ -24,7 +24,7 @@ import { getBool, BitIterator } from '../util/bit.js'; import { createElementComparator } from '../util/vector.js'; import { DataType, Dictionary, - Bool, Null, Utf8, Utf8View, LargeUtf8, Binary, BinaryView, LargeBinary, Decimal, FixedSizeBinary, List, FixedSizeList, Map_, Struct, + Bool, Null, Utf8, Utf8View, LargeUtf8, Binary, BinaryView, LargeBinary, Decimal, FixedSizeBinary, List, LargeList, FixedSizeList, Map_, Struct, Float, Float16, Float32, Float64, Int, Uint8, Uint16, Uint32, Uint64, Int8, Int16, Int32, Int64, Date_, DateDay, DateMillisecond, @@ -79,6 +79,7 @@ export interface IndexOfVisitor extends Visitor { visitTimeNanosecond(data: Data, value: T['TValue'] | null, index?: number): number; visitDecimal(data: Data, value: T['TValue'] | null, index?: number): number; visitList(data: Data, value: T['TValue'] | null, index?: number): number; + visitLargeList(data: Data, value: T['TValue'] | null, index?: number): number; visitStruct(data: Data, value: T['TValue'] | null, index?: number): number; visitUnion(data: Data, value: T['TValue'] | null, index?: number): number; visitDenseUnion(data: Data, value: T['TValue'] | null, index?: number): number; @@ -199,6 +200,7 @@ IndexOfVisitor.prototype.visitTimeMicrosecond = indexOfValue; IndexOfVisitor.prototype.visitTimeNanosecond = indexOfValue; IndexOfVisitor.prototype.visitDecimal = indexOfValue; IndexOfVisitor.prototype.visitList = indexOfValue; +IndexOfVisitor.prototype.visitLargeList = indexOfValue; IndexOfVisitor.prototype.visitStruct = indexOfValue; IndexOfVisitor.prototype.visitUnion = indexOfValue; IndexOfVisitor.prototype.visitDenseUnion = indexOfUnion; diff --git a/src/visitor/iterator.ts b/src/visitor/iterator.ts index ef54504c..da3ce0f9 100644 --- a/src/visitor/iterator.ts +++ b/src/visitor/iterator.ts @@ -21,7 +21,7 @@ import { Type, Precision } from '../enum.js'; import { TypeToDataType } from '../interfaces.js'; import { DataType, Dictionary, - Bool, Null, Utf8, Utf8View, LargeUtf8, Binary, BinaryView, LargeBinary, Decimal, FixedSizeBinary, List, FixedSizeList, Map_, Struct, + Bool, Null, Utf8, Utf8View, LargeUtf8, Binary, BinaryView, LargeBinary, Decimal, FixedSizeBinary, List, LargeList, FixedSizeList, Map_, Struct, Float, Float16, Float32, Float64, Int, Uint8, Uint16, Uint32, Uint64, Int8, Int16, Int32, Int64, Date_, DateDay, DateMillisecond, @@ -77,6 +77,7 @@ export interface IteratorVisitor extends Visitor { visitTimeNanosecond(vector: Vector): IterableIterator; visitDecimal(vector: Vector): IterableIterator; visitList(vector: Vector): IterableIterator; + visitLargeList(vector: Vector): IterableIterator; visitStruct(vector: Vector): IterableIterator; visitUnion(vector: Vector): IterableIterator; visitDenseUnion(vector: Vector): IterableIterator; @@ -186,6 +187,7 @@ IteratorVisitor.prototype.visitTimeMicrosecond = vectorIterator; IteratorVisitor.prototype.visitTimeNanosecond = vectorIterator; IteratorVisitor.prototype.visitDecimal = vectorIterator; IteratorVisitor.prototype.visitList = vectorIterator; +IteratorVisitor.prototype.visitLargeList = vectorIterator; IteratorVisitor.prototype.visitStruct = vectorIterator; IteratorVisitor.prototype.visitUnion = vectorIterator; IteratorVisitor.prototype.visitDenseUnion = vectorIterator; diff --git a/src/visitor/jsontypeassembler.ts b/src/visitor/jsontypeassembler.ts index cf110038..5ab45d9a 100644 --- a/src/visitor/jsontypeassembler.ts +++ b/src/visitor/jsontypeassembler.ts @@ -81,6 +81,9 @@ export class JSONTypeAssembler extends Visitor { public visitList({ typeId }: T) { return { 'name': ArrowType[typeId].toLowerCase() }; } + public visitLargeList({ typeId }: T) { + return { 'name': ArrowType[typeId].toLowerCase() }; + } public visitStruct({ typeId }: T) { return { 'name': ArrowType[typeId].toLowerCase() }; } diff --git a/src/visitor/jsonvectorassembler.ts b/src/visitor/jsonvectorassembler.ts index 2f4973ad..f89dbdda 100644 --- a/src/visitor/jsonvectorassembler.ts +++ b/src/visitor/jsonvectorassembler.ts @@ -28,7 +28,7 @@ import { toIntervalDayTimeObjects, toIntervalMonthDayNanoObjects } from '../util import { DataType, Float, Int, Date_, Interval, Time, Timestamp, Union, Duration, - Bool, Null, Utf8, LargeUtf8, Binary, LargeBinary, BinaryView, Utf8View, Decimal, FixedSizeBinary, List, FixedSizeList, Map_, Struct, IntArray, + Bool, Null, Utf8, LargeUtf8, Binary, LargeBinary, BinaryView, Utf8View, Decimal, FixedSizeBinary, List, LargeList, FixedSizeList, Map_, Struct, IntArray, } from '../type.js'; /** @ignore */ @@ -54,6 +54,7 @@ export interface JSONVectorAssembler extends Visitor { visitTime(data: Data): { DATA: number[] }; visitDecimal(data: Data): { DATA: string[] }; visitList(data: Data): { children: any[]; OFFSET: number[] }; + visitLargeList(data: Data): { children: any[]; OFFSET: string[] }; visitStruct(data: Data): { children: any[] }; visitUnion(data: Data): { children: any[]; TYPE_ID: number[] }; visitInterval(data: Data): { DATA: number[] }; @@ -152,6 +153,12 @@ export class JSONVectorAssembler extends Visitor { 'children': this.visitMany(data.type.children, data.children) }; } + public visitLargeList(data: Data) { + return { + 'OFFSET': [...bigNumsToStrings(data.valueOffsets, 2)], + 'children': this.visitMany(data.type.children, data.children) + }; + } public visitStruct(data: Data) { return { 'children': this.visitMany(data.type.children, data.children) diff --git a/src/visitor/set.ts b/src/visitor/set.ts index 90501937..dab83696 100644 --- a/src/visitor/set.ts +++ b/src/visitor/set.ts @@ -26,7 +26,7 @@ import { float64ToUint16 } from '../util/math.js'; import { Type, UnionMode, Precision, DateUnit, TimeUnit, IntervalUnit } from '../enum.js'; import { DataType, Dictionary, - Bool, Null, Utf8, Utf8View, LargeUtf8, Binary, BinaryView, LargeBinary, Decimal, FixedSizeBinary, List, FixedSizeList, Map_, Struct, + Bool, Null, Utf8, Utf8View, LargeUtf8, Binary, BinaryView, LargeBinary, Decimal, FixedSizeBinary, List, LargeList, FixedSizeList, Map_, Struct, Float, Float16, Float32, Float64, Int, Uint8, Uint16, Uint32, Uint64, Int8, Int16, Int32, Int64, Date_, DateDay, DateMillisecond, @@ -81,6 +81,7 @@ export interface SetVisitor extends Visitor { visitTimeNanosecond(data: Data, index: number, value: T['TValue']): void; visitDecimal(data: Data, index: number, value: T['TValue']): void; visitList(data: Data, index: number, value: T['TValue']): void; + visitLargeList(data: Data, index: number, value: T['TValue']): void; visitStruct(data: Data, index: number, value: T['TValue']): void; visitUnion(data: Data, index: number, value: T['TValue']): void; visitDenseUnion(data: Data, index: number, value: T['TValue']): void; @@ -264,16 +265,18 @@ export const setTime = (data: Data, index: number, value: T[' export const setDecimal = ({ values, stride }: Data, index: number, value: T['TValue']): void => { values.set(value.subarray(0, stride), stride * index); }; /** @ignore */ -const setList = (data: Data, index: number, value: T['TValue']): void => { +const setList = (data: Data, index: number, value: T['TValue']): void => { const values = data.children[0]; const valueOffsets = data.valueOffsets; const set = instance.getVisitFn(values); + const begin = bigIntToNumber(valueOffsets[index]); + const end = bigIntToNumber(valueOffsets[index + 1]); if (Array.isArray(value)) { - for (let idx = -1, itr = valueOffsets[index], end = valueOffsets[index + 1]; itr < end;) { + for (let idx = -1, itr = begin; itr < end;) { set(values, itr++, value[++idx]); } } else { - for (let idx = -1, itr = valueOffsets[index], end = valueOffsets[index + 1]; itr < end;) { + for (let idx = -1, itr = begin; itr < end;) { set(values, itr++, value.get(++idx)); } } @@ -437,6 +440,7 @@ SetVisitor.prototype.visitTimeMicrosecond = wrapSet(setTimeMicrosecond); SetVisitor.prototype.visitTimeNanosecond = wrapSet(setTimeNanosecond); SetVisitor.prototype.visitDecimal = wrapSet(setDecimal); SetVisitor.prototype.visitList = wrapSet(setList); +SetVisitor.prototype.visitLargeList = wrapSet(setList); SetVisitor.prototype.visitStruct = wrapSet(setStruct); SetVisitor.prototype.visitUnion = wrapSet(setUnion); SetVisitor.prototype.visitDenseUnion = wrapSet(setDenseUnion); diff --git a/src/visitor/typeassembler.ts b/src/visitor/typeassembler.ts index d997f6cf..2e454438 100644 --- a/src/visitor/typeassembler.ts +++ b/src/visitor/typeassembler.ts @@ -38,6 +38,7 @@ import { Timestamp } from '../fb/timestamp.js'; import { Interval } from '../fb/interval.js'; import { Duration } from '../fb/duration.js'; import { List } from '../fb/list.js'; +import { LargeList } from '../fb/large-list.js'; import { Struct_ as Struct } from '../fb/struct-.js'; import { Union } from '../fb/union.js'; import { DictionaryEncoding } from '../fb/dictionary-encoding.js'; @@ -139,6 +140,10 @@ export class TypeAssembler extends Visitor { List.startList(b); return List.endList(b); } + public visitLargeList(_node: T, b: Builder) { + LargeList.startLargeList(b); + return LargeList.endLargeList(b); + } public visitStruct(_node: T, b: Builder) { Struct.startStruct_(b); return Struct.endStruct_(b); diff --git a/src/visitor/typecomparator.ts b/src/visitor/typecomparator.ts index 5c1d60a9..96c0e4de 100644 --- a/src/visitor/typecomparator.ts +++ b/src/visitor/typecomparator.ts @@ -21,7 +21,7 @@ import { Visitor } from '../visitor.js'; import { Schema, Field } from '../schema.js'; import { DataType, TypeMap, Dictionary, - Bool, Null, Utf8, Utf8View, LargeUtf8, Binary, BinaryView, LargeBinary, Decimal, FixedSizeBinary, List, FixedSizeList, Map_, Struct, + Bool, Null, Utf8, Utf8View, LargeUtf8, Binary, BinaryView, LargeBinary, Decimal, FixedSizeBinary, List, LargeList, FixedSizeList, Map_, Struct, Float, Float16, Float32, Float64, Int, Uint8, Uint16, Uint32, Uint64, Int8, Int16, Int32, Int64, Date_, DateDay, DateMillisecond, @@ -75,6 +75,7 @@ export interface TypeComparator extends Visitor { visitTimeNanosecond(type: T, other?: DataType | null): other is T; visitDecimal(type: T, other?: DataType | null): other is T; visitList(type: T, other?: DataType | null): other is T; + visitLargeList(type: T, other?: DataType | null): other is T; visitStruct(type: T, other?: DataType | null): other is T; visitUnion(type: T, other?: DataType | null): other is T; visitDenseUnion(type: T, other?: DataType | null): other is T; @@ -172,7 +173,7 @@ function compareTime(type: T, other?: DataType | null): other is ); } -function compareList(type: T, other?: DataType | null): other is T { +function compareList(type: T, other?: DataType | null): other is T { return (type === other) || ( compareConstructor(type, other) && type.children.length === other.children.length && @@ -276,6 +277,7 @@ TypeComparator.prototype.visitTimeMicrosecond = compareTime; TypeComparator.prototype.visitTimeNanosecond = compareTime; TypeComparator.prototype.visitDecimal = compareAny; TypeComparator.prototype.visitList = compareList; +TypeComparator.prototype.visitLargeList = compareList; TypeComparator.prototype.visitStruct = compareStruct; TypeComparator.prototype.visitUnion = compareUnion; TypeComparator.prototype.visitDenseUnion = compareUnion; diff --git a/src/visitor/typector.ts b/src/visitor/typector.ts index 2aab6d3d..e9eabd77 100644 --- a/src/visitor/typector.ts +++ b/src/visitor/typector.ts @@ -68,6 +68,7 @@ export class GetDataTypeConstructor extends Visitor { public visitTimeNanosecond() { return type.TimeNanosecond; } public visitDecimal() { return type.Decimal; } public visitList() { return type.List; } + public visitLargeList() { return type.LargeList; } public visitStruct() { return type.Struct; } public visitUnion() { return type.Union; } public visitDenseUnion() { return type.DenseUnion; } diff --git a/src/visitor/vectorassembler.ts b/src/visitor/vectorassembler.ts index ae4b712d..2d7837dd 100644 --- a/src/visitor/vectorassembler.ts +++ b/src/visitor/vectorassembler.ts @@ -27,7 +27,7 @@ import { BufferRegion, FieldNode } from '../ipc/metadata/message.js'; import { DataType, Dictionary, Float, Int, Date_, Interval, Time, Timestamp, Union, Duration, - Bool, Null, Utf8, Utf8View, LargeUtf8, Binary, BinaryView, LargeBinary, Decimal, FixedSizeBinary, List, FixedSizeList, Map_, Struct, + Bool, Null, Utf8, Utf8View, LargeUtf8, Binary, BinaryView, LargeBinary, Decimal, FixedSizeBinary, List, LargeList, FixedSizeList, Map_, Struct, } from '../type.js'; import { bigIntToNumber } from '../util/bigint.js'; @@ -51,6 +51,7 @@ export interface VectorAssembler extends Visitor { visitTime(data: Data): this; visitDecimal(data: Data): this; visitList(data: Data): this; + visitLargeList(data: Data): this; visitStruct(data: Data): this; visitUnion(data: Data): this; visitInterval(data: Data): this; @@ -234,11 +235,12 @@ function assembleBinaryViewVector(this: VectorA } /** @ignore */ -function assembleListVector(this: VectorAssembler, data: Data) { +function assembleListVector(this: VectorAssembler, data: Data) { const { length, valueOffsets } = data; - // If we have valueOffsets (MapVector, ListVector), push that buffer first + // If we have valueOffsets (MapVector, ListVector, LargeListVector), push that buffer first if (valueOffsets) { - const { [0]: begin, [length]: end } = valueOffsets; + const begin = bigIntToNumber(valueOffsets[0]); + const end = bigIntToNumber(valueOffsets[length]); addBuffer.call(this, rebaseValueOffsets(-begin, length + 1, valueOffsets)); // Then insert the List's values child return this.visit(data.children[0].slice(begin, end - begin)); @@ -267,6 +269,7 @@ VectorAssembler.prototype.visitTimestamp = assembleFlatVector; VectorAssembler.prototype.visitTime = assembleFlatVector; VectorAssembler.prototype.visitDecimal = assembleFlatVector; VectorAssembler.prototype.visitList = assembleListVector; +VectorAssembler.prototype.visitLargeList = assembleListVector; VectorAssembler.prototype.visitStruct = assembleNestedVector; VectorAssembler.prototype.visitUnion = assembleUnion; VectorAssembler.prototype.visitInterval = assembleFlatVector; diff --git a/src/visitor/vectorloader.ts b/src/visitor/vectorloader.ts index a6892b28..4329cf10 100644 --- a/src/visitor/vectorloader.ts +++ b/src/visitor/vectorloader.ts @@ -128,6 +128,9 @@ export class VectorLoader extends Visitor { public visitList(type: T, { length, nullCount } = this.nextFieldNode()) { return makeData({ type, length, nullCount, nullBitmap: this.readNullBitmap(type, nullCount), valueOffsets: this.readOffsets(type), 'child': this.visit(type.children[0]) }); } + public visitLargeList(type: T, { length, nullCount } = this.nextFieldNode()) { + return makeData({ type, length, nullCount, nullBitmap: this.readNullBitmap(type, nullCount), valueOffsets: this.readOffsets(type), 'child': this.visit(type.children[0]) }); + } public visitStruct(type: T, { length, nullCount } = this.nextFieldNode()) { return makeData({ type, length, nullCount, nullBitmap: this.readNullBitmap(type, nullCount), children: this.visitMany(type.children) }); } diff --git a/test/data/tables.ts b/test/data/tables.ts index 80950a5e..e219fead 100644 --- a/test/data/tables.ts +++ b/test/data/tables.ts @@ -22,7 +22,7 @@ import * as generate from '../generate-test-data.js'; import { Schema, Field, Dictionary } from 'apache-arrow'; -const listVectorGeneratorNames = ['list', 'fixedSizeList']; +const listVectorGeneratorNames = ['list', 'largeList', 'fixedSizeList']; const nestedVectorGeneratorNames = ['struct', 'denseUnion', 'sparseUnion', 'map']; const dictionaryKeyGeneratorNames = ['int8', 'int16', 'int32', 'uint8', 'uint16', 'uint32']; const valueVectorGeneratorNames = [ diff --git a/test/generate-test-data.ts b/test/generate-test-data.ts index f173633b..379bfd0f 100644 --- a/test/generate-test-data.ts +++ b/test/generate-test-data.ts @@ -30,6 +30,7 @@ import { Time, TimeSecond, TimeMillisecond, TimeMicrosecond, TimeNanosecond, Decimal, List, + LargeList, Struct, Union, DenseUnion, SparseUnion, Dictionary, @@ -64,6 +65,7 @@ interface TestDataVectorGenerator extends Visitor { visit(type: T, length?: number, nullCount?: number): GeneratedVector; visit(type: T, length?: number, nullCount?: number): GeneratedVector; visit(type: T, length?: number, nullCount?: number, child?: Vector): GeneratedVector; + visit(type: T, length?: number, nullCount?: number, child?: Vector): GeneratedVector; visit(type: T, length?: number, nullCount?: number, child?: Vector): GeneratedVector; visit(type: T, length?: number, nullCount?: number, dictionary?: Vector): GeneratedVector; visit(type: T, length?: number, nullCount?: number, children?: Vector[]): GeneratedVector; @@ -89,6 +91,7 @@ interface TestDataVectorGenerator extends Visitor { visitTime: typeof generateTime; visitDecimal: typeof generateDecimal; visitList: typeof generateList; + visitLargeList: typeof generateLargeList; visitStruct: typeof generateStruct; visitUnion: typeof generateUnion; visitDictionary: typeof generateDictionary; @@ -118,6 +121,7 @@ TestDataVectorGenerator.prototype.visitTimestamp = generateTimestamp; TestDataVectorGenerator.prototype.visitTime = generateTime; TestDataVectorGenerator.prototype.visitDecimal = generateDecimal; TestDataVectorGenerator.prototype.visitList = generateList; +TestDataVectorGenerator.prototype.visitLargeList = generateLargeList; TestDataVectorGenerator.prototype.visitStruct = generateStruct; TestDataVectorGenerator.prototype.visitUnion = generateUnion; TestDataVectorGenerator.prototype.visitDictionary = generateDictionary; @@ -243,6 +247,7 @@ export const timeMicrosecond = (length = 100, nullCount = Math.trunc(length * 0. export const timeNanosecond = (length = 100, nullCount = Math.trunc(length * 0.2)) => vectorGenerator.visit(new TimeNanosecond(), length, nullCount); export const decimal = (length = 100, nullCount = Math.trunc(length * 0.2), scale = 2, precision = 9, bitWidth = 128) => vectorGenerator.visit(new Decimal(scale, precision, bitWidth), length, nullCount); export const list = (length = 100, nullCount = Math.trunc(length * 0.2), child = defaultListChild) => vectorGenerator.visit(new List(child), length, nullCount); +export const largeList = (length = 100, nullCount = Math.trunc(length * 0.2), child = defaultListChild) => vectorGenerator.visit(new LargeList(child), length, nullCount); export const struct = (length = 100, nullCount = Math.trunc(length * 0.2), children: Field[] = defaultStructChildren()) => vectorGenerator.visit(new Struct(children), length, nullCount); export const denseUnion = (length = 100, nullCount = Math.trunc(length * 0.2), children: Field[] = defaultUnionChildren()) => vectorGenerator.visit(new DenseUnion(children.map((f) => f.typeId), children), length, nullCount); export const sparseUnion = (length = 100, nullCount = Math.trunc(length * 0.2), children: Field[] = defaultUnionChildren()) => vectorGenerator.visit(new SparseUnion(children.map((f) => f.typeId), children), length, nullCount); @@ -258,7 +263,7 @@ export const fixedSizeList = (length = 100, nullCount = Math.trunc(length * 0.2) export const map = (length = 100, nullCount = Math.trunc(length * 0.2), child: Field> = defaultMapChild()) => vectorGenerator.visit(new Map_(child), length, nullCount); export const vecs = { - null_, bool, int8, int16, int32, int64, uint8, uint16, uint32, uint64, float16, float32, float64, utf8, largeUtf8, utf8View, binary, largeBinary, binaryView, fixedSizeBinary, dateDay, dateMillisecond, timestampSecond, timestampMillisecond, timestampMicrosecond, timestampNanosecond, timeSecond, timeMillisecond, timeMicrosecond, timeNanosecond, decimal, list, struct, denseUnion, sparseUnion, dictionary, intervalDayTime, intervalYearMonth, intervalMonthDayNano, fixedSizeList, map, durationSecond, durationMillisecond, durationMicrosecond, durationNanosecond + null_, bool, int8, int16, int32, int64, uint8, uint16, uint32, uint64, float16, float32, float64, utf8, largeUtf8, utf8View, binary, largeBinary, binaryView, fixedSizeBinary, dateDay, dateMillisecond, timestampSecond, timestampMillisecond, timestampMicrosecond, timestampNanosecond, timeSecond, timeMillisecond, timeMicrosecond, timeNanosecond, decimal, list, largeList, struct, denseUnion, sparseUnion, dictionary, intervalDayTime, intervalYearMonth, intervalMonthDayNano, fixedSizeList, map, durationSecond, durationMillisecond, durationMicrosecond, durationNanosecond } as { [k: string]: (...args: any[]) => any }; function generateNull(this: TestDataVectorGenerator, type: T, length = 100): GeneratedVector { @@ -498,19 +503,30 @@ function generateDuration(this: TestDataVectorGenerator, typ return { values: () => values, vector: new Vector([makeData({ type, length, nullCount, nullBitmap, data })]) }; } -function generateList(this: TestDataVectorGenerator, type: T, length = 100, nullCount = Math.trunc(length * 0.2), child = this.visit(type.children[0].type, length * 3, nullCount * 3)): GeneratedVector { +function generateListLike( + type: T, length: number, nullCount: number, child: GeneratedVector, + createOffsets: (length: number, nullBitmap: Uint8Array, stride: number, stride2: number) => Int32Array | BigInt64Array +): GeneratedVector { const childVec = child.vector; const nullBitmap = createBitmap(length, nullCount); const stride = childVec.length / (length - nullCount); - const valueOffsets = createVariableWidthOffsets32(length, nullBitmap, stride, stride); + const valueOffsets = createOffsets(length, nullBitmap, stride, stride); const values = memoize(() => { const childValues = child.values(); const values: (T['valueType'] | null)[] = [...valueOffsets.slice(1)] .map((offset, i) => isValid(nullBitmap, i) ? offset : null) - .map((o, i) => o == null ? null : childValues.slice(valueOffsets[i], o)); + .map((o, i) => o == null ? null : childValues.slice(Number(valueOffsets[i]), Number(o))); return values; }); - return { values, vector: new Vector([makeData({ type, length, nullCount, nullBitmap, valueOffsets, child: childVec.data[0] })]) }; + return { values, vector: new Vector([makeData({ type: type as any, length, nullCount, nullBitmap, valueOffsets, child: childVec.data[0] })]) } as GeneratedVector; +} + +function generateList(this: TestDataVectorGenerator, type: T, length = 100, nullCount = Math.trunc(length * 0.2), child = this.visit(type.children[0].type, length * 3, nullCount * 3)): GeneratedVector { + return generateListLike(type, length, nullCount, child, createVariableWidthOffsets32); +} + +function generateLargeList(this: TestDataVectorGenerator, type: T, length = 100, nullCount = Math.trunc(length * 0.2), child = this.visit(type.children[0].type, length * 3, nullCount * 3)): GeneratedVector { + return generateListLike(type, length, nullCount, child, createVariableWidthOffsets64); } function generateFixedSizeList(this: TestDataVectorGenerator, type: T, length = 100, nullCount = Math.trunc(length * 0.2), child = this.visit(type.children[0].type, length * type.listSize, nullCount * type.listSize)): GeneratedVector { @@ -734,6 +750,8 @@ function createVariableWidthOffsets32(length: number, nullBitmap: Uint8Array, mi } function createVariableWidthOffsets64(length: number, nullBitmap: Uint8Array, min = 10, max = Number.POSITIVE_INFINITY, allowEmpty = true) { + min = Math.trunc(min); + max = Math.trunc(max); const offsets = new BigInt64Array(length + 1); iterateBitmap(length, nullBitmap, (i, valid) => { if (!valid) { diff --git a/test/unit/builders/builder-tests.ts b/test/unit/builders/builder-tests.ts index e8684010..b1d9be69 100644 --- a/test/unit/builders/builder-tests.ts +++ b/test/unit/builders/builder-tests.ts @@ -60,6 +60,7 @@ describe('Generated Test Data', () => { describe('TimeNanosecondBuilder', () => { validateBuilder(generate.timeNanosecond); }); describe('DecimalBuilder', () => { validateBuilder(generate.decimal); }); describe('ListBuilder', () => { validateBuilder(generate.list); }); + describe('LargeListBuilder', () => { validateBuilder(generate.largeList); }); describe('StructBuilder', () => { validateBuilder(generate.struct); }); describe('DenseUnionBuilder', () => { validateBuilder(generate.denseUnion); }); describe('SparseUnionBuilder', () => { validateBuilder(generate.sparseUnion); }); diff --git a/test/unit/generated-data-tests.ts b/test/unit/generated-data-tests.ts index 0d3a760e..e293fe97 100644 --- a/test/unit/generated-data-tests.ts +++ b/test/unit/generated-data-tests.ts @@ -56,6 +56,7 @@ describe('Generated Test Data', () => { describe('TimeNanosecond', () => { validateVector(generate.timeNanosecond()); }); describe('Decimal', () => { validateVector(generate.decimal()); }); describe('List', () => { validateVector(generate.list()); }); + describe('LargeList', () => { validateVector(generate.largeList()); }); describe('Struct', () => { validateVector(generate.struct()); }); describe('DenseUnion', () => { validateVector(generate.denseUnion()); }); describe('SparseUnion', () => { validateVector(generate.sparseUnion()); }); diff --git a/test/unit/large-list-tests.ts b/test/unit/large-list-tests.ts new file mode 100644 index 00000000..4aac4d26 --- /dev/null +++ b/test/unit/large-list-tests.ts @@ -0,0 +1,55 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import { + Field, Int32, LargeList, Vector, makeData, +} from 'apache-arrow'; + +describe('LargeList overflow semantics', () => { + + const buildChild = (length: number) => { + const values = new Int32Array(length); + for (let i = 0; i < length; i++) values[i] = i; + return makeData({ type: new Int32(), length, data: values }); + }; + + test(`.get() throws when an offset exceeds Number.MAX_SAFE_INTEGER`, () => { + // Hand-build a LargeList Data whose second offset overflows the JS safe-integer range. + // We can't actually allocate that much child data, so we synthesize a Data with a + // small child but offsets that point past 2^53 — this exercises the bigIntToNumber + // guard inside getList, which is the boundary where bigint offsets are narrowed. + const child = buildChild(8); + const type = new LargeList(new Field('item', new Int32(), true)); + const valueOffsets = BigInt64Array.from([0n, BigInt(Number.MAX_SAFE_INTEGER) + 1n]); + const data = makeData({ type, length: 1, nullCount: 0, valueOffsets, child }); + const vec = new Vector([data]); + expect(() => vec.get(0)).toThrow(TypeError); + }); + + test(`.get() works at the Number.MAX_SAFE_INTEGER boundary`, () => { + // Offset exactly at MAX_SAFE_INTEGER must not throw — only past it. + const child = buildChild(8); + const type = new LargeList(new Field('item', new Int32(), true)); + const safeMax = BigInt(Number.MAX_SAFE_INTEGER); + const valueOffsets = BigInt64Array.from([0n, safeMax]); + const data = makeData({ type, length: 1, nullCount: 0, valueOffsets, child }); + const vec = new Vector([data]); + // The conversion itself must succeed; the resulting slice is degenerate + // because the child is small, but that's fine — we're verifying no throw. + expect(() => vec.get(0)).not.toThrow(); + }); +}); diff --git a/test/unit/visitor-tests.ts b/test/unit/visitor-tests.ts index 6ecb6cca..2b3f6605 100644 --- a/test/unit/visitor-tests.ts +++ b/test/unit/visitor-tests.ts @@ -18,7 +18,7 @@ import { Field, Visitor, DataType, Dictionary, - Bool, Null, Utf8, LargeUtf8, Binary, LargeBinary, Decimal, FixedSizeBinary, List, FixedSizeList, Map_, Struct, + Bool, Null, Utf8, LargeUtf8, Binary, LargeBinary, Decimal, FixedSizeBinary, List, LargeList, FixedSizeList, Map_, Struct, Float, Float16, Float32, Float64, Int, Uint8, Uint16, Uint32, Uint64, Int8, Int16, Int32, Int64, Date_, DateDay, DateMillisecond, @@ -45,6 +45,7 @@ class BasicVisitor extends Visitor { public visitTime(type: T) { return (this.type = type); } public visitDecimal(type: T) { return (this.type = type); } public visitList(type: T) { return (this.type = type); } + public visitLargeList(type: T) { return (this.type = type); } public visitStruct(type: T) { return (this.type = type); } public visitUnion(type: T) { return (this.type = type); } public visitDictionary(type: T) { return (this.type = type); } @@ -86,6 +87,7 @@ class FeatureVisitor extends Visitor { public visitTimeNanosecond(type: T) { return (this.type = type); } public visitDecimal(type: T) { return (this.type = type); } public visitList(type: T) { return (this.type = type); } + public visitLargeList(type: T) { return (this.type = type); } public visitStruct(type: T) { return (this.type = type); } public visitDenseUnion(type: T) { return (this.type = type); } public visitSparseUnion(type: T) { return (this.type = type); } @@ -117,6 +119,7 @@ describe('Visitor', () => { test(`visits Time types`, () => validateBasicVisitor(new Time(0, 64))); test(`visits Decimal types`, () => validateBasicVisitor(new Decimal(2, 9, 128))); test(`visits List types`, () => validateBasicVisitor(new List(null as any))); + test(`visits LargeList types`, () => validateBasicVisitor(new LargeList(null as any))); test(`visits Struct types`, () => validateBasicVisitor(new Struct([] as any[]))); test(`visits Union types`, () => validateBasicVisitor(new Union(0, [] as any[], [] as any[]))); test(`visits Dictionary types`, () => validateBasicVisitor(new Dictionary(null as any, null as any))); @@ -166,6 +169,7 @@ describe('Visitor', () => { test(`visits TimeNanosecond types`, () => validateFeatureVisitor(new TimeNanosecond())); test(`visits Decimal types`, () => validateFeatureVisitor(new Decimal(2, 9, 128))); test(`visits List types`, () => validateFeatureVisitor(new List(null as any))); + test(`visits LargeList types`, () => validateFeatureVisitor(new LargeList(null as any))); test(`visits Struct types`, () => validateFeatureVisitor(new Struct([] as any[]))); test(`visits DenseUnion types`, () => validateFeatureVisitor(new DenseUnion([] as any[], [] as any[]))); test(`visits SparseUnion types`, () => validateFeatureVisitor(new SparseUnion([] as any[], [] as any[])));