Skip to content

Commit f5b366e

Browse files
PECO-1054 Expose Arrow batches to users, part two (#163)
Updated FetchableItems interface to return an instance of OutputType instead of a slice of output type. Created interfaces SparkArrowBatch and SparkArrowRecord and implementations of each. This also changed the 1:1 ratio of batch instances to arrow records. A SparkArrowBatch can contain multiple arrow records now. Created BatchIterator interface and implementation and switched arrowRowScanner to use BatchIterator instead of BatchLoader Created RowValues interface and implementation as a container for the currently loaded values for a set of rows. Updated the behaviour of fetchable items cloudURL and localBatch to de-serialize the arrow records as part of fetching, rather than carry around the raw bytes for later de-serialization. Also eliminated the cloud fetch code that was de-serializing the arrow batch then serializing each record individually to create one batch instance per record. Removed chunkedByteReader and replaced with io.MultiReader Normalized use of row number so that there is no need to track the index of the row in the current batch.
2 parents f7c0286 + cfceb51 commit f5b366e

15 files changed

Lines changed: 719 additions & 585 deletions

internal/fetcher/fetcher.go

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ import (
99
)
1010

1111
type FetchableItems[OutputType any] interface {
12-
Fetch(ctx context.Context) ([]OutputType, error)
12+
Fetch(ctx context.Context) (OutputType, error)
1313
}
1414

1515
type Fetcher[OutputType any] interface {
@@ -151,10 +151,7 @@ func work[I FetchableItems[O], O any](f *concurrentFetcher[I, O], workerIndex in
151151
return
152152
} else {
153153
f.logger().Debug().Msgf("concurrent fetcher worker %d item loaded", workerIndex)
154-
for i := range result {
155-
r := result[i]
156-
f.outChan <- r
157-
}
154+
f.outChan <- result
158155
}
159156
} else {
160157
f.logger().Debug().Msgf("concurrent fetcher ending %d", workerIndex)

internal/fetcher/fetcher_test.go

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -30,13 +30,13 @@ func (m *mockFetchableItem) Fetch(ctx context.Context) ([]*mockOutput, error) {
3030
return outputs, nil
3131
}
3232

33-
var _ FetchableItems[*mockOutput] = (*mockFetchableItem)(nil)
33+
var _ FetchableItems[[]*mockOutput] = (*mockFetchableItem)(nil)
3434

3535
func TestConcurrentFetcher(t *testing.T) {
3636
t.Run("Comprehensively tests the concurrent fetcher", func(t *testing.T) {
3737
ctx := context.Background()
3838

39-
inputChan := make(chan FetchableItems[*mockOutput], 10)
39+
inputChan := make(chan FetchableItems[[]*mockOutput], 10)
4040
for i := 0; i < 10; i++ {
4141
item := mockFetchableItem{item: i, wait: 1 * time.Second}
4242
inputChan <- &item
@@ -57,7 +57,7 @@ func TestConcurrentFetcher(t *testing.T) {
5757

5858
var results []*mockOutput
5959
for result := range outChan {
60-
results = append(results, result)
60+
results = append(results, result...)
6161
}
6262

6363
// Check if the fetcher returned the expected results
@@ -87,7 +87,7 @@ func TestConcurrentFetcher(t *testing.T) {
8787
defer cancel()
8888

8989
// Create an input channel
90-
inputChan := make(chan FetchableItems[*mockOutput], 3)
90+
inputChan := make(chan FetchableItems[[]*mockOutput], 3)
9191
for i := 0; i < 3; i++ {
9292
item := mockFetchableItem{item: i, wait: 1 * time.Second}
9393
inputChan <- &item

0 commit comments

Comments
 (0)