Skip to content

Commit 176df8a

Browse files
committed
arena ToT: single-pass DistArray construction (drop buffering)
The arena-ToT construction paths pre-walked their cells twice: the two-pass make_nested_tile invoked its source once to size each cell and again to fill it, so callers with a single-pass source materialized the whole outer tile into a temporary vector first. ArenaToTBuilder makes a single ascending pass possible everywhere. - make_nested_tile (arena_kernels.h): rebuilt on ArenaToTBuilder -- inner_range_fn and inner_fill_fn are now interleaved per cell instead of two full passes; no separate all-ranges walk. Cells stay zero-initialized so the no-op-fill (shape-only) path is unchanged. - DistArray::make_arena_nested_tile: rebuilt on ArenaToTBuilder; cell_source is invoked exactly once per cell in ascending order. - DistArray::init_elements (arena branch): drops the std::vector<R> that collected every inner tensor of the outer tile before building. - DistArray::set(i, InIter) (arena branch): drops the std::vector that buffered the single-pass iterator; it now feeds straight through. - ArrayImpl retile (arena-ToT branch): builds each target tile with ArenaToTBuilder, one source-cell lookup per cell instead of two. Eliminates a peak-memory doubling during construction (the temporary held the whole tile's data alongside the arena slab). foreach / make_array were also reviewed: both are tile-type-agnostic (the result tile is default-constructed and the user op populates it) -- no two-pass machinery there, nothing to relax.
1 parent 3372118 commit 176df8a

3 files changed

Lines changed: 69 additions & 77 deletions

File tree

src/TiledArray/array_impl.h

Lines changed: 14 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1001,7 +1001,6 @@ std::shared_ptr<ArrayImpl<Tile, Policy>> make_with_new_trange(
10011001
// target cell at global outer element `e` takes its inner range and data
10021002
// from the source cell at `e` (elements outside the source range, e.g. a
10031003
// retile that grows the element range, yield null cells).
1004-
using inner_range_type = typename Tile::value_type::range_type;
10051004
const auto& source_elements = source_array.trange().elements_range();
10061005
std::map<std::size_t, Tile> src_tile_cache;
10071006
auto source_cell_at =
@@ -1024,21 +1023,20 @@ std::shared_ptr<ArrayImpl<Tile, Policy>> make_with_new_trange(
10241023
};
10251024
for (const auto target_ord : *target_array.pmap()) {
10261025
if (target_array.is_zero(target_ord)) continue;
1027-
Tile tile = make_nested_tile<Tile>(
1028-
target_trange.make_tile_range(target_ord),
1029-
[&](const auto& e) -> inner_range_type {
1030-
const auto* sc = source_cell_at(e);
1031-
return (sc && !sc->empty()) ? sc->range() : inner_range_type{};
1032-
},
1033-
[&](auto& cell, const auto& e) {
1034-
const auto* sc = source_cell_at(e);
1035-
if (sc && !sc->empty()) {
1036-
const auto* s = sc->data();
1037-
auto* d = cell.data();
1038-
for (std::size_t p = 0; p < cell.size(); ++p) d[p] = s[p];
1039-
}
1040-
});
1041-
target_array.set(target_ord, std::move(tile));
1026+
// build each target tile in one pass: a single source lookup per cell
1027+
// sizes it and fills it together (no separate all-ranges walk).
1028+
const auto outer_range = target_trange.make_tile_range(target_ord);
1029+
ArenaToTBuilder<Tile> builder(outer_range);
1030+
const std::size_t n = outer_range.volume();
1031+
for (std::size_t o = 0; o < n; ++o) {
1032+
const auto* sc = source_cell_at(outer_range.idx(o));
1033+
if (!sc || sc->empty()) continue; // leaves a deliberately-null cell
1034+
auto& cell = builder.emplace(o, sc->range());
1035+
const auto* s = sc->data();
1036+
auto* d = cell.data();
1037+
for (std::size_t p = 0; p < cell.size(); ++p) d[p] = s[p];
1038+
}
1039+
target_array.set(target_ord, std::move(builder).finish());
10421040
}
10431041
target_array.world().gop.fence();
10441042
} else {

src/TiledArray/dist_array.h

Lines changed: 32 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -811,18 +811,17 @@ class DistArray : public madness::archive::ParallelSerializableObject {
811811
check_index(i);
812812
if constexpr (detail::is_tensor_of_tensor_v<value_type> &&
813813
is_arena_tensor_v<element_type>) {
814-
// arena ToT: the iterated inner tiles carry the ranges needed to size
815-
// the slab; buffer them (the iterator is single-pass) and build.
814+
// arena ToT: each iterated inner tile carries the range that sizes its
815+
// cell. make_arena_nested_tile pulls the source once per cell in
816+
// ascending order, so the single-pass iterator feeds straight through.
816817
const auto outer_range = pimpl_->trange().make_tile_range(i);
817818
using SrcTile = std::decay_t<decltype(*first)>;
818-
std::vector<SrcTile> buf;
819-
buf.reserve(outer_range.volume());
820-
for (std::size_t k = 0; k < outer_range.volume(); ++k, ++first)
821-
buf.emplace_back(*first);
822-
pimpl_->set(i, make_arena_nested_tile(
823-
outer_range, [&buf](std::size_t k) -> const SrcTile& {
824-
return buf[k];
825-
}));
819+
pimpl_->set(i, make_arena_nested_tile(outer_range,
820+
[&first](std::size_t) -> SrcTile {
821+
SrcTile t = *first;
822+
++first;
823+
return t;
824+
}));
826825
} else {
827826
pimpl_->set(i, value_type(pimpl_->trange().make_tile_range(i), first));
828827
}
@@ -1165,15 +1164,11 @@ class DistArray : public madness::archive::ParallelSerializableObject {
11651164
std::is_assignable_v<element_type&, const R&>,
11661165
"DistArray::init_elements: op must return a freestanding "
11671166
"tensor assignable to the inner tile type");
1168-
// pass 1: collect op's freestanding inner tensors; pass 2:
1169-
// make_arena_nested_tile sizes the slab and deep-copies them in
1170-
std::vector<R> collected;
1171-
collected.reserve(outer_range.volume());
1172-
for (std::size_t o = 0; o < outer_range.volume(); ++o)
1173-
collected.emplace_back(op(outer_range.idx(o)));
1167+
// single pass: make_arena_nested_tile pulls each cell once, in
1168+
// ascending order, so op runs once per cell with no buffer
11741169
return make_arena_nested_tile(
1175-
outer_range, [&collected](std::size_t k) -> const R& {
1176-
return collected[k];
1170+
outer_range, [&op, &outer_range](std::size_t k) -> R {
1171+
return op(outer_range.idx(k));
11771172
});
11781173
},
11791174
skip_set);
@@ -1927,29 +1922,31 @@ class DistArray : public madness::archive::ParallelSerializableObject {
19271922

19281923
/// Engine behind the arena-ToT paths of \c init_elements and \c set:
19291924
/// \p cell_source(ordinal) returns a freestanding tensor whose range sizes
1930-
/// inner cell \p ordinal and whose data fills it. The slab is allocated by
1931-
/// \c detail::make_nested_tile and each cell deep-copies its source.
1925+
/// inner cell \p ordinal and whose data fills it. Built in one pass with
1926+
/// \c detail::ArenaToTBuilder; \p cell_source is invoked exactly once per
1927+
/// cell, in ascending ordinal order, so a single-pass source (a generator
1928+
/// op or an input iterator) can be fed straight through without buffering.
19321929
/// \param[in] outer_range the outer tile's range
19331930
/// \param[in] cell_source maps a cell ordinal to its source tensor
19341931
template <typename CellSource>
19351932
static value_type make_arena_nested_tile(const TiledArray::Range& outer_range,
19361933
CellSource&& cell_source) {
19371934
using InnerRange = typename element_type::range_type;
1938-
return detail::make_nested_tile<value_type>(
1939-
outer_range,
1940-
[&](const auto& idx) -> InnerRange {
1941-
// the inner-cell range type is built from an extent list -- it is
1942-
// not constructible from a foreign range type
1943-
const auto& src = cell_source(outer_range.ordinal(idx)).range();
1944-
const auto& src_ext = src.extent();
1945-
std::vector<std::size_t> ext(src.rank());
1946-
for (std::size_t d = 0; d < src.rank(); ++d)
1947-
ext[d] = static_cast<std::size_t>(src_ext[d]);
1948-
return InnerRange(ext);
1949-
},
1950-
[&](auto& cell, const auto& idx) {
1951-
cell = cell_source(outer_range.ordinal(idx));
1952-
});
1935+
detail::ArenaToTBuilder<value_type> builder(outer_range);
1936+
const std::size_t n = outer_range.volume();
1937+
for (std::size_t k = 0; k < n; ++k) {
1938+
const auto& src = cell_source(k);
1939+
// the inner-cell range type is built from an extent list -- it is not
1940+
// constructible from a foreign range type
1941+
const auto& src_range = src.range();
1942+
const auto& src_ext = src_range.extent();
1943+
std::vector<std::size_t> ext(src_range.rank());
1944+
for (std::size_t d = 0; d < src_range.rank(); ++d)
1945+
ext[d] = static_cast<std::size_t>(src_ext[d]);
1946+
auto& cell = builder.emplace(k, InnerRange(ext));
1947+
if (!cell.empty()) cell = src; // deep copy into the bound arena cell
1948+
}
1949+
return std::move(builder).finish();
19531950
}
19541951

19551952
/// Code factorization of the actual assert for the other overloads

src/TiledArray/tensor/arena_kernels.h

Lines changed: 23 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -148,32 +148,6 @@ struct nested_fill_noop {
148148
void operator()(Cell&, const Index&) const noexcept {}
149149
};
150150

151-
/// Build one ToT outer tile over `outer_range`, two-pass:
152-
/// pass 1: `inner_range_fn(outer_element_index)` -> inner `range_type`
153-
/// sizes every inner cell (zero-volume -> deliberately-null cell);
154-
/// pass 2: `inner_fill_fn(inner_cell&, outer_element_index)` fills each
155-
/// non-null cell. The default fill leaves storage zero-initialized.
156-
/// Dispatches internally on the inner-tile type (see `arena_outer_init`).
157-
template <typename OuterTensor, typename InnerRangeFn,
158-
typename InnerFillFn = nested_fill_noop>
159-
OuterTensor make_nested_tile(
160-
const typename OuterTensor::range_type& outer_range,
161-
InnerRangeFn&& inner_range_fn, InnerFillFn&& inner_fill_fn = {}) {
162-
// arena_outer_init keys ranges on the cell ordinal; user code keys on the
163-
// (global) outer element index -- translate via the outer range.
164-
auto cell_range_fn = [&](std::size_t ord) {
165-
return inner_range_fn(outer_range.idx(ord));
166-
};
167-
OuterTensor result =
168-
arena_outer_init<OuterTensor>(outer_range, 1, cell_range_fn);
169-
const std::size_t N = outer_range.volume();
170-
for (std::size_t ord = 0; ord < N; ++ord) {
171-
auto& cell = result.data()[ord];
172-
if (!cell.empty()) inner_fill_fn(cell, outer_range.idx(ord));
173-
}
174-
return result;
175-
}
176-
177151
/// One-pass incremental builder for an arena-backed ToT outer tile.
178152
///
179153
/// `make_nested_tile` / `arena_outer_init` pre-walk every inner range before
@@ -261,6 +235,29 @@ class ArenaToTBuilder {
261235
std::shared_ptr<inner_t[]> data_;
262236
};
263237

238+
/// Build one ToT outer tile over `outer_range` in a single pass: each inner
239+
/// cell is sized by `inner_range_fn(outer_element_index)` and immediately
240+
/// filled by `inner_fill_fn(inner_cell&, outer_element_index)` before moving
241+
/// to the next -- no separate all-ranges walk. A zero-volume inner range
242+
/// yields a deliberately-null cell, which `inner_fill_fn` is not invoked on.
243+
/// Cells are zero-initialized, so the default no-op fill still leaves zeroed
244+
/// storage. Backed by `ArenaToTBuilder`.
245+
template <typename OuterTensor, typename InnerRangeFn,
246+
typename InnerFillFn = nested_fill_noop>
247+
OuterTensor make_nested_tile(
248+
const typename OuterTensor::range_type& outer_range,
249+
InnerRangeFn&& inner_range_fn, InnerFillFn&& inner_fill_fn = {}) {
250+
ArenaToTBuilder<OuterTensor> builder(outer_range, /*batch_sz=*/1,
251+
/*zero_init=*/true);
252+
const std::size_t N = outer_range.volume();
253+
for (std::size_t ord = 0; ord < N; ++ord) {
254+
const auto idx = outer_range.idx(ord);
255+
auto& cell = builder.emplace(ord, inner_range_fn(idx));
256+
if (!cell.empty()) inner_fill_fn(cell, idx);
257+
}
258+
return std::move(builder).finish();
259+
}
260+
264261
/// Apply a unary fill op while preserving each source inner range.
265262
/// `fill_op(dst_data, src_data, n_elements)` writes the result cell.
266263
template <typename OuterTensor, typename SrcOuterTensor, typename FillOp>

0 commit comments

Comments
 (0)