Extended allowable cases

lshaw8317 · lshaw8317 · commit 637ce8dfc940 · 2026-03-22T15:22:36.000+01:00
diff --git a/src/blosc2/blosc2_ext.pyx b/src/blosc2/blosc2_ext.pyx
@@ -2182,12 +2182,13 @@ cdef int aux_matmul(mm_udata *udata, int64_t nchunk, int32_t nblock, void *param
     cdef int blocknitems[2]
     cdef int startA, startB, expected_blocknitems
     cdef blosc2_context* dctx
-    cdef int i, j, block_i, block_j, ncols, block_ncols, Bblock_ncols, Bncols
+    cdef int i, j, block_i, block_j, chunk_i, chunk_j, ncols, block_ncols, Bblock_ncols, Bncols, Ablock_ncols, Ancols
     cdef int nchunkA = 0, nchunkB = 0, nblockA = 0, nblockB = 0, offsetA = 0, offsetB = 0, offset = 0
     out_arr = udata.array
     cdef int ndim = out_arr.ndim
     cdef int nchunk_ = nchunk
     cdef int coord, batch, batch_, batches = 1
+    cdef int out_chunk_nrows, out_chunk_ncols, out_block_nrows, out_block_ncols
 
     # batches = sum(strides[i]*elcoords[i])
     for i in range(ndim - 2):
@@ -2201,12 +2202,10 @@ cdef int aux_matmul(mm_udata *udata, int64_t nchunk, int32_t nblock, void *param
         nchunkB += coord * udata.chunks_strides[2][i]
 
     ncols = udata.chunks_strides[0][ndim - 2]
+    Ancols = udata.chunks_strides[1][ndim - 2]
     Bncols = udata.chunks_strides[2][ndim - 2]
-
-    i = nchunk_ // ncols # ncols * i + j
-    j = nchunk_ % ncols
-    chunk_startA = nchunkA + i * ncols
-    chunk_startB = nchunkB + j
+    out_chunk_nrows = out_arr.chunkshape[ndim - 2]
+    out_chunk_ncols = out_arr.chunkshape[ndim - 1]
 
     # nblock = sum(strides[i]*blockcoords[i])
     cdef int nblock_ = nblock
@@ -2217,18 +2216,14 @@ cdef int aux_matmul(mm_udata *udata, int64_t nchunk, int32_t nblock, void *param
         nblockB += coord * udata.blocks_strides[2][i]
 
     block_ncols = udata.blocks_strides[0][ndim - 2]
+    Ablock_ncols = udata.blocks_strides[1][ndim - 2]
     Bblock_ncols = udata.blocks_strides[2][ndim - 2]
-
-    block_i = nblock_ // block_ncols
-    block_j = nblock_ % block_ncols
-    block_startA = nblockA + block_i * block_ncols
-    block_startB = nblockB + block_j
+    out_block_nrows = out_arr.blockshape[ndim - 2]
+    out_block_ncols = out_arr.blockshape[ndim - 1]
 
     dctx = blosc2_create_dctx(BLOSC2_DPARAMS_DEFAULTS)
 
     first_run = True
-    nchunkA = chunk_startA
-    nchunkB = chunk_startB
     while True: # chunk loop
         for i in range(2):
             chunk_idx = nchunkA if i == 0 else nchunkB
@@ -2244,16 +2239,28 @@ cdef int aux_matmul(mm_udata *udata, int64_t nchunk, int32_t nblock, void *param
                 if i == 0:
                     q = ndarr.blockshape[ndim - 1]
                     p = ndarr.blockshape[ndim - 2]
+                    # nchunk_ = chunks_in_row * chunk_row + chunk_col
+                    # convert from chunk_idx to element idx chunk_i (row)
+                    chunk_i = nchunk_ // ncols * out_chunk_nrows
+                    chunk_startA = nchunkA + chunk_i // ndarr.chunkshape[ndim - 2] * Ancols
+                    nchunkA = chunk_startA
+                    # nblock_ = blocks_in_chunkrow * block_row + block_col
+                    # convert from block_idx to element idx block_i (row)
+                    block_i = nblock_ // block_ncols * out_block_nrows
+                    block_startA = nblockA + block_i // p * Ablock_ncols
                 else: # i = 1
                     r = ndarr.blockshape[ndim - 1]
+                    # convert from chunk_idx to element idx chunk_j (col)
+                    chunk_j = nchunk_ % ncols * out_chunk_ncols
+                    chunk_startB = nchunkB + chunk_j // ndarr.chunkshape[ndim - 1]
+                    nchunkB = chunk_startB
+                    # convert from block_idx to element idx block_j (col)
+                    block_j = nblock_ % block_ncols * out_block_ncols
+                    block_startB = nblockB + block_j // r
                 input_buffers[i] = malloc(block_nbytes[i])
             if input_buffers[i] == NULL:
                 raise MemoryError("miniexpr: cannot allocate input block buffer")
             blocknitems[i] = block_nbytes[i] // <int> ndarr.sc.typesize
-            if i == 0:
-                expected_blocknitems = blocknitems[i]
-            elif blocknitems[i] != expected_blocknitems:
-                raise ValueError("miniexpr: inconsistent block element counts across inputs")
 
         first_run = False
         nblockA = block_startA
@@ -2297,11 +2304,11 @@ cdef int aux_matmul(mm_udata *udata, int64_t nchunk, int32_t nblock, void *param
                 batch += 1
             nblockA += 1
             nblockB += Bblock_ncols
-            if (nblockA % block_ncols == 0):
+            if (nblockA % Ablock_ncols == 0):
                 break
         nchunkA += 1
         nchunkB += Bncols
-        if (nchunkA % ncols == 0):
+        if (nchunkA % Ancols == 0):
             break
 
 
@@ -3280,7 +3287,7 @@ cdef class NDArray:
             cstrides = bstrides = estrides = 1
             for idx in range(2, self.array.ndim + 1):
                 i = inp.ndim - idx
-                if inp.shape[i + 1] == 1 or i < 0:
+                if (inp.shape[i + 1] == 1 and i < inp.ndim - 3) or i < 0:
                     udata.chunks_strides[j][i] = 0
                     udata.blocks_strides[j][i] = 0
                     udata.el_strides[j][i] = 0
diff --git a/src/blosc2/linalg.py b/src/blosc2/linalg.py
@@ -125,14 +125,13 @@ def matmul(x1: blosc2.Array, x2: blosc2.NDArray, **kwargs: Any) -> blosc2.NDArra
             if any(op.dtype != ops[0].dtype for op in ops):  # TODO: Remove this condition
                 use_miniexpr = False
 
+            # TODO: We can relax this to even just load according to result blockshape, but that's difficult.
             # Just force same chunk/block shapes
-            same_chunks = all(op.chunks == result.chunks for op in (x1, x2))
-            same_blocks = all(op.blocks == result.blocks for op in (x1, x2))
-            same_shape = all(op.shape == result.shape for op in (x1, x2))
-
-            use_miniexpr &= same_blocks & same_chunks & same_shape
+            # same_chunks = all(op.chunks == result.chunks for op in (x1, x2))
+            # same_blocks = all(op.blocks == result.blocks for op in (x1, x2))
+            # same_shape = all(op.shape == result.shape for op in (x1, x2))
 
-            # TODO: We can relax this to even just load according to result blockshape, but that's difficult.
+            # use_miniexpr &= same_blocks & same_chunks & same_shape
             # Two easier cases are presented below
             # Case 1: Might want to restrict loading across chunk boundaries, in which case would require:
             # x1.chunks[-2] % result.blocks[-2] == 0
@@ -146,18 +145,18 @@ def matmul(x1: blosc2.Array, x2: blosc2.NDArray, **kwargs: Any) -> blosc2.NDArra
             # (M, K) x (K, N) = (M, N)
             # so can load block-by-block for inputs and calculate block of output
             # Also need to avoid loading across chunk boundaries
-            # chunks_aligned = x1.chunks[-2] % x1.blocks[-2] == 0
-            # chunks_aligned &= x2.chunks[-1] % x2.blocks[-1] == 0
-            # chunks_aligned &= x2.chunks[-2] % x1.blocks[-1] == 0
-            # same_blocks = x2.blocks[-2] == x1.blocks[-1]
-            # same_blocks &= x2.blocks[-1] == result.blocks[-1]
-            # same_blocks &= result.blocks[-2] == x1.blocks[-2]
-            # try:
-            #     result_blocks = np.broadcast_shapes(x1.blocks, x2.blocks)
-            #     if not (same_blocks and chunks_aligned and result_blocks[:-2] == result.blocks[:-2]):
-            #         use_miniexpr = False
-            # except ValueError:
-            #     use_miniexpr = False
+            chunks_aligned = x1.chunks[-2] % x1.blocks[-2] == 0
+            chunks_aligned &= x2.chunks[-1] % x2.blocks[-1] == 0
+            chunks_aligned &= x2.chunks[-2] % x1.blocks[-1] == 0
+            same_blocks = x2.blocks[-2] == x1.blocks[-1]
+            same_blocks &= x2.blocks[-1] == result.blocks[-1]
+            same_blocks &= result.blocks[-2] == x1.blocks[-2]
+            try:
+                result_blocks = np.broadcast_shapes(x1.blocks, x2.blocks)
+                if not (same_blocks and chunks_aligned and result_blocks[:-2] == result.blocks[:-2]):
+                    use_miniexpr = False
+            except ValueError:
+                use_miniexpr = False
 
             use_miniexpr &= x1.dtype.kind in ("i", "f")
             use_miniexpr &= x2.dtype.kind in ("i", "f")