@@ -234,16 +234,18 @@ def get_sme_transpose_interleave_2svlx2svl_fp32_intrin(cols, rows):
234234 the contents of sub-tile 1 and 2 are stored in opposite locations - see the diagram
235235 below.
236236
237- A: Accumulator tile: A_t:
238- 2SVL 2SVL 2SVL
239- +----------------+ +-----------------+ +-------------------+
240- | --0a-- --1a-- | | | | | | | | |
241- | --0b-- --1b-- | | 0 1 | | 0a 0b .. 2a 2b .. |
242- | ... ... | ld1w.horiz | | st1w.vert | | | | | |
243- 2SVL | --2a-- --3a-- | ====> 2SVL | | ====> 2SVL | | | | | |
244- | --2a-- --3b-- | | 2 3 | | 1a 1b .. 3a 3b .. |
245- | ... ... | | | | | | | | |
246- +----------------+ +-----------------+ +-------------------+
237+ ::
238+
239+ A: Accumulator tile: A_t:
240+ 2SVL 2SVL 2SVL
241+ +----------------+ +-----------------+ +-------------------+
242+ | --0a-- --1a-- | | | | | | | | |
243+ | --0b-- --1b-- | | 0 1 | | 0a 0b .. 2a 2b .. |
244+ | ... ... | ld1w.horiz | | st1w.vert | | | | | |
245+ 2SVL | --2a-- --3a-- | ====> 2SVL | | ====> 2SVL | | | | | |
246+ | --2a-- --3b-- | | 2 3 | | 1a 1b .. 3a 3b .. |
247+ | ... ... | | | | | | | | |
248+ +----------------+ +-----------------+ +-------------------+
247249
248250 Returns
249251 -------
@@ -521,24 +523,26 @@ def get_sme_gemm_interleaved_mopa_2svlx2svl_intrin(M, K, in_dtype):
521523 Diagram showing outer-product performed on each of the accumulator sub-tiles
522524 for the fp32 datatype:
523525
524- SVL SVL
525- +----------------------------+
526- | l | h | K
527- K +----------------------------+
528- +---+ +----------------------------+
529- | | | 0: 1: |-+
530- | | | mopa(l, l) mopa(l, h) | |-+
531- l | | | | | |
532- | | | | | |
533- |---| | | | |
534- | | | 2: 3: | | |
535- h | | | mopa(h, l) mopa(h, h) | | |
536- | | | | | |
537- | | | | | |
538- +---+ +----------------------------+ | |
539- +----------------------------+ |
540- +---------------------------+
541- (accumulate K times)
526+ ::
527+
528+ SVL SVL
529+ +----------------------------+
530+ | l | h | K
531+ K +----------------------------+
532+ +---+ +----------------------------+
533+ | | | 0: 1: |-+
534+ | | | mopa(l, l) mopa(l, h) | |-+
535+ l | | | | | |
536+ | | | | | |
537+ |---| | | | |
538+ | | | 2: 3: | | |
539+ h | | | mopa(h, l) mopa(h, h) | | |
540+ | | | | | |
541+ | | | | | |
542+ +---+ +----------------------------+ | |
543+ +----------------------------+ |
544+ +---------------------------+
545+ (accumulate K times)
542546
543547 Pseudo code computing 2SVL x 2SVL GEMM for fp32 inputs:
544548
@@ -572,6 +576,7 @@ def get_sme_gemm_interleaved_mopa_2svlx2svl_intrin(M, K, in_dtype):
572576 }
573577
574578 Notes:
579+
575580 - Recall that A has been transposed beforehand such that each column is now accessed
576581 by row.
577582 - 'sme.zero' resets the accumulator tile to contain all zero's.
0 commit comments