mtspr 9, 3 # move r3 to CTR setvl 3,0,8,0,1,1 # MVL=8, VL=r3=MIN(MVL,CTR) # load VL bytes (update r4 addr) at width=8 (dw=8) addi 6, 0, 0 # set all 64-bits of r6=0 sv.lbzu/pi/dw=8 *6, 1(4) gbbd 8,6 # gbbd performs the transpose # now bits are turned around, popcnt and sum them setvl 0,0,8,0,1,1 # set MVL=VL=8 sv.popcntd/sw=8 *24,*8 # do (transposed) popcnt sv.add *16,*16,*24 # accumulate in results # branch back if CTR non-zero works even when VL=8 sv.bc/all 16, *0, -0x28 # reduces CTR by VL