-# load VL bytes (update r4 addr) but width=8 (dw=8)
-addi 6, 0, 0 # set 64-bits of r6=0
-sv.lbzu/pi/dw=8 *6, 1(4)
-# gather performs transpose (gets us to positional)
-gbbd 8,6
-# now bits are turned around, popcount and sum them
+# load VL bytes (update r4 addr) at width=8 (dw=8)
+addi 6, 0, 0 # set all 64-bits of r6=0
+sv.lbzu/pi/dw=8 *6, 1(4)
+gbbd 8,6 # gbbd performs the transpose
+# now bits are turned around, popcnt and sum them