From bbe84fe0d6e30e490b4cfed0606c3ecc859f67d2 Mon Sep 17 00:00:00 2001 From: lkcl Date: Sat, 15 Apr 2023 14:36:58 +0100 Subject: [PATCH] --- openpower/sv/rfc/ls009.mdwn | 330 ++++++++++++++++++++++++++++++++++++ 1 file changed, 330 insertions(+) diff --git a/openpower/sv/rfc/ls009.mdwn b/openpower/sv/rfc/ls009.mdwn index b25d94ce7..5370bf1e5 100644 --- a/openpower/sv/rfc/ls009.mdwn +++ b/openpower/sv/rfc/ls009.mdwn @@ -1760,6 +1760,8 @@ if __name__ == '__main__': ## REMAP FFT pseudocode +The FFT REMAP is RADIX2 only. + ``` # a "yield" version of the REMAP algorithm, for FFT Tukey-Cooley schedules # original code for the FFT Tukey-Cooley schedule: @@ -1928,6 +1930,334 @@ if __name__ == '__main__': demo() ``` +## DCT REMAP pseudocode + +The DCT REMAP is RADIX2 only. Convolutions may be applied as usual +to create non-RADIX2 DCT. Combined with appropriate Twin-butterfly +instructions the algorithm below, written in python3, becomes part +of an in-place in-registers Vectorised DCT. + +``` +# DCT "REMAP" scheduler to create an in-place iterative DCT. +# +# Original fastdctlee.py by Nayuki: +# Copyright (c) 2020 Project Nayuki. (MIT License) +# https://www.nayuki.io/page/fast-discrete-cosine-transform-algorithms + +from copy import deepcopy +import math + +# bits of the integer 'val'. +def reverse_bits(val, width): + result = 0 + for _ in range(width): + result = (result << 1) | (val & 1) + val >>= 1 + return result + + +# iterative version of [recursively-applied] half-rev. +# turns out this is Gray-Encoding. +def halfrev2(vec, pre_rev=True): + res = [] + for i in range(len(vec)): + if pre_rev: + res.append(vec[i ^ (i>>1)]) + else: + ri = i + bl = i.bit_length() + for ji in range(1, bl): + ri ^= (i >> ji) + res.append(vec[ri]) + return res + + +def iterate_dct_inner_halfswap_loadstore(SVSHAPE): + # get indices to iterate over, in the required order + n = SVSHAPE.lims[0] + mode = SVSHAPE.lims[1] + stride = SVSHAPE.lims[2] + + # reference list for not needing to do data-swaps, just swap what + # *indices* are referenced (two levels of indirection at the moment) + # pre-reverse the data-swap list so that it *ends up* in the order 0123.. + ji = list(range(n)) + + levels = n.bit_length() - 1 + ri = [reverse_bits(i, levels) for i in range(n)] + + if SVSHAPE.mode == 0b01: # FFT, bitrev only + ji = [ji[ri[i]] for i in range(n)] + elif SVSHAPE.submode2 == 0b001: + ji = [ji[ri[i]] for i in range(n)] + ji = halfrev2(ji, True) + else: + ji = halfrev2(ji, False) + ji = [ji[ri[i]] for i in range(n)] + + # invert order if requested + if SVSHAPE.invxyz[0]: + ji.reverse() + + for i, jl in enumerate(ji): + y_end = jl == ji[-1] + yield jl * stride, (0b111 if y_end else 0b000) + +def iterate_dct_inner_costable_indices(SVSHAPE): + # get indices to iterate over, in the required order + n = SVSHAPE.lims[0] + mode = SVSHAPE.lims[1] + stride = SVSHAPE.lims[2] + # creating lists of indices to iterate over in each dimension + # has to be done dynamically, because it depends on the size + # first, the size-based loop (which can be done statically) + x_r = [] + size = 2 + while size <= n: + x_r.append(size) + size *= 2 + # invert order if requested + if SVSHAPE.invxyz[0]: + x_r.reverse() + + if len(x_r) == 0: + return + + # start an infinite (wrapping) loop + skip = 0 + z_end = 1 # doesn't exist in this, only 2 loops + k = 0 + while True: + for size in x_r: # loop over 3rd order dimension (size) + x_end = size == x_r[-1] + # y_r schedule depends on size + halfsize = size // 2 + y_r = [] + for i in range(0, n, size): + y_r.append(i) + # invert if requested + if SVSHAPE.invxyz[1]: y_r.reverse() + # two lists of half-range indices, e.g. j 0123, jr 7654 + j = list(range(0, halfsize)) + # invert if requested + if SVSHAPE.invxyz[2]: j_r.reverse() + # loop over 1st order dimension + for ci, jl in enumerate(j): + y_end = jl == j[-1] + # now depending on MODE return the index. inner butterfly + if SVSHAPE.skip == 0b00: # in [0b00, 0b10]: + result = k # offset into COS table + elif SVSHAPE.skip == 0b10: # + result = ci # coefficient helper + elif SVSHAPE.skip == 0b11: # + result = size # coefficient helper + loopends = (z_end | + ((y_end and z_end)<<1) | + ((y_end and x_end and z_end)<<2)) + + yield (result * stride) + SVSHAPE.offset, loopends + k += 1 + +def iterate_dct_inner_butterfly_indices(SVSHAPE): + # get indices to iterate over, in the required order + n = SVSHAPE.lims[0] + mode = SVSHAPE.lims[1] + stride = SVSHAPE.lims[2] + # creating lists of indices to iterate over in each dimension + # has to be done dynamically, because it depends on the size + # first, the size-based loop (which can be done statically) + x_r = [] + size = 2 + while size <= n: + x_r.append(size) + size *= 2 + # invert order if requested + if SVSHAPE.invxyz[0]: + x_r.reverse() + + if len(x_r) == 0: + return + + # reference (read/write) the in-place data in *reverse-bit-order* + ri = list(range(n)) + if SVSHAPE.submode2 == 0b01: + levels = n.bit_length() - 1 + ri = [ri[reverse_bits(i, levels)] for i in range(n)] + + # reference list for not needing to do data-swaps, just swap what + # *indices* are referenced (two levels of indirection at the moment) + # pre-reverse the data-swap list so that it *ends up* in the order 0123.. + ji = list(range(n)) + inplace_mode = True + if inplace_mode and SVSHAPE.submode2 == 0b01: + ji = halfrev2(ji, True) + if inplace_mode and SVSHAPE.submode2 == 0b11: + ji = halfrev2(ji, False) + + # start an infinite (wrapping) loop + while True: + k = 0 + k_start = 0 + for size in x_r: # loop over 3rd order dimension (size) + x_end = size == x_r[-1] + # y_r schedule depends on size + halfsize = size // 2 + y_r = [] + for i in range(0, n, size): + y_r.append(i) + # invert if requested + if SVSHAPE.invxyz[1]: y_r.reverse() + for i in y_r: # loop over 2nd order dimension + y_end = i == y_r[-1] + # two lists of half-range indices, e.g. j 0123, jr 7654 + j = list(range(i, i + halfsize)) + jr = list(range(i+halfsize, i + size)) + jr.reverse() + # invert if requested + if SVSHAPE.invxyz[2]: + j.reverse() + jr.reverse() + hz2 = halfsize // 2 # zero stops reversing 1-item lists + # loop over 1st order dimension + k = k_start + for ci, (jl, jh) in enumerate(zip(j, jr)): + z_end = jl == j[-1] + # now depending on MODE return the index. inner butterfly + if SVSHAPE.skip == 0b00: # in [0b00, 0b10]: + if SVSHAPE.submode2 == 0b11: # iDCT + result = ji[ri[jl]] # lower half + else: + result = ri[ji[jl]] # lower half + elif SVSHAPE.skip == 0b01: # in [0b01, 0b11]: + if SVSHAPE.submode2 == 0b11: # iDCT + result = ji[ri[jl+halfsize]] # upper half + else: + result = ri[ji[jh]] # upper half + elif mode == 4: + # COS table pre-generated mode + if SVSHAPE.skip == 0b10: # + result = k # cos table offset + else: # mode 2 + # COS table generated on-demand ("Vertical-First") mode + if SVSHAPE.skip == 0b10: # + result = ci # coefficient helper + elif SVSHAPE.skip == 0b11: # + result = size # coefficient helper + loopends = (z_end | + ((y_end and z_end)<<1) | + ((y_end and x_end and z_end)<<2)) + + yield (result * stride) + SVSHAPE.offset, loopends + k += 1 + + # now in-place swap + if inplace_mode: + for ci, (jl, jh) in enumerate(zip(j[:hz2], jr[:hz2])): + jlh = jl+halfsize + tmp1, tmp2 = ji[jlh], ji[jh] + ji[jlh], ji[jh] = tmp2, tmp1 + + # new k_start point for cos tables( runs inside x_r loop NOT i loop) + k_start += halfsize + + +# python "yield" can be iterated. use this to make it clear how +# the indices are generated by using natural-looking nested loops +def iterate_dct_outer_butterfly_indices(SVSHAPE): + # get indices to iterate over, in the required order + n = SVSHAPE.lims[0] + mode = SVSHAPE.lims[1] + stride = SVSHAPE.lims[2] + # creating lists of indices to iterate over in each dimension + # has to be done dynamically, because it depends on the size + # first, the size-based loop (which can be done statically) + x_r = [] + size = n // 2 + while size >= 2: + x_r.append(size) + size //= 2 + # invert order if requested + if SVSHAPE.invxyz[0]: + x_r.reverse() + + if len(x_r) == 0: + return + + # I-DCT, reference (read/write) the in-place data in *reverse-bit-order* + ri = list(range(n)) + if SVSHAPE.submode2 in [0b11, 0b01]: + levels = n.bit_length() - 1 + ri = [ri[reverse_bits(i, levels)] for i in range(n)] + + # reference list for not needing to do data-swaps, just swap what + # *indices* are referenced (two levels of indirection at the moment) + # pre-reverse the data-swap list so that it *ends up* in the order 0123.. + ji = list(range(n)) + inplace_mode = False # need the space... SVSHAPE.skip in [0b10, 0b11] + if SVSHAPE.submode2 == 0b11: + ji = halfrev2(ji, False) + + # start an infinite (wrapping) loop + while True: + k = 0 + k_start = 0 + for size in x_r: # loop over 3rd order dimension (size) + halfsize = size//2 + x_end = size == x_r[-1] + y_r = list(range(0, halfsize)) + # invert if requested + if SVSHAPE.invxyz[1]: y_r.reverse() + for i in y_r: # loop over 2nd order dimension + y_end = i == y_r[-1] + # one list to create iterative-sum schedule + jr = list(range(i+halfsize, i+n-halfsize, size)) + # invert if requested + if SVSHAPE.invxyz[2]: jr.reverse() + hz2 = halfsize // 2 # zero stops reversing 1-item lists + k = k_start + for ci, jh in enumerate(jr): # loop over 1st order dimension + z_end = jh == jr[-1] + if mode == 4: + # COS table pre-generated mode + if SVSHAPE.skip == 0b00: # in [0b00, 0b10]: + if SVSHAPE.submode2 == 0b11: # iDCT + result = ji[ri[jh]] # upper half + else: + result = ri[ji[jh]] # lower half + elif SVSHAPE.skip == 0b01: # in [0b01, 0b11]: + if SVSHAPE.submode2 == 0b11: # iDCT + result = ji[ri[jh+size]] # upper half + else: + result = ri[ji[jh+size]] # upper half + elif SVSHAPE.skip == 0b10: # + result = k # cos table offset + else: + # COS table generated on-demand ("Vertical-First") mode + if SVSHAPE.skip == 0b00: # in [0b00, 0b10]: + if SVSHAPE.submode2 == 0b11: # iDCT + result = ji[ri[jh]] # lower half + else: + result = ri[ji[jh]] # lower half + elif SVSHAPE.skip == 0b01: # in [0b01, 0b11]: + if SVSHAPE.submode2 == 0b11: # iDCT + result = ji[ri[jh+size]] # upper half + else: + result = ri[ji[jh+size]] # upper half + elif SVSHAPE.skip == 0b10: # + result = ci # coefficient helper + elif SVSHAPE.skip == 0b11: # + result = size # coefficient helper + loopends = (z_end | + ((y_end and z_end)<<1) | + ((y_end and x_end and z_end)<<2)) + + yield (result * stride) + SVSHAPE.offset, loopends + k += 1 + + # new k_start point for cos tables( runs inside x_r loop NOT i loop) + k_start += halfsize + +``` [[!tag opf_rfc]] -- 2.30.2