conferences/openpower2021/openpower_2021.tex

   1 \documentclass[slidestop]{beamer}
   2 \usepackage{beamerthemesplit}
   3 \usepackage{graphics}
   4 \usepackage{pstricks}
   5
   6 \graphicspath{{./}}
   7
   8 \title{The Libre-SOC Hybrid 3D CPU}
   9 \author{Luke Kenneth Casson Leighton}
  10
  11
  12 \begin{document}
  13
  14 \frame{
  15    \begin{center}
  16     \huge{The Libre-SOC Hybrid 3D CPU}\\
  17     \vspace{32pt}
  18     \Large{Draft SVP64 in-place Matrix Multiply}\\
  19     \Large{and FFT / DCT for the Power ISA}\\
  20     \vspace{24pt}
  21     \Large{OpenPOWER Summit 2021}\\
  22     \vspace{16pt}
  23     \large{Sponsored by NLnet's PET Programme}\\
  24     \vspace{6pt}
  25     \large{28th Oct 2021}
  26   \end{center}
  27 }
  28
  29
  30 \frame{\frametitle{}
  31
  32 \vspace{15pt}
  33
  34  \begin{itemize}
  35    \item \vspace{15pt}
  36    \item \vspace{15pt}
  37    \item \vspace{15pt}
  38   \end{itemize}
  39 }
  40
  41 \frame{\frametitle{Overview of Libre-SOC goals}
  42
  43 \vspace{15pt}
  44
  45  \begin{itemize}
  46    \item To create power-efficient mass-volume products\vspace{15pt}
  47    \item To leverage the OpenPOWER ecosystem to do so\vspace{15pt}
  48    \item To be entirely transparent for Security reasons\vspace{15pt}
  49    \item To empower businesses to bring Secure transparent\\
  50          mass-volume products to market\vspace{15pt}
  51   \end{itemize}
  52 }
  53
  54 \frame{\frametitle{Overview of SVP64 goals}
  55
  56 \vspace{15pt}
  57
  58  \begin{itemize}
  59    \item High performance and high performance/watt\vspace{15pt}
  60    \item Reduced code density (reduced I-Cache usage)\\
  61          https://arxiv.org/abs/2002.10143 - 3.5x power reduction\vspace{8pt}
  62    \item Remain accessible for assembler writers and compilers alike\vspace{15pt}
  63    \item Introduce true Vectorisation to the Power ISA\\
  64          (VSX is Packed SIMD)\vspace{8pt}
  65    \item Be adopted via the external OPF ISA WG RFC process\\
  66          (not: be a non-official custom extension. proprietary\\
  67          custom extensions conflict with mass-volume adoption)\vspace{15pt}
  68   \end{itemize}
  69 }
  70
  71
  72
  73 \begin{frame}[fragile]
  74 \frametitle{Reminder of Simple-V}
  75
  76 \begin{semiverbatim}
  77 https://libre-soc.org/openpower/sv/overview/
  78 Greatly simplified (like x86 "REP" instruction):
  79
  80   for (i = 0; i < VL; i++)
  81        GPR[RT+i] <= GPR[RA+i] + GPR[RB+i];
  82
  83 function op\_add(RT, RA, RB, predr) # add not VADD!
  84   int i, id=0, irs1=0, irs2=0;
  85   for (i = 0; i < VL; i++)
  86     if (GPR[predr] & 1<<i) # predication
  87        GPR[RT+id] <= GPR[RA+irs1] + GPR[RB+irs2];
  88     if (reg\_is\_vectorised[RT])  \{ id += 1; \}
  89     if (reg\_is\_vectorised[RA])  \{ irs1 += 1; \}
  90     if (reg\_is\_vectorised[RB])  \{ irs2 += 1; \}
  91 \end{semiverbatim}
  92
  93 \end{frame}
  94
  95
  96 \begin{frame}[fragile]
  97 \frametitle{SVP64 REMAP system}
  98
  99 \begin{semiverbatim}
 100 Register offsets are "REMAP"ed through a Hardware FSM
 101 https://libre-soc.org/openpower/sv/remap/
 102 remarkably similar to ZOLC
 103 https://www.researchgate.net/publication/224647569
 104
 105 function op\_add(RT, RA, rs2, predr) # add not VADD!
 106   int i, id=0, irs1=0, irs2=0;
 107   for (i = 0; i < VL; i++)
 108     if (GPR[predr] & 1<<i) # predication
 109        GPR[RT+REMAP(id)] <= GPR[RA+REMAP(irs1)] +
 110                            GPR[rs2+REMAP(irs2)];
 111     if (reg\_is\_vectorised[RT])  \{ id += 1; \}
 112     if (reg\_is\_vectorised[RA])  \{ irs1 += 1; \}
 113     if (reg\_is\_vectorised[s2])  \{ irs2 += 1; \}
 114 \end{semiverbatim}
 115
 116 \end{frame}
 117
 118 \begin{frame}[fragile]
 119 \frametitle{Matrix Multiply: Basics}
 120
 121 \begin{semiverbatim}
 122 (a00 a01 a02  x (b00 b01   =
 123  a10 a11 a12)    b10 b11
 124                  b20 b21)
 125
 126 (a00*b00 + a01*b10 + a02*b20 a00*b01 + a01*b11 + a02*b21
 127  a10*b00 + a11*b10 + a12*b20 a10*b01 + a11*b11 + a12*b21)
 128
 129  (b00 b01    x (a00 a01 a02  =
 130   b10 b11       a10 a11 a12)
 131   b20 b21)
 132
 133 (b00*a00 + b01*a10  b00*a01 + b01*a11  b00*a02 + b01*a12
 134  b10*a00 + b11*a10  b10*a01 + b11*a11  b10*a02 + b11*a12
 135  b20*a00 + b21*a10  b20*a01 + b21*a11  b20*a02 + b21*a12)
 136
 137 \end{semiverbatim}
 138
 139 \end{frame}
 140
 141
 142 \begin{frame}[fragile]
 143 \frametitle{Matrix Multiply: naive, with python for-loops}
 144
 145 \begin{semiverbatim}
 146 result = [] # final result
 147 for i in range(len(A)):
 148
 149   row = [] # the new row in new matrix
 150   for j in range(len(B[0])):
 151
 152     product = 0 # the new element in the new row
 153     for v in range(len(A[i])):
 154         product += A[i][v] * B[v][j]
 155     row.append(product) # add sum of product to new row
 156
 157   result.append(row) # add new row into final result
 158 \end{semiverbatim}
 159
 160 \end{frame}
 161
 162 \begin{frame}[fragile]
 163 \frametitle{Matrix Multiply: suitable for Hardware scheduling}
 164
 165 \begin{semiverbatim}
 166 Unsuitable: creates massive Read-After-Write chains
 167
 168 for i in range(len(A)):
 169   for j in range(len(B[0])):
 170     for v in range(len(A[i])):
 171       product[i][j] += A[i][v] * B[v][j]
 172
 173 Suitable: can be parallelised / pipelined. RaW avoided
 174
 175 for i in range(len(A)):
 176   for v in range(len(A[i])):    # swapped
 177     for j in range(len(B[0])):  # with this
 178       product[i][j] += A[i][v] * B[v][j]
 179
 180 \end{semiverbatim}
 181
 182 \end{frame}
 183
 184
 185 \frame{\frametitle{Matrix Multiply: Generalise but Specialise}
 186
 187 \vspace{8pt}
 188
 189  \begin{itemize}
 190    \item Why not make a general-purpose nested "Loop" system?\\
 191         - Other uses (algorithms) beyond Matrix Multiplication\\
 192         - Allow any arbitrary-sized loops\\
 193         - Allow any permutation of nesting\\
 194         - Allow reversing per-dimension\vspace{5pt}
 195    \item Specialise by making Matrix Multiply "setup" quick/easy\\
 196         - two 32-bit instructions to set up A, B, C sizes\\
 197         - one 64-bit SVP64 FMAC instruction.\\
 198         - Nothing else needed.  Saves on I-Cache\vspace{5pt}
 199    \item Hardware turns out to be near-identical to ZOLC\\
 200         https://opencores.org/projects/hwlu\\
 201         https://libre-soc.org/openpower/sv/remap/\vspace{15pt}
 202   \end{itemize}
 203 }
 204
 205 \begin{frame}[fragile]
 206 \frametitle{Matrix Multiply: unit test / example}
 207
 208 \begin{semiverbatim}
 209   94 def test_sv_remap2(self):
 210   95     lst = ["svshape 5, 4, 3, 0, 0",
 211   96            "svremap 0b11111, 1, 2, 3, 0, 0, 0, 0",
 212   97            "sv.fmadds 0.v, 8.v, 16.v, 0.v"
 213   98           ]
 214   99             REMAP fmadds FRT, FRA, FRC, FRB
 215
 216 svshape 5, 4, 3, 0, 0 => A: 3x5 B: 3x4
 217                       => C: 3x3
 218 svremap (enable) (F)RS, (F)RT, (F)RA, (F)RB, (F)RC
 219 sv.fmadds: uses fp0 as accumulator
 220       product[i][j] += A[i][v] * B[v][j]
 221 \end{semiverbatim}
 222
 223 \end{frame}
 224
 225 \frame{\frametitle{Matrix Multiply: Ehm that's all Folks}
 226
 227 \vspace{6pt}
 228
 229  \begin{itemize}
 230    \item Really is that straightforward: no actual Vector ops\\
 231         - Does not dictate or limit micro-architectural detail\\
 232         - Issues Scalar FMACs into existing back-end hardware\\
 233         - Can use any 4-operand instruction (GF, INT, Bitmanip)\\
 234         - No Power-2 limits. Any operand width (8/16/32/64)\vspace{8pt}
 235    \item Limited to 127 scalar ops and in-place registers.  Future?\\
 236         - https://arxiv.org/abs/2002.10143 CISC-like load-and-inc\\
 237         - Auto-load/store (tagged) registers, keeps RISC ISA\\
 238         - Extend to memory-based arbitrary NxN matrix sizes\\
 239         - Still power-efficient: no I-cache usage during FMAC issue\vspace{8pt}
 240    \item Future can be investigated as part of EUR 22.6m EU Grant\\
 241         https://libre-soc.org/SEP-210803722-Libre-SOC-8-core/\vspace{15pt}
 242   \end{itemize}
 243 }
 244
 245
 246 \frame{\frametitle{DCT / FFT / DFT / NTT: what if we could REMAP?}
 247
 248 \vspace{6pt}
 249
 250  \begin{itemize}
 251    \item Can we create a REMAP Schedule for FFT (etc)? YES\\
 252         - More complicated than Matrix Schedules but same principle\\
 253         - Again: issues Scalar instructions into back-end micro-arch\\
 254         - Requires 5-operand (3-in, 2-out) new Scalar Instructions\\
 255         - Any operand width (8/16/32/64)\vspace{8pt}
 256    \item Limited to in-place registers and Power-of-Two.  Future?\\
 257         - Again: CISC-like auto-load/store-and-increment\\
 258         - https://arxiv.org/abs/2002.10143\\
 259         - Again: still power-efficient (no I-Cache usage in loops)\vspace{8pt}
 260    \item Again: can be investigated as part of EUR 22.6m EU Grant\\
 261         https://libre-soc.org/SEP-210803722-Libre-SOC-8-core/\vspace{15pt}
 262   \end{itemize}
 263 }
 264
 265
 266 \frame{\frametitle{Discrete Cosine Transform (DCT): Basics}
 267
 268  \begin{itemize}
 269    \item Standard DCT Schedule (messy, impossible for SIMD)
 270    \item Output is in bit-reversed order\\
 271          0b000 = 0b000 (in: 0   out: 0)\\
 272          0b001 = 0b100 (in: 1   out: 4) ...\\
 273          0b110 = 0b011 (in: 6   out: 3)\\
 274          0b111 = 0b111 (in: 7   out: 7)
 275   \end{itemize}
 276
 277 \begin{center}
 278 \includegraphics[width=0.75\textwidth]{plonka_dct1.png}
 279 \end{center}
 280
 281 }
 282
 283 \frame{\frametitle{Fast Fourier Transform (FFT/DFT): Butterfly Basics}
 284
 285  \begin{itemize}
 286    \item Standard Butterfly Schedule (again: messy, but less so)
 287    \item Output, again, is in bit-reversed order
 288   \end{itemize}
 289
 290 \begin{center}
 291 \includegraphics[width=0.70\textwidth]{fft_butterfly.png}
 292 \end{center}
 293
 294 }
 295
 296 \frame{\frametitle{FFT/DFT: 3-in, 2-out butterfly}
 297
 298  \begin{itemize}
 299    \item One multiply (by coefficient), one add, one subtract
 300    \item inputs: X[0] X[1] C(oeff) outputs: X[0] X[1]
 301   \end{itemize}
 302
 303 \begin{center}
 304 \includegraphics[width=0.90\textwidth]{2-point-dft.png}
 305 \end{center}
 306
 307 }
 308
 309
 310 \begin{frame}[fragile]
 311 \frametitle{DFT: Project Nayuki Radix-2 Cooley-Tukey DIT (1)}
 312
 313 \begin{semiverbatim}
 314 coef = (2 if inverse else -2) * cmath.pi / n
 315 exptable = [cmath.rect(1, i*coef) for i in range(n // 2)]
 316 vec = [vec[reverse_bits(i, levels)] for i in range(n)]
 317 size = 2
 318 while size <= n:
 319     halfsize, tablestep = size // 2, n // size
 320     for i in range(0, n, size):
 321         k = 0
 322         for j in range(i, i + halfsize):
 323             temp = vec[j + halfsize] * exptable[k]
 324             vec[j + halfsize] = vec[j] - temp
 325             vec[j] += temp
 326             k += tablestep
 327     size *= 2
 328 \end{semiverbatim}
 329
 330 \end{frame}
 331
 332
 333 \begin{frame}[fragile]
 334 \frametitle{DFT: Project Nayuki Radix-2 Cooley-Tukey DIT (2)}
 335
 336 \begin{semiverbatim}
 337 coef = (2 if inverse else -2) * cmath.pi / n
 338 exptable = [cmath.rect(1, i*coef) for i in range(n // 2)]
 339 vec = [vec[reverse_bits(i, levels)] for i in range(n)]
 340 size = 2
 341 while size <= n:
 342     hs, tablestep = size // 2, n // size
 343     for i in range(0, n, size):
 344         k = 0
 345         for j in range(i, i+hs):
 346             # Twin-Butterfly 3-in 2-out: one instruction
 347             C = exptable[k]
 348             vec[j+hs], vec[j] = 2B(vec[j+hs], vec[j], C)
 349             k += tablestep
 350     size *= 2
 351 \end{semiverbatim}
 352
 353 \end{frame}
 354
 355 \begin{frame}[fragile]
 356 \frametitle{DFT: Project Nayuki Radix-2 Cooley-Tukey DIT (3)}
 357
 358  \begin{itemize}
 359     \item What if the Triple Loop could be done with REMAP?
 360     \item Register Offsets j, j+hs, k created automatically?
 361     \item Only one actual inner loop instruction (Twin-butterfly)
 362     \item 3-in (X0/X1/C) 2-out (X0/X1) allows for in-place FFT
 363     \item Hardware FSM (like ZOLC) creates offset triplet\\
 364         - Technically not that hard to implement (for Radix-2)\\
 365         - Exact same principle as Triple-loop for Matrices
 366   \end{itemize}
 367
 368 \begin{semiverbatim}
 369 for j,k,hs in REMAP_TRIPLE_LOOP_GENERATOR():
 370             # Twin-Butterfly 3-in 2-out: one instruction
 371             C = exptable[k]
 372             vec[j+hs], vec[j] = 2B(vec[j+hs], vec[j], C)
 373 \end{semiverbatim}
 374
 375 \end{frame}
 376
 377 \frame{\frametitle{DCT: pre-arrange (pre-load) data}
 378
 379  \begin{itemize}
 380    \item Arrange input data such that output falls into place
 381    \item (another) Twin 3-in 2-out Mul-Add in-place instruction
 382   \end{itemize}
 383
 384 \begin{center}
 385 \includegraphics[width=0.70\textwidth]{dct_butterfly.png}
 386 \end{center}
 387
 388 }
 389
 390
 391 \frame{\frametitle{FFT (Complex numbers) and DCT coefficients?}
 392
 393  \begin{itemize}
 394    \item Problem (DCT): DCT Cosine Coefficients change (cos + 0.5)
 395          depending on the layer.  Cannot do as single instruction
 396    \item Problem (FFT): Complex number butterfly multiplication involves
 397          4 multiplies.  Cannot do in-place as single instruction\vspace{12pt}
 398    \item Solution: "Vertical-First" style Vectors\vspace{12pt}
 399    \item Understanding of SVP64 "Vertical-First"\\
 400          30min explanatory video https://youtube.com/watch?v=fn2KJvWyBKg
 401    \item Basically involves stepping "vertically" through instructions
 402          then moving ("stepping") to the next offset (next REMAP)
 403    \item Horizontal-first: run through the entire REMAP schedule on a single
 404          instruction before moving on to the next instruction
 405
 406   \end{itemize}
 407 }
 408
 409 \frame{\frametitle{TODO rewrite Summary}
 410
 411  \begin{itemize}
 412    \item Goal is to create a mass-volume low-power embedded SoC suitable
 413          for use in netbooks, chromebooks, tablets, smartphones, IoT SBCs.
 414    \item No DRM. 'Trustable' (by the users, not by Media Moguls) design
 415          ethos as a \textit{business} objective: requires full transparency
 416          as well as Formal Correctness Proofs
 417    \item Collaboration with OpenPOWER Foundation and Members absolutely
 418          essential. No short-cuts.  Standards to be developed and ratified
 419          so that everyone benefits.
 420    \item Working on the back of huge stability of POWER ecosystem
 421    \item Combination of which is that Board Support Package is 100\%
 422          upstream, app and product development by customer is hugely
 423          simplified and much more attractive
 424
 425   \end{itemize}
 426 }
 427
 428
 429 \frame{
 430   \begin{center}
 431     {\Huge The end\vspace{15pt}\\
 432                    Thank you\vspace{15pt}\\
 433                    Questions?\vspace{15pt}
 434         }
 435   \end{center}
 436
 437   \begin{itemize}
 438         \item Discussion: Libre-SOC-dev mailing list
 439         \item Freenode IRC \#libre-soc
 440         \item http://libre-soc.org/
 441         \item http://nlnet.nl/PET
 442   \end{itemize}
 443 }
 444
 445
 446 \end{document}