c3d192fd73e75f46732ce14fdd306004e59d2596
[mesa.git] / src / gallium / drivers / swr / rasterizer / jitter / functionpasses / lower_x86.cpp
1 /****************************************************************************
2 * Copyright (C) 2014-2018 Intel Corporation. All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file lower_x86.cpp
24 *
25 * @brief llvm pass to lower meta code to x86
26 *
27 * Notes:
28 *
29 ******************************************************************************/
30
31 #include "jit_pch.hpp"
32 #include "passes.h"
33 #include "JitManager.h"
34
35 #include "common/simdlib.hpp"
36
37 #include <unordered_map>
38
39 extern "C" void ScatterPS_256(uint8_t*, SIMD256::Integer, SIMD256::Float, uint8_t, uint32_t);
40
41 namespace llvm
42 {
43 // foward declare the initializer
44 void initializeLowerX86Pass(PassRegistry&);
45 } // namespace llvm
46
47 namespace SwrJit
48 {
49 using namespace llvm;
50
51 enum TargetArch
52 {
53 AVX = 0,
54 AVX2 = 1,
55 AVX512 = 2
56 };
57
58 enum TargetWidth
59 {
60 W256 = 0,
61 W512 = 1,
62 NUM_WIDTHS = 2
63 };
64
65 struct LowerX86;
66
67 typedef std::function<Instruction*(LowerX86*, TargetArch, TargetWidth, CallInst*)> EmuFunc;
68
69 struct X86Intrinsic
70 {
71 IntrinsicID intrin[NUM_WIDTHS];
72 EmuFunc emuFunc;
73 };
74
75 // Map of intrinsics that haven't been moved to the new mechanism yet. If used, these get the
76 // previous behavior of mapping directly to avx/avx2 intrinsics.
77 static std::map<std::string, IntrinsicID> intrinsicMap = {
78 {"meta.intrinsic.BEXTR_32", Intrinsic::x86_bmi_bextr_32},
79 {"meta.intrinsic.VPSHUFB", Intrinsic::x86_avx2_pshuf_b},
80 {"meta.intrinsic.VCVTPS2PH", Intrinsic::x86_vcvtps2ph_256},
81 {"meta.intrinsic.VPTESTC", Intrinsic::x86_avx_ptestc_256},
82 {"meta.intrinsic.VPTESTZ", Intrinsic::x86_avx_ptestz_256},
83 {"meta.intrinsic.VPHADDD", Intrinsic::x86_avx2_phadd_d},
84 {"meta.intrinsic.PDEP32", Intrinsic::x86_bmi_pdep_32},
85 {"meta.intrinsic.RDTSC", Intrinsic::x86_rdtsc},
86 };
87
88 // Forward decls
89 Instruction* NO_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst);
90 Instruction*
91 VPERM_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst);
92 Instruction*
93 VGATHER_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst);
94 Instruction*
95 VSCATTER_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst);
96 Instruction*
97 VROUND_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst);
98 Instruction*
99 VHSUB_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst);
100 Instruction*
101 VCONVERT_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst);
102
103 Instruction* DOUBLE_EMU(LowerX86* pThis,
104 TargetArch arch,
105 TargetWidth width,
106 CallInst* pCallInst,
107 Intrinsic::ID intrin);
108
109 static Intrinsic::ID DOUBLE = (Intrinsic::ID)-1;
110
111 // clang-format off
112 static std::map<std::string, X86Intrinsic> intrinsicMap2[] = {
113 // 256 wide 512 wide
114 {
115 // AVX
116 {"meta.intrinsic.VRCPPS", {{Intrinsic::x86_avx_rcp_ps_256, DOUBLE}, NO_EMU}},
117 {"meta.intrinsic.VPERMPS", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VPERM_EMU}},
118 {"meta.intrinsic.VPERMD", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VPERM_EMU}},
119 {"meta.intrinsic.VGATHERPD", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}},
120 {"meta.intrinsic.VGATHERPS", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}},
121 {"meta.intrinsic.VGATHERDD", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}},
122 {"meta.intrinsic.VSCATTERPS", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VSCATTER_EMU}},
123 {"meta.intrinsic.VCVTPD2PS", {{Intrinsic::x86_avx_cvt_pd2_ps_256, Intrinsic::not_intrinsic}, NO_EMU}},
124 {"meta.intrinsic.VROUND", {{Intrinsic::x86_avx_round_ps_256, DOUBLE}, NO_EMU}},
125 {"meta.intrinsic.VHSUBPS", {{Intrinsic::x86_avx_hsub_ps_256, DOUBLE}, NO_EMU}},
126 },
127 {
128 // AVX2
129 {"meta.intrinsic.VRCPPS", {{Intrinsic::x86_avx_rcp_ps_256, DOUBLE}, NO_EMU}},
130 {"meta.intrinsic.VPERMPS", {{Intrinsic::x86_avx2_permps, Intrinsic::not_intrinsic}, VPERM_EMU}},
131 {"meta.intrinsic.VPERMD", {{Intrinsic::x86_avx2_permd, Intrinsic::not_intrinsic}, VPERM_EMU}},
132 {"meta.intrinsic.VGATHERPD", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}},
133 {"meta.intrinsic.VGATHERPS", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}},
134 {"meta.intrinsic.VGATHERDD", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}},
135 {"meta.intrinsic.VSCATTERPS", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VSCATTER_EMU}},
136 {"meta.intrinsic.VCVTPD2PS", {{Intrinsic::x86_avx_cvt_pd2_ps_256, DOUBLE}, NO_EMU}},
137 {"meta.intrinsic.VROUND", {{Intrinsic::x86_avx_round_ps_256, DOUBLE}, NO_EMU}},
138 {"meta.intrinsic.VHSUBPS", {{Intrinsic::x86_avx_hsub_ps_256, DOUBLE}, NO_EMU}},
139 },
140 {
141 // AVX512
142 {"meta.intrinsic.VRCPPS", {{Intrinsic::x86_avx512_rcp14_ps_256, Intrinsic::x86_avx512_rcp14_ps_512}, NO_EMU}},
143 #if LLVM_VERSION_MAJOR < 7
144 {"meta.intrinsic.VPERMPS", {{Intrinsic::x86_avx512_mask_permvar_sf_256, Intrinsic::x86_avx512_mask_permvar_sf_512}, NO_EMU}},
145 {"meta.intrinsic.VPERMD", {{Intrinsic::x86_avx512_mask_permvar_si_256, Intrinsic::x86_avx512_mask_permvar_si_512}, NO_EMU}},
146 #else
147 {"meta.intrinsic.VPERMPS", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VPERM_EMU}},
148 {"meta.intrinsic.VPERMD", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VPERM_EMU}},
149 #endif
150 {"meta.intrinsic.VGATHERPD", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}},
151 {"meta.intrinsic.VGATHERPS", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}},
152 {"meta.intrinsic.VGATHERDD", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}},
153 {"meta.intrinsic.VSCATTERPS", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VSCATTER_EMU}},
154 #if LLVM_VERSION_MAJOR < 7
155 {"meta.intrinsic.VCVTPD2PS", {{Intrinsic::x86_avx512_mask_cvtpd2ps_256, Intrinsic::x86_avx512_mask_cvtpd2ps_512}, NO_EMU}},
156 #else
157 {"meta.intrinsic.VCVTPD2PS", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VCONVERT_EMU}},
158 #endif
159 {"meta.intrinsic.VROUND", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VROUND_EMU}},
160 {"meta.intrinsic.VHSUBPS", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VHSUB_EMU}},
161 }};
162 // clang-format on
163
164 static uint32_t getBitWidth(VectorType *pVTy)
165 {
166 #if LLVM_VERSION_MAJOR >= 11
167 return pVTy->getNumElements() * pVTy->getElementType()->getPrimitiveSizeInBits();
168 #else
169 return pVTy->getBitWidth();
170 #endif
171 }
172
173 struct LowerX86 : public FunctionPass
174 {
175 LowerX86(Builder* b = nullptr) : FunctionPass(ID), B(b)
176 {
177 initializeLowerX86Pass(*PassRegistry::getPassRegistry());
178
179 // Determine target arch
180 if (JM()->mArch.AVX512F())
181 {
182 mTarget = AVX512;
183 }
184 else if (JM()->mArch.AVX2())
185 {
186 mTarget = AVX2;
187 }
188 else if (JM()->mArch.AVX())
189 {
190 mTarget = AVX;
191 }
192 else
193 {
194 SWR_ASSERT(false, "Unsupported AVX architecture.");
195 mTarget = AVX;
196 }
197
198 // Setup scatter function for 256 wide
199 uint32_t curWidth = B->mVWidth;
200 B->SetTargetWidth(8);
201 std::vector<Type*> args = {
202 B->mInt8PtrTy, // pBase
203 B->mSimdInt32Ty, // vIndices
204 B->mSimdFP32Ty, // vSrc
205 B->mInt8Ty, // mask
206 B->mInt32Ty // scale
207 };
208
209 FunctionType* pfnScatterTy = FunctionType::get(B->mVoidTy, args, false);
210 mPfnScatter256 = cast<Function>(
211 #if LLVM_VERSION_MAJOR >= 9
212 B->JM()->mpCurrentModule->getOrInsertFunction("ScatterPS_256", pfnScatterTy).getCallee());
213 #else
214 B->JM()->mpCurrentModule->getOrInsertFunction("ScatterPS_256", pfnScatterTy));
215 #endif
216 if (sys::DynamicLibrary::SearchForAddressOfSymbol("ScatterPS_256") == nullptr)
217 {
218 sys::DynamicLibrary::AddSymbol("ScatterPS_256", (void*)&ScatterPS_256);
219 }
220
221 B->SetTargetWidth(curWidth);
222 }
223
224 // Try to decipher the vector type of the instruction. This does not work properly
225 // across all intrinsics, and will have to be rethought. Probably need something
226 // similar to llvm's getDeclaration() utility to map a set of inputs to a specific typed
227 // intrinsic.
228 void GetRequestedWidthAndType(CallInst* pCallInst,
229 const StringRef intrinName,
230 TargetWidth* pWidth,
231 Type** pTy)
232 {
233 assert(pCallInst);
234 Type* pVecTy = pCallInst->getType();
235
236 // Check for intrinsic specific types
237 // VCVTPD2PS type comes from src, not dst
238 if (intrinName.equals("meta.intrinsic.VCVTPD2PS"))
239 {
240 Value* pOp = pCallInst->getOperand(0);
241 assert(pOp);
242 pVecTy = pOp->getType();
243 }
244
245 if (!pVecTy->isVectorTy())
246 {
247 for (auto& op : pCallInst->arg_operands())
248 {
249 if (op.get()->getType()->isVectorTy())
250 {
251 pVecTy = op.get()->getType();
252 break;
253 }
254 }
255 }
256 SWR_ASSERT(pVecTy->isVectorTy(), "Couldn't determine vector size");
257
258 uint32_t width = getBitWidth(cast<VectorType>(pVecTy));
259 switch (width)
260 {
261 case 256:
262 *pWidth = W256;
263 break;
264 case 512:
265 *pWidth = W512;
266 break;
267 default:
268 SWR_ASSERT(false, "Unhandled vector width %d", width);
269 *pWidth = W256;
270 }
271
272 *pTy = pVecTy->getScalarType();
273 }
274
275 Value* GetZeroVec(TargetWidth width, Type* pTy)
276 {
277 uint32_t numElem = 0;
278 switch (width)
279 {
280 case W256:
281 numElem = 8;
282 break;
283 case W512:
284 numElem = 16;
285 break;
286 default:
287 SWR_ASSERT(false, "Unhandled vector width type %d\n", width);
288 }
289
290 return ConstantVector::getNullValue(getVectorType(pTy, numElem));
291 }
292
293 Value* GetMask(TargetWidth width)
294 {
295 Value* mask;
296 switch (width)
297 {
298 case W256:
299 mask = B->C((uint8_t)-1);
300 break;
301 case W512:
302 mask = B->C((uint16_t)-1);
303 break;
304 default:
305 SWR_ASSERT(false, "Unhandled vector width type %d\n", width);
306 }
307 return mask;
308 }
309
310 // Convert <N x i1> mask to <N x i32> x86 mask
311 Value* VectorMask(Value* vi1Mask)
312 {
313 #if LLVM_VERSION_MAJOR >= 11
314 uint32_t numElem = cast<VectorType>(vi1Mask->getType())->getNumElements();
315 #else
316 uint32_t numElem = vi1Mask->getType()->getVectorNumElements();
317 #endif
318 return B->S_EXT(vi1Mask, getVectorType(B->mInt32Ty, numElem));
319 }
320
321 Instruction* ProcessIntrinsicAdvanced(CallInst* pCallInst)
322 {
323 Function* pFunc = pCallInst->getCalledFunction();
324 assert(pFunc);
325
326 auto& intrinsic = intrinsicMap2[mTarget][pFunc->getName().str()];
327 TargetWidth vecWidth;
328 Type* pElemTy;
329 GetRequestedWidthAndType(pCallInst, pFunc->getName(), &vecWidth, &pElemTy);
330
331 // Check if there is a native intrinsic for this instruction
332 IntrinsicID id = intrinsic.intrin[vecWidth];
333 if (id == DOUBLE)
334 {
335 // Double pump the next smaller SIMD intrinsic
336 SWR_ASSERT(vecWidth != 0, "Cannot double pump smallest SIMD width.");
337 Intrinsic::ID id2 = intrinsic.intrin[vecWidth - 1];
338 SWR_ASSERT(id2 != Intrinsic::not_intrinsic,
339 "Cannot find intrinsic to double pump.");
340 return DOUBLE_EMU(this, mTarget, vecWidth, pCallInst, id2);
341 }
342 else if (id != Intrinsic::not_intrinsic)
343 {
344 Function* pIntrin = Intrinsic::getDeclaration(B->JM()->mpCurrentModule, id);
345 SmallVector<Value*, 8> args;
346 for (auto& arg : pCallInst->arg_operands())
347 {
348 args.push_back(arg.get());
349 }
350
351 // If AVX512, all instructions add a src operand and mask. We'll pass in 0 src and
352 // full mask for now Assuming the intrinsics are consistent and place the src
353 // operand and mask last in the argument list.
354 if (mTarget == AVX512)
355 {
356 if (pFunc->getName().equals("meta.intrinsic.VCVTPD2PS"))
357 {
358 args.push_back(GetZeroVec(W256, pCallInst->getType()->getScalarType()));
359 args.push_back(GetMask(W256));
360 // for AVX512 VCVTPD2PS, we also have to add rounding mode
361 args.push_back(B->C(_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC));
362 }
363 else
364 {
365 args.push_back(GetZeroVec(vecWidth, pElemTy));
366 args.push_back(GetMask(vecWidth));
367 }
368 }
369
370 return B->CALLA(pIntrin, args);
371 }
372 else
373 {
374 // No native intrinsic, call emulation function
375 return intrinsic.emuFunc(this, mTarget, vecWidth, pCallInst);
376 }
377
378 SWR_ASSERT(false);
379 return nullptr;
380 }
381
382 Instruction* ProcessIntrinsic(CallInst* pCallInst)
383 {
384 Function* pFunc = pCallInst->getCalledFunction();
385 assert(pFunc);
386
387 // Forward to the advanced support if found
388 if (intrinsicMap2[mTarget].find(pFunc->getName().str()) != intrinsicMap2[mTarget].end())
389 {
390 return ProcessIntrinsicAdvanced(pCallInst);
391 }
392
393 SWR_ASSERT(intrinsicMap.find(pFunc->getName().str()) != intrinsicMap.end(),
394 "Unimplemented intrinsic %s.",
395 pFunc->getName().str().c_str());
396
397 Intrinsic::ID x86Intrinsic = intrinsicMap[pFunc->getName().str()];
398 Function* pX86IntrinFunc =
399 Intrinsic::getDeclaration(B->JM()->mpCurrentModule, x86Intrinsic);
400
401 SmallVector<Value*, 8> args;
402 for (auto& arg : pCallInst->arg_operands())
403 {
404 args.push_back(arg.get());
405 }
406 return B->CALLA(pX86IntrinFunc, args);
407 }
408
409 //////////////////////////////////////////////////////////////////////////
410 /// @brief LLVM funtion pass run method.
411 /// @param f- The function we're working on with this pass.
412 virtual bool runOnFunction(Function& F)
413 {
414 std::vector<Instruction*> toRemove;
415 std::vector<BasicBlock*> bbs;
416
417 // Make temp copy of the basic blocks and instructions, as the intrinsic
418 // replacement code might invalidate the iterators
419 for (auto& b : F.getBasicBlockList())
420 {
421 bbs.push_back(&b);
422 }
423
424 for (auto* BB : bbs)
425 {
426 std::vector<Instruction*> insts;
427 for (auto& i : BB->getInstList())
428 {
429 insts.push_back(&i);
430 }
431
432 for (auto* I : insts)
433 {
434 if (CallInst* pCallInst = dyn_cast<CallInst>(I))
435 {
436 Function* pFunc = pCallInst->getCalledFunction();
437 if (pFunc)
438 {
439 if (pFunc->getName().startswith("meta.intrinsic"))
440 {
441 B->IRB()->SetInsertPoint(I);
442 Instruction* pReplace = ProcessIntrinsic(pCallInst);
443 toRemove.push_back(pCallInst);
444 if (pReplace)
445 {
446 pCallInst->replaceAllUsesWith(pReplace);
447 }
448 }
449 }
450 }
451 }
452 }
453
454 for (auto* pInst : toRemove)
455 {
456 pInst->eraseFromParent();
457 }
458
459 JitManager::DumpToFile(&F, "lowerx86");
460
461 return true;
462 }
463
464 virtual void getAnalysisUsage(AnalysisUsage& AU) const {}
465
466 JitManager* JM() { return B->JM(); }
467 Builder* B;
468 TargetArch mTarget;
469 Function* mPfnScatter256;
470
471 static char ID; ///< Needed by LLVM to generate ID for FunctionPass.
472 };
473
474 char LowerX86::ID = 0; // LLVM uses address of ID as the actual ID.
475
476 FunctionPass* createLowerX86Pass(Builder* b) { return new LowerX86(b); }
477
478 Instruction* NO_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst)
479 {
480 SWR_ASSERT(false, "Unimplemented intrinsic emulation.");
481 return nullptr;
482 }
483
484 Instruction* VPERM_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst)
485 {
486 // Only need vperm emulation for AVX
487 SWR_ASSERT(arch == AVX);
488
489 Builder* B = pThis->B;
490 auto v32A = pCallInst->getArgOperand(0);
491 auto vi32Index = pCallInst->getArgOperand(1);
492
493 Value* v32Result;
494 if (isa<Constant>(vi32Index))
495 {
496 // Can use llvm shuffle vector directly with constant shuffle indices
497 v32Result = B->VSHUFFLE(v32A, v32A, vi32Index);
498 }
499 else
500 {
501 v32Result = UndefValue::get(v32A->getType());
502 #if LLVM_VERSION_MAJOR >= 11
503 uint32_t numElem = cast<VectorType>(v32A->getType())->getNumElements();
504 #else
505 uint32_t numElem = v32A->getType()->getVectorNumElements();
506 #endif
507 for (uint32_t l = 0; l < numElem; ++l)
508 {
509 auto i32Index = B->VEXTRACT(vi32Index, B->C(l));
510 auto val = B->VEXTRACT(v32A, i32Index);
511 v32Result = B->VINSERT(v32Result, val, B->C(l));
512 }
513 }
514 return cast<Instruction>(v32Result);
515 }
516
517 Instruction*
518 VGATHER_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst)
519 {
520 Builder* B = pThis->B;
521 auto vSrc = pCallInst->getArgOperand(0);
522 auto pBase = pCallInst->getArgOperand(1);
523 auto vi32Indices = pCallInst->getArgOperand(2);
524 auto vi1Mask = pCallInst->getArgOperand(3);
525 auto i8Scale = pCallInst->getArgOperand(4);
526
527 pBase = B->POINTER_CAST(pBase, PointerType::get(B->mInt8Ty, 0));
528 #if LLVM_VERSION_MAJOR >= 11
529 VectorType* pVectorType = cast<VectorType>(vSrc->getType());
530 uint32_t numElem = pVectorType->getNumElements();
531 auto srcTy = pVectorType->getElementType();
532 #else
533 uint32_t numElem = vSrc->getType()->getVectorNumElements();
534 auto srcTy = vSrc->getType()->getVectorElementType();
535 #endif
536 auto i32Scale = B->Z_EXT(i8Scale, B->mInt32Ty);
537
538 Value* v32Gather = nullptr;
539 if (arch == AVX)
540 {
541 // Full emulation for AVX
542 // Store source on stack to provide a valid address to load from inactive lanes
543 auto pStack = B->STACKSAVE();
544 auto pTmp = B->ALLOCA(vSrc->getType());
545 B->STORE(vSrc, pTmp);
546
547 v32Gather = UndefValue::get(vSrc->getType());
548 #if LLVM_VERSION_MAJOR > 10
549 auto vi32Scale = ConstantVector::getSplat(ElementCount::get(numElem, false), cast<ConstantInt>(i32Scale));
550 #else
551 auto vi32Scale = ConstantVector::getSplat(numElem, cast<ConstantInt>(i32Scale));
552 #endif
553 auto vi32Offsets = B->MUL(vi32Indices, vi32Scale);
554
555 for (uint32_t i = 0; i < numElem; ++i)
556 {
557 auto i32Offset = B->VEXTRACT(vi32Offsets, B->C(i));
558 auto pLoadAddress = B->GEP(pBase, i32Offset);
559 pLoadAddress = B->BITCAST(pLoadAddress, PointerType::get(srcTy, 0));
560 auto pMaskedLoadAddress = B->GEP(pTmp, {0, i});
561 auto i1Mask = B->VEXTRACT(vi1Mask, B->C(i));
562 auto pValidAddress = B->SELECT(i1Mask, pLoadAddress, pMaskedLoadAddress);
563 auto val = B->LOAD(pValidAddress);
564 v32Gather = B->VINSERT(v32Gather, val, B->C(i));
565 }
566
567 B->STACKRESTORE(pStack);
568 }
569 else if (arch == AVX2 || (arch == AVX512 && width == W256))
570 {
571 Function* pX86IntrinFunc = nullptr;
572 if (srcTy == B->mFP32Ty)
573 {
574 pX86IntrinFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule,
575 Intrinsic::x86_avx2_gather_d_ps_256);
576 }
577 else if (srcTy == B->mInt32Ty)
578 {
579 pX86IntrinFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule,
580 Intrinsic::x86_avx2_gather_d_d_256);
581 }
582 else if (srcTy == B->mDoubleTy)
583 {
584 pX86IntrinFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule,
585 Intrinsic::x86_avx2_gather_d_q_256);
586 }
587 else
588 {
589 SWR_ASSERT(false, "Unsupported vector element type for gather.");
590 }
591
592 if (width == W256)
593 {
594 auto v32Mask = B->BITCAST(pThis->VectorMask(vi1Mask), vSrc->getType());
595 v32Gather = B->CALL(pX86IntrinFunc, {vSrc, pBase, vi32Indices, v32Mask, i8Scale});
596 }
597 else if (width == W512)
598 {
599 // Double pump 4-wide for 64bit elements
600 #if LLVM_VERSION_MAJOR >= 11
601 if (cast<VectorType>(vSrc->getType())->getElementType() == B->mDoubleTy)
602 #else
603 if (vSrc->getType()->getVectorElementType() == B->mDoubleTy)
604 #endif
605 {
606 auto v64Mask = pThis->VectorMask(vi1Mask);
607 #if LLVM_VERSION_MAJOR >= 11
608 uint32_t numElem = cast<VectorType>(v64Mask->getType())->getNumElements();
609 #else
610 uint32_t numElem = v64Mask->getType()->getVectorNumElements();
611 #endif
612 v64Mask = B->S_EXT(v64Mask, getVectorType(B->mInt64Ty, numElem));
613 v64Mask = B->BITCAST(v64Mask, vSrc->getType());
614
615 Value* src0 = B->VSHUFFLE(vSrc, vSrc, B->C({0, 1, 2, 3}));
616 Value* src1 = B->VSHUFFLE(vSrc, vSrc, B->C({4, 5, 6, 7}));
617
618 Value* indices0 = B->VSHUFFLE(vi32Indices, vi32Indices, B->C({0, 1, 2, 3}));
619 Value* indices1 = B->VSHUFFLE(vi32Indices, vi32Indices, B->C({4, 5, 6, 7}));
620
621 Value* mask0 = B->VSHUFFLE(v64Mask, v64Mask, B->C({0, 1, 2, 3}));
622 Value* mask1 = B->VSHUFFLE(v64Mask, v64Mask, B->C({4, 5, 6, 7}));
623
624 #if LLVM_VERSION_MAJOR >= 11
625 uint32_t numElemSrc0 = cast<VectorType>(src0->getType())->getNumElements();
626 uint32_t numElemMask0 = cast<VectorType>(mask0->getType())->getNumElements();
627 uint32_t numElemSrc1 = cast<VectorType>(src1->getType())->getNumElements();
628 uint32_t numElemMask1 = cast<VectorType>(mask1->getType())->getNumElements();
629 #else
630 uint32_t numElemSrc0 = src0->getType()->getVectorNumElements();
631 uint32_t numElemMask0 = mask0->getType()->getVectorNumElements();
632 uint32_t numElemSrc1 = src1->getType()->getVectorNumElements();
633 uint32_t numElemMask1 = mask1->getType()->getVectorNumElements();
634 #endif
635 src0 = B->BITCAST(src0, getVectorType(B->mInt64Ty, numElemSrc0));
636 mask0 = B->BITCAST(mask0, getVectorType(B->mInt64Ty, numElemMask0));
637 Value* gather0 =
638 B->CALL(pX86IntrinFunc, {src0, pBase, indices0, mask0, i8Scale});
639 src1 = B->BITCAST(src1, getVectorType(B->mInt64Ty, numElemSrc1));
640 mask1 = B->BITCAST(mask1, getVectorType(B->mInt64Ty, numElemMask1));
641 Value* gather1 =
642 B->CALL(pX86IntrinFunc, {src1, pBase, indices1, mask1, i8Scale});
643 v32Gather = B->VSHUFFLE(gather0, gather1, B->C({0, 1, 2, 3, 4, 5, 6, 7}));
644 v32Gather = B->BITCAST(v32Gather, vSrc->getType());
645 }
646 else
647 {
648 // Double pump 8-wide for 32bit elements
649 auto v32Mask = pThis->VectorMask(vi1Mask);
650 v32Mask = B->BITCAST(v32Mask, vSrc->getType());
651 Value* src0 = B->EXTRACT_16(vSrc, 0);
652 Value* src1 = B->EXTRACT_16(vSrc, 1);
653
654 Value* indices0 = B->EXTRACT_16(vi32Indices, 0);
655 Value* indices1 = B->EXTRACT_16(vi32Indices, 1);
656
657 Value* mask0 = B->EXTRACT_16(v32Mask, 0);
658 Value* mask1 = B->EXTRACT_16(v32Mask, 1);
659
660 Value* gather0 =
661 B->CALL(pX86IntrinFunc, {src0, pBase, indices0, mask0, i8Scale});
662 Value* gather1 =
663 B->CALL(pX86IntrinFunc, {src1, pBase, indices1, mask1, i8Scale});
664
665 v32Gather = B->JOIN_16(gather0, gather1);
666 }
667 }
668 }
669 else if (arch == AVX512)
670 {
671 Value* iMask = nullptr;
672 Function* pX86IntrinFunc = nullptr;
673 if (srcTy == B->mFP32Ty)
674 {
675 pX86IntrinFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule,
676 Intrinsic::x86_avx512_gather_dps_512);
677 iMask = B->BITCAST(vi1Mask, B->mInt16Ty);
678 }
679 else if (srcTy == B->mInt32Ty)
680 {
681 pX86IntrinFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule,
682 Intrinsic::x86_avx512_gather_dpi_512);
683 iMask = B->BITCAST(vi1Mask, B->mInt16Ty);
684 }
685 else if (srcTy == B->mDoubleTy)
686 {
687 pX86IntrinFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule,
688 Intrinsic::x86_avx512_gather_dpd_512);
689 iMask = B->BITCAST(vi1Mask, B->mInt8Ty);
690 }
691 else
692 {
693 SWR_ASSERT(false, "Unsupported vector element type for gather.");
694 }
695
696 auto i32Scale = B->Z_EXT(i8Scale, B->mInt32Ty);
697 v32Gather = B->CALL(pX86IntrinFunc, {vSrc, pBase, vi32Indices, iMask, i32Scale});
698 }
699
700 return cast<Instruction>(v32Gather);
701 }
702 Instruction*
703 VSCATTER_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst)
704 {
705 Builder* B = pThis->B;
706 auto pBase = pCallInst->getArgOperand(0);
707 auto vi1Mask = pCallInst->getArgOperand(1);
708 auto vi32Indices = pCallInst->getArgOperand(2);
709 auto v32Src = pCallInst->getArgOperand(3);
710 auto i32Scale = pCallInst->getArgOperand(4);
711
712 if (arch != AVX512)
713 {
714 // Call into C function to do the scatter. This has significantly better compile perf
715 // compared to jitting scatter loops for every scatter
716 if (width == W256)
717 {
718 auto mask = B->BITCAST(vi1Mask, B->mInt8Ty);
719 B->CALL(pThis->mPfnScatter256, {pBase, vi32Indices, v32Src, mask, i32Scale});
720 }
721 else
722 {
723 // Need to break up 512 wide scatter to two 256 wide
724 auto maskLo = B->VSHUFFLE(vi1Mask, vi1Mask, B->C({0, 1, 2, 3, 4, 5, 6, 7}));
725 auto indicesLo =
726 B->VSHUFFLE(vi32Indices, vi32Indices, B->C({0, 1, 2, 3, 4, 5, 6, 7}));
727 auto srcLo = B->VSHUFFLE(v32Src, v32Src, B->C({0, 1, 2, 3, 4, 5, 6, 7}));
728
729 auto mask = B->BITCAST(maskLo, B->mInt8Ty);
730 B->CALL(pThis->mPfnScatter256, {pBase, indicesLo, srcLo, mask, i32Scale});
731
732 auto maskHi = B->VSHUFFLE(vi1Mask, vi1Mask, B->C({8, 9, 10, 11, 12, 13, 14, 15}));
733 auto indicesHi =
734 B->VSHUFFLE(vi32Indices, vi32Indices, B->C({8, 9, 10, 11, 12, 13, 14, 15}));
735 auto srcHi = B->VSHUFFLE(v32Src, v32Src, B->C({8, 9, 10, 11, 12, 13, 14, 15}));
736
737 mask = B->BITCAST(maskHi, B->mInt8Ty);
738 B->CALL(pThis->mPfnScatter256, {pBase, indicesHi, srcHi, mask, i32Scale});
739 }
740 return nullptr;
741 }
742
743 Value* iMask;
744 Function* pX86IntrinFunc;
745 if (width == W256)
746 {
747 // No direct intrinsic supported in llvm to scatter 8 elem with 32bit indices, but we
748 // can use the scatter of 8 elements with 64bit indices
749 pX86IntrinFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule,
750 Intrinsic::x86_avx512_scatter_qps_512);
751
752 auto vi32IndicesExt = B->Z_EXT(vi32Indices, B->mSimdInt64Ty);
753 iMask = B->BITCAST(vi1Mask, B->mInt8Ty);
754 B->CALL(pX86IntrinFunc, {pBase, iMask, vi32IndicesExt, v32Src, i32Scale});
755 }
756 else if (width == W512)
757 {
758 pX86IntrinFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule,
759 Intrinsic::x86_avx512_scatter_dps_512);
760 iMask = B->BITCAST(vi1Mask, B->mInt16Ty);
761 B->CALL(pX86IntrinFunc, {pBase, iMask, vi32Indices, v32Src, i32Scale});
762 }
763 return nullptr;
764 }
765
766 // No support for vroundps in avx512 (it is available in kncni), so emulate with avx
767 // instructions
768 Instruction*
769 VROUND_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst)
770 {
771 SWR_ASSERT(arch == AVX512);
772
773 auto B = pThis->B;
774 auto vf32Src = pCallInst->getOperand(0);
775 assert(vf32Src);
776 auto i8Round = pCallInst->getOperand(1);
777 assert(i8Round);
778 auto pfnFunc =
779 Intrinsic::getDeclaration(B->JM()->mpCurrentModule, Intrinsic::x86_avx_round_ps_256);
780
781 if (width == W256)
782 {
783 return cast<Instruction>(B->CALL2(pfnFunc, vf32Src, i8Round));
784 }
785 else if (width == W512)
786 {
787 auto v8f32SrcLo = B->EXTRACT_16(vf32Src, 0);
788 auto v8f32SrcHi = B->EXTRACT_16(vf32Src, 1);
789
790 auto v8f32ResLo = B->CALL2(pfnFunc, v8f32SrcLo, i8Round);
791 auto v8f32ResHi = B->CALL2(pfnFunc, v8f32SrcHi, i8Round);
792
793 return cast<Instruction>(B->JOIN_16(v8f32ResLo, v8f32ResHi));
794 }
795 else
796 {
797 SWR_ASSERT(false, "Unimplemented vector width.");
798 }
799
800 return nullptr;
801 }
802
803 Instruction*
804 VCONVERT_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst)
805 {
806 SWR_ASSERT(arch == AVX512);
807
808 auto B = pThis->B;
809 auto vf32Src = pCallInst->getOperand(0);
810
811 if (width == W256)
812 {
813 auto vf32SrcRound = Intrinsic::getDeclaration(B->JM()->mpCurrentModule,
814 Intrinsic::x86_avx_round_ps_256);
815 return cast<Instruction>(B->FP_TRUNC(vf32SrcRound, B->mFP32Ty));
816 }
817 else if (width == W512)
818 {
819 // 512 can use intrinsic
820 auto pfnFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule,
821 Intrinsic::x86_avx512_mask_cvtpd2ps_512);
822 return cast<Instruction>(B->CALL(pfnFunc, vf32Src));
823 }
824 else
825 {
826 SWR_ASSERT(false, "Unimplemented vector width.");
827 }
828
829 return nullptr;
830 }
831
832 // No support for hsub in AVX512
833 Instruction* VHSUB_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst)
834 {
835 SWR_ASSERT(arch == AVX512);
836
837 auto B = pThis->B;
838 auto src0 = pCallInst->getOperand(0);
839 auto src1 = pCallInst->getOperand(1);
840
841 // 256b hsub can just use avx intrinsic
842 if (width == W256)
843 {
844 auto pX86IntrinFunc =
845 Intrinsic::getDeclaration(B->JM()->mpCurrentModule, Intrinsic::x86_avx_hsub_ps_256);
846 return cast<Instruction>(B->CALL2(pX86IntrinFunc, src0, src1));
847 }
848 else if (width == W512)
849 {
850 // 512b hsub can be accomplished with shuf/sub combo
851 auto minuend = B->VSHUFFLE(src0, src1, B->C({0, 2, 8, 10, 4, 6, 12, 14}));
852 auto subtrahend = B->VSHUFFLE(src0, src1, B->C({1, 3, 9, 11, 5, 7, 13, 15}));
853 return cast<Instruction>(B->SUB(minuend, subtrahend));
854 }
855 else
856 {
857 SWR_ASSERT(false, "Unimplemented vector width.");
858 return nullptr;
859 }
860 }
861
862 // Double pump input using Intrin template arg. This blindly extracts lower and upper 256 from
863 // each vector argument and calls the 256 wide intrinsic, then merges the results to 512 wide
864 Instruction* DOUBLE_EMU(LowerX86* pThis,
865 TargetArch arch,
866 TargetWidth width,
867 CallInst* pCallInst,
868 Intrinsic::ID intrin)
869 {
870 auto B = pThis->B;
871 SWR_ASSERT(width == W512);
872 Value* result[2];
873 Function* pX86IntrinFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule, intrin);
874 for (uint32_t i = 0; i < 2; ++i)
875 {
876 SmallVector<Value*, 8> args;
877 for (auto& arg : pCallInst->arg_operands())
878 {
879 auto argType = arg.get()->getType();
880 if (argType->isVectorTy())
881 {
882 #if LLVM_VERSION_MAJOR >= 11
883 uint32_t vecWidth = cast<VectorType>(argType)->getNumElements();
884 auto elemTy = cast<VectorType>(argType)->getElementType();
885 #else
886 uint32_t vecWidth = argType->getVectorNumElements();
887 auto elemTy = argType->getVectorElementType();
888 #endif
889 Value* lanes = B->CInc<int>(i * vecWidth / 2, vecWidth / 2);
890 Value* argToPush = B->VSHUFFLE(arg.get(), B->VUNDEF(elemTy, vecWidth), lanes);
891 args.push_back(argToPush);
892 }
893 else
894 {
895 args.push_back(arg.get());
896 }
897 }
898 result[i] = B->CALLA(pX86IntrinFunc, args);
899 }
900 uint32_t vecWidth;
901 if (result[0]->getType()->isVectorTy())
902 {
903 assert(result[1]->getType()->isVectorTy());
904 #if LLVM_VERSION_MAJOR >= 11
905 vecWidth = cast<VectorType>(result[0]->getType())->getNumElements() +
906 cast<VectorType>(result[1]->getType())->getNumElements();
907 #else
908 vecWidth = result[0]->getType()->getVectorNumElements() +
909 result[1]->getType()->getVectorNumElements();
910 #endif
911 }
912 else
913 {
914 vecWidth = 2;
915 }
916 Value* lanes = B->CInc<int>(0, vecWidth);
917 return cast<Instruction>(B->VSHUFFLE(result[0], result[1], lanes));
918 }
919
920 } // namespace SwrJit
921
922 using namespace SwrJit;
923
924 INITIALIZE_PASS_BEGIN(LowerX86, "LowerX86", "LowerX86", false, false)
925 INITIALIZE_PASS_END(LowerX86, "LowerX86", "LowerX86", false, false)