- https://reviews.llvm.org/rL347208
- llvm version
- clang 9.0.0 (fdd6b2ea 2019 Feb 10)
- llvm 9.0.0 (ea4b5cb48 2019 Feb 10)
- What is interleaved access?
- E.g.
struct Colour { float r, g, b, a; }; Colour C[4] = ..; float Rs[4] = {C[0].r, C[1].r, C[2].r, C[3].r}; float Gs[4] = {C[0].g, C[1].g, C[2].g, C[3].g}; float Bs[4] = {C[0].b, C[1].b, C[2].b, C[3].b}; float As[4] = {C[0].a, C[1].a, C[2].a, C[3].a};
- Each Colour is interleaved loaded into Rs, Gs, Bs, As.
- ARMv8's interleaved access instructions
(Figure from: https://www.slideshare.net/linaroorg/lce13-gwggfxonarmv8 p.9)
- llvm tries to find interleaved access pattern, and transforms the pattern to use target interleaved access intrinsics
- Steps for combining interleaved load:
- InterleavedLoadCombine:
"The pass searches for ShuffleVector instructions that represent interleaved loads. Matches are converted such that they will be captured by the InterleavedAccessPass." - InterleavedAccessPass:
"Identify interleaved memory accesses and transform into target specific intrinsics." - Example aarch64-interleaved-ld-combine.ll
%gep1 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64 2 %gep2 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64 3 %gep3 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64 4 %gep4 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64 5 %ld1 = load <4 x float>, <4 x float>* %gep1, align 16 %ld2 = load <4 x float>, <4 x float>* %gep2, align 16 %ld3 = load <4 x float>, <4 x float>* %gep3, align 16 %ld4 = load <4 x float>, <4 x float>* %gep4, align 16 %sv1 = shufflevector <4 x float> %ld1, <4 x float> %ld2, <4 x i32> <i32 0, i32 1, i32 4, i32 5> %sv2 = shufflevector <4 x float> %ld1, <4 x float> %ld2, <4 x i32> <i32 2, i32 3, i32 6, i32 7> %sv3 = shufflevector <4 x float> %ld3, <4 x float> %ld4, <4 x i32> <i32 0, i32 1, i32 4, i32 5> %sv4 = shufflevector <4 x float> %ld3, <4 x float> %ld4, <4 x i32> <i32 2, i32 3, i32 6, i32 7> %m0_3 = shufflevector <4 x float> %sv1, <4 x float> %sv3, <4 x i32> <i32 0, i32 2, i32 4, i32 6> %m4_7 = shufflevector <4 x float> %sv1, <4 x float> %sv3, <4 x i32> <i32 1, i32 3, i32 5, i32 7> %m8_11 = shufflevector <4 x float> %sv2, <4 x float> %sv4, <4 x i32> <i32 0, i32 2, i32 4, i32 6> %m12_15 = shufflevector <4 x float> %sv2, <4 x float> %sv4, <4 x i32> <i32 1, i32 3, i32 5, i32 7> store <4 x float> %m0_3, <4 x float>* %gep1, align 16 store <4 x float> %m4_7, <4 x float>* %gep2, align 16 store <4 x float> %m8_11, <4 x float>* %gep3, align 16 store <4 x float> %m12_15, <4 x float>* %gep4, align 16
opt -S -interleaved-load-combine ./aarch64-interleaved-ld-combine.ll > combined-ld.ll
define void @aarch64_ilc_const(<4 x float>* %ptr) { entry: %gep1 = .. .. %interleaved.wide.ptrcast = bitcast <4 x float>* %gep1 to <16 x float>* %interleaved.wide.load = load <16 x float>, <16 x float>* %interleaved.wide.ptrcast, align 16 %ld1 = .. .. %sv1 = .. .. %interleaved.shuffle = shufflevector <16 x float> %interleaved.wide.load, <16 x float> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12> %m0_3 = .. %interleaved.shuffle1 = shufflevector <16 x float> %interleaved.wide.load, <16 x float> undef, <4 x i32> <i32 1, i32 5, i32 9, i32 13> %m4_7 = .. %interleaved.shuffle2 = shufflevector <16 x float> %interleaved.wide.load, <16 x float> undef, <4 x i32> <i32 2, i32 6, i32 10, i32 14> %m8_11 = .. %interleaved.shuffle3 = shufflevector <16 x float> %interleaved.wide.load, <16 x float> undef, <4 x i32> <i32 3, i32 7, i32 11, i32 15> %m12_15 = .. store <4 x float> %interleaved.shuffle, <4 x float>* %gep1, align 16 store <4 x float> %interleaved.shuffle1, <4 x float>* %gep2, align 16 store <4 x float> %interleaved.shuffle2, <4 x float>* %gep3, align 16 store <4 x float> %interleaved.shuffle3, <4 x float>* %gep4, align 16 ret void }
opt -S -interleaved-access ./combined-ld.ll > combined-access.ll
define void @aarch64_ilc_const(<4 x float>* %ptr) { entry: %gep1 = .. .. %interleaved.wide.ptrcast = bitcast <4 x float>* %gep1 to <16 x float>* %0 = bitcast <16 x float>* %interleaved.wide.ptrcast to <4 x float>* %ldN = call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld4.v4f32.p0v4f32(<4 x float>* %0) %1 = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %ldN, 3 %2 = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %ldN, 2 %3 = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %ldN, 1 %4 = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %ldN, 0 %ld1 = .. .. %sv1 = .. .. %m0_3 = .. .. store <4 x float> %4, <4 x float>* %gep1, align 16 store <4 x float> %3, <4 x float>* %gep2, align 16 store <4 x float> %2, <4 x float>* %gep3, align 16 store <4 x float> %1, <4 x float>* %gep4, align 16 ret void }
llc ./combined-access.ll -o final.s
add x8, x0, #32 ld4 { v0.4s, v1.4s, v2.4s, v3.4s }, [x8] stp q0, q1, [x0, #32] stp q2, q3, [x0, #64] ret
Code
bool InterleavedLoadCombineImpl::run() { .. // Start with the highest factor to avoid combining and recombining. for (unsigned Factor = MaxFactor; Factor >= 2; Factor--) { std::list<VectorInfo> Candidates; for (BasicBlock &BB : F) { for (Instruction &I : BB) { if (auto SVI = dyn_cast<ShuffleVectorInst>(&I)) { Candidates.emplace_back(SVI->getType()); if (!VectorInfo::computeFromSVI(SVI, Candidates.back(), DL)) { Candidates.pop_back(); continue; } if (!Candidates.back().isInterleaved(Factor, DL)) Candidates.pop_back(); } } }
- The pass first scans all ShuffleVectorInst, and calculates its VectorInfo. In VectorInfo, the most important info is memory address polynomial.
E.g.
%gep1 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64 2 %gep2 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64 3 %ld1 = load <4 x float>, <4 x float>* %gep1, align 16 %ld2 = load <4 x float>, <4 x float>* %gep2, align 16 %sv1 = shufflevector <4 x float> %ld1, <4 x float> %ld2, <4 x i32> <i32 0, i32 1, i32 4, i32 5> %sv2 = shufflevector <4 x float> %ld1, <4 x float> %ld2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
Memory Address Polynomial for sv1 and sv2
Cand.sv1 = <4 x float>* %ptr + [ [{#ErrBits:0} + 32], [{#ErrBits:0} + 36], [{#ErrBits:0} + 48], [{#ErrBits:0} + 52]] Cand.sv2 = <4 x float>* %ptr + [ [{#ErrBits:0} + 40], [{#ErrBits:0} + 44], [{#ErrBits:0} + 56], [{#ErrBits:0} + 60]]
If interleaved Factor is 4, then sv1 and sv2 are not candidates, because each channel has to be 16 bytes offset (sizeof(float) * Factor = 16), so they will get false in 'isInterleaved'.
%gep3 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64 4 %gep4 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64 5 %ld3 = load <4 x float>, <4 x float>* %gep3, align 16 %ld4 = load <4 x float>, <4 x float>* %gep4, align 16 %sv3 = shufflevector <4 x float> %ld3, <4 x float> %ld4, <4 x i32> <i32 0, i32 1, i32 4, i32 5> %sv4 = shufflevector <4 x float> %ld3, <4 x float> %ld4, <4 x i32> <i32 2, i32 3, i32 6, i32 7> %m0_3 = shufflevector <4 x float> %sv1, <4 x float> %sv3, <4 x i32> <i32 0, i32 2, i32 4, i32 6> %m4_7 = shufflevector <4 x float> %sv1, <4 x float> %sv3, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
In this case, m0_3 and m4_7 are candidates for combining.
Cand.m0_3 = <4 x float>* %ptr + [ [{#ErrBits:0} + 32], [{#ErrBits:0} + 48], [{#ErrBits:0} + 64], [{#ErrBits:0} + 80]] Cand.m4_7 = <4 x float>* %ptr + [ [{#ErrBits:0} + 36], [{#ErrBits:0} + 52], [{#ErrBits:0} + 68], [{#ErrBits:0} + 84]]
Candidates are sent to findPattern for searching possible interleaved loads.
std::list<VectorInfo> InterleavedLoad; while (findPattern(Candidates, InterleavedLoad, Factor, DL)) { if (combine(InterleavedLoad, ORE)) { changed = true; } else { // Remove the first element of the Interleaved Load but put the others // back on the list and continue searching Candidates.splice(Candidates.begin(), InterleavedLoad, std::next(InterleavedLoad.begin()), InterleavedLoad.end()); } InterleavedLoad.clear(); } }
In our example, %m0_3, %m4_7, %m8_11, and %m12_15 are identified in findPattern, and then sent to combine.
#1 Load interleaved combined with factor 4 %m0_3: <4 x float>* %ptr + [ [{#ErrBits:0} + 32], [{#ErrBits:0} + 48], [{#ErrBits:0} + 64], [{#ErrBits:0} + 80]] %m4_7: <4 x float>* %ptr + [ [{#ErrBits:0} + 36], [{#ErrBits:0} + 52], [{#ErrBits:0} + 68], [{#ErrBits:0} + 84]] %m8_11: <4 x float>* %ptr + [ [{#ErrBits:0} + 40], [{#ErrBits:0} + 56], [{#ErrBits:0} + 72], [{#ErrBits:0} + 88]] %m12_15: <4 x float>* %ptr + [ [{#ErrBits:0} + 44], [{#ErrBits:0} + 60], [{#ErrBits:0} + 76], [{#ErrBits:0} + 92]]
(To be continued ..)