%gep1 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64 2
%gep2 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64 3
%gep3 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64 4
%gep4 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64 5
%ld1 = load <4 x float>, <4 x float>* %gep1, align 16
%ld2 = load <4 x float>, <4 x float>* %gep2, align 16
%ld3 = load <4 x float>, <4 x float>* %gep3, align 16
%ld4 = load <4 x float>, <4 x float>* %gep4, align 16
%sv1 = shufflevector <4 x float> %ld1, <4 x float> %ld2, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
%sv2 = shufflevector <4 x float> %ld1, <4 x float> %ld2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
%sv3 = shufflevector <4 x float> %ld3, <4 x float> %ld4, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
%sv4 = shufflevector <4 x float> %ld3, <4 x float> %ld4, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
%m0_3 = shufflevector <4 x float> %sv1, <4 x float> %sv3, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
%m4_7 = shufflevector <4 x float> %sv1, <4 x float> %sv3, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
%m8_11 = shufflevector <4 x float> %sv2, <4 x float> %sv4, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
%m12_15 = shufflevector <4 x float> %sv2, <4 x float> %sv4, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
store <4 x float> %m0_3, <4 x float>* %gep1, align 16
store <4 x float> %m4_7, <4 x float>* %gep2, align 16
store <4 x float> %m8_11, <4 x float>* %gep3, align 16
store <4 x float> %m12_15, <4 x float>* %gep4, align 16
opt -S -interleaved-load-combine ./aarch64-interleaved-ld-combine.ll > combined-ld.ll
define void @aarch64_ilc_const(<4 x float>* %ptr) {
entry:
%gep1 = ..
..
%interleaved.wide.ptrcast = bitcast <4 x float>* %gep1 to <16 x float>*
%interleaved.wide.load = load <16 x float>, <16 x float>* %interleaved.wide.ptrcast, align 16
%ld1 = ..
..
%sv1 = ..
..
%interleaved.shuffle = shufflevector <16 x float> %interleaved.wide.load, <16 x float> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
%m0_3 = ..
%interleaved.shuffle1 = shufflevector <16 x float> %interleaved.wide.load, <16 x float> undef, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
%m4_7 = ..
%interleaved.shuffle2 = shufflevector <16 x float> %interleaved.wide.load, <16 x float> undef, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
%m8_11 = ..
%interleaved.shuffle3 = shufflevector <16 x float> %interleaved.wide.load, <16 x float> undef, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
%m12_15 = ..
store <4 x float> %interleaved.shuffle, <4 x float>* %gep1, align 16
store <4 x float> %interleaved.shuffle1, <4 x float>* %gep2, align 16
store <4 x float> %interleaved.shuffle2, <4 x float>* %gep3, align 16
store <4 x float> %interleaved.shuffle3, <4 x float>* %gep4, align 16
ret void
}
opt -S -interleaved-access ./combined-ld.ll > combined-access.ll
define void @aarch64_ilc_const(<4 x float>* %ptr) {
entry:
%gep1 = ..
..
%interleaved.wide.ptrcast = bitcast <4 x float>* %gep1 to <16 x float>*
%0 = bitcast <16 x float>* %interleaved.wide.ptrcast to <4 x float>*
%ldN = call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld4.v4f32.p0v4f32(<4 x float>* %0)
%1 = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %ldN, 3
%2 = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %ldN, 2
%3 = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %ldN, 1
%4 = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %ldN, 0
%ld1 = ..
..
%sv1 = ..
..
%m0_3 = ..
..
store <4 x float> %4, <4 x float>* %gep1, align 16
store <4 x float> %3, <4 x float>* %gep2, align 16
store <4 x float> %2, <4 x float>* %gep3, align 16
store <4 x float> %1, <4 x float>* %gep4, align 16
ret void
}
llc ./combined-access.ll -o final.s
add x8, x0, #32
ld4 { v0.4s, v1.4s, v2.4s, v3.4s }, [x8]
stp q0, q1, [x0, #32]
stp q2, q3, [x0, #64]
ret
Code
bool InterleavedLoadCombineImpl::run() {
..
// Start with the highest factor to avoid combining and recombining.
for (unsigned Factor = MaxFactor; Factor >= 2; Factor--) {
std::list<VectorInfo> Candidates;
for (BasicBlock &BB : F) {
for (Instruction &I : BB) {
if (auto SVI = dyn_cast<ShuffleVectorInst>(&I)) {
Candidates.emplace_back(SVI->getType());
if (!VectorInfo::computeFromSVI(SVI, Candidates.back(), DL)) {
Candidates.pop_back();
continue;
}
if (!Candidates.back().isInterleaved(Factor, DL))
Candidates.pop_back();
}
}
}
- The pass first scans all ShuffleVectorInst, and calculates its VectorInfo. In VectorInfo, the most important info is memory address polynomial.
E.g.
%gep1 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64 2
%gep2 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64 3
%ld1 = load <4 x float>, <4 x float>* %gep1, align 16
%ld2 = load <4 x float>, <4 x float>* %gep2, align 16
%sv1 = shufflevector <4 x float> %ld1, <4 x float> %ld2, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
%sv2 = shufflevector <4 x float> %ld1, <4 x float> %ld2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
Memory Address Polynomial for sv1 and sv2
Cand.sv1 = <4 x float>* %ptr + [
[{#ErrBits:0} + 32],
[{#ErrBits:0} + 36],
[{#ErrBits:0} + 48],
[{#ErrBits:0} + 52]]
Cand.sv2 = <4 x float>* %ptr + [
[{#ErrBits:0} + 40],
[{#ErrBits:0} + 44],
[{#ErrBits:0} + 56],
[{#ErrBits:0} + 60]]
If interleaved Factor is 4, then sv1 and sv2 are not candidates, because each channel has to be 16 bytes offset (sizeof(float) * Factor = 16), so they will get false in 'isInterleaved'.
%gep3 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64 4
%gep4 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64 5
%ld3 = load <4 x float>, <4 x float>* %gep3, align 16
%ld4 = load <4 x float>, <4 x float>* %gep4, align 16
%sv3 = shufflevector <4 x float> %ld3, <4 x float> %ld4, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
%sv4 = shufflevector <4 x float> %ld3, <4 x float> %ld4, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
%m0_3 = shufflevector <4 x float> %sv1, <4 x float> %sv3, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
%m4_7 = shufflevector <4 x float> %sv1, <4 x float> %sv3, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
In this case, m0_3 and m4_7 are candidates for combining.
Cand.m0_3 = <4 x float>* %ptr + [
[{#ErrBits:0} + 32],
[{#ErrBits:0} + 48],
[{#ErrBits:0} + 64],
[{#ErrBits:0} + 80]]
Cand.m4_7 = <4 x float>* %ptr + [
[{#ErrBits:0} + 36],
[{#ErrBits:0} + 52],
[{#ErrBits:0} + 68],
[{#ErrBits:0} + 84]]
Candidates are sent to findPattern for searching possible interleaved loads.
std::list<VectorInfo> InterleavedLoad;
while (findPattern(Candidates, InterleavedLoad, Factor, DL)) {
if (combine(InterleavedLoad, ORE)) {
changed = true;
} else {
// Remove the first element of the Interleaved Load but put the others
// back on the list and continue searching
Candidates.splice(Candidates.begin(), InterleavedLoad,
std::next(InterleavedLoad.begin()),
InterleavedLoad.end());
}
InterleavedLoad.clear();
}
}
In our example, %m0_3, %m4_7, %m8_11, and %m12_15 are identified in findPattern, and then sent to combine.
#1 Load interleaved combined with factor 4
%m0_3: <4 x float>* %ptr + [
[{#ErrBits:0} + 32],
[{#ErrBits:0} + 48],
[{#ErrBits:0} + 64],
[{#ErrBits:0} + 80]]
%m4_7: <4 x float>* %ptr + [
[{#ErrBits:0} + 36],
[{#ErrBits:0} + 52],
[{#ErrBits:0} + 68],
[{#ErrBits:0} + 84]]
%m8_11: <4 x float>* %ptr + [
[{#ErrBits:0} + 40],
[{#ErrBits:0} + 56],
[{#ErrBits:0} + 72],
[{#ErrBits:0} + 88]]
%m12_15: <4 x float>* %ptr + [
[{#ErrBits:0} + 44],
[{#ErrBits:0} + 60],
[{#ErrBits:0} + 76],
[{#ErrBits:0} + 92]]
(To be continued ..)