Optimize for fun: [llvm] Interleaved Load Combine Pass (5e067bb37)

https://reviews.llvm.org/rL347208
llvm version

clang 9.0.0 (fdd6b2ea 2019 Feb 10)
llvm 9.0.0 (ea4b5cb48 2019 Feb 10)

What is interleaved access?

E.g.

struct Colour { float r, g, b, a; };
Colour C[4] = ..;
float Rs[4] = {C[0].r, C[1].r, C[2].r, C[3].r};
float Gs[4] = {C[0].g, C[1].g, C[2].g, C[3].g};
float Bs[4] = {C[0].b, C[1].b, C[2].b, C[3].b};
float As[4] = {C[0].a, C[1].a, C[2].a, C[3].a};

Each Colour is interleaved loaded into Rs, Gs, Bs, As.
ARMv8's interleaved access instructions

(Figure from: https://www.slideshare.net/linaroorg/lce13-gwggfxonarmv8 p.9)

llvm tries to find interleaved access pattern, and transforms the pattern to use target interleaved access intrinsics
Steps for combining interleaved load:

InterleavedLoadCombine:
"The pass searches for ShuffleVector instructions that represent interleaved loads. Matches are converted such that they will be captured by the InterleavedAccessPass."
InterleavedAccessPass:
"Identify interleaved memory accesses and transform into target specific intrinsics."

Example aarch64-interleaved-ld-combine.ll

  %gep1 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64  2
  %gep2 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64  3
  %gep3 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64  4
  %gep4 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64  5
  %ld1 = load <4 x float>, <4 x float>* %gep1, align 16
  %ld2 = load <4 x float>, <4 x float>* %gep2, align 16
  %ld3 = load <4 x float>, <4 x float>* %gep3, align 16
  %ld4 = load <4 x float>, <4 x float>* %gep4, align 16
  %sv1 = shufflevector <4 x float> %ld1, <4 x float> %ld2, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
  %sv2 = shufflevector <4 x float> %ld1, <4 x float> %ld2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
  %sv3 = shufflevector <4 x float> %ld3, <4 x float> %ld4, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
  %sv4 = shufflevector <4 x float> %ld3, <4 x float> %ld4, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
  %m0_3   = shufflevector <4 x float> %sv1, <4 x float> %sv3, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
  %m4_7   = shufflevector <4 x float> %sv1, <4 x float> %sv3, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
  %m8_11  = shufflevector <4 x float> %sv2, <4 x float> %sv4, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
  %m12_15 = shufflevector <4 x float> %sv2, <4 x float> %sv4, <4 x i32> <i32 1, i32 3, i32 5, i32 7>

  store <4 x float> %m0_3, <4 x float>* %gep1, align 16
  store <4 x float> %m4_7, <4 x float>* %gep2, align 16
  store <4 x float> %m8_11, <4 x float>* %gep3, align 16
  store <4 x float> %m12_15, <4 x float>* %gep4, align 16

opt -S -interleaved-load-combine ./aarch64-interleaved-ld-combine.ll > combined-ld.ll

define void @aarch64_ilc_const(<4 x float>* %ptr) {
entry:
 %gep1 = ..
 ..
 
 %interleaved.wide.ptrcast = bitcast <4 x float>* %gep1 to <16 x float>*
 %interleaved.wide.load = load <16 x float>, <16 x float>* %interleaved.wide.ptrcast, align 16

 %ld1 = ..
 ..
 %sv1 = ..
 ..

 %interleaved.shuffle = shufflevector <16 x float> %interleaved.wide.load, <16 x float> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
 %m0_3 = ..
 %interleaved.shuffle1 = shufflevector <16 x float> %interleaved.wide.load, <16 x float> undef, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
 %m4_7 = ..
 %interleaved.shuffle2 = shufflevector <16 x float> %interleaved.wide.load, <16 x float> undef, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
 %m8_11 = ..
 %interleaved.shuffle3 = shufflevector <16 x float> %interleaved.wide.load, <16 x float> undef, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
 %m12_15 = ..
 store <4 x float> %interleaved.shuffle, <4 x float>* %gep1, align 16
 store <4 x float> %interleaved.shuffle1, <4 x float>* %gep2, align 16
 store <4 x float> %interleaved.shuffle2, <4 x float>* %gep3, align 16
 store <4 x float> %interleaved.shuffle3, <4 x float>* %gep4, align 16
 ret void
}

opt -S -interleaved-access ./combined-ld.ll > combined-access.ll

define void @aarch64_ilc_const(<4 x float>* %ptr) {
entry:
  %gep1 = ..
  ..
  %interleaved.wide.ptrcast = bitcast <4 x float>* %gep1 to <16 x float>*

  %0 = bitcast <16 x float>* %interleaved.wide.ptrcast to <4 x float>*
  %ldN = call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld4.v4f32.p0v4f32(<4 x float>* %0)
  %1 = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %ldN, 3
  %2 = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %ldN, 2
  %3 = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %ldN, 1
  %4 = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %ldN, 0

  %ld1 = ..
  ..
  %sv1 = ..
  ..
  %m0_3 = ..
  ..
  store <4 x float> %4, <4 x float>* %gep1, align 16
  store <4 x float> %3, <4 x float>* %gep2, align 16
  store <4 x float> %2, <4 x float>* %gep3, align 16
  store <4 x float> %1, <4 x float>* %gep4, align 16
  ret void
}

llc ./combined-access.ll -o final.s

add x8, x0, #32
ld4 { v0.4s, v1.4s, v2.4s, v3.4s }, [x8]
stp q0, q1, [x0, #32]
stp q2, q3, [x0, #64]
ret

Code

bool InterleavedLoadCombineImpl::run() {
  ..
  // Start with the highest factor to avoid combining and recombining.
  for (unsigned Factor = MaxFactor; Factor >= 2; Factor--) {
    std::list<VectorInfo> Candidates;

    for (BasicBlock &BB : F) {
      for (Instruction &I : BB) {
        if (auto SVI = dyn_cast<ShuffleVectorInst>(&I)) {
          Candidates.emplace_back(SVI->getType());

          if (!VectorInfo::computeFromSVI(SVI, Candidates.back(), DL)) {
            Candidates.pop_back();
            continue;
          }

          if (!Candidates.back().isInterleaved(Factor, DL))
            Candidates.pop_back();
        }
      }
    }

The pass first scans all ShuffleVectorInst, and calculates its VectorInfo. In VectorInfo, the most important info is memory address polynomial.

E.g.

  %gep1 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64  2
  %gep2 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64  3

  %ld1 = load <4 x float>, <4 x float>* %gep1, align 16
  %ld2 = load <4 x float>, <4 x float>* %gep2, align 16

  %sv1 = shufflevector <4 x float> %ld1, <4 x float> %ld2, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
  %sv2 = shufflevector <4 x float> %ld1, <4 x float> %ld2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>

Memory Address Polynomial for sv1 and sv2

Cand.sv1 = <4 x float>* %ptr + [
    [{#ErrBits:0} + 32],
    [{#ErrBits:0} + 36],
    [{#ErrBits:0} + 48],
    [{#ErrBits:0} + 52]]

Cand.sv2 = <4 x float>* %ptr + [
    [{#ErrBits:0} + 40],
    [{#ErrBits:0} + 44],
    [{#ErrBits:0} + 56],
    [{#ErrBits:0} + 60]]

If interleaved Factor is 4, then sv1 and sv2 are not candidates, because each channel has to be 16 bytes offset (sizeof(float) * Factor = 16), so they will get false in 'isInterleaved'.

  %gep3 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64  4
  %gep4 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64  5

  %ld3 = load <4 x float>, <4 x float>* %gep3, align 16
  %ld4 = load <4 x float>, <4 x float>* %gep4, align 16

  %sv3 = shufflevector <4 x float> %ld3, <4 x float> %ld4, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
  %sv4 = shufflevector <4 x float> %ld3, <4 x float> %ld4, <4 x i32> <i32 2, i32 3, i32 6, i32 7>

  %m0_3   = shufflevector <4 x float> %sv1, <4 x float> %sv3, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
  %m4_7   = shufflevector <4 x float> %sv1, <4 x float> %sv3, <4 x i32> <i32 1, i32 3, i32 5, i32 7>

In this case, m0_3 and m4_7 are candidates for combining.

Cand.m0_3 = <4 x float>* %ptr + [
    [{#ErrBits:0} + 32],
    [{#ErrBits:0} + 48],
    [{#ErrBits:0} + 64],
    [{#ErrBits:0} + 80]]

Cand.m4_7 = <4 x float>* %ptr + [
    [{#ErrBits:0} + 36],
    [{#ErrBits:0} + 52],
    [{#ErrBits:0} + 68],
    [{#ErrBits:0} + 84]]

Candidates are sent to findPattern for searching possible interleaved loads.

    std::list<VectorInfo> InterleavedLoad;
    while (findPattern(Candidates, InterleavedLoad, Factor, DL)) {
      if (combine(InterleavedLoad, ORE)) {
        changed = true;
      } else {
        // Remove the first element of the Interleaved Load but put the others
        // back on the list and continue searching
        Candidates.splice(Candidates.begin(), InterleavedLoad,
                          std::next(InterleavedLoad.begin()),
                          InterleavedLoad.end());
      }
      InterleavedLoad.clear();
    }
  }

In our example, %m0_3, %m4_7, %m8_11, and %m12_15 are identified in findPattern, and then sent to combine.

#1 Load interleaved combined with factor 4
%m0_3: <4 x float>* %ptr + [
    [{#ErrBits:0} + 32],
    [{#ErrBits:0} + 48],
    [{#ErrBits:0} + 64],
    [{#ErrBits:0} + 80]]
%m4_7: <4 x float>* %ptr + [
    [{#ErrBits:0} + 36],
    [{#ErrBits:0} + 52],
    [{#ErrBits:0} + 68],
    [{#ErrBits:0} + 84]]
%m8_11: <4 x float>* %ptr + [
    [{#ErrBits:0} + 40],
    [{#ErrBits:0} + 56],
    [{#ErrBits:0} + 72],
    [{#ErrBits:0} + 88]]
%m12_15: <4 x float>* %ptr + [
    [{#ErrBits:0} + 44],
    [{#ErrBits:0} + 60],
    [{#ErrBits:0} + 76],
    [{#ErrBits:0} + 92]]

(To be continued ..)

Optimize for fun

Sunday, March 24, 2019

[llvm] Interleaved Load Combine Pass (5e067bb37) - Part I

No comments:

Post a Comment