Sunday, March 24, 2019

[llvm] Interleaved Load Combine Pass (5e067bb37) - Part I

  • What is interleaved access?
    • E.g.
      struct Colour { float r, g, b, a; };
      Colour C[4] = ..;
      float Rs[4] = {C[0].r, C[1].r, C[2].r, C[3].r};
      float Gs[4] = {C[0].g, C[1].g, C[2].g, C[3].g};
      float Bs[4] = {C[0].b, C[1].b, C[2].b, C[3].b};
      float As[4] = {C[0].a, C[1].a, C[2].a, C[3].a};
    • Each Colour is interleaved loaded into Rs, Gs, Bs, As.
    • ARMv8's interleaved access instructions
  • llvm tries to find interleaved access pattern, and transforms the pattern to use target interleaved access intrinsics
  • Steps for combining interleaved load:
    • InterleavedLoadCombine:
      "The pass searches for ShuffleVector instructions that represent interleaved loads. Matches are converted such that they will be captured by the InterleavedAccessPass."
    • InterleavedAccessPass:
      "Identify interleaved memory accesses and transform into target specific intrinsics."
  • Example aarch64-interleaved-ld-combine.ll
  %gep1 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64  2
  %gep2 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64  3
  %gep3 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64  4
  %gep4 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64  5
  %ld1 = load <4 x float>, <4 x float>* %gep1, align 16
  %ld2 = load <4 x float>, <4 x float>* %gep2, align 16
  %ld3 = load <4 x float>, <4 x float>* %gep3, align 16
  %ld4 = load <4 x float>, <4 x float>* %gep4, align 16
  %sv1 = shufflevector <4 x float> %ld1, <4 x float> %ld2, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
  %sv2 = shufflevector <4 x float> %ld1, <4 x float> %ld2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
  %sv3 = shufflevector <4 x float> %ld3, <4 x float> %ld4, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
  %sv4 = shufflevector <4 x float> %ld3, <4 x float> %ld4, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
  %m0_3   = shufflevector <4 x float> %sv1, <4 x float> %sv3, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
  %m4_7   = shufflevector <4 x float> %sv1, <4 x float> %sv3, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
  %m8_11  = shufflevector <4 x float> %sv2, <4 x float> %sv4, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
  %m12_15 = shufflevector <4 x float> %sv2, <4 x float> %sv4, <4 x i32> <i32 1, i32 3, i32 5, i32 7>

  store <4 x float> %m0_3, <4 x float>* %gep1, align 16
  store <4 x float> %m4_7, <4 x float>* %gep2, align 16
  store <4 x float> %m8_11, <4 x float>* %gep3, align 16
  store <4 x float> %m12_15, <4 x float>* %gep4, align 16

opt -S -interleaved-load-combine ./aarch64-interleaved-ld-combine.ll > combined-ld.ll
define void @aarch64_ilc_const(<4 x float>* %ptr) {
entry:
 %gep1 = ..
 ..
 
 %interleaved.wide.ptrcast = bitcast <4 x float>* %gep1 to <16 x float>*
 %interleaved.wide.load = load <16 x float>, <16 x float>* %interleaved.wide.ptrcast, align 16

 %ld1 = ..
 ..
 %sv1 = ..
 ..

 %interleaved.shuffle = shufflevector <16 x float> %interleaved.wide.load, <16 x float> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
 %m0_3 = ..
 %interleaved.shuffle1 = shufflevector <16 x float> %interleaved.wide.load, <16 x float> undef, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
 %m4_7 = ..
 %interleaved.shuffle2 = shufflevector <16 x float> %interleaved.wide.load, <16 x float> undef, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
 %m8_11 = ..
 %interleaved.shuffle3 = shufflevector <16 x float> %interleaved.wide.load, <16 x float> undef, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
 %m12_15 = ..
 store <4 x float> %interleaved.shuffle, <4 x float>* %gep1, align 16
 store <4 x float> %interleaved.shuffle1, <4 x float>* %gep2, align 16
 store <4 x float> %interleaved.shuffle2, <4 x float>* %gep3, align 16
 store <4 x float> %interleaved.shuffle3, <4 x float>* %gep4, align 16
 ret void
}

opt -S -interleaved-access ./combined-ld.ll > combined-access.ll
define void @aarch64_ilc_const(<4 x float>* %ptr) {
entry:
  %gep1 = ..
  ..
  %interleaved.wide.ptrcast = bitcast <4 x float>* %gep1 to <16 x float>*

  %0 = bitcast <16 x float>* %interleaved.wide.ptrcast to <4 x float>*
  %ldN = call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld4.v4f32.p0v4f32(<4 x float>* %0)
  %1 = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %ldN, 3
  %2 = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %ldN, 2
  %3 = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %ldN, 1
  %4 = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %ldN, 0

  %ld1 = ..
  ..
  %sv1 = ..
  ..
  %m0_3 = ..
  ..
  store <4 x float> %4, <4 x float>* %gep1, align 16
  store <4 x float> %3, <4 x float>* %gep2, align 16
  store <4 x float> %2, <4 x float>* %gep3, align 16
  store <4 x float> %1, <4 x float>* %gep4, align 16
  ret void
}

llc ./combined-access.ll -o final.s
add x8, x0, #32
ld4 { v0.4s, v1.4s, v2.4s, v3.4s }, [x8]
stp q0, q1, [x0, #32]
stp q2, q3, [x0, #64]
ret



Code
bool InterleavedLoadCombineImpl::run() {
  ..
  // Start with the highest factor to avoid combining and recombining.
  for (unsigned Factor = MaxFactor; Factor >= 2; Factor--) {
    std::list<VectorInfo> Candidates;

    for (BasicBlock &BB : F) {
      for (Instruction &I : BB) {
        if (auto SVI = dyn_cast<ShuffleVectorInst>(&I)) {
          Candidates.emplace_back(SVI->getType());

          if (!VectorInfo::computeFromSVI(SVI, Candidates.back(), DL)) {
            Candidates.pop_back();
            continue;
          }

          if (!Candidates.back().isInterleaved(Factor, DL))
            Candidates.pop_back();
        }
      }
    }
  • The pass first scans all ShuffleVectorInst, and calculates its VectorInfo. In VectorInfo, the most important info is memory address polynomial.
E.g.
  %gep1 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64  2
  %gep2 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64  3

  %ld1 = load <4 x float>, <4 x float>* %gep1, align 16
  %ld2 = load <4 x float>, <4 x float>* %gep2, align 16

  %sv1 = shufflevector <4 x float> %ld1, <4 x float> %ld2, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
  %sv2 = shufflevector <4 x float> %ld1, <4 x float> %ld2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
Memory Address Polynomial for sv1 and sv2
  • Cand.sv1 = <4 x float>* %ptr + [
        [{#ErrBits:0} + 32],
        [{#ErrBits:0} + 36],
        [{#ErrBits:0} + 48],
        [{#ErrBits:0} + 52]]
    
    Cand.sv2 = <4 x float>* %ptr + [
        [{#ErrBits:0} + 40],
        [{#ErrBits:0} + 44],
        [{#ErrBits:0} + 56],
        [{#ErrBits:0} + 60]]
If interleaved Factor is 4, then sv1 and sv2 are not candidates, because each channel has to be 16 bytes offset (sizeof(float) * Factor = 16), so they will get false in 'isInterleaved'.
  %gep3 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64  4
  %gep4 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64  5

  %ld3 = load <4 x float>, <4 x float>* %gep3, align 16
  %ld4 = load <4 x float>, <4 x float>* %gep4, align 16

  %sv3 = shufflevector <4 x float> %ld3, <4 x float> %ld4, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
  %sv4 = shufflevector <4 x float> %ld3, <4 x float> %ld4, <4 x i32> <i32 2, i32 3, i32 6, i32 7>

  %m0_3   = shufflevector <4 x float> %sv1, <4 x float> %sv3, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
  %m4_7   = shufflevector <4 x float> %sv1, <4 x float> %sv3, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
In this case, m0_3 and m4_7 are candidates for combining.
Cand.m0_3 = <4 x float>* %ptr + [
    [{#ErrBits:0} + 32],
    [{#ErrBits:0} + 48],
    [{#ErrBits:0} + 64],
    [{#ErrBits:0} + 80]]

Cand.m4_7 = <4 x float>* %ptr + [
    [{#ErrBits:0} + 36],
    [{#ErrBits:0} + 52],
    [{#ErrBits:0} + 68],
    [{#ErrBits:0} + 84]]
Candidates are sent to findPattern for searching possible interleaved loads.
    std::list<VectorInfo> InterleavedLoad;
    while (findPattern(Candidates, InterleavedLoad, Factor, DL)) {
      if (combine(InterleavedLoad, ORE)) {
        changed = true;
      } else {
        // Remove the first element of the Interleaved Load but put the others
        // back on the list and continue searching
        Candidates.splice(Candidates.begin(), InterleavedLoad,
                          std::next(InterleavedLoad.begin()),
                          InterleavedLoad.end());
      }
      InterleavedLoad.clear();
    }
  }
In our example, %m0_3, %m4_7, %m8_11, and %m12_15 are identified in findPattern, and then sent to combine. 
#1 Load interleaved combined with factor 4
%m0_3: <4 x float>* %ptr + [
    [{#ErrBits:0} + 32],
    [{#ErrBits:0} + 48],
    [{#ErrBits:0} + 64],
    [{#ErrBits:0} + 80]]
%m4_7: <4 x float>* %ptr + [
    [{#ErrBits:0} + 36],
    [{#ErrBits:0} + 52],
    [{#ErrBits:0} + 68],
    [{#ErrBits:0} + 84]]
%m8_11: <4 x float>* %ptr + [
    [{#ErrBits:0} + 40],
    [{#ErrBits:0} + 56],
    [{#ErrBits:0} + 72],
    [{#ErrBits:0} + 88]]
%m12_15: <4 x float>* %ptr + [
    [{#ErrBits:0} + 44],
    [{#ErrBits:0} + 60],
    [{#ErrBits:0} + 76],
    [{#ErrBits:0} + 92]]

(To be continued ..)

Saturday, March 9, 2019

leetcode contest 124

#1369

995. Minimum Number of K Consecutive Bit Flips

In an array A containing only 0s and 1s, a K-bit flip consists of choosing a (contiguous) subarray of length K and simultaneously changing every 0 in the subarray to 1, and every 1 in the subarray to 0.
Return the minimum number of K-bit flips required so that there is no 0 in the array.  If it is not possible, return -1.

Example 1:
Input: A = [0,1,0], K = 1
Output: 2
Explanation: Flip A[0], then flip A[2].
Example 2:
Input: A = [1,1,0], K = 2
Output: -1
Explanation: No matter how we flip subarrays of size 2, we can't make the array become [1,1,1].
Example 3:
Input: A = [0,0,0,1,0,1,1,0], K = 3
Output: 3
Explanation:
Flip A[0],A[1],A[2]: A becomes [1,1,1,1,0,1,1,0]
Flip A[4],A[5],A[6]: A becomes [1,1,1,1,1,0,0,0]
Flip A[5],A[6],A[7]: A becomes [1,1,1,1,1,1,1,1]

Note:
  1. 1 <= A.length <= 30000
  2. 1 <= K <= A.length

By lee215.
  // ref. lee215's solution
  int minKBitFlips(vector<int>& A, int K) {
    const int n = A.size();
    vector<bool> IsFlipped(n, false);
    bool Flipped = false;
    int Ans = 0;
    //    00010110 K = 3
    // => 11110110 Flipped = true
    // => 11111000
    // => 11111111

    //    01010110 K = 3
    // => 10110110 Flipped = true
    // => 11000110 Flipped = false
    // => 11111110 Flipped = true
    for (int i = 0; i < n; ++i) {
      if (i >= K)
        Flipped ^= IsFlipped[i - K];

      if ((bool)A[i] == Flipped) {
        if (i + K > n)
          return -1;

        IsFlipped[i] = true;
        Flipped = !Flipped;
        ++Ans;
      }
    }
    return Ans;
  }