Wednesday, February 27, 2019

[llvm assembler] fix relocation symbol flow

$ cat ppc-br-test.s
  bge 0, .LBB0_4
.LBB0_4:
  bge 0, .LBB0_5
  blr
$ llvm-mc -triple powerpc64le-unknown-unknown --show-encoding ppc-br-test.s
  .text
  bge 0, .LBB0_4 # encoding: [0bAAAAAA00,A,0x80,0x40]
                 #   fixup A - offset: 0, value: .LBB0_4, kind: fixup_ppc_brcond14
.LBB0_4:
  bge 0, .LBB0_5 # encoding: [0bAAAAAA00,A,0x80,0x40]
                 #   fixup A - offset: 0, value: .LBB0_5, kind: fixup_ppc_brcond14

  blr            # encoding: [0x20,0x00,0x80,0x4e]
Code flow, use PowerPC as our target example:
bool AsmParser::Run(bool NoInitialTextSection, bool NoFinalize) {
  // Create the initial section, if requested.
  if (!NoInitialTextSection)
    Out.InitSections(false);

  ...
  // While we have input, parse each statement.
  while (Lexer.isNot(AsmToken::Eof)) {
    ParseStatementInfo Info(&AsmStrRewrites);
    if (!parseStatement(Info, nullptr))
      continue;
    ...
  // Finalize the output stream if there are no errors and if the client wants
  // us to.
  if (!HadError && !NoFinalize)
    Out.Finish();

  return HadError || getContext().hadError();
}
void MCStreamer::Finish() {
  ..
  MCTargetStreamer *TS = getTargetStreamer();
  if (TS)
    TS->finish();

  FinishImpl();
}
bool AsmParser::parseStatement(ParseStatementInfo &Info,
                               MCAsmParserSemaCallback *SI) {
  ...
  // If parsing succeeded, match the instruction.
  if (!ParseHadError) {
    uint64_t ErrorInfo;
    if (getTargetParser().MatchAndEmitInstruction(
            IDLoc, Info.Opcode, Info.ParsedOperands, Out, ErrorInfo,
            getTargetParser().isParsingInlineAsm()))
bool PPCAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
                                           OperandVector &Operands,
                                           MCStreamer &Out, uint64_t &ErrorInfo,
                                           bool MatchingInlineAsm) {
  MCInst Inst;

  switch (MatchInstructionImpl(Operands, Inst, ErrorInfo, MatchingInlineAsm)) {
  case Match_Success:
    // Post-process instructions (typically extended mnemonics)
    ProcessInstruction(Inst, Operands);
    Inst.setLoc(IDLoc);
    Out.EmitInstruction(Inst, getSTI());
    return false;
  • MatchInstructionImpl is automatically generated by llvm TableGen
  • If we are Asm-Streamer
void MCAsmStreamer::EmitInstruction(const MCInst &Inst,
                                    const MCSubtargetInfo &STI) {
  // Show the encoding in a comment if we have a code emitter.
  AddEncodingComment(Inst, STI);

  ...
  if(getTargetStreamer())
    getTargetStreamer()->prettyPrintAsm(*InstPrinter, OS, Inst, STI);
  else
    InstPrinter->printInst(&Inst, OS, "", STI);
void MCAsmStreamer::AddEncodingComment(const MCInst &Inst,
                                       const MCSubtargetInfo &STI) {
  ...
  getAssembler().getEmitter().encodeInstruction(Inst, VecOS, Fixups, STI);

  // This show Fixups information
  // e.g. #   fixup A - offset: 0, value: .LBB0_4, kind: fixup_ppc_brcond14
void PPCMCCodeEmitter::encodeInstruction(
    const MCInst &MI, raw_ostream &OS, SmallVectorImpl<MCFixup> &Fixups,
    const MCSubtargetInfo &STI) const {
  uint64_t Bits = getBinaryCodeForInstr(MI, Fixups, STI);

  // Output the constant in big/little endian byte order.
  unsigned Size = getInstSizeInBytes(MI);
  switch (Size) {
  case 0:
    break;
  case 4:
    support::endian::write<uint32_t>(OS, Bits, E);
    break;
  • getBinaryCodeForInstr: TableGen'erated function
void MCAsmStreamer::FinishImpl() {
  ... generating dwarf info
}
  • If we are ELF-Streamer
void MCObjectStreamer::EmitInstruction(const MCInst &Inst,
                                       const MCSubtargetInfo &STI) {
  getAssembler().getBackend().handleCodePaddingInstructionBegin(Inst);
  EmitInstructionImpl(Inst, STI);
  getAssembler().getBackend().handleCodePaddingInstructionEnd(Inst);
}
void MCObjectStreamer::EmitInstructionImpl(const MCInst &Inst,
                                           const MCSubtargetInfo &STI) {
  MCStreamer::EmitInstruction(Inst, STI);

  MCSection *Sec = getCurrentSectionOnly();
  Sec->setHasInstructions(true);

  // Now that a machine instruction has been assembled into this section, make
  // a line entry for any .loc directive that has been seen.
  MCDwarfLineEntry::Make(this, getCurrentSectionOnly());

  // If this instruction doesn't need relaxation, just emit it as data.
  MCAssembler &Assembler = getAssembler();
  if (!Assembler.getBackend().mayNeedRelaxation(Inst, STI)) {
    EmitInstToData(Inst, STI);
    return;
  }

  // Otherwise, relax and emit it as data if either:
  // - The RelaxAll flag was passed
  // - Bundling is enabled and this instruction is inside a bundle-locked
  //   group. We want to emit all such instructions into the same data
  //   fragment.
  if (Assembler.getRelaxAll() ||
      (Assembler.isBundlingEnabled() && Sec->isBundleLocked())) {
    MCInst Relaxed;
    getAssembler().getBackend().relaxInstruction(Inst, STI, Relaxed);
    while (getAssembler().getBackend().mayNeedRelaxation(Relaxed, STI))
      getAssembler().getBackend().relaxInstruction(Relaxed, STI, Relaxed);
    EmitInstToData(Relaxed, STI);
    return;
  }

  // Otherwise emit to a separate fragment.
  EmitInstToFragment(Inst, STI);
}
void MCStreamer::EmitInstruction(const MCInst &Inst, const MCSubtargetInfo &) {
  // Scan for values.
  for (unsigned i = Inst.getNumOperands(); i--;)
    if (Inst.getOperand(i).isExpr())
      visitUsedExpr(*Inst.getOperand(i).getExpr());
}
void MCELFStreamer::EmitInstToData(const MCInst &Inst,
                                   const MCSubtargetInfo &STI) {
  ...
  Assembler.getEmitter().encodeInstruction(Inst, VecOS, Fixups, STI);

  for (unsigned i = 0, e = Fixups.size(); i != e; ++i)
    fixSymbolsInTLSFixups(Fixups[i].getValue());

  if (Assembler.isBundlingEnabled()) {
    ...
  }

  // Add the fixups and data.
  for (unsigned i = 0, e = Fixups.size(); i != e; ++i) {
    ...
  }
void MCELFStreamer::FinishImpl() {
  // Ensure the last section gets aligned if necessary.
  MCSection *CurSection = getCurrentSectionOnly();
  setSectionAlignmentForBundling(getAssembler(), CurSection);

  finalizeCGProfile();
  EmitFrames(nullptr);

  this->MCObjectStreamer::FinishImpl();
}
void MCObjectStreamer::FinishImpl() {
  ...
  flushPendingLabels();
  resolvePendingFixups();
  getAssembler().Finish();
}
class MCObjectStreamer : public MCStreamer {
  ...
  /// If any labels have been emitted but not assigned fragments, ensure that
  /// they get assigned, either to F if possible or to a new data fragment.
  /// Optionally, it is also possible to provide an offset \p FOffset, which
  /// will be used as a symbol offset within the fragment.
  void flushPendingLabels(MCFragment *F, uint64_t FOffset = 0);

  /// Create a dummy fragment to assign any pending labels.
  void flushPendingLabels() { flushPendingLabels(nullptr); }
void MCObjectStreamer::flushPendingLabels(MCFragment *F, uint64_t FOffset) {
  ...
  for (MCSymbol *Sym : PendingLabels) {
    Sym->setFragment(F);
    Sym->setOffset(FOffset);
  }
  PendingLabels.clear();
}
// When fixup's offset is a forward declared label, e.g.:
//
//   .reloc 1f, R_MIPS_JALR, foo
// 1: nop
//
// postpone adding it to Fixups vector until the label is defined and its offset
// is known.
void MCObjectStreamer::resolvePendingFixups() {
  for (PendingMCFixup &PendingFixup : PendingFixups) {
    if (!PendingFixup.Sym || PendingFixup.Sym->isUndefined ()) {
      getContext().reportError(PendingFixup.Fixup.getLoc(),
                               "unresolved relocation offset");
      continue;
    }
    flushPendingLabels(PendingFixup.DF, PendingFixup.DF->getContents().size());
    PendingFixup.Fixup.setOffset(PendingFixup.Sym->getOffset());
    PendingFixup.DF->getFixups().push_back(PendingFixup.Fixup);
  }
  PendingFixups.clear();
}
void MCAssembler::Finish() {
  // Create the layout object.
  MCAsmLayout Layout(*this);
  layout(Layout);

  // Write the object file.
  stats::ObjectBytes += getWriter().writeObject(*this, Layout);
}
void MCAssembler::layout(MCAsmLayout &Layout) {
  ...
  // Finalize the layout, including fragment lowering.
  finishLayout(Layout);

  ...
  // Evaluate and apply the fixups, generating relocation entries as necessary.
  for (MCSection &Sec : *this) {
    for (MCFragment &Frag : Sec) {
      ...
      for (const MCFixup &Fixup : Fixups) {
        uint64_t FixedValue;
        bool IsResolved;
        MCValue Target;
        std::tie(Target, FixedValue, IsResolved) =
            handleFixup(Layout, Frag, Fixup);
        getBackend().applyFixup(*this, Fixup, Target, Contents, FixedValue,
                                IsResolved, STI);
      }
class PPCAsmBackend : public MCAsmBackend {
  ...
  void applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
                  const MCValue &Target, MutableArrayRef<char> Data,
                  uint64_t Value, bool IsResolved,
                  const MCSubtargetInfo *STI) const override {
    Value = adjustFixupValue(Fixup.getKind(), Value);
    if (!Value) return;           // Doesn't change encoding.

    unsigned Offset = Fixup.getOffset();
    unsigned NumBytes = getFixupKindNumBytes(Fixup.getKind());

    // For each byte of the fragment that the fixup touches, mask in the bits
    // from the fixup value. The Value has been "split up" into the appropriate
    // bitfields above.
    for (unsigned i = 0; i != NumBytes; ++i) {
      unsigned Idx = Endian == support::little ? i : (NumBytes - 1 - i);
      Data[Offset + i] |= uint8_t((Value >> (Idx * 8)) & 0xff);
    }
  }

markdown test

  /// @}
  /// \name Associated Sections
  /// @{

  /// isDefined - Check if this symbol is defined (i.e., it has an address).
  ///
  /// Defined symbols are either absolute or in some section.
  bool isDefined() const { return !isUndefined(); }

  /// isUndefined - Check if this symbol undefined (i.e., implicitly defined).
  bool isUndefined(bool SetUsed = true) const { return IsDefined; }

  /// Mark the symbol as undefined.
  void setUndefined() { IsDefined = false; }

Monday, February 11, 2019

EIE: Efficient Inference Engine on Compressed Deep Neural Network


EIE.pdf: https://www.cs.virginia.edu/~smk9u/CS6501F16/p243-han.pdf

Note: This post covers part of 'Evaluation Section', but I may separate 'Evaluation Section' into another post.

Let's start from AlexNet

(figure from: Analysis of Sparse Convolutional Neural Networks
  http://ece757.ece.wisc.edu/project_talks/sparse.pptx)

// AlexNet Layers in text.
  %conv1_1 = Conv(%data_0, %conv1_w_0, %conv1_b_0)
  %conv1_2 = Relu(%conv1_1)
  %norm1_1 = LRN(%conv1_2)
  %pool1_1 = MaxPool(%norm1_1)
  %conv2_1 = Conv(%pool1_1, %conv2_w_0, %conv2_b_0)
  %conv2_2 = Relu(%conv2_1)
  %norm2_1 = LRN(%conv2_2)
  %pool2_1 = MaxPool(%norm2_1)
  %conv3_1 = Conv(%pool2_1, %conv3_w_0, %conv3_b_0)
  %conv3_2 = Relu(%conv3_1)
  %conv4_1 = Conv(%conv3_2, %conv4_w_0, %conv4_b_0)
  %conv4_2 = Relu(%conv4_1)
  %conv5_1 = Conv(%conv4_2, %conv5_w_0, %conv5_b_0)
  %conv5_2 = Relu(%conv5_1)
  %pool5_1 = MaxPool(%conv5_2)
  %fc6_1 = Gemm(%pool5_1, %fc6_w_0, %fc6_b_0)
  %fc6_2 = Relu(%fc6_1)
  %fc6_3, %_fc6_mask_1 = Dropout(%fc6_2)
  %fc7_1 = Gemm(%fc6_3, %fc7_w_0, %fc7_b_0)
  %fc7_2 = Relu(%fc7_1)
  %fc7_3, %_fc7_mask_1 = Dropout(%fc7_2)
  %fc8_1 = Gemm(%fc7_3, %fc8_w_0, %fc8_b_0)
  %prob_1 = Softmax(%fc8_1)
  return %prob_1

The parameters size of AlexNet is 240 MB, FC6 weights (fc6_w_0) contribute 144 MB, FC7 weights (fc7_w_0) contribute 64 MB. 

The weight has to be put in DRAM, and access DRAM is costly in terms of cycle time and energy.

Table from EIE paper.

It's a heavy burden for power and performance constrained embedded devices, so, Professor Han et al. presented methodology DEEP COMPRESSION to compress deep neural network without loss of accuracy. The result is very impressive:



For AlexNet:
  • Original: 240 MB
  • Compressed: 6.9 MB (included the meta-data for sparse representation)
    => 35 Times Compress rate

Performance: (DEEP COMPRESSION Figure 9)


Energy: (DEEP COMPRESSION Figure 10)

It gets very good performance in execution time and energy efficiency. In order to more efficiently operate on compressed model, Professor Han et al. proposed EIE (Efficient Inference Engine).


What is EIE?
  • A specialized accelerator that performs customized sparse matrix vector multiplication
  • A scalable array of processing elements (PEs)

Each PE
  • Holds 131K weights of the compressed model
  • Perform 800 million weight calculations per second
  • In 45nm CMOS:
    • area = 0.638mm2
    • dissipates = 9.16mW at 800MHz

Focus on Matrix-vector multiplication (M×V), Why?
  • Convolutional neural network (CNN)
    • Fully Connected layers are implemented with M×V
    • In object detection algorithms fast R-CNN (Girshick, 2015),  38% computation time is consumed on FC layers on uncompressed model
  • Recurrent neural network (RNN)
    • M×V operations are performed on the new input and the hidden state at each time step, producing a new hidden state and the output.
    • Long-Short-Term-Memory (LSTM)
      • A widely used structure of RNN cell
      • Each LSTM cell can be decomposed into eight M×V operations
      • Widely used in image captioning, speech recognition and natural language processing

Fully Connected layer
  • Fully Connected Layers form the final layers of a typical CNN and implemented as Matrix Vector Multiply operation. (GEMV).
(Text and figure from: Analysis of Sparse Convolutional Neural Networks


M×V Computation, e.g. AlexNet and VGG-16
  • (Original) Uncompressed (Dense) Model
    • a is the input activation vector, b is the output activation vector, W is the weight matrix
  • Deep Compression (Sparse) Model
    • Weight sharing replaces each weight Wij with a four-bit index Iij into a shared table S of 16 possible weight values.
    • Xi represents the static sparsity of W
    • Y represents the dynamic sparsity of a
    • EIE perform the indexing S [Iij ] and the multiply-add only for non-zero of Wij and aj

EIE on Compressed sparse column (CSC) format, use figure 2 (4 PEs configuration) as an example
  • PE0 holds:
    • rows W0W4W8, W12 of Matrix W 
    • a0, a4 of input activations
    • b0 , b4 , b8 of output activations
      => Given PEk, PEk holds Wi, ai, bi for which i (mod N) = k.
  • Figure 3. Memory layout, use PE0 as an example
    • Virtual Weight: contains the non-zero weights
    • Relative Row Index:
      • Same length as Virtual Weight
      • Encodes the number of zeros before the corresponding entry in Virtual Weight
    • Each entry is represented by a four-bit value
      • If more than 15 zeros appear before a non-zero entry we add a zero in Virtual Weight
      • E.g. [0,0,1,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3]
        • Virtual Weight = [1,2,0,3]
        • Relative Row Index = [2,0,15,2] 
    • Column Pointer
      • Pointing to the beginning of the vector for each column
      • Column Pointer[0] = 0, column 0's beginning = Virtual Weight [0]
      • Column Pointer[1] = 3, column 1's beginning = Virtual Weight [3]
      • Column Pointer[2] = 4, column 2's beginning = Virtual Weight [4]
      • ...
      • Column Pointer[7] = 11, column 7's beginning = Virtual Weight [11]
      • Column Pointer[8] = 13
        • final entry in Column Pointer points one beyond the last vector element
        • the number of non-zeros in column j (including padded zeros) is given by pj+1 − pj.
  • Multiplication, use figure 2 as an example.
    • Scan input activations a
    • a2 is not zero, broadcast value a2 and column index 2 to all PEs
    • PE0 multiplies a2 by W0,2 and W12,2
    • PE1 has all zeros in column 2
    • PE2 multiplies a2 by W2,2 and W14,2
    • PE3 multiplies a2 by W11,2 and W15,2
    • And so on ..
  • This process may suffer load imbalance because each PE may have a different number of non-zeros in a particular column.
    • Can be reduced by queuing.

EIE Hardware Implementation
  • Central Control Unit (CCU)
    • Controls an array of PEs
    • Receives non-zero input activations from a distributed Leading Non-zero Detection network and broadcasts these to the PEs
    • Communicates with the host
  • Activation Queue (Act Queue)
    • Get Non-zero element aj and index j from queue (pushed by CCU)
  • Pointer Read Unit
    • Index j is used to look up the start and end pointers pj and pj+1
    • Store pointers in two single-ported SRAM banks for one cycle access
    • Pointers are 16-bits in length.
  • Sparse Matrix Read Unit
    • Each entry in the SRAM is 8-bits
      • 4-bit for Virtual Weight, 4-bit for Relative Row Index
    • For efficiency (see paper Section VI) the PE’s slice of encoded sparse matrix I is stored in a 64-bit-wide SRAM
      => fetch 8 entry a time
  • Arithmetic Unit
    • Decode 4-bit Virtual Weight to a 16-bit fixed-point number via a table look up
    • Use Relative Row Index to index an accumulator array (the destination activation registers) 
    • A bypass path
      • if the same accumulator is selected on two adjacent cycles.
  • Activation Read/Write
    • Two activation register files for source and destination
    • The role of source and destination is exchanged in next layer
      => no additional data transfer is needed to support multi-layer feed-forward computation
  •  Distributed Leading Non-Zero Detection
    • Each group of 4 PEs does a local leading non-zero detection on their input activation. The result is sent to a Leading Non-zero Detection Node (LNZD Node)
    • Each LNZD node finds the next non-zero activation across its four children and sends this result up the quadtree
    • At the root LNZD Node, the selected non-zero activation is broadcast back to all the PEs via a separate wire placed in an H-tree
Evaluation


 



Sunday, February 10, 2019

How to SSH gate forwarding git user to gitserver

Public Domain (git, e.g. clone in ssh way) -> Public Server (Linux Based) -> Private Git Server

How to config 'Public Server' so user in Public Domain can access 'Private Git Server'?

Use iptable forwarding for 'Public Server'. See:
https://serverfault.com/questions/437952/how-to-ssh-gate-forwarding-git-user-to-gitserver

Setup iptable (Chinese)
http://s2.naes.tn.edu.tw/~kv/iptables.htm