Optimize for fun: February 2019

Wednesday, February 27, 2019

[llvm assembler] fix relocation symbol flow

$ cat ppc-br-test.s

  bge 0, .LBB0_4
.LBB0_4:
  bge 0, .LBB0_5
  blr

$ llvm-mc -triple powerpc64le-unknown-unknown --show-encoding ppc-br-test.s

  .text
  bge 0, .LBB0_4 # encoding: [0bAAAAAA00,A,0x80,0x40]
                 #   fixup A - offset: 0, value: .LBB0_4, kind: fixup_ppc_brcond14
.LBB0_4:
  bge 0, .LBB0_5 # encoding: [0bAAAAAA00,A,0x80,0x40]
                 #   fixup A - offset: 0, value: .LBB0_5, kind: fixup_ppc_brcond14

  blr            # encoding: [0x20,0x00,0x80,0x4e]

Code flow, use PowerPC as our target example:

bool AsmParser::Run(bool NoInitialTextSection, bool NoFinalize) {
  // Create the initial section, if requested.
  if (!NoInitialTextSection)
    Out.InitSections(false);

  ...
  // While we have input, parse each statement.
  while (Lexer.isNot(AsmToken::Eof)) {
    ParseStatementInfo Info(&AsmStrRewrites);
    if (!parseStatement(Info, nullptr))
      continue;
    ...

  // Finalize the output stream if there are no errors and if the client wants
  // us to.
  if (!HadError && !NoFinalize)
    Out.Finish();

  return HadError || getContext().hadError();
}

Out is a MCStreamer

void MCStreamer::Finish() {
  ..
  MCTargetStreamer *TS = getTargetStreamer();
  if (TS)
    TS->finish();

  FinishImpl();
}

bool AsmParser::parseStatement(ParseStatementInfo &Info,
                               MCAsmParserSemaCallback *SI) {
  ...
  // If parsing succeeded, match the instruction.
  if (!ParseHadError) {
    uint64_t ErrorInfo;
    if (getTargetParser().MatchAndEmitInstruction(
            IDLoc, Info.Opcode, Info.ParsedOperands, Out, ErrorInfo,
            getTargetParser().isParsingInlineAsm()))

bool PPCAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
                                           OperandVector &Operands,
                                           MCStreamer &Out, uint64_t &ErrorInfo,
                                           bool MatchingInlineAsm) {
  MCInst Inst;

  switch (MatchInstructionImpl(Operands, Inst, ErrorInfo, MatchingInlineAsm)) {
  case Match_Success:
    // Post-process instructions (typically extended mnemonics)
    ProcessInstruction(Inst, Operands);
    Inst.setLoc(IDLoc);
    Out.EmitInstruction(Inst, getSTI());
    return false;

MatchInstructionImpl is automatically generated by llvm TableGen
If we are Asm-Streamer

void MCAsmStreamer::EmitInstruction(const MCInst &Inst,
                                    const MCSubtargetInfo &STI) {
  // Show the encoding in a comment if we have a code emitter.
  AddEncodingComment(Inst, STI);

  ...
  if(getTargetStreamer())
    getTargetStreamer()->prettyPrintAsm(*InstPrinter, OS, Inst, STI);
  else
    InstPrinter->printInst(&Inst, OS, "", STI);

void MCAsmStreamer::AddEncodingComment(const MCInst &Inst,
                                       const MCSubtargetInfo &STI) {
  ...
  getAssembler().getEmitter().encodeInstruction(Inst, VecOS, Fixups, STI);

  // This show Fixups information
  // e.g. #   fixup A - offset: 0, value: .LBB0_4, kind: fixup_ppc_brcond14

void PPCMCCodeEmitter::encodeInstruction(
    const MCInst &MI, raw_ostream &OS, SmallVectorImpl<MCFixup> &Fixups,
    const MCSubtargetInfo &STI) const {
  uint64_t Bits = getBinaryCodeForInstr(MI, Fixups, STI);

  // Output the constant in big/little endian byte order.
  unsigned Size = getInstSizeInBytes(MI);
  switch (Size) {
  case 0:
    break;
  case 4:
    support::endian::write<uint32_t>(OS, Bits, E);
    break;

getBinaryCodeForInstr: TableGen'erated function

void MCAsmStreamer::FinishImpl() {
  ... generating dwarf info
}

If we are ELF-Streamer

void MCObjectStreamer::EmitInstruction(const MCInst &Inst,
                                       const MCSubtargetInfo &STI) {
  getAssembler().getBackend().handleCodePaddingInstructionBegin(Inst);
  EmitInstructionImpl(Inst, STI);
  getAssembler().getBackend().handleCodePaddingInstructionEnd(Inst);
}

void MCObjectStreamer::EmitInstructionImpl(const MCInst &Inst,
                                           const MCSubtargetInfo &STI) {
  MCStreamer::EmitInstruction(Inst, STI);

  MCSection *Sec = getCurrentSectionOnly();
  Sec->setHasInstructions(true);

  // Now that a machine instruction has been assembled into this section, make
  // a line entry for any .loc directive that has been seen.
  MCDwarfLineEntry::Make(this, getCurrentSectionOnly());

  // If this instruction doesn't need relaxation, just emit it as data.
  MCAssembler &Assembler = getAssembler();
  if (!Assembler.getBackend().mayNeedRelaxation(Inst, STI)) {
    EmitInstToData(Inst, STI);
    return;
  }

  // Otherwise, relax and emit it as data if either:
  // - The RelaxAll flag was passed
  // - Bundling is enabled and this instruction is inside a bundle-locked
  //   group. We want to emit all such instructions into the same data
  //   fragment.
  if (Assembler.getRelaxAll() ||
      (Assembler.isBundlingEnabled() && Sec->isBundleLocked())) {
    MCInst Relaxed;
    getAssembler().getBackend().relaxInstruction(Inst, STI, Relaxed);
    while (getAssembler().getBackend().mayNeedRelaxation(Relaxed, STI))
      getAssembler().getBackend().relaxInstruction(Relaxed, STI, Relaxed);
    EmitInstToData(Relaxed, STI);
    return;
  }

  // Otherwise emit to a separate fragment.
  EmitInstToFragment(Inst, STI);
}

void MCStreamer::EmitInstruction(const MCInst &Inst, const MCSubtargetInfo &) {
  // Scan for values.
  for (unsigned i = Inst.getNumOperands(); i--;)
    if (Inst.getOperand(i).isExpr())
      visitUsedExpr(*Inst.getOperand(i).getExpr());
}

void MCELFStreamer::EmitInstToData(const MCInst &Inst,
                                   const MCSubtargetInfo &STI) {
  ...
  Assembler.getEmitter().encodeInstruction(Inst, VecOS, Fixups, STI);

  for (unsigned i = 0, e = Fixups.size(); i != e; ++i)
    fixSymbolsInTLSFixups(Fixups[i].getValue());

  if (Assembler.isBundlingEnabled()) {
    ...
  }

  // Add the fixups and data.
  for (unsigned i = 0, e = Fixups.size(); i != e; ++i) {
    ...
  }

void MCELFStreamer::FinishImpl() {
  // Ensure the last section gets aligned if necessary.
  MCSection *CurSection = getCurrentSectionOnly();
  setSectionAlignmentForBundling(getAssembler(), CurSection);

  finalizeCGProfile();
  EmitFrames(nullptr);

  this->MCObjectStreamer::FinishImpl();
}

void MCObjectStreamer::FinishImpl() {
  ...
  flushPendingLabels();
  resolvePendingFixups();
  getAssembler().Finish();
}

class MCObjectStreamer : public MCStreamer {
  ...
  /// If any labels have been emitted but not assigned fragments, ensure that
  /// they get assigned, either to F if possible or to a new data fragment.
  /// Optionally, it is also possible to provide an offset \p FOffset, which
  /// will be used as a symbol offset within the fragment.
  void flushPendingLabels(MCFragment *F, uint64_t FOffset = 0);

  /// Create a dummy fragment to assign any pending labels.
  void flushPendingLabels() { flushPendingLabels(nullptr); }

void MCObjectStreamer::flushPendingLabels(MCFragment *F, uint64_t FOffset) {
  ...
  for (MCSymbol *Sym : PendingLabels) {
    Sym->setFragment(F);
    Sym->setOffset(FOffset);
  }
  PendingLabels.clear();
}

// When fixup's offset is a forward declared label, e.g.:
//
//   .reloc 1f, R_MIPS_JALR, foo
// 1: nop
//
// postpone adding it to Fixups vector until the label is defined and its offset
// is known.
void MCObjectStreamer::resolvePendingFixups() {
  for (PendingMCFixup &PendingFixup : PendingFixups) {
    if (!PendingFixup.Sym || PendingFixup.Sym->isUndefined ()) {
      getContext().reportError(PendingFixup.Fixup.getLoc(),
                               "unresolved relocation offset");
      continue;
    }
    flushPendingLabels(PendingFixup.DF, PendingFixup.DF->getContents().size());
    PendingFixup.Fixup.setOffset(PendingFixup.Sym->getOffset());
    PendingFixup.DF->getFixups().push_back(PendingFixup.Fixup);
  }
  PendingFixups.clear();
}

void MCAssembler::Finish() {
  // Create the layout object.
  MCAsmLayout Layout(*this);
  layout(Layout);

  // Write the object file.
  stats::ObjectBytes += getWriter().writeObject(*this, Layout);
}

void MCAssembler::layout(MCAsmLayout &Layout) {
  ...
  // Finalize the layout, including fragment lowering.
  finishLayout(Layout);

  ...
  // Evaluate and apply the fixups, generating relocation entries as necessary.
  for (MCSection &Sec : *this) {
    for (MCFragment &Frag : Sec) {
      ...
      for (const MCFixup &Fixup : Fixups) {
        uint64_t FixedValue;
        bool IsResolved;
        MCValue Target;
        std::tie(Target, FixedValue, IsResolved) =
            handleFixup(Layout, Frag, Fixup);
        getBackend().applyFixup(*this, Fixup, Target, Contents, FixedValue,
                                IsResolved, STI);
      }

class PPCAsmBackend : public MCAsmBackend {
  ...
  void applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
                  const MCValue &Target, MutableArrayRef<char> Data,
                  uint64_t Value, bool IsResolved,
                  const MCSubtargetInfo *STI) const override {
    Value = adjustFixupValue(Fixup.getKind(), Value);
    if (!Value) return;           // Doesn't change encoding.

    unsigned Offset = Fixup.getOffset();
    unsigned NumBytes = getFixupKindNumBytes(Fixup.getKind());

    // For each byte of the fragment that the fixup touches, mask in the bits
    // from the fixup value. The Value has been "split up" into the appropriate
    // bitfields above.
    for (unsigned i = 0; i != NumBytes; ++i) {
      unsigned Idx = Endian == support::little ? i : (NumBytes - 1 - i);
      Data[Offset + i] |= uint8_t((Value >> (Idx * 8)) & 0xff);
    }
  }

markdown test

  /// @}
  /// \name Associated Sections
  /// @{

  /// isDefined - Check if this symbol is defined (i.e., it has an address).
  ///
  /// Defined symbols are either absolute or in some section.
  bool isDefined() const { return !isUndefined(); }

  /// isUndefined - Check if this symbol undefined (i.e., implicitly defined).
  bool isUndefined(bool SetUsed = true) const { return IsDefined; }

  /// Mark the symbol as undefined.
  void setUndefined() { IsDefined = false; }

Monday, February 11, 2019

EIE: Efficient Inference Engine on Compressed Deep Neural Network

EIE.pdf: https://www.cs.virginia.edu/~smk9u/CS6501F16/p243-han.pdf

Note: This post covers part of 'Evaluation Section', but I may separate 'Evaluation Section' into another post.

Let's start from AlexNet

(figure from: Analysis of Sparse Convolutional Neural Networks
http://ece757.ece.wisc.edu/project_talks/sparse.pptx)

// AlexNet Layers in text.
  %conv1_1 = Conv(%data_0, %conv1_w_0, %conv1_b_0)
  %conv1_2 = Relu(%conv1_1)
  %norm1_1 = LRN(%conv1_2)
  %pool1_1 = MaxPool(%norm1_1)
  %conv2_1 = Conv(%pool1_1, %conv2_w_0, %conv2_b_0)
  %conv2_2 = Relu(%conv2_1)
  %norm2_1 = LRN(%conv2_2)
  %pool2_1 = MaxPool(%norm2_1)
  %conv3_1 = Conv(%pool2_1, %conv3_w_0, %conv3_b_0)
  %conv3_2 = Relu(%conv3_1)
  %conv4_1 = Conv(%conv3_2, %conv4_w_0, %conv4_b_0)
  %conv4_2 = Relu(%conv4_1)
  %conv5_1 = Conv(%conv4_2, %conv5_w_0, %conv5_b_0)
  %conv5_2 = Relu(%conv5_1)
  %pool5_1 = MaxPool(%conv5_2)
  %fc6_1 = Gemm(%pool5_1, %fc6_w_0, %fc6_b_0)
  %fc6_2 = Relu(%fc6_1)
  %fc6_3, %_fc6_mask_1 = Dropout(%fc6_2)
  %fc7_1 = Gemm(%fc6_3, %fc7_w_0, %fc7_b_0)
  %fc7_2 = Relu(%fc7_1)
  %fc7_3, %_fc7_mask_1 = Dropout(%fc7_2)
  %fc8_1 = Gemm(%fc7_3, %fc8_w_0, %fc8_b_0)
  %prob_1 = Softmax(%fc8_1)
  return %prob_1

The parameters size of AlexNet is 240 MB, FC6 weights (fc6_w_0) contribute 144 MB, FC7 weights (fc7_w_0) contribute 64 MB.

The weight has to be put in DRAM, and access DRAM is costly in terms of cycle time and energy.

Table from EIE paper.

It's a heavy burden for power and performance constrained embedded devices, so, Professor Han et al. presented methodology DEEP COMPRESSION to compress deep neural network without loss of accuracy. The result is very impressive:

For AlexNet:

Original: 240 MB
Compressed: 6.9 MB (included the meta-data for sparse representation)
=> 35 Times Compress rate

Performance: (DEEP COMPRESSION Figure 9)

Energy: (DEEP COMPRESSION Figure 10)

It gets very good performance in execution time and energy efficiency. In order to more efficiently operate on compressed model, Professor Han et al. proposed EIE (Efficient Inference Engine).

What is EIE?

A specialized accelerator that performs customized sparse matrix vector multiplication
A scalable array of processing elements (PEs)

Each PE

Holds 131K weights of the compressed model
Perform 800 million weight calculations per second
In 45nm CMOS:

area = 0.638mm2
dissipates = 9.16mW at 800MHz

Focus on Matrix-vector multiplication (M×V), Why?

Convolutional neural network (CNN)

Fully Connected layers are implemented with M×V
In object detection algorithms fast R-CNN (Girshick, 2015), 38% computation time is consumed on FC layers on uncompressed model

Recurrent neural network (RNN)

M×V operations are performed on the new input and the hidden state at each time step, producing a new hidden state and the output.
Long-Short-Term-Memory (LSTM)

A widely used structure of RNN cell
Each LSTM cell can be decomposed into eight M×V operations
Widely used in image captioning, speech recognition and natural language processing

Fully Connected layer

Fully Connected Layers form the final layers of a typical CNN and implemented as Matrix Vector Multiply operation. (GEMV).

(Text and figure from: Analysis of Sparse Convolutional Neural Networks

http://ece757.ece.wisc.edu/project_talks/sparse.pptx)

M×V Computation, e.g. AlexNet and VGG-16

(Original) Uncompressed (Dense) Model

a is the input activation vector, b is the output activation vector, W is the weight matrix

Deep Compression (Sparse) Model

Weight sharing replaces each weight Wij with a four-bit index Iij into a shared table S of 16 possible weight values.
Xi represents the static sparsity of W
Y represents the dynamic sparsity of a
EIE perform the indexing S [Iij ] and the multiply-add only for non-zero of Wij and aj

EIE on Compressed sparse column (CSC) format, use figure 2 (4 PEs configuration) as an example

PE0 holds:

rows W0, W4, W8, W12 of Matrix W
a0, a4 of input activations
b0 , b4 , b8 of output activations
=> Given PEk, PEk holds Wi, ai, bi for which i (mod N) = k.

Figure 3. Memory layout, use PE0 as an example

Virtual Weight: contains the non-zero weights
Relative Row Index:

Same length as Virtual Weight
Encodes the number of zeros before the corresponding entry in Virtual Weight

Each entry is represented by a four-bit value

If more than 15 zeros appear before a non-zero entry we add a zero in Virtual Weight
E.g. [0,0,1,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3]

Virtual Weight = [1,2,0,3]
Relative Row Index = [2,0,15,2]

Column Pointer

Pointing to the beginning of the vector for each column
Column Pointer[0] = 0, column 0's beginning = Virtual Weight [0]
Column Pointer[1] = 3, column 1's beginning = Virtual Weight [3]
Column Pointer[2] = 4, column 2's beginning = Virtual Weight [4]
...
Column Pointer[7] = 11, column 7's beginning = Virtual Weight [11]
Column Pointer[8] = 13

final entry in Column Pointer points one beyond the last vector element
the number of non-zeros in column j (including padded zeros) is given by pj+1 − pj.

Multiplication, use figure 2 as an example.

Scan input activations a
a2 is not zero, broadcast value a2 and column index 2 to all PEs
PE0 multiplies a2 by W0,2 and W12,2
PE1 has all zeros in column 2
PE2 multiplies a2 by W2,2 and W14,2
PE3 multiplies a2 by W11,2 and W15,2
And so on ..

This process may suffer load imbalance because each PE may have a different number of non-zeros in a particular column.

Can be reduced by queuing.

EIE Hardware Implementation

Central Control Unit (CCU)

Controls an array of PEs
Receives non-zero input activations from a distributed Leading Non-zero Detection network and broadcasts these to the PEs
Communicates with the host

Activation Queue (Act Queue)

Get Non-zero element aj and index j from queue (pushed by CCU)

Pointer Read Unit

Index j is used to look up the start and end pointers pj and pj+1
Store pointers in two single-ported SRAM banks for one cycle access
Pointers are 16-bits in length.

Sparse Matrix Read Unit

Each entry in the SRAM is 8-bits

4-bit for Virtual Weight, 4-bit for Relative Row Index

For efficiency (see paper Section VI) the PE’s slice of encoded sparse matrix I is stored in a 64-bit-wide SRAM
=> fetch 8 entry a time

Arithmetic Unit

Decode 4-bit Virtual Weight to a 16-bit fixed-point number via a table look up
Use Relative Row Index to index an accumulator array (the destination activation registers)
A bypass path

if the same accumulator is selected on two adjacent cycles.

Activation Read/Write

Two activation register files for source and destination
The role of source and destination is exchanged in next layer
=> no additional data transfer is needed to support multi-layer feed-forward computation

Distributed Leading Non-Zero Detection

Each group of 4 PEs does a local leading non-zero detection on their input activation. The result is sent to a Leading Non-zero Detection Node (LNZD Node)
Each LNZD node finds the next non-zero activation across its four children and sends this result up the quadtree
At the root LNZD Node, the selected non-zero activation is broadcast back to all the PEs via a separate wire placed in an H-tree

Evaluation

Sunday, February 10, 2019

How to SSH gate forwarding git user to gitserver

Public Domain (git, e.g. clone in ssh way) -> Public Server (Linux Based) -> Private Git Server

How to config 'Public Server' so user in Public Domain can access 'Private Git Server'?

Use iptable forwarding for 'Public Server'. See:
https://serverfault.com/questions/437952/how-to-ssh-gate-forwarding-git-user-to-gitserver

Setup iptable (Chinese)
http://s2.naes.tn.edu.tw/~kv/iptables.htm