Eyeriss

Eyeriss-like design

Following are the configuration files for an Eyeriss-like design...

Architecture

architecture:
  # ============================================================
  # Architecture Description
  # ============================================================
  version: 0.3
  subtree:
    - name: system
      local:
        - name: DRAM
          class: DRAM
          attributes:
            type: LPDDR4
            width: 64
            block-size: 4
            word-bits: 16
      subtree:
        - name: eyeriss
          attributes:
            technology: 45nm
          local:
            - name: shared_glb
              class: smartbuffer_SRAM
              attributes:
                memory_depth: 16384
                memory_width: 64
                n_banks: 32
                block-size: 4
                word-bits: 16
                read_bandwidth: 16
                write_bandwidth: 16
            - name: DummyBuffer[0..13] # for better mapping
              class: regfile
              attributes:
                depth: 16
                width: 16
                word-bits: 16
                block-size: 1
                meshX: 14
          subtree:
          - name: PE[0..167]
            local:
              - name: ifmap_spad
                class: smartbuffer_RF
                attributes:
                  memory_depth: 12
                  memory_width: 16
                  block-size: 1
                  word-bits: 16
                  meshX: 14
                  read_bandwidth: 2
                  write_bandwidth: 2
              - name: weights_spad
                class: smartbuffer_RF
                attributes:
                  memory_depth: 192
                  memory_width: 16
                  block-size: 1
                  word-bits: 16
                  meshX: 14
                  read_bandwidth: 2
                  write_bandwidth: 2
              - name: psum_spad
                class: smartbuffer_RF
                attributes:
                  memory_depth: 16
                  memory_width: 16
                  update_fifo_depth: 2
                  block-size: 1
                  word-bits: 16
                  meshX: 14
                  read_bandwidth: 2
                  write_bandwidth: 2
              - name: mac
                class: intmac
                attributes:
                  datawidth: 16
                  meshX : 14

Architecture Constraints

#
# The following constraints are limitations of the hardware architecture and dataflow
#

architecture_constraints:
  targets:
  # certain buffer only stores certain datatypes
  - target: psum_spad
    type: bypass
    bypass: [Inputs, Weights]
    keep: [Outputs]
  - target: weights_spad
    type: bypass
    bypass: [Inputs, Outputs]
    keep: [Weights]
  - target: ifmap_spad
    type: bypass
    bypass: [Weights, Outputs]
    keep: [Inputs]
  - target: DummyBuffer
    type: bypass
    bypass: [Inputs, Outputs, Weights]
  - target: shared_glb
    type: bypass
    bypass: [Weights]
    keep: [Inputs, Outputs]
  - target: DummyBuffer
    type: spatial
    split: 4
    permutation: NPQR SCM
    factors: N=1 P=1 Q=1 R=1 S=0
  # only allow fanout of M, Q out from glb
  - target: shared_glb
    type: spatial
    split: 7
    permutation: NCPRSQM
    factors: N=1 C=1 P=1 R=1 S=1
  # one ofmap position but of different output channels
  - target: psum_spad
    type: temporal
    permutation: NCPQRS M
    factors: N=1 C=1 R=1 S=1 P=1 Q=1
  # row stationary -> 1 row at a time
  - target: weights_spad
    type: temporal
    permutation: NMPQS CR
    factors: N=1 M=1 P=1 Q=1 S=1 R=0
  - target: ifmap_spad
    type: temporal
    permutation: NMCPQRS
    factors: N=1 M=1 C=1 P=1 Q=1 R=1 S=1
  # enforce the hardware limit of the bypassing everything
  - target: DummyBuffer
    type: temporal
    factors: N=1 M=1 C=1 P=1 Q=1 R=1 S=1

Mapping Constraints

#
# The following constraints are not limitations of the hardware architecture and dataflow,
# but help limit the search space to speed up search
#

mapspace_constraints:
  targets:
    # intuitive optimization to reduce map space size
    # the factors of these are 1 anyways, so the order does not really matter
    - target: DummyBuffer
      type: temporal
      permutation: NMCPQRS
    # intuitive optimization for row stationary
    # -> process a row/col of the output before going to the next one
    - target: shared_glb
      type: temporal
      permutation: QRSC PNM
      factors: Q=1 R=1 S=1 P=0
    # intuitive optimization to reduce map space size
    - target: DRAM
      type: temporal
      permutation: RSP CMNQ
      factors: R=1 S=1 P=1