Added comments about the modified Duff's Device in execute.cc (#77)

author Andy Wright <acwright@mit.edu>

Thu, 1 Dec 2016 20:04:34 +0000 (15:04 -0500)

committer Andrew Waterman <aswaterman@gmail.com>

Thu, 1 Dec 2016 20:04:34 +0000 (12:04 -0800)
author Andy Wright <acwright@mit.edu>
Thu, 1 Dec 2016 20:04:34 +0000 (15:04 -0500)
committer Andrew Waterman <aswaterman@gmail.com>
Thu, 1 Dec 2016 20:04:34 +0000 (12:04 -0800)
diff --git a/riscv/execute.cc b/riscv/execute.cc

index 7b42262c589a401a848945e08e77085741b4922b..36e789629ecc4d075e5b0681e3519892e5f1328b 100644 (file)
--- a/riscv/execute.cc
+++ b/riscv/execute.cc
@@ -40,6 +40,9 @@ inline void processor_t::update_histogram(reg_t pc)
  #endif
  }
  
+// This is expected to be inlined by the compiler so each use of execute_insn
+// includes a duplicated body of the function to get separate fetch.func
+// function calls.
  static reg_t execute_insn(processor_t* p, reg_t pc, insn_fetch_t fetch)
  {
    commit_log_stash_privilege(p->get_state());
@@ -121,9 +124,40 @@ void processor_t::step(size_t n)
        }
        else while (instret < n)
        {
+        // This code uses a modified Duff's Device to improve the performance
+        // of executing instructions. While typical Duff's Devices are used
+        // for software pipelining, the switch statement below primarily
+        // benefits from separate call points for the fetch.func function call
+        // found in each execute_insn. This function call is an indirect jump
+        // that depends on the current instruction. By having an indirect jump
+        // dedicated for each icache entry, you improve the performance of the
+        // host's next address predictor. Each case in the switch statement
+        // allows for the program flow to contine to the next case if it
+        // corresponds to the next instruction in the program and instret is
+        // still less than n.
+        //
+        // According to Andrew Waterman's recollection, this optimization
+        // resulted in approximately a 2x performance increase.
+        //
+        // If there is support for compressed instructions, the mmu and the
+        // switch statement get more complicated. Each branch target is stored
+        // in the index corresponding to mmu->icache_index(), but consecutive
+        // non-branching instructions are stored in consecutive indices even if
+        // mmu->icache_index() specifies a different index (which is the case
+        // for 32-bit instructions in the presence of compressed instructions).
+
+        // This figures out where to jump to in the switch statement
          size_t idx = _mmu->icache_index(pc);
+
+        // This gets the cached decoded instruction form the MMU. If the MMU
+        // does not have the current pc cached, it will refill the MMU and
+        // return the correct entry. ic_entry->data.func is the C++ function
+        // corresponding to the instruction.
          auto ic_entry = _mmu->access_icache(pc);
  
+        // This macro is included in "icache.h" included within the switch
+        // statement below. The indirect jump corresponding to the instruction
+        // is located within the execute_insn() function call.
          #define ICACHE_ACCESS(i) { \
            insn_fetch_t fetch = ic_entry->data; \
            ic_entry++; \
@@ -135,7 +169,10 @@ void processor_t::step(size_t n)
            state.pc = pc; \
          }
  
+        // This switch statement implements the modified Duff's device as
+        // explained above.
          switch (idx) {
+          // "icache.h" is generated by the gen_icache script
            #include "icache.h"
          }
author	Andy Wright <acwright@mit.edu>
	Thu, 1 Dec 2016 20:04:34 +0000 (15:04 -0500)
committer	Andrew Waterman <aswaterman@gmail.com>
	Thu, 1 Dec 2016 20:04:34 +0000 (12:04 -0800)