From 77f281580773fe5217ae40bcf8a0f8bc05c28ded Mon Sep 17 00:00:00 2001 From: Andrew Waterman Date: Mon, 13 Jan 2014 16:42:02 -0800 Subject: [PATCH] Improve performance for branchy code We now use a heavily unrolled loop as the software I$, which allows the host machine's branch target prediction to associate target PCs with unique-ish host PCs. --- Makefile.in | 2 +- riscv/decode.h | 9 +++-- riscv/encoding.h | 4 +- riscv/gen_icache | 9 +++++ riscv/htif.cc | 8 ++-- riscv/htif.h | 2 +- riscv/interactive.cc | 4 +- riscv/mmu.cc | 4 +- riscv/mmu.h | 74 +++++++++++++++++++------------------ riscv/processor.cc | 87 ++++++++++++++++++++++++++++---------------- riscv/processor.h | 4 +- riscv/riscv.mk.in | 4 ++ riscv/sim.cc | 2 +- riscv/sim.h | 3 +- 14 files changed, 131 insertions(+), 85 deletions(-) create mode 100755 riscv/gen_icache diff --git a/Makefile.in b/Makefile.in index 19e6805..45e3a11 100644 --- a/Makefile.in +++ b/Makefile.in @@ -192,7 +192,7 @@ $$($(2)_objs) : %.o : %.cc $$($(2)_gen_hdrs) $$($(2)_c_objs) : %.o : %.c $$($(2)_gen_hdrs) $(COMPILE_C) -c $$< -$(2)_junk += $$($(2)_objs) $$($(2)_c_objs) $$($(2)_deps) $$($(2)_c_deps) +$(2)_junk += $$($(2)_objs) $$($(2)_c_objs) $$($(2)_deps) $$($(2)_c_deps) $$($(2)_gen_hdrs) # Build a library for this subproject diff --git a/riscv/decode.h b/riscv/decode.h index 6c26a68..4abd9f9 100644 --- a/riscv/decode.h +++ b/riscv/decode.h @@ -80,8 +80,9 @@ public: } void write(size_t i, T value) { - if (!(zero_reg && i == 0)) - data[i] = value; + data[i] = value; + if (zero_reg) + data[0] = 0; } const T& operator [] (size_t i) const { @@ -164,12 +165,14 @@ private: #define set_pc(x) \ do { if ((x) & 3 /* For now... */) \ throw trap_instruction_address_misaligned(); \ - npc = (x); \ + npc = sext_xprlen(x); \ } while(0) #define validate_csr(which, write) ({ \ int write_priv = ((which) >> 10) & 3; \ int read_priv = ((which) >> 8) & 3; \ + if ((which) == CSR_FCSR || (which) == CSR_FFLAGS || (which) == CSR_FRM) \ + require_fp; \ if (read_priv > 0 || (write_priv > 0 && (write))) require_supervisor; \ (which); }) diff --git a/riscv/encoding.h b/riscv/encoding.h index 48a0108..711ef7b 100644 --- a/riscv/encoding.h +++ b/riscv/encoding.h @@ -288,7 +288,7 @@ #define MASK_SRET 0xffffffff #define MATCH_FNMADD_S 0x4f #define MASK_FNMADD_S 0x600007f -#define MATCH_JAL 0x67 +#define MATCH_JAL 0x6f #define MASK_JAL 0x7f #define MATCH_LWU 0x6003 #define MASK_LWU 0x707f @@ -392,7 +392,7 @@ #define MASK_DIVU 0xfe00707f #define MATCH_AMOSWAP_W 0x800202f #define MASK_AMOSWAP_W 0xf800707f -#define MATCH_JALR 0x6f +#define MATCH_JALR 0x67 #define MASK_JALR 0x707f #define MATCH_FSD 0x3027 #define MASK_FSD 0x707f diff --git a/riscv/gen_icache b/riscv/gen_icache new file mode 100755 index 0000000..c581b55 --- /dev/null +++ b/riscv/gen_icache @@ -0,0 +1,9 @@ +#!/bin/sh +echo \#define ICACHE_SIZE $1 +n=$(($1-1)) +echo \#define ICACHE_SWITCH \\ +for i in `seq 0 $n` +do + echo case $i: ICACHE_ACCESS\($i\)\; \\ +done +echo diff --git a/riscv/htif.cc b/riscv/htif.cc index af26faa..741a00f 100644 --- a/riscv/htif.cc +++ b/riscv/htif.cc @@ -91,14 +91,14 @@ void htif_isasim_t::tick_once() old_val = coreid; break; case CSR_TOHOST & 0x1f: - old_val = proc->state.tohost; + old_val = proc->get_state()->tohost; if (write) - proc->state.tohost = new_val; + proc->get_state()->tohost = new_val; break; case CSR_FROMHOST & 0x1f: - old_val = proc->state.fromhost; + old_val = proc->get_state()->fromhost; if (write && old_val == 0) - proc->state.fromhost = new_val; + proc->set_fromhost(new_val); break; case CSR_RESET & 0x1f: old_val = !proc->running(); diff --git a/riscv/htif.h b/riscv/htif.h index 2a940ad..4e1025e 100644 --- a/riscv/htif.h +++ b/riscv/htif.h @@ -18,6 +18,7 @@ class htif_isasim_t : public htif_pthread_t public: htif_isasim_t(sim_t* _sim, const std::vector& args); bool tick(); + bool done(); private: sim_t* sim; @@ -25,7 +26,6 @@ private: uint8_t seqno; void tick_once(); - bool done(); }; #endif diff --git a/riscv/interactive.cc b/riscv/interactive.cc index ad38ace..9014aa0 100644 --- a/riscv/interactive.cc +++ b/riscv/interactive.cc @@ -45,7 +45,7 @@ static std::string readline(int fd) void sim_t::interactive() { - while (true) + while (!htif->done()) { std::cerr << ": " << std::flush; std::string s = readline(2); @@ -103,7 +103,7 @@ void sim_t::interactive_run(const std::string& cmd, const std::vectordone(); i++) step(1); } diff --git a/riscv/mmu.cc b/riscv/mmu.cc index f8efd5a..4675f75 100644 --- a/riscv/mmu.cc +++ b/riscv/mmu.cc @@ -16,7 +16,7 @@ mmu_t::~mmu_t() void mmu_t::flush_icache() { - for (size_t i = 0; i < ICACHE_ENTRIES; i++) + for (size_t i = 0; i < ICACHE_SIZE; i++) icache[i].tag = -1; } @@ -32,7 +32,7 @@ void mmu_t::flush_tlb() void* mmu_t::refill_tlb(reg_t addr, reg_t bytes, bool store, bool fetch) { reg_t idx = (addr >> PGSHIFT) % TLB_ENTRIES; - reg_t expected_tag = addr & ~(PGSIZE-1); + reg_t expected_tag = addr >> PGSHIFT; reg_t pte = walk(addr); diff --git a/riscv/mmu.h b/riscv/mmu.h index 551fa46..c09cfc4 100644 --- a/riscv/mmu.h +++ b/riscv/mmu.h @@ -4,6 +4,7 @@ #define _RISCV_MMU_H #include "decode.h" +#include "icache.h" #include "trap.h" #include "common.h" #include "config.h" @@ -21,6 +22,21 @@ const reg_t VPN_BITS = PTIDXBITS * LEVELS; const reg_t PPN_BITS = 8*sizeof(reg_t) - PGSHIFT; const reg_t VA_BITS = VPN_BITS + PGSHIFT; +struct insn_fetch_t +{ + insn_func_t func; + union { + insn_t insn; + uint_fast32_t pad; + } insn; +}; + +struct icache_entry_t { + reg_t tag; + reg_t pad; + insn_fetch_t data; +}; + // this class implements a processor's port into the virtual memory system. // an MMU and instruction cache are maintained for simulator performance. class mmu_t @@ -32,8 +48,6 @@ public: // template for functions that load an aligned value from memory #define load_func(type) \ type##_t load_##type(reg_t addr) __attribute__((always_inline)) { \ - if(unlikely(addr % sizeof(type##_t))) \ - throw trap_load_address_misaligned(addr); \ void* paddr = translate(addr, sizeof(type##_t), false, false); \ return *(type##_t*)paddr; \ } @@ -53,8 +67,6 @@ public: // template for functions that store an aligned value to memory #define store_func(type) \ void store_##type(reg_t addr, type##_t val) { \ - if(unlikely(addr % sizeof(type##_t))) \ - throw trap_store_address_misaligned(addr); \ void* paddr = translate(addr, sizeof(type##_t), true, false); \ *(type##_t*)paddr = val; \ } @@ -65,40 +77,34 @@ public: store_func(uint32) store_func(uint64) - struct insn_fetch_t - { - insn_func_t func; - union { - insn_t insn; - uint_fast32_t pad; - } insn; - }; - // load instruction from memory at aligned address. - inline insn_fetch_t load_insn(reg_t addr) + inline icache_entry_t access_icache(reg_t addr) { - reg_t offset = addr & (sizeof(insn_t) * (ICACHE_ENTRIES-1)); - offset *= sizeof(icache_entry_t) / sizeof(insn_t); - icache_entry_t* entry = (icache_entry_t*)((char*)icache + offset); - insn_fetch_t data = entry->data; - if (likely(entry->tag == addr)) - return data; + reg_t idx = (addr / sizeof(insn_t)) % ICACHE_SIZE; + icache_entry_t entry = icache[idx]; + if (likely(entry.tag == addr)) + return entry; void* iaddr = translate(addr, sizeof(insn_t), false, true); insn_fetch_t fetch; fetch.insn.pad = *(decltype(fetch.insn.insn.bits())*)iaddr; fetch.func = proc->decode_insn(fetch.insn.insn); - entry->tag = addr; - entry->data = fetch; + icache[idx].tag = addr; + icache[idx].data = fetch; reg_t paddr = (char*)iaddr - mem; if (!tracer.empty() && tracer.interested_in_range(paddr, paddr + sizeof(insn_t), false, true)) { - entry->tag = -1; + icache[idx].tag = -1; tracer.trace(paddr, sizeof(insn_t), false, true); } - return entry->data; + return icache[idx]; + } + + inline insn_fetch_t load_insn(reg_t addr) + { + return access_icache(addr).data; } void set_processor(processor_t* p) { proc = p; flush_tlb(); } @@ -115,13 +121,7 @@ private: memtracer_list_t tracer; // implement an instruction cache for simulator performance - static const reg_t ICACHE_ENTRIES = 2048; - struct icache_entry_t { - reg_t tag; - reg_t pad; - insn_fetch_t data; - }; - icache_entry_t icache[ICACHE_ENTRIES]; + icache_entry_t icache[ICACHE_SIZE]; // implement a TLB for simulator performance static const reg_t TLB_ENTRIES = 256; @@ -141,11 +141,15 @@ private: __attribute__((always_inline)) { reg_t idx = (addr >> PGSHIFT) % TLB_ENTRIES; - reg_t expected_tag = addr & ~(PGSIZE-1); - - reg_t* tlb_tag = fetch ? tlb_insn_tag : store ? tlb_store_tag :tlb_load_tag; + reg_t expected_tag = addr >> PGSHIFT; + reg_t* tags = fetch ? tlb_insn_tag : store ? tlb_store_tag :tlb_load_tag; + reg_t tag = tags[idx]; void* data = tlb_data[idx] + addr; - if (likely(tlb_tag[idx] == expected_tag)) + + if (unlikely(addr & (bytes-1))) + store ? throw trap_store_address_misaligned(addr) : throw trap_load_address_misaligned(addr); + + if (likely(tag == expected_tag)) return data; return refill_tlb(addr, bytes, store, fetch); diff --git a/riscv/processor.cc b/riscv/processor.cc index b12a8e0..17b4181 100644 --- a/riscv/processor.cc +++ b/riscv/processor.cc @@ -5,7 +5,9 @@ #include "common.h" #include "config.h" #include "sim.h" +#include "htif.h" #include "disasm.h" +#include "icache.h" #include #include #include @@ -95,9 +97,10 @@ void processor_t::step(size_t n) if(!run) return; - size_t i = 0; - reg_t npc = state.pc; mmu_t* _mmu = mmu; + auto count32 = decltype(state.compare)(state.count); + bool count_le_compare = count32 <= state.compare; + n = std::min(n, size_t(state.compare - count32) | 1); try { @@ -106,9 +109,9 @@ void processor_t::step(size_t n) // execute_insn fetches and executes one instruction #define execute_insn(noisy) \ do { \ - mmu_t::insn_fetch_t fetch = _mmu->load_insn(npc); \ + insn_fetch_t fetch = mmu->load_insn(state.pc); \ if(noisy) disasm(fetch.insn.insn); \ - npc = fetch.func(this, fetch.insn.insn, npc); \ + state.pc = fetch.func(this, fetch.insn.insn, state.pc); \ } while(0) @@ -118,50 +121,65 @@ void processor_t::step(size_t n) #undef execute_insn #define execute_insn(noisy) \ do { \ - mmu_t::insn_fetch_t fetch = _mmu->load_insn(npc); \ + insn_fetch_t fetch = _mmu->load_insn(state.pc); \ if(noisy) disasm(fetch.insn.insn); \ bool in_spvr = state.sr & SR_S; \ - if (!in_spvr) fprintf(stderr, "\n0x%016" PRIx64 " (0x%08" PRIx32 ") ", npc, fetch.insn.insn.bits()); \ - /*if (!in_spvr) fprintf(stderr, "\n0x%016" PRIx64 " (0x%08" PRIx32 ") %s ", npc, fetch.insn.insn.bits(), disasmblr.disassemble(fetch.insn.insn).c_str());*/ \ - npc = fetch.func(this, fetch.insn.insn, npc); \ + if (!in_spvr) fprintf(stderr, "\n0x%016" PRIx64 " (0x%08" PRIx32 ") ", state.pc, fetch.insn.insn.bits()); \ + /*if (!in_spvr) fprintf(stderr, "\n0x%016" PRIx64 " (0x%08" PRIx32 ") %s ", state.pc, fetch.insn.insn.bits(), disasmblr.disassemble(fetch.insn.insn).c_str());*/ \ + state.pc = fetch.func(this, fetch.insn.insn, state.pc); \ } while(0) #endif - if(debug) for( ; i < n; i++) // print out instructions as we go - execute_insn(true); - else + if (debug) // print out instructions as we go { - // unrolled for speed - for( ; n > 3 && i < n-3; i+=4) + for (size_t i = 0; i < n; state.count++, i++) + execute_insn(true); + } + else while (n > 0) + { + size_t idx = (state.pc / sizeof(insn_t)) % ICACHE_SIZE; + auto ic_entry_init = &_mmu->icache[idx], ic_entry = ic_entry_init; + + #define update_count() { \ + size_t i = ic_entry - ic_entry_init; \ + state.count += i; \ + if (i >= n) break; \ + n -= i; } + + #define ICACHE_ACCESS(idx) { \ + insn_t insn = ic_entry->data.insn.insn; \ + insn_func_t func = ic_entry->data.func; \ + if (unlikely(ic_entry->tag != state.pc)) break; \ + ic_entry++; \ + state.pc = func(this, insn, state.pc); } + + switch (idx) while (true) { - execute_insn(false); - execute_insn(false); - execute_insn(false); - execute_insn(false); + ICACHE_SWITCH; + update_count(); + ic_entry_init = ic_entry = &_mmu->icache[0]; } - for( ; i < n; i++) - execute_insn(false); - } - state.pc = npc; + _mmu->access_icache(state.pc); + update_count(); + } } catch(trap_t& t) { - take_trap(npc, t); + take_trap(t); } - // update timer and possibly register a timer interrupt - uint32_t old_count = state.count; - state.count += i; - if(old_count < state.compare && uint64_t(old_count) + i >= state.compare) + bool count_ge_compare = + uint64_t(n) + decltype(state.compare)(state.count) >= state.compare; + if (count_le_compare && count_ge_compare) set_interrupt(IRQ_TIMER, true); } -void processor_t::take_trap(reg_t pc, trap_t& t) +void processor_t::take_trap(trap_t& t) { if (debug) fprintf(stderr, "core %3d: exception %s, epc 0x%016" PRIx64 "\n", - id, t.name(), pc); + id, t.name(), state.pc); // switch to supervisor, set previous supervisor bit, disable interrupts set_pcr(CSR_STATUS, (((state.sr & ~SR_EI) | SR_S) & ~SR_PS & ~SR_PEI) | @@ -170,7 +188,7 @@ void processor_t::take_trap(reg_t pc, trap_t& t) yield_load_reservation(); state.cause = t.cause(); - state.epc = pc; + state.epc = state.pc; state.pc = state.evec; t.side_effects(&state); // might set badvaddr etc. @@ -255,14 +273,19 @@ reg_t processor_t::set_pcr(int which, reg_t val) state.tohost = val; break; case CSR_FROMHOST: - set_interrupt(IRQ_HOST, val != 0); - state.fromhost = val; + set_fromhost(val); break; } return old_pcr; } +void processor_t::set_fromhost(reg_t val) +{ + set_interrupt(IRQ_HOST, val != 0); + state.fromhost = val; +} + reg_t processor_t::get_pcr(int which) { switch (which) @@ -306,8 +329,10 @@ reg_t processor_t::get_pcr(int which) case CSR_SUP1: return state.pcr_k1; case CSR_TOHOST: + sim->get_htif()->tick(); // not necessary, but faster return state.tohost; case CSR_FROMHOST: + sim->get_htif()->tick(); // not necessary, but faster return state.fromhost; default: return -1; diff --git a/riscv/processor.h b/riscv/processor.h index e27aa82..9e52d3d 100644 --- a/riscv/processor.h +++ b/riscv/processor.h @@ -65,6 +65,7 @@ public: void deliver_ipi(); // register an interprocessor interrupt bool running() { return run; } reg_t set_pcr(int which, reg_t val); + void set_fromhost(reg_t val); void set_interrupt(int which, bool on); reg_t get_pcr(int which); mmu_t* get_mmu() { return mmu; } @@ -91,13 +92,12 @@ private: std::vector opcode_store; void take_interrupt(); // take a trap if any interrupts are pending - void take_trap(reg_t pc, trap_t& t); // take an exception + void take_trap(trap_t& t); // take an exception void disasm(insn_t insn); // disassemble and print an instruction friend class sim_t; friend class mmu_t; friend class extension_t; - friend class htif_isasim_t; void build_opcode_map(); insn_func_t decode_insn(insn_t insn); diff --git a/riscv/riscv.mk.in b/riscv/riscv.mk.in index fd506c8..45c5ee7 100644 --- a/riscv/riscv.mk.in +++ b/riscv/riscv.mk.in @@ -43,10 +43,14 @@ riscv_srcs = \ riscv_test_srcs = riscv_gen_hdrs = \ + icache.h \ riscv_gen_srcs = \ $(addsuffix .cc, $(call get_insn_list,$(src_dir)/riscv/encoding.h)) +icache.h: + $(src_dir)/riscv/gen_icache 1024 > $@ + $(riscv_gen_srcs): %.cc: insns/%.h insn_template.cc sed 's/NAME/$(subst .cc,,$@)/' $(src_dir)/riscv/insn_template.cc | sed 's/OPCODE/$(call get_opcode,$(src_dir)/riscv/encoding.h,$(subst .cc,,$@))/' > $@ diff --git a/riscv/sim.cc b/riscv/sim.cc index c800e87..59fe593 100644 --- a/riscv/sim.cc +++ b/riscv/sim.cc @@ -34,7 +34,7 @@ sim_t::sim_t(size_t nprocs, size_t mem_mb, const std::vector& args) while ((mem = (char*)calloc(1, memsz)) == NULL) memsz = memsz*10/11/quantum*quantum; - if (memsz != memsz) + if (memsz != memsz0) fprintf(stderr, "warning: only got %lu bytes of target mem (wanted %lu)\n", (unsigned long)memsz, (unsigned long)memsz0); diff --git a/riscv/sim.h b/riscv/sim.h index d643e6d..d437c1a 100644 --- a/riscv/sim.h +++ b/riscv/sim.h @@ -24,6 +24,7 @@ public: void stop(); void set_debug(bool value); void set_procs_debug(bool value); + htif_isasim_t* get_htif() { return htif.get(); } // deliver an IPI to a specific processor void send_ipi(reg_t who); @@ -36,7 +37,7 @@ public: reg_t get_scr(int which); private: - std::auto_ptr htif; + std::unique_ptr htif; char* mem; // main memory size_t memsz; // memory size in bytes mmu_t* debug_mmu; // debug port into main memory -- 2.30.2