3 based on Anton Blanchard microwatt dcache.vhdl
5 note that the microwatt dcache wishbone interface expects "stall".
6 for simplicity at the moment this is hard-coded to cyc & ~ack.
7 see WB4 spec, p84, section 5.2.1
9 IMPORTANT: for store, the data is sampled the cycle AFTER the "valid"
15 from nmutil
.gtkw
import write_gtkw
17 sys
.setrecursionlimit(1000000)
19 from enum
import Enum
, unique
21 from nmigen
import Module
, Signal
, Elaboratable
, Cat
, Repl
, Array
, Const
22 from nmutil
.util
import Display
24 from copy
import deepcopy
25 from random
import randint
, seed
27 from nmigen
.cli
import main
28 from nmutil
.iocontrol
import RecordObject
29 from nmigen
.utils
import log2_int
30 from soc
.experiment
.mem_types
import (LoadStore1ToDCacheType
,
31 DCacheToLoadStore1Type
,
35 from soc
.experiment
.wb_types
import (WB_ADDR_BITS
, WB_DATA_BITS
, WB_SEL_BITS
,
36 WBAddrType
, WBDataType
, WBSelType
,
37 WBMasterOut
, WBSlaveOut
,
38 WBMasterOutVector
, WBSlaveOutVector
,
39 WBIOMasterOut
, WBIOSlaveOut
)
41 from soc
.experiment
.cache_ram
import CacheRam
42 #from soc.experiment.plru import PLRU
43 from nmutil
.plru
import PLRU
46 from soc
.bus
.sram
import SRAM
47 from nmigen
import Memory
48 from nmigen
.cli
import rtlil
50 # NOTE: to use cxxsim, export NMIGEN_SIM_MODE=cxxsim from the shell
51 # Also, check out the cxxsim nmigen branch, and latest yosys from git
52 from nmutil
.sim_tmp_alternative
import Simulator
54 from nmutil
.util
import wrap
57 # TODO: make these parameters of DCache at some point
58 LINE_SIZE
= 64 # Line size in bytes
59 NUM_LINES
= 16 # Number of lines in a set
60 NUM_WAYS
= 4 # Number of ways
61 TLB_SET_SIZE
= 64 # L1 DTLB entries per set
62 TLB_NUM_WAYS
= 2 # L1 DTLB number of sets
63 TLB_LG_PGSZ
= 12 # L1 DTLB log_2(page_size)
64 LOG_LENGTH
= 0 # Non-zero to enable log data collection
66 # BRAM organisation: We never access more than
67 # -- WB_DATA_BITS at a time so to save
68 # -- resources we make the array only that wide, and
69 # -- use consecutive indices for to make a cache "line"
71 # -- ROW_SIZE is the width in bytes of the BRAM
72 # -- (based on WB, so 64-bits)
73 ROW_SIZE
= WB_DATA_BITS
// 8;
75 # ROW_PER_LINE is the number of row (wishbone
76 # transactions) in a line
77 ROW_PER_LINE
= LINE_SIZE
// ROW_SIZE
79 # BRAM_ROWS is the number of rows in BRAM needed
80 # to represent the full dcache
81 BRAM_ROWS
= NUM_LINES
* ROW_PER_LINE
83 print ("ROW_SIZE", ROW_SIZE
)
84 print ("ROW_PER_LINE", ROW_PER_LINE
)
85 print ("BRAM_ROWS", BRAM_ROWS
)
86 print ("NUM_WAYS", NUM_WAYS
)
88 # Bit fields counts in the address
90 # REAL_ADDR_BITS is the number of real address
94 # ROW_BITS is the number of bits to select a row
95 ROW_BITS
= log2_int(BRAM_ROWS
)
97 # ROW_LINE_BITS is the number of bits to select
99 ROW_LINE_BITS
= log2_int(ROW_PER_LINE
)
101 # LINE_OFF_BITS is the number of bits for
102 # the offset in a cache line
103 LINE_OFF_BITS
= log2_int(LINE_SIZE
)
105 # ROW_OFF_BITS is the number of bits for
106 # the offset in a row
107 ROW_OFF_BITS
= log2_int(ROW_SIZE
)
109 # INDEX_BITS is the number if bits to
110 # select a cache line
111 INDEX_BITS
= log2_int(NUM_LINES
)
113 # SET_SIZE_BITS is the log base 2 of the set size
114 SET_SIZE_BITS
= LINE_OFF_BITS
+ INDEX_BITS
116 # TAG_BITS is the number of bits of
117 # the tag part of the address
118 TAG_BITS
= REAL_ADDR_BITS
- SET_SIZE_BITS
120 # TAG_WIDTH is the width in bits of each way of the tag RAM
121 TAG_WIDTH
= TAG_BITS
+ 7 - ((TAG_BITS
+ 7) % 8)
123 # WAY_BITS is the number of bits to select a way
124 WAY_BITS
= log2_int(NUM_WAYS
)
126 # Example of layout for 32 lines of 64 bytes:
128 .. tag |index| line |
130 .. | |---| | ROW_LINE_BITS (3)
131 .. | |--- - --| LINE_OFF_BITS (6)
132 .. | |- --| ROW_OFF_BITS (3)
133 .. |----- ---| | ROW_BITS (8)
134 .. |-----| | INDEX_BITS (5)
135 .. --------| | TAG_BITS (45)
138 print ("Dcache TAG %d IDX %d ROW_BITS %d ROFF %d LOFF %d RLB %d" % \
139 (TAG_BITS
, INDEX_BITS
, ROW_BITS
,
140 ROW_OFF_BITS
, LINE_OFF_BITS
, ROW_LINE_BITS
))
141 print ("index @: %d-%d" % (LINE_OFF_BITS
, SET_SIZE_BITS
))
142 print ("row @: %d-%d" % (LINE_OFF_BITS
, ROW_OFF_BITS
))
143 print ("tag @: %d-%d width %d" % (SET_SIZE_BITS
, REAL_ADDR_BITS
, TAG_WIDTH
))
145 TAG_RAM_WIDTH
= TAG_WIDTH
* NUM_WAYS
147 print ("TAG_RAM_WIDTH", TAG_RAM_WIDTH
)
150 return Array(Signal(TAG_RAM_WIDTH
, name
="cachetag_%d" % x
) \
151 for x
in range(NUM_LINES
))
153 def CacheValidBitsArray():
154 return Array(Signal(NUM_WAYS
, name
="cachevalid_%d" % x
) \
155 for x
in range(NUM_LINES
))
157 def RowPerLineValidArray():
158 return Array(Signal(name
="rows_valid%d" % x
) \
159 for x
in range(ROW_PER_LINE
))
162 TLB_SET_BITS
= log2_int(TLB_SET_SIZE
)
163 TLB_WAY_BITS
= log2_int(TLB_NUM_WAYS
)
164 TLB_EA_TAG_BITS
= 64 - (TLB_LG_PGSZ
+ TLB_SET_BITS
)
165 TLB_TAG_WAY_BITS
= TLB_NUM_WAYS
* TLB_EA_TAG_BITS
167 TLB_PTE_WAY_BITS
= TLB_NUM_WAYS
* TLB_PTE_BITS
;
170 return (1<<log2_int(x
, False)) == x
172 assert (LINE_SIZE
% ROW_SIZE
) == 0, "LINE_SIZE not multiple of ROW_SIZE"
173 assert ispow2(LINE_SIZE
), "LINE_SIZE not power of 2"
174 assert ispow2(NUM_LINES
), "NUM_LINES not power of 2"
175 assert ispow2(ROW_PER_LINE
), "ROW_PER_LINE not power of 2"
176 assert ROW_BITS
== (INDEX_BITS
+ ROW_LINE_BITS
), "geometry bits don't add up"
177 assert (LINE_OFF_BITS
== ROW_OFF_BITS
+ ROW_LINE_BITS
), \
178 "geometry bits don't add up"
179 assert REAL_ADDR_BITS
== (TAG_BITS
+ INDEX_BITS
+ LINE_OFF_BITS
), \
180 "geometry bits don't add up"
181 assert REAL_ADDR_BITS
== (TAG_BITS
+ ROW_BITS
+ ROW_OFF_BITS
), \
182 "geometry bits don't add up"
183 assert 64 == WB_DATA_BITS
, "Can't yet handle wb width that isn't 64-bits"
184 assert SET_SIZE_BITS
<= TLB_LG_PGSZ
, "Set indexed by virtual address"
187 def TLBValidBitsArray():
188 return Array(Signal(TLB_NUM_WAYS
, name
="tlbvalid%d" % x
) \
189 for x
in range(TLB_SET_SIZE
))
192 return Array(Signal(TLB_EA_TAG_BITS
, name
="tlbtagea%d" % x
) \
193 for x
in range (TLB_NUM_WAYS
))
196 return Array(Signal(TLB_TAG_WAY_BITS
, name
="tlbtags%d" % x
) \
197 for x
in range (TLB_SET_SIZE
))
200 return Array(Signal(TLB_PTE_WAY_BITS
, name
="tlbptes%d" % x
) \
201 for x
in range(TLB_SET_SIZE
))
204 return Array(Signal(WAY_BITS
, name
="hitway_%d" % x
) \
205 for x
in range(TLB_NUM_WAYS
))
207 # Cache RAM interface
209 return Array(Signal(WB_DATA_BITS
, name
="cache_out%d" % x
) \
210 for x
in range(NUM_WAYS
))
212 # PLRU output interface
214 return Array(Signal(WAY_BITS
, name
="plru_out%d" % x
) \
215 for x
in range(NUM_LINES
))
217 # TLB PLRU output interface
219 return Array(Signal(TLB_WAY_BITS
, name
="tlbplru_out%d" % x
) \
220 for x
in range(TLB_SET_SIZE
))
222 # Helper functions to decode incoming requests
224 # Return the cache line index (tag index) for an address
226 return addr
[LINE_OFF_BITS
:SET_SIZE_BITS
]
228 # Return the cache row index (data memory) for an address
230 return addr
[ROW_OFF_BITS
:SET_SIZE_BITS
]
232 # Return the index of a row within a line
233 def get_row_of_line(row
):
234 return row
[:ROW_BITS
][:ROW_LINE_BITS
]
236 # Returns whether this is the last row of a line
237 def is_last_row_addr(addr
, last
):
238 return addr
[ROW_OFF_BITS
:LINE_OFF_BITS
] == last
240 # Returns whether this is the last row of a line
241 def is_last_row(row
, last
):
242 return get_row_of_line(row
) == last
244 # Return the next row in the current cache line. We use a
245 # dedicated function in order to limit the size of the
246 # generated adder to be only the bits within a cache line
247 # (3 bits with default settings)
249 row_v
= row
[0:ROW_LINE_BITS
] + 1
250 return Cat(row_v
[:ROW_LINE_BITS
], row
[ROW_LINE_BITS
:])
252 # Get the tag value from the address
254 return addr
[SET_SIZE_BITS
:REAL_ADDR_BITS
]
256 # Read a tag from a tag memory row
257 def read_tag(way
, tagset
):
258 return tagset
.word_select(way
, TAG_WIDTH
)[:TAG_BITS
]
260 # Read a TLB tag from a TLB tag memory row
261 def read_tlb_tag(way
, tags
):
262 return tags
.word_select(way
, TLB_EA_TAG_BITS
)
264 # Write a TLB tag to a TLB tag memory row
265 def write_tlb_tag(way
, tags
, tag
):
266 return read_tlb_tag(way
, tags
).eq(tag
)
268 # Read a PTE from a TLB PTE memory row
269 def read_tlb_pte(way
, ptes
):
270 return ptes
.word_select(way
, TLB_PTE_BITS
)
272 def write_tlb_pte(way
, ptes
, newpte
):
273 return read_tlb_pte(way
, ptes
).eq(newpte
)
276 # Record for storing permission, attribute, etc. bits from a PTE
277 class PermAttr(RecordObject
):
278 def __init__(self
, name
=None):
279 super().__init
__(name
=name
)
280 self
.reference
= Signal()
281 self
.changed
= Signal()
282 self
.nocache
= Signal()
284 self
.rd_perm
= Signal()
285 self
.wr_perm
= Signal()
288 def extract_perm_attr(pte
):
293 # Type of operation on a "valid" input
297 OP_BAD
= 1 # NC cache hit, TLB miss, prot/RC failure
298 OP_STCX_FAIL
= 2 # conditional store w/o reservation
299 OP_LOAD_HIT
= 3 # Cache hit on load
300 OP_LOAD_MISS
= 4 # Load missing cache
301 OP_LOAD_NC
= 5 # Non-cachable load
302 OP_STORE_HIT
= 6 # Store hitting cache
303 OP_STORE_MISS
= 7 # Store missing cache
306 # Cache state machine
309 IDLE
= 0 # Normal load hit processing
310 RELOAD_WAIT_ACK
= 1 # Cache reload wait ack
311 STORE_WAIT_ACK
= 2 # Store wait ack
312 NC_LOAD_WAIT_ACK
= 3 # Non-cachable load wait ack
317 # In order to make timing, we use the BRAMs with
318 # an output buffer, which means that the BRAM
319 # output is delayed by an extra cycle.
321 # Thus, the dcache has a 2-stage internal pipeline
322 # for cache hits with no stalls.
324 # All other operations are handled via stalling
325 # in the first stage.
327 # The second stage can thus complete a hit at the same
328 # time as the first stage emits a stall for a complex op.
330 # Stage 0 register, basically contains just the latched request
332 class RegStage0(RecordObject
):
333 def __init__(self
, name
=None):
334 super().__init
__(name
=name
)
335 self
.req
= LoadStore1ToDCacheType(name
="lsmem")
336 self
.tlbie
= Signal() # indicates a tlbie request (from MMU)
337 self
.doall
= Signal() # with tlbie, indicates flush whole TLB
338 self
.tlbld
= Signal() # indicates a TLB load request (from MMU)
339 self
.mmu_req
= Signal() # indicates source of request
340 self
.d_valid
= Signal() # indicates req.data is valid now
343 class MemAccessRequest(RecordObject
):
344 def __init__(self
, name
=None):
345 super().__init
__(name
=name
)
347 self
.valid
= Signal()
349 self
.real_addr
= Signal(REAL_ADDR_BITS
)
350 self
.data
= Signal(64)
351 self
.byte_sel
= Signal(8)
352 self
.hit_way
= Signal(WAY_BITS
)
353 self
.same_tag
= Signal()
354 self
.mmu_req
= Signal()
357 # First stage register, contains state for stage 1 of load hits
358 # and for the state machine used by all other operations
359 class RegStage1(RecordObject
):
360 def __init__(self
, name
=None):
361 super().__init
__(name
=name
)
362 # Info about the request
363 self
.full
= Signal() # have uncompleted request
364 self
.mmu_req
= Signal() # request is from MMU
365 self
.req
= MemAccessRequest(name
="reqmem")
368 self
.hit_way
= Signal(WAY_BITS
)
369 self
.hit_load_valid
= Signal()
370 self
.hit_index
= Signal(INDEX_BITS
)
371 self
.cache_hit
= Signal()
374 self
.tlb_hit
= Signal()
375 self
.tlb_hit_way
= Signal(TLB_NUM_WAYS
)
376 self
.tlb_hit_index
= Signal(TLB_WAY_BITS
)
378 # 2-stage data buffer for data forwarded from writes to reads
379 self
.forward_data1
= Signal(64)
380 self
.forward_data2
= Signal(64)
381 self
.forward_sel1
= Signal(8)
382 self
.forward_valid1
= Signal()
383 self
.forward_way1
= Signal(WAY_BITS
)
384 self
.forward_row1
= Signal(ROW_BITS
)
385 self
.use_forward1
= Signal()
386 self
.forward_sel
= Signal(8)
388 # Cache miss state (reload state machine)
389 self
.state
= Signal(State
)
391 self
.write_bram
= Signal()
392 self
.write_tag
= Signal()
393 self
.slow_valid
= Signal()
394 self
.wb
= WBMasterOut("wb")
395 self
.reload_tag
= Signal(TAG_BITS
)
396 self
.store_way
= Signal(WAY_BITS
)
397 self
.store_row
= Signal(ROW_BITS
)
398 self
.store_index
= Signal(INDEX_BITS
)
399 self
.end_row_ix
= Signal(ROW_LINE_BITS
)
400 self
.rows_valid
= RowPerLineValidArray()
401 self
.acks_pending
= Signal(3)
402 self
.inc_acks
= Signal()
403 self
.dec_acks
= Signal()
405 # Signals to complete (possibly with error)
406 self
.ls_valid
= Signal()
407 self
.ls_error
= Signal()
408 self
.mmu_done
= Signal()
409 self
.mmu_error
= Signal()
410 self
.cache_paradox
= Signal()
412 # Signal to complete a failed stcx.
413 self
.stcx_fail
= Signal()
416 # Reservation information
417 class Reservation(RecordObject
):
420 self
.valid
= Signal()
421 self
.addr
= Signal(64-LINE_OFF_BITS
)
424 class DTLBUpdate(Elaboratable
):
426 self
.tlbie
= Signal()
427 self
.tlbwe
= Signal()
428 self
.doall
= Signal()
429 self
.updated
= Signal()
430 self
.v_updated
= Signal()
431 self
.tlb_hit
= Signal()
432 self
.tlb_req_index
= Signal(TLB_SET_BITS
)
434 self
.tlb_hit_way
= Signal(TLB_WAY_BITS
)
435 self
.tlb_tag_way
= Signal(TLB_TAG_WAY_BITS
)
436 self
.tlb_pte_way
= Signal(TLB_PTE_WAY_BITS
)
437 self
.repl_way
= Signal(TLB_WAY_BITS
)
438 self
.eatag
= Signal(TLB_EA_TAG_BITS
)
439 self
.pte_data
= Signal(TLB_PTE_BITS
)
441 self
.dv
= Signal(TLB_NUM_WAYS
) # tlb_way_valids_t
443 self
.tb_out
= Signal(TLB_TAG_WAY_BITS
) # tlb_way_tags_t
444 self
.pb_out
= Signal(TLB_NUM_WAYS
) # tlb_way_valids_t
445 self
.db_out
= Signal(TLB_PTE_WAY_BITS
) # tlb_way_ptes_t
447 def elaborate(self
, platform
):
452 tagset
= Signal(TLB_TAG_WAY_BITS
)
453 pteset
= Signal(TLB_PTE_WAY_BITS
)
455 tb_out
, pb_out
, db_out
= self
.tb_out
, self
.pb_out
, self
.db_out
456 comb
+= db_out
.eq(self
.dv
)
458 with m
.If(self
.tlbie
& self
.doall
):
459 pass # clear all back in parent
460 with m
.Elif(self
.tlbie
):
461 with m
.If(self
.tlb_hit
):
462 comb
+= db_out
.bit_select(self
.tlb_hit_way
, 1).eq(1)
463 comb
+= self
.v_updated
.eq(1)
465 with m
.Elif(self
.tlbwe
):
467 comb
+= tagset
.eq(self
.tlb_tag_way
)
468 comb
+= write_tlb_tag(self
.repl_way
, tagset
, self
.eatag
)
469 comb
+= tb_out
.eq(tagset
)
471 comb
+= pteset
.eq(self
.tlb_pte_way
)
472 comb
+= write_tlb_pte(self
.repl_way
, pteset
, self
.pte_data
)
473 comb
+= pb_out
.eq(pteset
)
475 comb
+= db_out
.bit_select(self
.repl_way
, 1).eq(1)
477 comb
+= self
.updated
.eq(1)
478 comb
+= self
.v_updated
.eq(1)
483 class DCachePendingHit(Elaboratable
):
485 def __init__(self
, tlb_pte_way
, tlb_valid_way
, tlb_hit_way
,
486 cache_valid_idx
, cache_tag_set
,
491 self
.virt_mode
= Signal()
492 self
.is_hit
= Signal()
493 self
.tlb_hit
= Signal()
494 self
.hit_way
= Signal(WAY_BITS
)
495 self
.rel_match
= Signal()
496 self
.req_index
= Signal(INDEX_BITS
)
497 self
.reload_tag
= Signal(TAG_BITS
)
499 self
.tlb_hit_way
= tlb_hit_way
500 self
.tlb_pte_way
= tlb_pte_way
501 self
.tlb_valid_way
= tlb_valid_way
502 self
.cache_valid_idx
= cache_valid_idx
503 self
.cache_tag_set
= cache_tag_set
504 self
.req_addr
= req_addr
505 self
.hit_set
= hit_set
507 def elaborate(self
, platform
):
513 virt_mode
= self
.virt_mode
515 tlb_pte_way
= self
.tlb_pte_way
516 tlb_valid_way
= self
.tlb_valid_way
517 cache_valid_idx
= self
.cache_valid_idx
518 cache_tag_set
= self
.cache_tag_set
519 req_addr
= self
.req_addr
520 tlb_hit_way
= self
.tlb_hit_way
521 tlb_hit
= self
.tlb_hit
522 hit_set
= self
.hit_set
523 hit_way
= self
.hit_way
524 rel_match
= self
.rel_match
525 req_index
= self
.req_index
526 reload_tag
= self
.reload_tag
528 rel_matches
= Array(Signal(name
="rel_matches_%d" % i
) \
529 for i
in range(TLB_NUM_WAYS
))
530 hit_way_set
= HitWaySet()
532 # Test if pending request is a hit on any way
533 # In order to make timing in virtual mode,
534 # when we are using the TLB, we compare each
535 # way with each of the real addresses from each way of
536 # the TLB, and then decide later which match to use.
538 with m
.If(virt_mode
):
539 for j
in range(TLB_NUM_WAYS
): # tlb_num_way_t
540 s_tag
= Signal(TAG_BITS
, name
="s_tag%d" % j
)
542 s_pte
= Signal(TLB_PTE_BITS
)
543 s_ra
= Signal(REAL_ADDR_BITS
)
544 comb
+= s_pte
.eq(read_tlb_pte(j
, tlb_pte_way
))
545 comb
+= s_ra
.eq(Cat(req_addr
[0:TLB_LG_PGSZ
],
546 s_pte
[TLB_LG_PGSZ
:REAL_ADDR_BITS
]))
547 comb
+= s_tag
.eq(get_tag(s_ra
))
549 for i
in range(NUM_WAYS
): # way_t
550 is_tag_hit
= Signal(name
="is_tag_hit_%d_%d" % (j
, i
))
551 comb
+= is_tag_hit
.eq(go
& cache_valid_idx
[i
] &
552 (read_tag(i
, cache_tag_set
) == s_tag
)
554 with m
.If(is_tag_hit
):
555 comb
+= hit_way_set
[j
].eq(i
)
557 comb
+= hit_set
[j
].eq(s_hit
)
558 with m
.If(s_tag
== reload_tag
):
559 comb
+= rel_matches
[j
].eq(1)
561 comb
+= is_hit
.eq(hit_set
[tlb_hit_way
])
562 comb
+= hit_way
.eq(hit_way_set
[tlb_hit_way
])
563 comb
+= rel_match
.eq(rel_matches
[tlb_hit_way
])
565 s_tag
= Signal(TAG_BITS
)
566 comb
+= s_tag
.eq(get_tag(req_addr
))
567 for i
in range(NUM_WAYS
): # way_t
568 is_tag_hit
= Signal(name
="is_tag_hit_%d" % i
)
569 comb
+= is_tag_hit
.eq(go
& cache_valid_idx
[i
] &
570 (read_tag(i
, cache_tag_set
) == s_tag
))
571 with m
.If(is_tag_hit
):
572 comb
+= hit_way
.eq(i
)
574 with m
.If(s_tag
== reload_tag
):
575 comb
+= rel_match
.eq(1)
580 class DCache(Elaboratable
):
581 """Set associative dcache write-through
582 TODO (in no specific order):
583 * See list in icache.vhdl
584 * Complete load misses on the cycle when WB data comes instead of
585 at the end of line (this requires dealing with requests coming in
589 self
.d_in
= LoadStore1ToDCacheType("d_in")
590 self
.d_out
= DCacheToLoadStore1Type("d_out")
592 self
.m_in
= MMUToDCacheType("m_in")
593 self
.m_out
= DCacheToMMUType("m_out")
595 self
.stall_out
= Signal()
597 self
.wb_out
= WBMasterOut("wb_out")
598 self
.wb_in
= WBSlaveOut("wb_in")
600 self
.log_out
= Signal(20)
602 def stage_0(self
, m
, r0
, r1
, r0_full
):
603 """Latch the request in r0.req as long as we're not stalling
607 d_in
, d_out
, m_in
= self
.d_in
, self
.d_out
, self
.m_in
609 r
= RegStage0("stage0")
611 # TODO, this goes in unit tests and formal proofs
612 with m
.If(d_in
.valid
& m_in
.valid
):
613 sync
+= Display("request collision loadstore vs MMU")
615 with m
.If(m_in
.valid
):
616 comb
+= r
.req
.valid
.eq(1)
617 comb
+= r
.req
.load
.eq(~
(m_in
.tlbie | m_in
.tlbld
))
618 comb
+= r
.req
.dcbz
.eq(0)
619 comb
+= r
.req
.nc
.eq(0)
620 comb
+= r
.req
.reserve
.eq(0)
621 comb
+= r
.req
.virt_mode
.eq(0)
622 comb
+= r
.req
.priv_mode
.eq(1)
623 comb
+= r
.req
.addr
.eq(m_in
.addr
)
624 comb
+= r
.req
.data
.eq(m_in
.pte
)
625 comb
+= r
.req
.byte_sel
.eq(~
0) # Const -1 sets all to 0b111....
626 comb
+= r
.tlbie
.eq(m_in
.tlbie
)
627 comb
+= r
.doall
.eq(m_in
.doall
)
628 comb
+= r
.tlbld
.eq(m_in
.tlbld
)
629 comb
+= r
.mmu_req
.eq(1)
631 comb
+= r
.req
.eq(d_in
)
632 comb
+= r
.req
.data
.eq(0)
633 comb
+= r
.tlbie
.eq(0)
634 comb
+= r
.doall
.eq(0)
635 comb
+= r
.tlbld
.eq(0)
636 comb
+= r
.mmu_req
.eq(0)
637 with m
.If((~r1
.full
& ~d_in
.hold
) | ~r0_full
):
639 sync
+= r0_full
.eq(r
.req
.valid
)
640 # Sample data the cycle after a request comes in from loadstore1.
641 # If another request has come in already then the data will get
642 # put directly into req.data below.
643 with m
.If(r0
.req
.valid
& ~r
.req
.valid
& ~r0
.d_valid
&
645 sync
+= r0
.req
.data
.eq(d_in
.data
)
646 sync
+= r0
.d_valid
.eq(1)
648 def tlb_read(self
, m
, r0_stall
, tlb_valid_way
,
649 tlb_tag_way
, tlb_pte_way
, dtlb_valid_bits
,
650 dtlb_tags
, dtlb_ptes
):
652 Operates in the second cycle on the request latched in r0.req.
653 TLB updates write the entry at the end of the second cycle.
657 m_in
, d_in
= self
.m_in
, self
.d_in
659 index
= Signal(TLB_SET_BITS
)
660 addrbits
= Signal(TLB_SET_BITS
)
663 amax
= TLB_LG_PGSZ
+ TLB_SET_BITS
665 with m
.If(m_in
.valid
):
666 comb
+= addrbits
.eq(m_in
.addr
[amin
: amax
])
668 comb
+= addrbits
.eq(d_in
.addr
[amin
: amax
])
669 comb
+= index
.eq(addrbits
)
671 # If we have any op and the previous op isn't finished,
672 # then keep the same output for next cycle.
673 with m
.If(~r0_stall
):
674 sync
+= tlb_valid_way
.eq(dtlb_valid_bits
[index
])
675 sync
+= tlb_tag_way
.eq(dtlb_tags
[index
])
676 sync
+= tlb_pte_way
.eq(dtlb_ptes
[index
])
678 def maybe_tlb_plrus(self
, m
, r1
, tlb_plru_victim
):
679 """Generate TLB PLRUs
684 if TLB_NUM_WAYS
== 0:
686 for i
in range(TLB_SET_SIZE
):
688 tlb_plru
= PLRU(TLB_WAY_BITS
)
689 setattr(m
.submodules
, "maybe_plru_%d" % i
, tlb_plru
)
690 tlb_plru_acc_en
= Signal()
692 comb
+= tlb_plru_acc_en
.eq(r1
.tlb_hit
& (r1
.tlb_hit_index
== i
))
693 comb
+= tlb_plru
.acc_en
.eq(tlb_plru_acc_en
)
694 comb
+= tlb_plru
.acc_i
.eq(r1
.tlb_hit_way
)
695 comb
+= tlb_plru_victim
[i
].eq(tlb_plru
.lru_o
)
697 def tlb_search(self
, m
, tlb_req_index
, r0
, r0_valid
,
698 tlb_valid_way
, tlb_tag_way
, tlb_hit_way
,
699 tlb_pte_way
, pte
, tlb_hit
, valid_ra
, perm_attr
, ra
):
703 hitway
= Signal(TLB_WAY_BITS
)
705 eatag
= Signal(TLB_EA_TAG_BITS
)
707 TLB_LG_END
= TLB_LG_PGSZ
+ TLB_SET_BITS
708 comb
+= tlb_req_index
.eq(r0
.req
.addr
[TLB_LG_PGSZ
: TLB_LG_END
])
709 comb
+= eatag
.eq(r0
.req
.addr
[TLB_LG_END
: 64 ])
711 for i
in range(TLB_NUM_WAYS
):
712 is_tag_hit
= Signal()
713 comb
+= is_tag_hit
.eq(tlb_valid_way
[i
]
714 & (read_tlb_tag(i
, tlb_tag_way
) == eatag
))
715 with m
.If(is_tag_hit
):
719 comb
+= tlb_hit
.eq(hit
& r0_valid
)
720 comb
+= tlb_hit_way
.eq(hitway
)
723 comb
+= pte
.eq(read_tlb_pte(hitway
, tlb_pte_way
))
724 comb
+= valid_ra
.eq(tlb_hit | ~r0
.req
.virt_mode
)
726 with m
.If(r0
.req
.virt_mode
):
727 comb
+= ra
.eq(Cat(Const(0, ROW_OFF_BITS
),
728 r0
.req
.addr
[ROW_OFF_BITS
:TLB_LG_PGSZ
],
729 pte
[TLB_LG_PGSZ
:REAL_ADDR_BITS
]))
730 comb
+= perm_attr
.reference
.eq(pte
[8])
731 comb
+= perm_attr
.changed
.eq(pte
[7])
732 comb
+= perm_attr
.nocache
.eq(pte
[5])
733 comb
+= perm_attr
.priv
.eq(pte
[3])
734 comb
+= perm_attr
.rd_perm
.eq(pte
[2])
735 comb
+= perm_attr
.wr_perm
.eq(pte
[1])
737 comb
+= ra
.eq(Cat(Const(0, ROW_OFF_BITS
),
738 r0
.req
.addr
[ROW_OFF_BITS
:REAL_ADDR_BITS
]))
739 comb
+= perm_attr
.reference
.eq(1)
740 comb
+= perm_attr
.changed
.eq(1)
741 comb
+= perm_attr
.nocache
.eq(0)
742 comb
+= perm_attr
.priv
.eq(1)
743 comb
+= perm_attr
.rd_perm
.eq(1)
744 comb
+= perm_attr
.wr_perm
.eq(1)
746 def tlb_update(self
, m
, r0_valid
, r0
, dtlb_valid_bits
, tlb_req_index
,
747 tlb_hit_way
, tlb_hit
, tlb_plru_victim
, tlb_tag_way
,
748 dtlb_tags
, tlb_pte_way
, dtlb_ptes
):
750 dtlb_valids
= TLBValidBitsArray()
758 comb
+= tlbie
.eq(r0_valid
& r0
.tlbie
)
759 comb
+= tlbwe
.eq(r0_valid
& r0
.tlbld
)
761 m
.submodules
.tlb_update
= d
= DTLBUpdate()
762 with m
.If(tlbie
& r0
.doall
):
763 # clear all valid bits at once
764 for i
in range(TLB_SET_SIZE
):
765 sync
+= dtlb_valid_bits
[i
].eq(0)
766 with m
.If(d
.updated
):
767 sync
+= dtlb_tags
[tlb_req_index
].eq(d
.tb_out
)
768 sync
+= dtlb_ptes
[tlb_req_index
].eq(d
.pb_out
)
769 with m
.If(d
.v_updated
):
770 sync
+= dtlb_valid_bits
[tlb_req_index
].eq(d
.db_out
)
772 comb
+= d
.dv
.eq(dtlb_valid_bits
[tlb_req_index
])
774 comb
+= d
.tlbie
.eq(tlbie
)
775 comb
+= d
.tlbwe
.eq(tlbwe
)
776 comb
+= d
.doall
.eq(r0
.doall
)
777 comb
+= d
.tlb_hit
.eq(tlb_hit
)
778 comb
+= d
.tlb_hit_way
.eq(tlb_hit_way
)
779 comb
+= d
.tlb_tag_way
.eq(tlb_tag_way
)
780 comb
+= d
.tlb_pte_way
.eq(tlb_pte_way
)
781 comb
+= d
.tlb_req_index
.eq(tlb_req_index
)
784 comb
+= d
.repl_way
.eq(tlb_hit_way
)
786 comb
+= d
.repl_way
.eq(tlb_plru_victim
[tlb_req_index
])
787 comb
+= d
.eatag
.eq(r0
.req
.addr
[TLB_LG_PGSZ
+ TLB_SET_BITS
:64])
788 comb
+= d
.pte_data
.eq(r0
.req
.data
)
790 def maybe_plrus(self
, m
, r1
, plru_victim
):
796 if TLB_NUM_WAYS
== 0:
799 for i
in range(NUM_LINES
):
801 plru
= PLRU(WAY_BITS
)
802 setattr(m
.submodules
, "plru%d" % i
, plru
)
803 plru_acc_en
= Signal()
805 comb
+= plru_acc_en
.eq(r1
.cache_hit
& (r1
.hit_index
== i
))
806 comb
+= plru
.acc_en
.eq(plru_acc_en
)
807 comb
+= plru
.acc_i
.eq(r1
.hit_way
)
808 comb
+= plru_victim
[i
].eq(plru
.lru_o
)
810 def cache_tag_read(self
, m
, r0_stall
, req_index
, cache_tag_set
, cache_tags
):
811 """Cache tag RAM read port
815 m_in
, d_in
= self
.m_in
, self
.d_in
817 index
= Signal(INDEX_BITS
)
820 comb
+= index
.eq(req_index
)
821 with m
.Elif(m_in
.valid
):
822 comb
+= index
.eq(get_index(m_in
.addr
))
824 comb
+= index
.eq(get_index(d_in
.addr
))
825 sync
+= cache_tag_set
.eq(cache_tags
[index
])
827 def dcache_request(self
, m
, r0
, ra
, req_index
, req_row
, req_tag
,
828 r0_valid
, r1
, cache_valids
, replace_way
,
829 use_forward1_next
, use_forward2_next
,
830 req_hit_way
, plru_victim
, rc_ok
, perm_attr
,
831 valid_ra
, perm_ok
, access_ok
, req_op
, req_go
,
833 tlb_hit
, tlb_hit_way
, tlb_valid_way
, cache_tag_set
,
834 cancel_store
, req_same_tag
, r0_stall
, early_req_row
):
835 """Cache request parsing and hit detection
839 m_in
, d_in
= self
.m_in
, self
.d_in
842 hit_way
= Signal(WAY_BITS
)
847 hit_set
= Array(Signal(name
="hit_set_%d" % i
) \
848 for i
in range(TLB_NUM_WAYS
))
849 cache_valid_idx
= Signal(NUM_WAYS
)
851 # Extract line, row and tag from request
852 comb
+= req_index
.eq(get_index(r0
.req
.addr
))
853 comb
+= req_row
.eq(get_row(r0
.req
.addr
))
854 comb
+= req_tag
.eq(get_tag(ra
))
856 if False: # display on comb is a bit... busy.
857 comb
+= Display("dcache_req addr:%x ra: %x idx: %x tag: %x row: %x",
858 r0
.req
.addr
, ra
, req_index
, req_tag
, req_row
)
860 comb
+= go
.eq(r0_valid
& ~
(r0
.tlbie | r0
.tlbld
) & ~r1
.ls_error
)
861 comb
+= cache_valid_idx
.eq(cache_valids
[req_index
])
863 m
.submodules
.dcache_pend
= dc
= DCachePendingHit(tlb_pte_way
,
864 tlb_valid_way
, tlb_hit_way
,
865 cache_valid_idx
, cache_tag_set
,
869 comb
+= dc
.tlb_hit
.eq(tlb_hit
)
870 comb
+= dc
.reload_tag
.eq(r1
.reload_tag
)
871 comb
+= dc
.virt_mode
.eq(r0
.req
.virt_mode
)
873 comb
+= dc
.req_index
.eq(req_index
)
874 comb
+= is_hit
.eq(dc
.is_hit
)
875 comb
+= hit_way
.eq(dc
.hit_way
)
876 comb
+= req_same_tag
.eq(dc
.rel_match
)
878 # See if the request matches the line currently being reloaded
879 with m
.If((r1
.state
== State
.RELOAD_WAIT_ACK
) &
880 (req_index
== r1
.store_index
) & req_same_tag
):
881 # For a store, consider this a hit even if the row isn't
882 # valid since it will be by the time we perform the store.
883 # For a load, check the appropriate row valid bit.
884 rrow
= Signal(ROW_LINE_BITS
)
885 comb
+= rrow
.eq(req_row
)
886 valid
= r1
.rows_valid
[rrow
]
887 comb
+= is_hit
.eq((~r0
.req
.load
) | valid
)
888 comb
+= hit_way
.eq(replace_way
)
890 # Whether to use forwarded data for a load or not
891 with m
.If((get_row(r1
.req
.real_addr
) == req_row
) &
892 (r1
.req
.hit_way
== hit_way
)):
893 # Only need to consider r1.write_bram here, since if we
894 # are writing refill data here, then we don't have a
895 # cache hit this cycle on the line being refilled.
896 # (There is the possibility that the load following the
897 # load miss that started the refill could be to the old
898 # contents of the victim line, since it is a couple of
899 # cycles after the refill starts before we see the updated
900 # cache tag. In that case we don't use the bypass.)
901 comb
+= use_forward1_next
.eq(r1
.write_bram
)
902 with m
.If((r1
.forward_row1
== req_row
) & (r1
.forward_way1
== hit_way
)):
903 comb
+= use_forward2_next
.eq(r1
.forward_valid1
)
905 # The way that matched on a hit
906 comb
+= req_hit_way
.eq(hit_way
)
908 # The way to replace on a miss
909 with m
.If(r1
.write_tag
):
910 comb
+= replace_way
.eq(plru_victim
[r1
.store_index
])
912 comb
+= replace_way
.eq(r1
.store_way
)
914 # work out whether we have permission for this access
915 # NB we don't yet implement AMR, thus no KUAP
916 comb
+= rc_ok
.eq(perm_attr
.reference
917 & (r0
.req
.load | perm_attr
.changed
))
918 comb
+= perm_ok
.eq((r0
.req
.priv_mode |
(~perm_attr
.priv
)) &
920 (r0
.req
.load
& perm_attr
.rd_perm
)))
921 comb
+= access_ok
.eq(valid_ra
& perm_ok
& rc_ok
)
922 # Combine the request and cache hit status to decide what
923 # operation needs to be done
924 comb
+= nc
.eq(r0
.req
.nc | perm_attr
.nocache
)
925 comb
+= op
.eq(Op
.OP_NONE
)
927 with m
.If(~access_ok
):
928 comb
+= op
.eq(Op
.OP_BAD
)
929 with m
.Elif(cancel_store
):
930 comb
+= op
.eq(Op
.OP_STCX_FAIL
)
932 comb
+= opsel
.eq(Cat(is_hit
, nc
, r0
.req
.load
))
933 with m
.Switch(opsel
):
934 with m
.Case(0b101): comb
+= op
.eq(Op
.OP_LOAD_HIT
)
935 with m
.Case(0b100): comb
+= op
.eq(Op
.OP_LOAD_MISS
)
936 with m
.Case(0b110): comb
+= op
.eq(Op
.OP_LOAD_NC
)
937 with m
.Case(0b001): comb
+= op
.eq(Op
.OP_STORE_HIT
)
938 with m
.Case(0b000): comb
+= op
.eq(Op
.OP_STORE_MISS
)
939 with m
.Case(0b010): comb
+= op
.eq(Op
.OP_STORE_MISS
)
940 with m
.Case(0b011): comb
+= op
.eq(Op
.OP_BAD
)
941 with m
.Case(0b111): comb
+= op
.eq(Op
.OP_BAD
)
942 comb
+= req_op
.eq(op
)
943 comb
+= req_go
.eq(go
)
945 # Version of the row number that is valid one cycle earlier
946 # in the cases where we need to read the cache data BRAM.
947 # If we're stalling then we need to keep reading the last
949 with m
.If(~r0_stall
):
950 with m
.If(m_in
.valid
):
951 comb
+= early_req_row
.eq(get_row(m_in
.addr
))
953 comb
+= early_req_row
.eq(get_row(d_in
.addr
))
955 comb
+= early_req_row
.eq(req_row
)
957 def reservation_comb(self
, m
, cancel_store
, set_rsrv
, clear_rsrv
,
958 r0_valid
, r0
, reservation
):
959 """Handle load-with-reservation and store-conditional instructions
963 with m
.If(r0_valid
& r0
.req
.reserve
):
964 # XXX generate alignment interrupt if address
965 # is not aligned XXX or if r0.req.nc = '1'
966 with m
.If(r0
.req
.load
):
967 comb
+= set_rsrv
.eq(r0
.req
.atomic_last
) # load with reservation
969 comb
+= clear_rsrv
.eq(r0
.req
.atomic_last
) # store conditional
970 with m
.If((~reservation
.valid
) |
971 (r0
.req
.addr
[LINE_OFF_BITS
:64] != reservation
.addr
)):
972 comb
+= cancel_store
.eq(1)
974 def reservation_reg(self
, m
, r0_valid
, access_ok
, set_rsrv
, clear_rsrv
,
980 with m
.If(r0_valid
& access_ok
):
981 with m
.If(clear_rsrv
):
982 sync
+= reservation
.valid
.eq(0)
983 with m
.Elif(set_rsrv
):
984 sync
+= reservation
.valid
.eq(1)
985 sync
+= reservation
.addr
.eq(r0
.req
.addr
[LINE_OFF_BITS
:64])
987 def writeback_control(self
, m
, r1
, cache_out_row
):
988 """Return data for loads & completion control logic
992 d_out
, m_out
= self
.d_out
, self
.m_out
994 data_out
= Signal(64)
995 data_fwd
= Signal(64)
997 # Use the bypass if are reading the row that was
998 # written 1 or 2 cycles ago, including for the
999 # slow_valid = 1 case (i.e. completing a load
1000 # miss or a non-cacheable load).
1001 with m
.If(r1
.use_forward1
):
1002 comb
+= data_fwd
.eq(r1
.forward_data1
)
1004 comb
+= data_fwd
.eq(r1
.forward_data2
)
1006 comb
+= data_out
.eq(cache_out_row
)
1009 with m
.If(r1
.forward_sel
[i
]):
1010 dsel
= data_fwd
.word_select(i
, 8)
1011 comb
+= data_out
.word_select(i
, 8).eq(dsel
)
1013 comb
+= d_out
.valid
.eq(r1
.ls_valid
)
1014 comb
+= d_out
.data
.eq(data_out
)
1015 comb
+= d_out
.store_done
.eq(~r1
.stcx_fail
)
1016 comb
+= d_out
.error
.eq(r1
.ls_error
)
1017 comb
+= d_out
.cache_paradox
.eq(r1
.cache_paradox
)
1020 comb
+= m_out
.done
.eq(r1
.mmu_done
)
1021 comb
+= m_out
.err
.eq(r1
.mmu_error
)
1022 comb
+= m_out
.data
.eq(data_out
)
1024 # We have a valid load or store hit or we just completed
1025 # a slow op such as a load miss, a NC load or a store
1027 # Note: the load hit is delayed by one cycle. However it
1028 # can still not collide with r.slow_valid (well unless I
1029 # miscalculated) because slow_valid can only be set on a
1030 # subsequent request and not on its first cycle (the state
1031 # machine must have advanced), which makes slow_valid
1032 # at least 2 cycles from the previous hit_load_valid.
1034 # Sanity: Only one of these must be set in any given cycle
1036 if False: # TODO: need Display to get this to work
1037 assert (r1
.slow_valid
& r1
.stcx_fail
) != 1, \
1038 "unexpected slow_valid collision with stcx_fail"
1040 assert ((r1
.slow_valid | r1
.stcx_fail
) | r1
.hit_load_valid
) != 1, \
1041 "unexpected hit_load_delayed collision with slow_valid"
1043 with m
.If(~r1
.mmu_req
):
1044 # Request came from loadstore1...
1045 # Load hit case is the standard path
1046 with m
.If(r1
.hit_load_valid
):
1047 sync
+= Display("completing load hit data=%x", data_out
)
1049 # error cases complete without stalling
1050 with m
.If(r1
.ls_error
):
1051 sync
+= Display("completing ld/st with error")
1053 # Slow ops (load miss, NC, stores)
1054 with m
.If(r1
.slow_valid
):
1055 sync
+= Display("completing store or load miss adr=%x data=%x",
1056 r1
.req
.real_addr
, data_out
)
1059 # Request came from MMU
1060 with m
.If(r1
.hit_load_valid
):
1061 sync
+= Display("completing load hit to MMU, data=%x",
1063 # error cases complete without stalling
1064 with m
.If(r1
.mmu_error
):
1065 sync
+= Display("combpleting MMU ld with error")
1067 # Slow ops (i.e. load miss)
1068 with m
.If(r1
.slow_valid
):
1069 sync
+= Display("completing MMU load miss, data=%x",
1072 def rams(self
, m
, r1
, early_req_row
, cache_out_row
, replace_way
):
1074 Generate a cache RAM for each way. This handles the normal
1075 reads, writes from reloads and the special store-hit update
1078 Note: the BRAMs have an extra read buffer, meaning the output
1079 is pipelined an extra cycle. This differs from the
1080 icache. The writeback logic needs to take that into
1081 account by using 1-cycle delayed signals for load hits.
1086 for i
in range(NUM_WAYS
):
1087 do_read
= Signal(name
="do_rd%d" % i
)
1088 rd_addr
= Signal(ROW_BITS
, name
="rd_addr_%d" % i
)
1089 do_write
= Signal(name
="do_wr%d" % i
)
1090 wr_addr
= Signal(ROW_BITS
, name
="wr_addr_%d" % i
)
1091 wr_data
= Signal(WB_DATA_BITS
, name
="din_%d" % i
)
1092 wr_sel
= Signal(ROW_SIZE
)
1093 wr_sel_m
= Signal(ROW_SIZE
)
1094 _d_out
= Signal(WB_DATA_BITS
, name
="dout_%d" % i
) # cache_row_t
1096 way
= CacheRam(ROW_BITS
, WB_DATA_BITS
, ADD_BUF
=True)
1097 setattr(m
.submodules
, "cacheram_%d" % i
, way
)
1099 comb
+= way
.rd_en
.eq(do_read
)
1100 comb
+= way
.rd_addr
.eq(rd_addr
)
1101 comb
+= _d_out
.eq(way
.rd_data_o
)
1102 comb
+= way
.wr_sel
.eq(wr_sel_m
)
1103 comb
+= way
.wr_addr
.eq(wr_addr
)
1104 comb
+= way
.wr_data
.eq(wr_data
)
1107 comb
+= do_read
.eq(1)
1108 comb
+= rd_addr
.eq(early_req_row
)
1109 with m
.If(r1
.hit_way
== i
):
1110 comb
+= cache_out_row
.eq(_d_out
)
1114 # Defaults to wishbone read responses (cache refill)
1116 # For timing, the mux on wr_data/sel/addr is not
1117 # dependent on anything other than the current state.
1119 with m
.If(r1
.write_bram
):
1120 # Write store data to BRAM. This happens one
1121 # cycle after the store is in r0.
1122 comb
+= wr_data
.eq(r1
.req
.data
)
1123 comb
+= wr_sel
.eq(r1
.req
.byte_sel
)
1124 comb
+= wr_addr
.eq(get_row(r1
.req
.real_addr
))
1126 with m
.If(i
== r1
.req
.hit_way
):
1127 comb
+= do_write
.eq(1)
1129 # Otherwise, we might be doing a reload or a DCBZ
1131 comb
+= wr_data
.eq(0)
1133 comb
+= wr_data
.eq(wb_in
.dat
)
1134 comb
+= wr_addr
.eq(r1
.store_row
)
1135 comb
+= wr_sel
.eq(~
0) # all 1s
1137 with m
.If((r1
.state
== State
.RELOAD_WAIT_ACK
)
1138 & wb_in
.ack
& (replace_way
== i
)):
1139 comb
+= do_write
.eq(1)
1141 # Mask write selects with do_write since BRAM
1142 # doesn't have a global write-enable
1143 with m
.If(do_write
):
1144 comb
+= wr_sel_m
.eq(wr_sel
)
1146 # Cache hit synchronous machine for the easy case.
1147 # This handles load hits.
1148 # It also handles error cases (TLB miss, cache paradox)
1149 def dcache_fast_hit(self
, m
, req_op
, r0_valid
, r0
, r1
,
1150 req_hit_way
, req_index
, req_tag
, access_ok
,
1151 tlb_hit
, tlb_hit_way
, tlb_req_index
):
1156 with m
.If(req_op
!= Op
.OP_NONE
):
1157 sync
+= Display("op:%d addr:%x nc: %d idx: %x tag: %x way: %x",
1158 req_op
, r0
.req
.addr
, r0
.req
.nc
,
1159 req_index
, req_tag
, req_hit_way
)
1161 with m
.If(r0_valid
):
1162 sync
+= r1
.mmu_req
.eq(r0
.mmu_req
)
1164 # Fast path for load/store hits.
1165 # Set signals for the writeback controls.
1166 sync
+= r1
.hit_way
.eq(req_hit_way
)
1167 sync
+= r1
.hit_index
.eq(req_index
)
1169 with m
.If(req_op
== Op
.OP_LOAD_HIT
):
1170 sync
+= r1
.hit_load_valid
.eq(1)
1172 sync
+= r1
.hit_load_valid
.eq(0)
1174 with m
.If((req_op
== Op
.OP_LOAD_HIT
) |
(req_op
== Op
.OP_STORE_HIT
)):
1175 sync
+= r1
.cache_hit
.eq(1)
1177 sync
+= r1
.cache_hit
.eq(0)
1179 with m
.If(req_op
== Op
.OP_BAD
):
1180 # Display(f"Signalling ld/st error valid_ra={valid_ra}"
1181 # f"rc_ok={rc_ok} perm_ok={perm_ok}"
1182 sync
+= r1
.ls_error
.eq(~r0
.mmu_req
)
1183 sync
+= r1
.mmu_error
.eq(r0
.mmu_req
)
1184 sync
+= r1
.cache_paradox
.eq(access_ok
)
1187 sync
+= r1
.ls_error
.eq(0)
1188 sync
+= r1
.mmu_error
.eq(0)
1189 sync
+= r1
.cache_paradox
.eq(0)
1191 with m
.If(req_op
== Op
.OP_STCX_FAIL
):
1192 sync
+= r1
.stcx_fail
.eq(1)
1194 sync
+= r1
.stcx_fail
.eq(0)
1196 # Record TLB hit information for updating TLB PLRU
1197 sync
+= r1
.tlb_hit
.eq(tlb_hit
)
1198 sync
+= r1
.tlb_hit_way
.eq(tlb_hit_way
)
1199 sync
+= r1
.tlb_hit_index
.eq(tlb_req_index
)
1201 # Memory accesses are handled by this state machine:
1203 # * Cache load miss/reload (in conjunction with "rams")
1204 # * Load hits for non-cachable forms
1205 # * Stores (the collision case is handled in "rams")
1207 # All wishbone requests generation is done here.
1208 # This machine operates at stage 1.
1209 def dcache_slow(self
, m
, r1
, use_forward1_next
, use_forward2_next
,
1210 cache_valids
, r0
, replace_way
,
1211 req_hit_way
, req_same_tag
,
1212 r0_valid
, req_op
, cache_tags
, req_go
, ra
):
1219 req
= MemAccessRequest("mreq_ds")
1221 req_row
= Signal(ROW_BITS
)
1222 req_idx
= Signal(INDEX_BITS
)
1223 req_tag
= Signal(TAG_BITS
)
1224 comb
+= req_idx
.eq(get_index(req
.real_addr
))
1225 comb
+= req_row
.eq(get_row(req
.real_addr
))
1226 comb
+= req_tag
.eq(get_tag(req
.real_addr
))
1228 sync
+= r1
.use_forward1
.eq(use_forward1_next
)
1229 sync
+= r1
.forward_sel
.eq(0)
1231 with m
.If(use_forward1_next
):
1232 sync
+= r1
.forward_sel
.eq(r1
.req
.byte_sel
)
1233 with m
.Elif(use_forward2_next
):
1234 sync
+= r1
.forward_sel
.eq(r1
.forward_sel1
)
1236 sync
+= r1
.forward_data2
.eq(r1
.forward_data1
)
1237 with m
.If(r1
.write_bram
):
1238 sync
+= r1
.forward_data1
.eq(r1
.req
.data
)
1239 sync
+= r1
.forward_sel1
.eq(r1
.req
.byte_sel
)
1240 sync
+= r1
.forward_way1
.eq(r1
.req
.hit_way
)
1241 sync
+= r1
.forward_row1
.eq(get_row(r1
.req
.real_addr
))
1242 sync
+= r1
.forward_valid1
.eq(1)
1245 sync
+= r1
.forward_data1
.eq(0)
1247 sync
+= r1
.forward_data1
.eq(wb_in
.dat
)
1248 sync
+= r1
.forward_sel1
.eq(~
0) # all 1s
1249 sync
+= r1
.forward_way1
.eq(replace_way
)
1250 sync
+= r1
.forward_row1
.eq(r1
.store_row
)
1251 sync
+= r1
.forward_valid1
.eq(0)
1253 # One cycle pulses reset
1254 sync
+= r1
.slow_valid
.eq(0)
1255 sync
+= r1
.write_bram
.eq(0)
1256 sync
+= r1
.inc_acks
.eq(0)
1257 sync
+= r1
.dec_acks
.eq(0)
1259 sync
+= r1
.ls_valid
.eq(0)
1260 # complete tlbies and TLB loads in the third cycle
1261 sync
+= r1
.mmu_done
.eq(r0_valid
& (r0
.tlbie | r0
.tlbld
))
1263 with m
.If((req_op
== Op
.OP_LOAD_HIT
) |
(req_op
== Op
.OP_STCX_FAIL
)):
1264 with m
.If(~r0
.mmu_req
):
1265 sync
+= r1
.ls_valid
.eq(1)
1267 sync
+= r1
.mmu_done
.eq(1)
1269 with m
.If(r1
.write_tag
):
1270 # Store new tag in selected way
1271 for i
in range(NUM_WAYS
):
1272 with m
.If(i
== replace_way
):
1273 ct
= Signal(TAG_RAM_WIDTH
)
1274 comb
+= ct
.eq(cache_tags
[r1
.store_index
])
1277 cache_tags(r1.store_index)((i + 1) * TAG_WIDTH - 1 downto i * TAG_WIDTH) <=
1278 (TAG_WIDTH - 1 downto TAG_BITS => '0') & r1.reload_tag;
1280 comb
+= ct
.word_select(i
, TAG_WIDTH
).eq(r1
.reload_tag
)
1281 sync
+= cache_tags
[r1
.store_index
].eq(ct
)
1282 sync
+= r1
.store_way
.eq(replace_way
)
1283 sync
+= r1
.write_tag
.eq(0)
1285 # Take request from r1.req if there is one there,
1286 # else from req_op, ra, etc.
1288 comb
+= req
.eq(r1
.req
)
1290 comb
+= req
.op
.eq(req_op
)
1291 comb
+= req
.valid
.eq(req_go
)
1292 comb
+= req
.mmu_req
.eq(r0
.mmu_req
)
1293 comb
+= req
.dcbz
.eq(r0
.req
.dcbz
)
1294 comb
+= req
.real_addr
.eq(ra
)
1296 with m
.If(r0
.req
.dcbz
):
1297 # force data to 0 for dcbz
1298 comb
+= req
.data
.eq(0)
1299 with m
.Elif(r0
.d_valid
):
1300 comb
+= req
.data
.eq(r0
.req
.data
)
1302 comb
+= req
.data
.eq(d_in
.data
)
1304 # Select all bytes for dcbz
1305 # and for cacheable loads
1306 with m
.If(r0
.req
.dcbz |
(r0
.req
.load
& ~r0
.req
.nc
)):
1307 comb
+= req
.byte_sel
.eq(~
0) # all 1s
1309 comb
+= req
.byte_sel
.eq(r0
.req
.byte_sel
)
1310 comb
+= req
.hit_way
.eq(req_hit_way
)
1311 comb
+= req
.same_tag
.eq(req_same_tag
)
1313 # Store the incoming request from r0,
1314 # if it is a slow request
1315 # Note that r1.full = 1 implies req_op = OP_NONE
1316 with m
.If((req_op
== Op
.OP_LOAD_MISS
)
1317 |
(req_op
== Op
.OP_LOAD_NC
)
1318 |
(req_op
== Op
.OP_STORE_MISS
)
1319 |
(req_op
== Op
.OP_STORE_HIT
)):
1320 sync
+= r1
.req
.eq(req
)
1321 sync
+= r1
.full
.eq(1)
1323 # Main state machine
1324 with m
.Switch(r1
.state
):
1326 with m
.Case(State
.IDLE
):
1327 sync
+= r1
.wb
.adr
.eq(req
.real_addr
[ROW_LINE_BITS
:])
1328 sync
+= r1
.wb
.sel
.eq(req
.byte_sel
)
1329 sync
+= r1
.wb
.dat
.eq(req
.data
)
1330 sync
+= r1
.dcbz
.eq(req
.dcbz
)
1332 # Keep track of our index and way
1333 # for subsequent stores.
1334 sync
+= r1
.store_index
.eq(req_idx
)
1335 sync
+= r1
.store_row
.eq(req_row
)
1336 sync
+= r1
.end_row_ix
.eq(get_row_of_line(req_row
)-1)
1337 sync
+= r1
.reload_tag
.eq(req_tag
)
1338 sync
+= r1
.req
.same_tag
.eq(1)
1340 with m
.If(req
.op
== Op
.OP_STORE_HIT
):
1341 sync
+= r1
.store_way
.eq(req
.hit_way
)
1343 # Reset per-row valid bits,
1344 # ready for handling OP_LOAD_MISS
1345 for i
in range(ROW_PER_LINE
):
1346 sync
+= r1
.rows_valid
[i
].eq(0)
1348 with m
.If(req_op
!= Op
.OP_NONE
):
1349 sync
+= Display("cache op %d", req
.op
)
1351 with m
.Switch(req
.op
):
1352 with m
.Case(Op
.OP_LOAD_HIT
):
1353 # stay in IDLE state
1356 with m
.Case(Op
.OP_LOAD_MISS
):
1357 sync
+= Display("cache miss real addr: %x " \
1359 req
.real_addr
, req_row
, req_tag
)
1361 # Start the wishbone cycle
1362 sync
+= r1
.wb
.we
.eq(0)
1363 sync
+= r1
.wb
.cyc
.eq(1)
1364 sync
+= r1
.wb
.stb
.eq(1)
1366 # Track that we had one request sent
1367 sync
+= r1
.state
.eq(State
.RELOAD_WAIT_ACK
)
1368 sync
+= r1
.write_tag
.eq(1)
1370 with m
.Case(Op
.OP_LOAD_NC
):
1371 sync
+= r1
.wb
.cyc
.eq(1)
1372 sync
+= r1
.wb
.stb
.eq(1)
1373 sync
+= r1
.wb
.we
.eq(0)
1374 sync
+= r1
.state
.eq(State
.NC_LOAD_WAIT_ACK
)
1376 with m
.Case(Op
.OP_STORE_HIT
, Op
.OP_STORE_MISS
):
1377 with m
.If(~req
.dcbz
):
1378 sync
+= r1
.state
.eq(State
.STORE_WAIT_ACK
)
1379 sync
+= r1
.acks_pending
.eq(1)
1380 sync
+= r1
.full
.eq(0)
1381 sync
+= r1
.slow_valid
.eq(1)
1383 with m
.If(~req
.mmu_req
):
1384 sync
+= r1
.ls_valid
.eq(1)
1386 sync
+= r1
.mmu_done
.eq(1)
1388 with m
.If(req
.op
== Op
.OP_STORE_HIT
):
1389 sync
+= r1
.write_bram
.eq(1)
1391 # dcbz is handled much like a load miss except
1392 # that we are writing to memory instead of reading
1393 sync
+= r1
.state
.eq(State
.RELOAD_WAIT_ACK
)
1395 with m
.If(req
.op
== Op
.OP_STORE_MISS
):
1396 sync
+= r1
.write_tag
.eq(1)
1398 sync
+= r1
.wb
.we
.eq(1)
1399 sync
+= r1
.wb
.cyc
.eq(1)
1400 sync
+= r1
.wb
.stb
.eq(1)
1402 # OP_NONE and OP_BAD do nothing
1403 # OP_BAD & OP_STCX_FAIL were
1404 # handled above already
1405 with m
.Case(Op
.OP_NONE
):
1407 with m
.Case(Op
.OP_BAD
):
1409 with m
.Case(Op
.OP_STCX_FAIL
):
1412 with m
.Case(State
.RELOAD_WAIT_ACK
):
1413 ld_stbs_done
= Signal()
1414 # Requests are all sent if stb is 0
1415 comb
+= ld_stbs_done
.eq(~r1
.wb
.stb
)
1417 # If we are still sending requests, was one accepted?
1418 with m
.If((~wb_in
.stall
) & r1
.wb
.stb
):
1419 # That was the last word? We are done sending.
1420 # Clear stb and set ld_stbs_done so we can handle an
1421 # eventual last ack on the same cycle.
1422 # sigh - reconstruct wb adr with 3 extra 0s at front
1423 wb_adr
= Cat(Const(0, ROW_OFF_BITS
), r1
.wb
.adr
)
1424 with m
.If(is_last_row_addr(wb_adr
, r1
.end_row_ix
)):
1425 sync
+= r1
.wb
.stb
.eq(0)
1426 comb
+= ld_stbs_done
.eq(1)
1428 # Calculate the next row address in the current cache line
1429 row
= Signal(LINE_OFF_BITS
-ROW_OFF_BITS
)
1430 comb
+= row
.eq(r1
.wb
.adr
)
1431 sync
+= r1
.wb
.adr
[:LINE_OFF_BITS
-ROW_OFF_BITS
].eq(row
+1)
1433 # Incoming acks processing
1434 sync
+= r1
.forward_valid1
.eq(wb_in
.ack
)
1435 with m
.If(wb_in
.ack
):
1436 srow
= Signal(ROW_LINE_BITS
)
1437 comb
+= srow
.eq(r1
.store_row
)
1438 sync
+= r1
.rows_valid
[srow
].eq(1)
1440 # If this is the data we were looking for,
1441 # we can complete the request next cycle.
1442 # Compare the whole address in case the
1443 # request in r1.req is not the one that
1444 # started this refill.
1445 with m
.If(req
.valid
& r1
.req
.same_tag
&
1446 ((r1
.dcbz
& r1
.req
.dcbz
) |
1447 (~r1
.dcbz
& (r1
.req
.op
== Op
.OP_LOAD_MISS
))) &
1448 (r1
.store_row
== get_row(req
.real_addr
))):
1449 sync
+= r1
.full
.eq(0)
1450 sync
+= r1
.slow_valid
.eq(1)
1451 with m
.If(~r1
.mmu_req
):
1452 sync
+= r1
.ls_valid
.eq(1)
1454 sync
+= r1
.mmu_done
.eq(1)
1455 sync
+= r1
.forward_sel
.eq(~
0) # all 1s
1456 sync
+= r1
.use_forward1
.eq(1)
1458 # Check for completion
1459 with m
.If(ld_stbs_done
& is_last_row(r1
.store_row
,
1461 # Complete wishbone cycle
1462 sync
+= r1
.wb
.cyc
.eq(0)
1464 # Cache line is now valid
1465 cv
= Signal(INDEX_BITS
)
1466 comb
+= cv
.eq(cache_valids
[r1
.store_index
])
1467 comb
+= cv
.bit_select(r1
.store_way
, 1).eq(1)
1468 sync
+= cache_valids
[r1
.store_index
].eq(cv
)
1470 sync
+= r1
.state
.eq(State
.IDLE
)
1472 # Increment store row counter
1473 sync
+= r1
.store_row
.eq(next_row(r1
.store_row
))
1475 with m
.Case(State
.STORE_WAIT_ACK
):
1476 st_stbs_done
= Signal()
1478 adjust_acks
= Signal(3)
1480 comb
+= st_stbs_done
.eq(~r1
.wb
.stb
)
1481 comb
+= acks
.eq(r1
.acks_pending
)
1483 with m
.If(r1
.inc_acks
!= r1
.dec_acks
):
1484 with m
.If(r1
.inc_acks
):
1485 comb
+= adjust_acks
.eq(acks
+ 1)
1487 comb
+= adjust_acks
.eq(acks
- 1)
1489 comb
+= adjust_acks
.eq(acks
)
1491 sync
+= r1
.acks_pending
.eq(adjust_acks
)
1493 # Clear stb when slave accepted request
1494 with m
.If(~wb_in
.stall
):
1495 # See if there is another store waiting
1496 # to be done which is in the same real page.
1497 with m
.If(req
.valid
):
1498 _ra
= req
.real_addr
[ROW_LINE_BITS
:SET_SIZE_BITS
]
1499 sync
+= r1
.wb
.adr
[0:SET_SIZE_BITS
].eq(_ra
)
1500 sync
+= r1
.wb
.dat
.eq(req
.data
)
1501 sync
+= r1
.wb
.sel
.eq(req
.byte_sel
)
1503 with m
.If((adjust_acks
< 7) & req
.same_tag
&
1504 ((req
.op
== Op
.OP_STORE_MISS
)
1505 |
(req
.op
== Op
.OP_STORE_HIT
))):
1506 sync
+= r1
.wb
.stb
.eq(1)
1507 comb
+= st_stbs_done
.eq(0)
1509 with m
.If(req
.op
== Op
.OP_STORE_HIT
):
1510 sync
+= r1
.write_bram
.eq(1)
1511 sync
+= r1
.full
.eq(0)
1512 sync
+= r1
.slow_valid
.eq(1)
1514 # Store requests never come from the MMU
1515 sync
+= r1
.ls_valid
.eq(1)
1516 comb
+= st_stbs_done
.eq(0)
1517 sync
+= r1
.inc_acks
.eq(1)
1519 sync
+= r1
.wb
.stb
.eq(0)
1520 comb
+= st_stbs_done
.eq(1)
1522 # Got ack ? See if complete.
1523 with m
.If(wb_in
.ack
):
1524 with m
.If(st_stbs_done
& (adjust_acks
== 1)):
1525 sync
+= r1
.state
.eq(State
.IDLE
)
1526 sync
+= r1
.wb
.cyc
.eq(0)
1527 sync
+= r1
.wb
.stb
.eq(0)
1528 sync
+= r1
.dec_acks
.eq(1)
1530 with m
.Case(State
.NC_LOAD_WAIT_ACK
):
1531 # Clear stb when slave accepted request
1532 with m
.If(~wb_in
.stall
):
1533 sync
+= r1
.wb
.stb
.eq(0)
1535 # Got ack ? complete.
1536 with m
.If(wb_in
.ack
):
1537 sync
+= r1
.state
.eq(State
.IDLE
)
1538 sync
+= r1
.full
.eq(0)
1539 sync
+= r1
.slow_valid
.eq(1)
1541 with m
.If(~r1
.mmu_req
):
1542 sync
+= r1
.ls_valid
.eq(1)
1544 sync
+= r1
.mmu_done
.eq(1)
1546 sync
+= r1
.forward_sel
.eq(~
0) # all 1s
1547 sync
+= r1
.use_forward1
.eq(1)
1548 sync
+= r1
.wb
.cyc
.eq(0)
1549 sync
+= r1
.wb
.stb
.eq(0)
1551 def dcache_log(self
, m
, r1
, valid_ra
, tlb_hit_way
, stall_out
):
1554 d_out
, wb_in
, log_out
= self
.d_out
, self
.wb_in
, self
.log_out
1556 sync
+= log_out
.eq(Cat(r1
.state
[:3], valid_ra
, tlb_hit_way
[:3],
1557 stall_out
, req_op
[:3], d_out
.valid
, d_out
.error
,
1558 r1
.wb
.cyc
, r1
.wb
.stb
, wb_in
.ack
, wb_in
.stall
,
1561 def elaborate(self
, platform
):
1567 # Storage. Hopefully "cache_rows" is a BRAM, the rest is LUTs
1568 cache_tags
= CacheTagArray()
1569 cache_tag_set
= Signal(TAG_RAM_WIDTH
)
1570 cache_valids
= CacheValidBitsArray()
1572 # TODO attribute ram_style : string;
1573 # TODO attribute ram_style of cache_tags : signal is "distributed";
1575 """note: these are passed to nmigen.hdl.Memory as "attributes".
1576 don't know how, just that they are.
1578 dtlb_valid_bits
= TLBValidBitsArray()
1579 dtlb_tags
= TLBTagsArray()
1580 dtlb_ptes
= TLBPtesArray()
1581 # TODO attribute ram_style of
1582 # dtlb_tags : signal is "distributed";
1583 # TODO attribute ram_style of
1584 # dtlb_ptes : signal is "distributed";
1586 r0
= RegStage0("r0")
1589 r1
= RegStage1("r1")
1591 reservation
= Reservation()
1593 # Async signals on incoming request
1594 req_index
= Signal(INDEX_BITS
)
1595 req_row
= Signal(ROW_BITS
)
1596 req_hit_way
= Signal(WAY_BITS
)
1597 req_tag
= Signal(TAG_BITS
)
1599 req_data
= Signal(64)
1600 req_same_tag
= Signal()
1603 early_req_row
= Signal(ROW_BITS
)
1605 cancel_store
= Signal()
1607 clear_rsrv
= Signal()
1612 use_forward1_next
= Signal()
1613 use_forward2_next
= Signal()
1615 cache_out_row
= Signal(WB_DATA_BITS
)
1617 plru_victim
= PLRUOut()
1618 replace_way
= Signal(WAY_BITS
)
1620 # Wishbone read/write/cache write formatting signals
1624 tlb_tag_way
= Signal(TLB_TAG_WAY_BITS
)
1625 tlb_pte_way
= Signal(TLB_PTE_WAY_BITS
)
1626 tlb_valid_way
= Signal(TLB_NUM_WAYS
)
1627 tlb_req_index
= Signal(TLB_SET_BITS
)
1629 tlb_hit_way
= Signal(TLB_WAY_BITS
)
1630 pte
= Signal(TLB_PTE_BITS
)
1631 ra
= Signal(REAL_ADDR_BITS
)
1633 perm_attr
= PermAttr("dc_perms")
1636 access_ok
= Signal()
1638 tlb_plru_victim
= TLBPLRUOut()
1640 # we don't yet handle collisions between loadstore1 requests
1642 comb
+= self
.m_out
.stall
.eq(0)
1644 # Hold off the request in r0 when r1 has an uncompleted request
1645 comb
+= r0_stall
.eq(r0_full
& (r1
.full | d_in
.hold
))
1646 comb
+= r0_valid
.eq(r0_full
& ~r1
.full
& ~d_in
.hold
)
1647 comb
+= self
.stall_out
.eq(r0_stall
)
1649 # Wire up wishbone request latch out of stage 1
1650 comb
+= self
.wb_out
.eq(r1
.wb
)
1652 # deal with litex not doing wishbone pipeline mode
1653 # XXX in wrong way. FIFOs are needed in the SRAM test
1654 # so that stb/ack match up
1655 comb
+= self
.wb_in
.stall
.eq(self
.wb_out
.cyc
& ~self
.wb_in
.ack
)
1657 # call sub-functions putting everything together, using shared
1658 # signals established above
1659 self
.stage_0(m
, r0
, r1
, r0_full
)
1660 self
.tlb_read(m
, r0_stall
, tlb_valid_way
,
1661 tlb_tag_way
, tlb_pte_way
, dtlb_valid_bits
,
1662 dtlb_tags
, dtlb_ptes
)
1663 self
.tlb_search(m
, tlb_req_index
, r0
, r0_valid
,
1664 tlb_valid_way
, tlb_tag_way
, tlb_hit_way
,
1665 tlb_pte_way
, pte
, tlb_hit
, valid_ra
, perm_attr
, ra
)
1666 self
.tlb_update(m
, r0_valid
, r0
, dtlb_valid_bits
, tlb_req_index
,
1667 tlb_hit_way
, tlb_hit
, tlb_plru_victim
, tlb_tag_way
,
1668 dtlb_tags
, tlb_pte_way
, dtlb_ptes
)
1669 self
.maybe_plrus(m
, r1
, plru_victim
)
1670 self
.maybe_tlb_plrus(m
, r1
, tlb_plru_victim
)
1671 self
.cache_tag_read(m
, r0_stall
, req_index
, cache_tag_set
, cache_tags
)
1672 self
.dcache_request(m
, r0
, ra
, req_index
, req_row
, req_tag
,
1673 r0_valid
, r1
, cache_valids
, replace_way
,
1674 use_forward1_next
, use_forward2_next
,
1675 req_hit_way
, plru_victim
, rc_ok
, perm_attr
,
1676 valid_ra
, perm_ok
, access_ok
, req_op
, req_go
,
1678 tlb_hit
, tlb_hit_way
, tlb_valid_way
, cache_tag_set
,
1679 cancel_store
, req_same_tag
, r0_stall
, early_req_row
)
1680 self
.reservation_comb(m
, cancel_store
, set_rsrv
, clear_rsrv
,
1681 r0_valid
, r0
, reservation
)
1682 self
.reservation_reg(m
, r0_valid
, access_ok
, set_rsrv
, clear_rsrv
,
1684 self
.writeback_control(m
, r1
, cache_out_row
)
1685 self
.rams(m
, r1
, early_req_row
, cache_out_row
, replace_way
)
1686 self
.dcache_fast_hit(m
, req_op
, r0_valid
, r0
, r1
,
1687 req_hit_way
, req_index
, req_tag
, access_ok
,
1688 tlb_hit
, tlb_hit_way
, tlb_req_index
)
1689 self
.dcache_slow(m
, r1
, use_forward1_next
, use_forward2_next
,
1690 cache_valids
, r0
, replace_way
,
1691 req_hit_way
, req_same_tag
,
1692 r0_valid
, req_op
, cache_tags
, req_go
, ra
)
1693 #self.dcache_log(m, r1, valid_ra, tlb_hit_way, stall_out)
1697 def dcache_load(dut
, addr
, nc
=0):
1698 yield dut
.d_in
.load
.eq(1)
1699 yield dut
.d_in
.nc
.eq(nc
)
1700 yield dut
.d_in
.addr
.eq(addr
)
1701 yield dut
.d_in
.byte_sel
.eq(~
0)
1702 yield dut
.d_in
.valid
.eq(1)
1704 yield dut
.d_in
.valid
.eq(0)
1705 yield dut
.d_in
.byte_sel
.eq(0)
1706 while not (yield dut
.d_out
.valid
):
1708 # yield # data is valid one cycle AFTER valid goes hi? (no it isn't)
1709 data
= yield dut
.d_out
.data
1713 def dcache_store(dut
, addr
, data
, nc
=0):
1714 yield dut
.d_in
.load
.eq(0)
1715 yield dut
.d_in
.nc
.eq(nc
)
1716 yield dut
.d_in
.byte_sel
.eq(~
0)
1717 yield dut
.d_in
.addr
.eq(addr
)
1718 yield dut
.d_in
.valid
.eq(1)
1720 yield dut
.d_in
.data
.eq(data
) # leave set, but the cycle AFTER
1721 yield dut
.d_in
.valid
.eq(0)
1722 yield dut
.d_in
.byte_sel
.eq(0)
1723 while not (yield dut
.d_out
.valid
):
1727 def dcache_random_sim(dut
, mem
, nc
=0):
1730 sim_mem
= deepcopy(mem
)
1731 memsize
= len(sim_mem
)
1732 print ("mem len", memsize
)
1735 yield dut
.d_in
.valid
.eq(0)
1736 yield dut
.d_in
.load
.eq(0)
1737 yield dut
.d_in
.priv_mode
.eq(1)
1738 yield dut
.d_in
.nc
.eq(0)
1739 yield dut
.d_in
.addr
.eq(0)
1740 yield dut
.d_in
.data
.eq(0)
1741 yield dut
.m_in
.valid
.eq(0)
1742 yield dut
.m_in
.addr
.eq(0)
1743 yield dut
.m_in
.pte
.eq(0)
1744 # wait 4 * clk_period
1752 #for i in range(1024):
1755 for i
in range(1024):
1756 addr
= randint(0, memsize
-1)
1757 data
= randint(0, (1<<64)-1)
1758 sim_mem
[addr
] = data
1762 print ("random testing %d 0x%x row %d data 0x%x" % (i
, addr
, row
, data
))
1764 yield from dcache_load(dut
, addr
, nc
)
1765 yield from dcache_store(dut
, addr
, data
, nc
)
1767 addr
= randint(0, memsize
-1)
1768 sim_data
= sim_mem
[addr
]
1772 print (" load 0x%x row %d expect data 0x%x" % (addr
, row
, sim_data
))
1773 data
= yield from dcache_load(dut
, addr
, nc
)
1774 assert data
== sim_data
, \
1775 "check addr 0x%x row %d data %x != %x" % (addr
, row
, data
, sim_data
)
1777 for addr
in range(memsize
):
1778 data
= yield from dcache_load(dut
, addr
*8, nc
)
1779 assert data
== sim_mem
[addr
], \
1780 "final check %x data %x != %x" % (addr
*8, data
, sim_mem
[addr
])
1783 def dcache_regression_sim(dut
, mem
, nc
=0):
1786 sim_mem
= deepcopy(mem
)
1787 memsize
= len(sim_mem
)
1788 print ("mem len", memsize
)
1791 yield dut
.d_in
.valid
.eq(0)
1792 yield dut
.d_in
.load
.eq(0)
1793 yield dut
.d_in
.priv_mode
.eq(1)
1794 yield dut
.d_in
.nc
.eq(0)
1795 yield dut
.d_in
.addr
.eq(0)
1796 yield dut
.d_in
.data
.eq(0)
1797 yield dut
.m_in
.valid
.eq(0)
1798 yield dut
.m_in
.addr
.eq(0)
1799 yield dut
.m_in
.pte
.eq(0)
1800 # wait 4 * clk_period
1810 print ("random testing %d 0x%x row %d" % (i
, addr
, row
))
1812 yield from dcache_load(dut
, addr
, nc
)
1815 sim_data
= sim_mem
[addr
]
1819 print (" load 0x%x row %d expect data 0x%x" % (addr
, row
, sim_data
))
1820 data
= yield from dcache_load(dut
, addr
, nc
)
1821 assert data
== sim_data
, \
1822 "check addr 0x%x row %d data %x != %x" % (addr
, row
, data
, sim_data
)
1826 def dcache_sim(dut
, mem
):
1828 yield dut
.d_in
.valid
.eq(0)
1829 yield dut
.d_in
.load
.eq(0)
1830 yield dut
.d_in
.priv_mode
.eq(1)
1831 yield dut
.d_in
.nc
.eq(0)
1832 yield dut
.d_in
.addr
.eq(0)
1833 yield dut
.d_in
.data
.eq(0)
1834 yield dut
.m_in
.valid
.eq(0)
1835 yield dut
.m_in
.addr
.eq(0)
1836 yield dut
.m_in
.pte
.eq(0)
1837 # wait 4 * clk_period
1843 # Cacheable read of address 4
1844 data
= yield from dcache_load(dut
, 0x58)
1845 addr
= yield dut
.d_in
.addr
1846 assert data
== 0x0000001700000016, \
1847 f
"data @%x=%x expected 0x0000001700000016" % (addr
, data
)
1849 # Cacheable read of address 20
1850 data
= yield from dcache_load(dut
, 0x20)
1851 addr
= yield dut
.d_in
.addr
1852 assert data
== 0x0000000900000008, \
1853 f
"data @%x=%x expected 0x0000000900000008" % (addr
, data
)
1855 # Cacheable read of address 30
1856 data
= yield from dcache_load(dut
, 0x530)
1857 addr
= yield dut
.d_in
.addr
1858 assert data
== 0x0000014D0000014C, \
1859 f
"data @%x=%x expected 0000014D0000014C" % (addr
, data
)
1861 # 2nd Cacheable read of address 30
1862 data
= yield from dcache_load(dut
, 0x530)
1863 addr
= yield dut
.d_in
.addr
1864 assert data
== 0x0000014D0000014C, \
1865 f
"data @%x=%x expected 0000014D0000014C" % (addr
, data
)
1867 # Non-cacheable read of address 100
1868 data
= yield from dcache_load(dut
, 0x100, nc
=1)
1869 addr
= yield dut
.d_in
.addr
1870 assert data
== 0x0000004100000040, \
1871 f
"data @%x=%x expected 0000004100000040" % (addr
, data
)
1873 # Store at address 530
1874 yield from dcache_store(dut
, 0x530, 0x121)
1876 # Store at address 30
1877 yield from dcache_store(dut
, 0x530, 0x12345678)
1879 # 3nd Cacheable read of address 530
1880 data
= yield from dcache_load(dut
, 0x530)
1881 addr
= yield dut
.d_in
.addr
1882 assert data
== 0x12345678, \
1883 f
"data @%x=%x expected 0x12345678" % (addr
, data
)
1885 # 4th Cacheable read of address 20
1886 data
= yield from dcache_load(dut
, 0x20)
1887 addr
= yield dut
.d_in
.addr
1888 assert data
== 0x0000000900000008, \
1889 f
"data @%x=%x expected 0x0000000900000008" % (addr
, data
)
1897 def test_dcache(mem
, test_fn
, test_name
):
1900 memory
= Memory(width
=64, depth
=len(mem
), init
=mem
, simulate
=True)
1901 sram
= SRAM(memory
=memory
, granularity
=8)
1904 m
.submodules
.dcache
= dut
1905 m
.submodules
.sram
= sram
1907 m
.d
.comb
+= sram
.bus
.cyc
.eq(dut
.wb_out
.cyc
)
1908 m
.d
.comb
+= sram
.bus
.stb
.eq(dut
.wb_out
.stb
)
1909 m
.d
.comb
+= sram
.bus
.we
.eq(dut
.wb_out
.we
)
1910 m
.d
.comb
+= sram
.bus
.sel
.eq(dut
.wb_out
.sel
)
1911 m
.d
.comb
+= sram
.bus
.adr
.eq(dut
.wb_out
.adr
)
1912 m
.d
.comb
+= sram
.bus
.dat_w
.eq(dut
.wb_out
.dat
)
1914 m
.d
.comb
+= dut
.wb_in
.ack
.eq(sram
.bus
.ack
)
1915 m
.d
.comb
+= dut
.wb_in
.dat
.eq(sram
.bus
.dat_r
)
1917 dcache_write_gtkw(test_name
)
1923 sim
.add_sync_process(wrap(test_fn(dut
, mem
)))
1924 with sim
.write_vcd('test_dcache%s.vcd' % test_name
):
1928 def dcache_write_gtkw(test_name
):
1932 'd_in_load', 'd_in_nc', 'd_in_addr[63:0]', 'd_in_data[63:0]',
1933 'd_in_byte_sel[7:0]', 'd_in_valid'
1936 'd_out_valid', 'd_out_data[63:0]'
1939 'wb_out_cyc', 'wb_out_stb', 'wb_out_we',
1940 'wb_out_adr[31:0]', 'wb_out_sel[7:0]', 'wb_out_dat[63:0]'
1943 'wb_in_stall', 'wb_in_ack', 'wb_in_dat[63:0]'
1946 write_gtkw('test_dcache%s.gtkw' % test_name
,
1947 'test_dcache%s.vcd' % test_name
,
1948 traces
, module
='top.dcache')
1951 if __name__
== '__main__':
1954 vl
= rtlil
.convert(dut
, ports
=[])
1955 with
open("test_dcache.il", "w") as f
:
1960 for i
in range(memsize
):
1963 test_dcache(mem
, dcache_regression_sim
, "simpleregression")
1967 for i
in range(memsize
):
1970 test_dcache(mem
, dcache_random_sim
, "random")
1973 for i
in range(1024):
1974 mem
.append((i
*2)|
((i
*2+1)<<32))
1976 test_dcache(mem
, dcache_sim
, "")