add nc argument to dcache load/store tests
[soc.git] / src / soc / experiment / dcache.py
1 """DCache
2
3 based on Anton Blanchard microwatt dcache.vhdl
4
5 note that the microwatt dcache wishbone interface expects "stall".
6 for simplicity at the moment this is hard-coded to cyc & ~ack.
7 see WB4 spec, p84, section 5.2.1
8
9 IMPORTANT: for store, the data is sampled the cycle AFTER the "valid"
10 is raised. sigh
11 """
12
13 import sys
14
15 from nmutil.gtkw import write_gtkw
16
17 sys.setrecursionlimit(1000000)
18
19 from enum import Enum, unique
20
21 from nmigen import Module, Signal, Elaboratable, Cat, Repl, Array, Const
22 from nmutil.util import Display
23
24 from copy import deepcopy
25 from random import randint, seed
26
27 from nmigen.cli import main
28 from nmutil.iocontrol import RecordObject
29 from nmigen.utils import log2_int
30 from soc.experiment.mem_types import (LoadStore1ToDCacheType,
31 DCacheToLoadStore1Type,
32 MMUToDCacheType,
33 DCacheToMMUType)
34
35 from soc.experiment.wb_types import (WB_ADDR_BITS, WB_DATA_BITS, WB_SEL_BITS,
36 WBAddrType, WBDataType, WBSelType,
37 WBMasterOut, WBSlaveOut,
38 WBMasterOutVector, WBSlaveOutVector,
39 WBIOMasterOut, WBIOSlaveOut)
40
41 from soc.experiment.cache_ram import CacheRam
42 #from soc.experiment.plru import PLRU
43 from nmutil.plru import PLRU
44
45 # for test
46 from soc.bus.sram import SRAM
47 from nmigen import Memory
48 from nmigen.cli import rtlil
49
50 # NOTE: to use cxxsim, export NMIGEN_SIM_MODE=cxxsim from the shell
51 # Also, check out the cxxsim nmigen branch, and latest yosys from git
52 from nmutil.sim_tmp_alternative import Simulator
53
54 from nmutil.util import wrap
55
56
57 # TODO: make these parameters of DCache at some point
58 LINE_SIZE = 64 # Line size in bytes
59 NUM_LINES = 16 # Number of lines in a set
60 NUM_WAYS = 4 # Number of ways
61 TLB_SET_SIZE = 64 # L1 DTLB entries per set
62 TLB_NUM_WAYS = 2 # L1 DTLB number of sets
63 TLB_LG_PGSZ = 12 # L1 DTLB log_2(page_size)
64 LOG_LENGTH = 0 # Non-zero to enable log data collection
65
66 # BRAM organisation: We never access more than
67 # -- WB_DATA_BITS at a time so to save
68 # -- resources we make the array only that wide, and
69 # -- use consecutive indices for to make a cache "line"
70 # --
71 # -- ROW_SIZE is the width in bytes of the BRAM
72 # -- (based on WB, so 64-bits)
73 ROW_SIZE = WB_DATA_BITS // 8;
74
75 # ROW_PER_LINE is the number of row (wishbone
76 # transactions) in a line
77 ROW_PER_LINE = LINE_SIZE // ROW_SIZE
78
79 # BRAM_ROWS is the number of rows in BRAM needed
80 # to represent the full dcache
81 BRAM_ROWS = NUM_LINES * ROW_PER_LINE
82
83 print ("ROW_SIZE", ROW_SIZE)
84 print ("ROW_PER_LINE", ROW_PER_LINE)
85 print ("BRAM_ROWS", BRAM_ROWS)
86 print ("NUM_WAYS", NUM_WAYS)
87
88 # Bit fields counts in the address
89
90 # REAL_ADDR_BITS is the number of real address
91 # bits that we store
92 REAL_ADDR_BITS = 56
93
94 # ROW_BITS is the number of bits to select a row
95 ROW_BITS = log2_int(BRAM_ROWS)
96
97 # ROW_LINE_BITS is the number of bits to select
98 # a row within a line
99 ROW_LINE_BITS = log2_int(ROW_PER_LINE)
100
101 # LINE_OFF_BITS is the number of bits for
102 # the offset in a cache line
103 LINE_OFF_BITS = log2_int(LINE_SIZE)
104
105 # ROW_OFF_BITS is the number of bits for
106 # the offset in a row
107 ROW_OFF_BITS = log2_int(ROW_SIZE)
108
109 # INDEX_BITS is the number if bits to
110 # select a cache line
111 INDEX_BITS = log2_int(NUM_LINES)
112
113 # SET_SIZE_BITS is the log base 2 of the set size
114 SET_SIZE_BITS = LINE_OFF_BITS + INDEX_BITS
115
116 # TAG_BITS is the number of bits of
117 # the tag part of the address
118 TAG_BITS = REAL_ADDR_BITS - SET_SIZE_BITS
119
120 # TAG_WIDTH is the width in bits of each way of the tag RAM
121 TAG_WIDTH = TAG_BITS + 7 - ((TAG_BITS + 7) % 8)
122
123 # WAY_BITS is the number of bits to select a way
124 WAY_BITS = log2_int(NUM_WAYS)
125
126 # Example of layout for 32 lines of 64 bytes:
127 layout = """\
128 .. tag |index| line |
129 .. | row | |
130 .. | |---| | ROW_LINE_BITS (3)
131 .. | |--- - --| LINE_OFF_BITS (6)
132 .. | |- --| ROW_OFF_BITS (3)
133 .. |----- ---| | ROW_BITS (8)
134 .. |-----| | INDEX_BITS (5)
135 .. --------| | TAG_BITS (45)
136 """
137 print (layout)
138 print ("Dcache TAG %d IDX %d ROW_BITS %d ROFF %d LOFF %d RLB %d" % \
139 (TAG_BITS, INDEX_BITS, ROW_BITS,
140 ROW_OFF_BITS, LINE_OFF_BITS, ROW_LINE_BITS))
141 print ("index @: %d-%d" % (LINE_OFF_BITS, SET_SIZE_BITS))
142 print ("row @: %d-%d" % (LINE_OFF_BITS, ROW_OFF_BITS))
143 print ("tag @: %d-%d width %d" % (SET_SIZE_BITS, REAL_ADDR_BITS, TAG_WIDTH))
144
145 TAG_RAM_WIDTH = TAG_WIDTH * NUM_WAYS
146
147 print ("TAG_RAM_WIDTH", TAG_RAM_WIDTH)
148
149 def CacheTagArray():
150 return Array(Signal(TAG_RAM_WIDTH, name="cachetag_%d" % x) \
151 for x in range(NUM_LINES))
152
153 def CacheValidBitsArray():
154 return Array(Signal(NUM_WAYS, name="cachevalid_%d" % x) \
155 for x in range(NUM_LINES))
156
157 def RowPerLineValidArray():
158 return Array(Signal(name="rows_valid%d" % x) \
159 for x in range(ROW_PER_LINE))
160
161 # L1 TLB
162 TLB_SET_BITS = log2_int(TLB_SET_SIZE)
163 TLB_WAY_BITS = log2_int(TLB_NUM_WAYS)
164 TLB_EA_TAG_BITS = 64 - (TLB_LG_PGSZ + TLB_SET_BITS)
165 TLB_TAG_WAY_BITS = TLB_NUM_WAYS * TLB_EA_TAG_BITS
166 TLB_PTE_BITS = 64
167 TLB_PTE_WAY_BITS = TLB_NUM_WAYS * TLB_PTE_BITS;
168
169 def ispow2(x):
170 return (1<<log2_int(x, False)) == x
171
172 assert (LINE_SIZE % ROW_SIZE) == 0, "LINE_SIZE not multiple of ROW_SIZE"
173 assert ispow2(LINE_SIZE), "LINE_SIZE not power of 2"
174 assert ispow2(NUM_LINES), "NUM_LINES not power of 2"
175 assert ispow2(ROW_PER_LINE), "ROW_PER_LINE not power of 2"
176 assert ROW_BITS == (INDEX_BITS + ROW_LINE_BITS), "geometry bits don't add up"
177 assert (LINE_OFF_BITS == ROW_OFF_BITS + ROW_LINE_BITS), \
178 "geometry bits don't add up"
179 assert REAL_ADDR_BITS == (TAG_BITS + INDEX_BITS + LINE_OFF_BITS), \
180 "geometry bits don't add up"
181 assert REAL_ADDR_BITS == (TAG_BITS + ROW_BITS + ROW_OFF_BITS), \
182 "geometry bits don't add up"
183 assert 64 == WB_DATA_BITS, "Can't yet handle wb width that isn't 64-bits"
184 assert SET_SIZE_BITS <= TLB_LG_PGSZ, "Set indexed by virtual address"
185
186
187 def TLBValidBitsArray():
188 return Array(Signal(TLB_NUM_WAYS, name="tlbvalid%d" % x) \
189 for x in range(TLB_SET_SIZE))
190
191 def TLBTagEAArray():
192 return Array(Signal(TLB_EA_TAG_BITS, name="tlbtagea%d" % x) \
193 for x in range (TLB_NUM_WAYS))
194
195 def TLBTagsArray():
196 return Array(Signal(TLB_TAG_WAY_BITS, name="tlbtags%d" % x) \
197 for x in range (TLB_SET_SIZE))
198
199 def TLBPtesArray():
200 return Array(Signal(TLB_PTE_WAY_BITS, name="tlbptes%d" % x) \
201 for x in range(TLB_SET_SIZE))
202
203 def HitWaySet():
204 return Array(Signal(WAY_BITS, name="hitway_%d" % x) \
205 for x in range(TLB_NUM_WAYS))
206
207 # Cache RAM interface
208 def CacheRamOut():
209 return Array(Signal(WB_DATA_BITS, name="cache_out%d" % x) \
210 for x in range(NUM_WAYS))
211
212 # PLRU output interface
213 def PLRUOut():
214 return Array(Signal(WAY_BITS, name="plru_out%d" % x) \
215 for x in range(NUM_LINES))
216
217 # TLB PLRU output interface
218 def TLBPLRUOut():
219 return Array(Signal(TLB_WAY_BITS, name="tlbplru_out%d" % x) \
220 for x in range(TLB_SET_SIZE))
221
222 # Helper functions to decode incoming requests
223 #
224 # Return the cache line index (tag index) for an address
225 def get_index(addr):
226 return addr[LINE_OFF_BITS:SET_SIZE_BITS]
227
228 # Return the cache row index (data memory) for an address
229 def get_row(addr):
230 return addr[ROW_OFF_BITS:SET_SIZE_BITS]
231
232 # Return the index of a row within a line
233 def get_row_of_line(row):
234 return row[:ROW_BITS][:ROW_LINE_BITS]
235
236 # Returns whether this is the last row of a line
237 def is_last_row_addr(addr, last):
238 return addr[ROW_OFF_BITS:LINE_OFF_BITS] == last
239
240 # Returns whether this is the last row of a line
241 def is_last_row(row, last):
242 return get_row_of_line(row) == last
243
244 # Return the next row in the current cache line. We use a
245 # dedicated function in order to limit the size of the
246 # generated adder to be only the bits within a cache line
247 # (3 bits with default settings)
248 def next_row(row):
249 row_v = row[0:ROW_LINE_BITS] + 1
250 return Cat(row_v[:ROW_LINE_BITS], row[ROW_LINE_BITS:])
251
252 # Get the tag value from the address
253 def get_tag(addr):
254 return addr[SET_SIZE_BITS:REAL_ADDR_BITS]
255
256 # Read a tag from a tag memory row
257 def read_tag(way, tagset):
258 return tagset.word_select(way, TAG_WIDTH)[:TAG_BITS]
259
260 # Read a TLB tag from a TLB tag memory row
261 def read_tlb_tag(way, tags):
262 return tags.word_select(way, TLB_EA_TAG_BITS)
263
264 # Write a TLB tag to a TLB tag memory row
265 def write_tlb_tag(way, tags, tag):
266 return read_tlb_tag(way, tags).eq(tag)
267
268 # Read a PTE from a TLB PTE memory row
269 def read_tlb_pte(way, ptes):
270 return ptes.word_select(way, TLB_PTE_BITS)
271
272 def write_tlb_pte(way, ptes, newpte):
273 return read_tlb_pte(way, ptes).eq(newpte)
274
275
276 # Record for storing permission, attribute, etc. bits from a PTE
277 class PermAttr(RecordObject):
278 def __init__(self, name=None):
279 super().__init__(name=name)
280 self.reference = Signal()
281 self.changed = Signal()
282 self.nocache = Signal()
283 self.priv = Signal()
284 self.rd_perm = Signal()
285 self.wr_perm = Signal()
286
287
288 def extract_perm_attr(pte):
289 pa = PermAttr()
290 return pa;
291
292
293 # Type of operation on a "valid" input
294 @unique
295 class Op(Enum):
296 OP_NONE = 0
297 OP_BAD = 1 # NC cache hit, TLB miss, prot/RC failure
298 OP_STCX_FAIL = 2 # conditional store w/o reservation
299 OP_LOAD_HIT = 3 # Cache hit on load
300 OP_LOAD_MISS = 4 # Load missing cache
301 OP_LOAD_NC = 5 # Non-cachable load
302 OP_STORE_HIT = 6 # Store hitting cache
303 OP_STORE_MISS = 7 # Store missing cache
304
305
306 # Cache state machine
307 @unique
308 class State(Enum):
309 IDLE = 0 # Normal load hit processing
310 RELOAD_WAIT_ACK = 1 # Cache reload wait ack
311 STORE_WAIT_ACK = 2 # Store wait ack
312 NC_LOAD_WAIT_ACK = 3 # Non-cachable load wait ack
313
314
315 # Dcache operations:
316 #
317 # In order to make timing, we use the BRAMs with
318 # an output buffer, which means that the BRAM
319 # output is delayed by an extra cycle.
320 #
321 # Thus, the dcache has a 2-stage internal pipeline
322 # for cache hits with no stalls.
323 #
324 # All other operations are handled via stalling
325 # in the first stage.
326 #
327 # The second stage can thus complete a hit at the same
328 # time as the first stage emits a stall for a complex op.
329 #
330 # Stage 0 register, basically contains just the latched request
331
332 class RegStage0(RecordObject):
333 def __init__(self, name=None):
334 super().__init__(name=name)
335 self.req = LoadStore1ToDCacheType(name="lsmem")
336 self.tlbie = Signal() # indicates a tlbie request (from MMU)
337 self.doall = Signal() # with tlbie, indicates flush whole TLB
338 self.tlbld = Signal() # indicates a TLB load request (from MMU)
339 self.mmu_req = Signal() # indicates source of request
340 self.d_valid = Signal() # indicates req.data is valid now
341
342
343 class MemAccessRequest(RecordObject):
344 def __init__(self, name=None):
345 super().__init__(name=name)
346 self.op = Signal(Op)
347 self.valid = Signal()
348 self.dcbz = Signal()
349 self.real_addr = Signal(REAL_ADDR_BITS)
350 self.data = Signal(64)
351 self.byte_sel = Signal(8)
352 self.hit_way = Signal(WAY_BITS)
353 self.same_tag = Signal()
354 self.mmu_req = Signal()
355
356
357 # First stage register, contains state for stage 1 of load hits
358 # and for the state machine used by all other operations
359 class RegStage1(RecordObject):
360 def __init__(self, name=None):
361 super().__init__(name=name)
362 # Info about the request
363 self.full = Signal() # have uncompleted request
364 self.mmu_req = Signal() # request is from MMU
365 self.req = MemAccessRequest(name="reqmem")
366
367 # Cache hit state
368 self.hit_way = Signal(WAY_BITS)
369 self.hit_load_valid = Signal()
370 self.hit_index = Signal(INDEX_BITS)
371 self.cache_hit = Signal()
372
373 # TLB hit state
374 self.tlb_hit = Signal()
375 self.tlb_hit_way = Signal(TLB_NUM_WAYS)
376 self.tlb_hit_index = Signal(TLB_WAY_BITS)
377
378 # 2-stage data buffer for data forwarded from writes to reads
379 self.forward_data1 = Signal(64)
380 self.forward_data2 = Signal(64)
381 self.forward_sel1 = Signal(8)
382 self.forward_valid1 = Signal()
383 self.forward_way1 = Signal(WAY_BITS)
384 self.forward_row1 = Signal(ROW_BITS)
385 self.use_forward1 = Signal()
386 self.forward_sel = Signal(8)
387
388 # Cache miss state (reload state machine)
389 self.state = Signal(State)
390 self.dcbz = Signal()
391 self.write_bram = Signal()
392 self.write_tag = Signal()
393 self.slow_valid = Signal()
394 self.wb = WBMasterOut("wb")
395 self.reload_tag = Signal(TAG_BITS)
396 self.store_way = Signal(WAY_BITS)
397 self.store_row = Signal(ROW_BITS)
398 self.store_index = Signal(INDEX_BITS)
399 self.end_row_ix = Signal(ROW_LINE_BITS)
400 self.rows_valid = RowPerLineValidArray()
401 self.acks_pending = Signal(3)
402 self.inc_acks = Signal()
403 self.dec_acks = Signal()
404
405 # Signals to complete (possibly with error)
406 self.ls_valid = Signal()
407 self.ls_error = Signal()
408 self.mmu_done = Signal()
409 self.mmu_error = Signal()
410 self.cache_paradox = Signal()
411
412 # Signal to complete a failed stcx.
413 self.stcx_fail = Signal()
414
415
416 # Reservation information
417 class Reservation(RecordObject):
418 def __init__(self):
419 super().__init__()
420 self.valid = Signal()
421 self.addr = Signal(64-LINE_OFF_BITS)
422
423
424 class DTLBUpdate(Elaboratable):
425 def __init__(self):
426 self.tlbie = Signal()
427 self.tlbwe = Signal()
428 self.doall = Signal()
429 self.updated = Signal()
430 self.v_updated = Signal()
431 self.tlb_hit = Signal()
432 self.tlb_req_index = Signal(TLB_SET_BITS)
433
434 self.tlb_hit_way = Signal(TLB_WAY_BITS)
435 self.tlb_tag_way = Signal(TLB_TAG_WAY_BITS)
436 self.tlb_pte_way = Signal(TLB_PTE_WAY_BITS)
437 self.repl_way = Signal(TLB_WAY_BITS)
438 self.eatag = Signal(TLB_EA_TAG_BITS)
439 self.pte_data = Signal(TLB_PTE_BITS)
440
441 self.dv = Signal(TLB_NUM_WAYS) # tlb_way_valids_t
442
443 self.tb_out = Signal(TLB_TAG_WAY_BITS) # tlb_way_tags_t
444 self.pb_out = Signal(TLB_NUM_WAYS) # tlb_way_valids_t
445 self.db_out = Signal(TLB_PTE_WAY_BITS) # tlb_way_ptes_t
446
447 def elaborate(self, platform):
448 m = Module()
449 comb = m.d.comb
450 sync = m.d.sync
451
452 tagset = Signal(TLB_TAG_WAY_BITS)
453 pteset = Signal(TLB_PTE_WAY_BITS)
454
455 tb_out, pb_out, db_out = self.tb_out, self.pb_out, self.db_out
456 comb += db_out.eq(self.dv)
457
458 with m.If(self.tlbie & self.doall):
459 pass # clear all back in parent
460 with m.Elif(self.tlbie):
461 with m.If(self.tlb_hit):
462 comb += db_out.bit_select(self.tlb_hit_way, 1).eq(1)
463 comb += self.v_updated.eq(1)
464
465 with m.Elif(self.tlbwe):
466
467 comb += tagset.eq(self.tlb_tag_way)
468 comb += write_tlb_tag(self.repl_way, tagset, self.eatag)
469 comb += tb_out.eq(tagset)
470
471 comb += pteset.eq(self.tlb_pte_way)
472 comb += write_tlb_pte(self.repl_way, pteset, self.pte_data)
473 comb += pb_out.eq(pteset)
474
475 comb += db_out.bit_select(self.repl_way, 1).eq(1)
476
477 comb += self.updated.eq(1)
478 comb += self.v_updated.eq(1)
479
480 return m
481
482
483 class DCachePendingHit(Elaboratable):
484
485 def __init__(self, tlb_pte_way, tlb_valid_way, tlb_hit_way,
486 cache_valid_idx, cache_tag_set,
487 req_addr,
488 hit_set):
489
490 self.go = Signal()
491 self.virt_mode = Signal()
492 self.is_hit = Signal()
493 self.tlb_hit = Signal()
494 self.hit_way = Signal(WAY_BITS)
495 self.rel_match = Signal()
496 self.req_index = Signal(INDEX_BITS)
497 self.reload_tag = Signal(TAG_BITS)
498
499 self.tlb_hit_way = tlb_hit_way
500 self.tlb_pte_way = tlb_pte_way
501 self.tlb_valid_way = tlb_valid_way
502 self.cache_valid_idx = cache_valid_idx
503 self.cache_tag_set = cache_tag_set
504 self.req_addr = req_addr
505 self.hit_set = hit_set
506
507 def elaborate(self, platform):
508 m = Module()
509 comb = m.d.comb
510 sync = m.d.sync
511
512 go = self.go
513 virt_mode = self.virt_mode
514 is_hit = self.is_hit
515 tlb_pte_way = self.tlb_pte_way
516 tlb_valid_way = self.tlb_valid_way
517 cache_valid_idx = self.cache_valid_idx
518 cache_tag_set = self.cache_tag_set
519 req_addr = self.req_addr
520 tlb_hit_way = self.tlb_hit_way
521 tlb_hit = self.tlb_hit
522 hit_set = self.hit_set
523 hit_way = self.hit_way
524 rel_match = self.rel_match
525 req_index = self.req_index
526 reload_tag = self.reload_tag
527
528 rel_matches = Array(Signal(name="rel_matches_%d" % i) \
529 for i in range(TLB_NUM_WAYS))
530 hit_way_set = HitWaySet()
531
532 # Test if pending request is a hit on any way
533 # In order to make timing in virtual mode,
534 # when we are using the TLB, we compare each
535 # way with each of the real addresses from each way of
536 # the TLB, and then decide later which match to use.
537
538 with m.If(virt_mode):
539 for j in range(TLB_NUM_WAYS): # tlb_num_way_t
540 s_tag = Signal(TAG_BITS, name="s_tag%d" % j)
541 s_hit = Signal()
542 s_pte = Signal(TLB_PTE_BITS)
543 s_ra = Signal(REAL_ADDR_BITS)
544 comb += s_pte.eq(read_tlb_pte(j, tlb_pte_way))
545 comb += s_ra.eq(Cat(req_addr[0:TLB_LG_PGSZ],
546 s_pte[TLB_LG_PGSZ:REAL_ADDR_BITS]))
547 comb += s_tag.eq(get_tag(s_ra))
548
549 for i in range(NUM_WAYS): # way_t
550 is_tag_hit = Signal(name="is_tag_hit_%d_%d" % (j, i))
551 comb += is_tag_hit.eq(go & cache_valid_idx[i] &
552 (read_tag(i, cache_tag_set) == s_tag)
553 & tlb_valid_way[j])
554 with m.If(is_tag_hit):
555 comb += hit_way_set[j].eq(i)
556 comb += s_hit.eq(1)
557 comb += hit_set[j].eq(s_hit)
558 with m.If(s_tag == reload_tag):
559 comb += rel_matches[j].eq(1)
560 with m.If(tlb_hit):
561 comb += is_hit.eq(hit_set[tlb_hit_way])
562 comb += hit_way.eq(hit_way_set[tlb_hit_way])
563 comb += rel_match.eq(rel_matches[tlb_hit_way])
564 with m.Else():
565 s_tag = Signal(TAG_BITS)
566 comb += s_tag.eq(get_tag(req_addr))
567 for i in range(NUM_WAYS): # way_t
568 is_tag_hit = Signal(name="is_tag_hit_%d" % i)
569 comb += is_tag_hit.eq(go & cache_valid_idx[i] &
570 (read_tag(i, cache_tag_set) == s_tag))
571 with m.If(is_tag_hit):
572 comb += hit_way.eq(i)
573 comb += is_hit.eq(1)
574 with m.If(s_tag == reload_tag):
575 comb += rel_match.eq(1)
576
577 return m
578
579
580 class DCache(Elaboratable):
581 """Set associative dcache write-through
582 TODO (in no specific order):
583 * See list in icache.vhdl
584 * Complete load misses on the cycle when WB data comes instead of
585 at the end of line (this requires dealing with requests coming in
586 while not idle...)
587 """
588 def __init__(self):
589 self.d_in = LoadStore1ToDCacheType("d_in")
590 self.d_out = DCacheToLoadStore1Type("d_out")
591
592 self.m_in = MMUToDCacheType("m_in")
593 self.m_out = DCacheToMMUType("m_out")
594
595 self.stall_out = Signal()
596
597 self.wb_out = WBMasterOut("wb_out")
598 self.wb_in = WBSlaveOut("wb_in")
599
600 self.log_out = Signal(20)
601
602 def stage_0(self, m, r0, r1, r0_full):
603 """Latch the request in r0.req as long as we're not stalling
604 """
605 comb = m.d.comb
606 sync = m.d.sync
607 d_in, d_out, m_in = self.d_in, self.d_out, self.m_in
608
609 r = RegStage0("stage0")
610
611 # TODO, this goes in unit tests and formal proofs
612 with m.If(d_in.valid & m_in.valid):
613 sync += Display("request collision loadstore vs MMU")
614
615 with m.If(m_in.valid):
616 comb += r.req.valid.eq(1)
617 comb += r.req.load.eq(~(m_in.tlbie | m_in.tlbld))
618 comb += r.req.dcbz.eq(0)
619 comb += r.req.nc.eq(0)
620 comb += r.req.reserve.eq(0)
621 comb += r.req.virt_mode.eq(0)
622 comb += r.req.priv_mode.eq(1)
623 comb += r.req.addr.eq(m_in.addr)
624 comb += r.req.data.eq(m_in.pte)
625 comb += r.req.byte_sel.eq(~0) # Const -1 sets all to 0b111....
626 comb += r.tlbie.eq(m_in.tlbie)
627 comb += r.doall.eq(m_in.doall)
628 comb += r.tlbld.eq(m_in.tlbld)
629 comb += r.mmu_req.eq(1)
630 with m.Else():
631 comb += r.req.eq(d_in)
632 comb += r.req.data.eq(0)
633 comb += r.tlbie.eq(0)
634 comb += r.doall.eq(0)
635 comb += r.tlbld.eq(0)
636 comb += r.mmu_req.eq(0)
637 with m.If((~r1.full & ~d_in.hold) | ~r0_full):
638 sync += r0.eq(r)
639 sync += r0_full.eq(r.req.valid)
640 # Sample data the cycle after a request comes in from loadstore1.
641 # If another request has come in already then the data will get
642 # put directly into req.data below.
643 with m.If(r0.req.valid & ~r.req.valid & ~r0.d_valid &
644 ~r0.mmu_req):
645 sync += r0.req.data.eq(d_in.data)
646 sync += r0.d_valid.eq(1)
647
648 def tlb_read(self, m, r0_stall, tlb_valid_way,
649 tlb_tag_way, tlb_pte_way, dtlb_valid_bits,
650 dtlb_tags, dtlb_ptes):
651 """TLB
652 Operates in the second cycle on the request latched in r0.req.
653 TLB updates write the entry at the end of the second cycle.
654 """
655 comb = m.d.comb
656 sync = m.d.sync
657 m_in, d_in = self.m_in, self.d_in
658
659 index = Signal(TLB_SET_BITS)
660 addrbits = Signal(TLB_SET_BITS)
661
662 amin = TLB_LG_PGSZ
663 amax = TLB_LG_PGSZ + TLB_SET_BITS
664
665 with m.If(m_in.valid):
666 comb += addrbits.eq(m_in.addr[amin : amax])
667 with m.Else():
668 comb += addrbits.eq(d_in.addr[amin : amax])
669 comb += index.eq(addrbits)
670
671 # If we have any op and the previous op isn't finished,
672 # then keep the same output for next cycle.
673 with m.If(~r0_stall):
674 sync += tlb_valid_way.eq(dtlb_valid_bits[index])
675 sync += tlb_tag_way.eq(dtlb_tags[index])
676 sync += tlb_pte_way.eq(dtlb_ptes[index])
677
678 def maybe_tlb_plrus(self, m, r1, tlb_plru_victim):
679 """Generate TLB PLRUs
680 """
681 comb = m.d.comb
682 sync = m.d.sync
683
684 if TLB_NUM_WAYS == 0:
685 return
686 for i in range(TLB_SET_SIZE):
687 # TLB PLRU interface
688 tlb_plru = PLRU(TLB_WAY_BITS)
689 setattr(m.submodules, "maybe_plru_%d" % i, tlb_plru)
690 tlb_plru_acc_en = Signal()
691
692 comb += tlb_plru_acc_en.eq(r1.tlb_hit & (r1.tlb_hit_index == i))
693 comb += tlb_plru.acc_en.eq(tlb_plru_acc_en)
694 comb += tlb_plru.acc_i.eq(r1.tlb_hit_way)
695 comb += tlb_plru_victim[i].eq(tlb_plru.lru_o)
696
697 def tlb_search(self, m, tlb_req_index, r0, r0_valid,
698 tlb_valid_way, tlb_tag_way, tlb_hit_way,
699 tlb_pte_way, pte, tlb_hit, valid_ra, perm_attr, ra):
700
701 comb = m.d.comb
702
703 hitway = Signal(TLB_WAY_BITS)
704 hit = Signal()
705 eatag = Signal(TLB_EA_TAG_BITS)
706
707 TLB_LG_END = TLB_LG_PGSZ + TLB_SET_BITS
708 comb += tlb_req_index.eq(r0.req.addr[TLB_LG_PGSZ : TLB_LG_END])
709 comb += eatag.eq(r0.req.addr[TLB_LG_END : 64 ])
710
711 for i in range(TLB_NUM_WAYS):
712 is_tag_hit = Signal()
713 comb += is_tag_hit.eq(tlb_valid_way[i]
714 & (read_tlb_tag(i, tlb_tag_way) == eatag))
715 with m.If(is_tag_hit):
716 comb += hitway.eq(i)
717 comb += hit.eq(1)
718
719 comb += tlb_hit.eq(hit & r0_valid)
720 comb += tlb_hit_way.eq(hitway)
721
722 with m.If(tlb_hit):
723 comb += pte.eq(read_tlb_pte(hitway, tlb_pte_way))
724 comb += valid_ra.eq(tlb_hit | ~r0.req.virt_mode)
725
726 with m.If(r0.req.virt_mode):
727 comb += ra.eq(Cat(Const(0, ROW_OFF_BITS),
728 r0.req.addr[ROW_OFF_BITS:TLB_LG_PGSZ],
729 pte[TLB_LG_PGSZ:REAL_ADDR_BITS]))
730 comb += perm_attr.reference.eq(pte[8])
731 comb += perm_attr.changed.eq(pte[7])
732 comb += perm_attr.nocache.eq(pte[5])
733 comb += perm_attr.priv.eq(pte[3])
734 comb += perm_attr.rd_perm.eq(pte[2])
735 comb += perm_attr.wr_perm.eq(pte[1])
736 with m.Else():
737 comb += ra.eq(Cat(Const(0, ROW_OFF_BITS),
738 r0.req.addr[ROW_OFF_BITS:REAL_ADDR_BITS]))
739 comb += perm_attr.reference.eq(1)
740 comb += perm_attr.changed.eq(1)
741 comb += perm_attr.nocache.eq(0)
742 comb += perm_attr.priv.eq(1)
743 comb += perm_attr.rd_perm.eq(1)
744 comb += perm_attr.wr_perm.eq(1)
745
746 def tlb_update(self, m, r0_valid, r0, dtlb_valid_bits, tlb_req_index,
747 tlb_hit_way, tlb_hit, tlb_plru_victim, tlb_tag_way,
748 dtlb_tags, tlb_pte_way, dtlb_ptes):
749
750 dtlb_valids = TLBValidBitsArray()
751
752 comb = m.d.comb
753 sync = m.d.sync
754
755 tlbie = Signal()
756 tlbwe = Signal()
757
758 comb += tlbie.eq(r0_valid & r0.tlbie)
759 comb += tlbwe.eq(r0_valid & r0.tlbld)
760
761 m.submodules.tlb_update = d = DTLBUpdate()
762 with m.If(tlbie & r0.doall):
763 # clear all valid bits at once
764 for i in range(TLB_SET_SIZE):
765 sync += dtlb_valid_bits[i].eq(0)
766 with m.If(d.updated):
767 sync += dtlb_tags[tlb_req_index].eq(d.tb_out)
768 sync += dtlb_ptes[tlb_req_index].eq(d.pb_out)
769 with m.If(d.v_updated):
770 sync += dtlb_valid_bits[tlb_req_index].eq(d.db_out)
771
772 comb += d.dv.eq(dtlb_valid_bits[tlb_req_index])
773
774 comb += d.tlbie.eq(tlbie)
775 comb += d.tlbwe.eq(tlbwe)
776 comb += d.doall.eq(r0.doall)
777 comb += d.tlb_hit.eq(tlb_hit)
778 comb += d.tlb_hit_way.eq(tlb_hit_way)
779 comb += d.tlb_tag_way.eq(tlb_tag_way)
780 comb += d.tlb_pte_way.eq(tlb_pte_way)
781 comb += d.tlb_req_index.eq(tlb_req_index)
782
783 with m.If(tlb_hit):
784 comb += d.repl_way.eq(tlb_hit_way)
785 with m.Else():
786 comb += d.repl_way.eq(tlb_plru_victim[tlb_req_index])
787 comb += d.eatag.eq(r0.req.addr[TLB_LG_PGSZ + TLB_SET_BITS:64])
788 comb += d.pte_data.eq(r0.req.data)
789
790 def maybe_plrus(self, m, r1, plru_victim):
791 """Generate PLRUs
792 """
793 comb = m.d.comb
794 sync = m.d.sync
795
796 if TLB_NUM_WAYS == 0:
797 return
798
799 for i in range(NUM_LINES):
800 # PLRU interface
801 plru = PLRU(WAY_BITS)
802 setattr(m.submodules, "plru%d" % i, plru)
803 plru_acc_en = Signal()
804
805 comb += plru_acc_en.eq(r1.cache_hit & (r1.hit_index == i))
806 comb += plru.acc_en.eq(plru_acc_en)
807 comb += plru.acc_i.eq(r1.hit_way)
808 comb += plru_victim[i].eq(plru.lru_o)
809
810 def cache_tag_read(self, m, r0_stall, req_index, cache_tag_set, cache_tags):
811 """Cache tag RAM read port
812 """
813 comb = m.d.comb
814 sync = m.d.sync
815 m_in, d_in = self.m_in, self.d_in
816
817 index = Signal(INDEX_BITS)
818
819 with m.If(r0_stall):
820 comb += index.eq(req_index)
821 with m.Elif(m_in.valid):
822 comb += index.eq(get_index(m_in.addr))
823 with m.Else():
824 comb += index.eq(get_index(d_in.addr))
825 sync += cache_tag_set.eq(cache_tags[index])
826
827 def dcache_request(self, m, r0, ra, req_index, req_row, req_tag,
828 r0_valid, r1, cache_valids, replace_way,
829 use_forward1_next, use_forward2_next,
830 req_hit_way, plru_victim, rc_ok, perm_attr,
831 valid_ra, perm_ok, access_ok, req_op, req_go,
832 tlb_pte_way,
833 tlb_hit, tlb_hit_way, tlb_valid_way, cache_tag_set,
834 cancel_store, req_same_tag, r0_stall, early_req_row):
835 """Cache request parsing and hit detection
836 """
837
838 comb = m.d.comb
839 m_in, d_in = self.m_in, self.d_in
840
841 is_hit = Signal()
842 hit_way = Signal(WAY_BITS)
843 op = Signal(Op)
844 opsel = Signal(3)
845 go = Signal()
846 nc = Signal()
847 hit_set = Array(Signal(name="hit_set_%d" % i) \
848 for i in range(TLB_NUM_WAYS))
849 cache_valid_idx = Signal(NUM_WAYS)
850
851 # Extract line, row and tag from request
852 comb += req_index.eq(get_index(r0.req.addr))
853 comb += req_row.eq(get_row(r0.req.addr))
854 comb += req_tag.eq(get_tag(ra))
855
856 if False: # display on comb is a bit... busy.
857 comb += Display("dcache_req addr:%x ra: %x idx: %x tag: %x row: %x",
858 r0.req.addr, ra, req_index, req_tag, req_row)
859
860 comb += go.eq(r0_valid & ~(r0.tlbie | r0.tlbld) & ~r1.ls_error)
861 comb += cache_valid_idx.eq(cache_valids[req_index])
862
863 m.submodules.dcache_pend = dc = DCachePendingHit(tlb_pte_way,
864 tlb_valid_way, tlb_hit_way,
865 cache_valid_idx, cache_tag_set,
866 r0.req.addr,
867 hit_set)
868
869 comb += dc.tlb_hit.eq(tlb_hit)
870 comb += dc.reload_tag.eq(r1.reload_tag)
871 comb += dc.virt_mode.eq(r0.req.virt_mode)
872 comb += dc.go.eq(go)
873 comb += dc.req_index.eq(req_index)
874 comb += is_hit.eq(dc.is_hit)
875 comb += hit_way.eq(dc.hit_way)
876 comb += req_same_tag.eq(dc.rel_match)
877
878 # See if the request matches the line currently being reloaded
879 with m.If((r1.state == State.RELOAD_WAIT_ACK) &
880 (req_index == r1.store_index) & req_same_tag):
881 # For a store, consider this a hit even if the row isn't
882 # valid since it will be by the time we perform the store.
883 # For a load, check the appropriate row valid bit.
884 rrow = Signal(ROW_LINE_BITS)
885 comb += rrow.eq(req_row)
886 valid = r1.rows_valid[rrow]
887 comb += is_hit.eq((~r0.req.load) | valid)
888 comb += hit_way.eq(replace_way)
889
890 # Whether to use forwarded data for a load or not
891 with m.If((get_row(r1.req.real_addr) == req_row) &
892 (r1.req.hit_way == hit_way)):
893 # Only need to consider r1.write_bram here, since if we
894 # are writing refill data here, then we don't have a
895 # cache hit this cycle on the line being refilled.
896 # (There is the possibility that the load following the
897 # load miss that started the refill could be to the old
898 # contents of the victim line, since it is a couple of
899 # cycles after the refill starts before we see the updated
900 # cache tag. In that case we don't use the bypass.)
901 comb += use_forward1_next.eq(r1.write_bram)
902 with m.If((r1.forward_row1 == req_row) & (r1.forward_way1 == hit_way)):
903 comb += use_forward2_next.eq(r1.forward_valid1)
904
905 # The way that matched on a hit
906 comb += req_hit_way.eq(hit_way)
907
908 # The way to replace on a miss
909 with m.If(r1.write_tag):
910 comb += replace_way.eq(plru_victim[r1.store_index])
911 with m.Else():
912 comb += replace_way.eq(r1.store_way)
913
914 # work out whether we have permission for this access
915 # NB we don't yet implement AMR, thus no KUAP
916 comb += rc_ok.eq(perm_attr.reference
917 & (r0.req.load | perm_attr.changed))
918 comb += perm_ok.eq((r0.req.priv_mode | (~perm_attr.priv)) &
919 (perm_attr.wr_perm |
920 (r0.req.load & perm_attr.rd_perm)))
921 comb += access_ok.eq(valid_ra & perm_ok & rc_ok)
922 # Combine the request and cache hit status to decide what
923 # operation needs to be done
924 comb += nc.eq(r0.req.nc | perm_attr.nocache)
925 comb += op.eq(Op.OP_NONE)
926 with m.If(go):
927 with m.If(~access_ok):
928 comb += op.eq(Op.OP_BAD)
929 with m.Elif(cancel_store):
930 comb += op.eq(Op.OP_STCX_FAIL)
931 with m.Else():
932 comb += opsel.eq(Cat(is_hit, nc, r0.req.load))
933 with m.Switch(opsel):
934 with m.Case(0b101): comb += op.eq(Op.OP_LOAD_HIT)
935 with m.Case(0b100): comb += op.eq(Op.OP_LOAD_MISS)
936 with m.Case(0b110): comb += op.eq(Op.OP_LOAD_NC)
937 with m.Case(0b001): comb += op.eq(Op.OP_STORE_HIT)
938 with m.Case(0b000): comb += op.eq(Op.OP_STORE_MISS)
939 with m.Case(0b010): comb += op.eq(Op.OP_STORE_MISS)
940 with m.Case(0b011): comb += op.eq(Op.OP_BAD)
941 with m.Case(0b111): comb += op.eq(Op.OP_BAD)
942 comb += req_op.eq(op)
943 comb += req_go.eq(go)
944
945 # Version of the row number that is valid one cycle earlier
946 # in the cases where we need to read the cache data BRAM.
947 # If we're stalling then we need to keep reading the last
948 # row requested.
949 with m.If(~r0_stall):
950 with m.If(m_in.valid):
951 comb += early_req_row.eq(get_row(m_in.addr))
952 with m.Else():
953 comb += early_req_row.eq(get_row(d_in.addr))
954 with m.Else():
955 comb += early_req_row.eq(req_row)
956
957 def reservation_comb(self, m, cancel_store, set_rsrv, clear_rsrv,
958 r0_valid, r0, reservation):
959 """Handle load-with-reservation and store-conditional instructions
960 """
961 comb = m.d.comb
962
963 with m.If(r0_valid & r0.req.reserve):
964 # XXX generate alignment interrupt if address
965 # is not aligned XXX or if r0.req.nc = '1'
966 with m.If(r0.req.load):
967 comb += set_rsrv.eq(r0.req.atomic_last) # load with reservation
968 with m.Else():
969 comb += clear_rsrv.eq(r0.req.atomic_last) # store conditional
970 with m.If((~reservation.valid) |
971 (r0.req.addr[LINE_OFF_BITS:64] != reservation.addr)):
972 comb += cancel_store.eq(1)
973
974 def reservation_reg(self, m, r0_valid, access_ok, set_rsrv, clear_rsrv,
975 reservation, r0):
976
977 comb = m.d.comb
978 sync = m.d.sync
979
980 with m.If(r0_valid & access_ok):
981 with m.If(clear_rsrv):
982 sync += reservation.valid.eq(0)
983 with m.Elif(set_rsrv):
984 sync += reservation.valid.eq(1)
985 sync += reservation.addr.eq(r0.req.addr[LINE_OFF_BITS:64])
986
987 def writeback_control(self, m, r1, cache_out_row):
988 """Return data for loads & completion control logic
989 """
990 comb = m.d.comb
991 sync = m.d.sync
992 d_out, m_out = self.d_out, self.m_out
993
994 data_out = Signal(64)
995 data_fwd = Signal(64)
996
997 # Use the bypass if are reading the row that was
998 # written 1 or 2 cycles ago, including for the
999 # slow_valid = 1 case (i.e. completing a load
1000 # miss or a non-cacheable load).
1001 with m.If(r1.use_forward1):
1002 comb += data_fwd.eq(r1.forward_data1)
1003 with m.Else():
1004 comb += data_fwd.eq(r1.forward_data2)
1005
1006 comb += data_out.eq(cache_out_row)
1007
1008 for i in range(8):
1009 with m.If(r1.forward_sel[i]):
1010 dsel = data_fwd.word_select(i, 8)
1011 comb += data_out.word_select(i, 8).eq(dsel)
1012
1013 comb += d_out.valid.eq(r1.ls_valid)
1014 comb += d_out.data.eq(data_out)
1015 comb += d_out.store_done.eq(~r1.stcx_fail)
1016 comb += d_out.error.eq(r1.ls_error)
1017 comb += d_out.cache_paradox.eq(r1.cache_paradox)
1018
1019 # Outputs to MMU
1020 comb += m_out.done.eq(r1.mmu_done)
1021 comb += m_out.err.eq(r1.mmu_error)
1022 comb += m_out.data.eq(data_out)
1023
1024 # We have a valid load or store hit or we just completed
1025 # a slow op such as a load miss, a NC load or a store
1026 #
1027 # Note: the load hit is delayed by one cycle. However it
1028 # can still not collide with r.slow_valid (well unless I
1029 # miscalculated) because slow_valid can only be set on a
1030 # subsequent request and not on its first cycle (the state
1031 # machine must have advanced), which makes slow_valid
1032 # at least 2 cycles from the previous hit_load_valid.
1033
1034 # Sanity: Only one of these must be set in any given cycle
1035
1036 if False: # TODO: need Display to get this to work
1037 assert (r1.slow_valid & r1.stcx_fail) != 1, \
1038 "unexpected slow_valid collision with stcx_fail"
1039
1040 assert ((r1.slow_valid | r1.stcx_fail) | r1.hit_load_valid) != 1, \
1041 "unexpected hit_load_delayed collision with slow_valid"
1042
1043 with m.If(~r1.mmu_req):
1044 # Request came from loadstore1...
1045 # Load hit case is the standard path
1046 with m.If(r1.hit_load_valid):
1047 sync += Display("completing load hit data=%x", data_out)
1048
1049 # error cases complete without stalling
1050 with m.If(r1.ls_error):
1051 sync += Display("completing ld/st with error")
1052
1053 # Slow ops (load miss, NC, stores)
1054 with m.If(r1.slow_valid):
1055 sync += Display("completing store or load miss adr=%x data=%x",
1056 r1.req.real_addr, data_out)
1057
1058 with m.Else():
1059 # Request came from MMU
1060 with m.If(r1.hit_load_valid):
1061 sync += Display("completing load hit to MMU, data=%x",
1062 m_out.data)
1063 # error cases complete without stalling
1064 with m.If(r1.mmu_error):
1065 sync += Display("combpleting MMU ld with error")
1066
1067 # Slow ops (i.e. load miss)
1068 with m.If(r1.slow_valid):
1069 sync += Display("completing MMU load miss, data=%x",
1070 m_out.data)
1071
1072 def rams(self, m, r1, early_req_row, cache_out_row, replace_way):
1073 """rams
1074 Generate a cache RAM for each way. This handles the normal
1075 reads, writes from reloads and the special store-hit update
1076 path as well.
1077
1078 Note: the BRAMs have an extra read buffer, meaning the output
1079 is pipelined an extra cycle. This differs from the
1080 icache. The writeback logic needs to take that into
1081 account by using 1-cycle delayed signals for load hits.
1082 """
1083 comb = m.d.comb
1084 wb_in = self.wb_in
1085
1086 for i in range(NUM_WAYS):
1087 do_read = Signal(name="do_rd%d" % i)
1088 rd_addr = Signal(ROW_BITS, name="rd_addr_%d" % i)
1089 do_write = Signal(name="do_wr%d" % i)
1090 wr_addr = Signal(ROW_BITS, name="wr_addr_%d" % i)
1091 wr_data = Signal(WB_DATA_BITS, name="din_%d" % i)
1092 wr_sel = Signal(ROW_SIZE)
1093 wr_sel_m = Signal(ROW_SIZE)
1094 _d_out = Signal(WB_DATA_BITS, name="dout_%d" % i) # cache_row_t
1095
1096 way = CacheRam(ROW_BITS, WB_DATA_BITS, ADD_BUF=True)
1097 setattr(m.submodules, "cacheram_%d" % i, way)
1098
1099 comb += way.rd_en.eq(do_read)
1100 comb += way.rd_addr.eq(rd_addr)
1101 comb += _d_out.eq(way.rd_data_o)
1102 comb += way.wr_sel.eq(wr_sel_m)
1103 comb += way.wr_addr.eq(wr_addr)
1104 comb += way.wr_data.eq(wr_data)
1105
1106 # Cache hit reads
1107 comb += do_read.eq(1)
1108 comb += rd_addr.eq(early_req_row)
1109 with m.If(r1.hit_way == i):
1110 comb += cache_out_row.eq(_d_out)
1111
1112 # Write mux:
1113 #
1114 # Defaults to wishbone read responses (cache refill)
1115 #
1116 # For timing, the mux on wr_data/sel/addr is not
1117 # dependent on anything other than the current state.
1118
1119 with m.If(r1.write_bram):
1120 # Write store data to BRAM. This happens one
1121 # cycle after the store is in r0.
1122 comb += wr_data.eq(r1.req.data)
1123 comb += wr_sel.eq(r1.req.byte_sel)
1124 comb += wr_addr.eq(get_row(r1.req.real_addr))
1125
1126 with m.If(i == r1.req.hit_way):
1127 comb += do_write.eq(1)
1128 with m.Else():
1129 # Otherwise, we might be doing a reload or a DCBZ
1130 with m.If(r1.dcbz):
1131 comb += wr_data.eq(0)
1132 with m.Else():
1133 comb += wr_data.eq(wb_in.dat)
1134 comb += wr_addr.eq(r1.store_row)
1135 comb += wr_sel.eq(~0) # all 1s
1136
1137 with m.If((r1.state == State.RELOAD_WAIT_ACK)
1138 & wb_in.ack & (replace_way == i)):
1139 comb += do_write.eq(1)
1140
1141 # Mask write selects with do_write since BRAM
1142 # doesn't have a global write-enable
1143 with m.If(do_write):
1144 comb += wr_sel_m.eq(wr_sel)
1145
1146 # Cache hit synchronous machine for the easy case.
1147 # This handles load hits.
1148 # It also handles error cases (TLB miss, cache paradox)
1149 def dcache_fast_hit(self, m, req_op, r0_valid, r0, r1,
1150 req_hit_way, req_index, req_tag, access_ok,
1151 tlb_hit, tlb_hit_way, tlb_req_index):
1152
1153 comb = m.d.comb
1154 sync = m.d.sync
1155
1156 with m.If(req_op != Op.OP_NONE):
1157 sync += Display("op:%d addr:%x nc: %d idx: %x tag: %x way: %x",
1158 req_op, r0.req.addr, r0.req.nc,
1159 req_index, req_tag, req_hit_way)
1160
1161 with m.If(r0_valid):
1162 sync += r1.mmu_req.eq(r0.mmu_req)
1163
1164 # Fast path for load/store hits.
1165 # Set signals for the writeback controls.
1166 sync += r1.hit_way.eq(req_hit_way)
1167 sync += r1.hit_index.eq(req_index)
1168
1169 with m.If(req_op == Op.OP_LOAD_HIT):
1170 sync += r1.hit_load_valid.eq(1)
1171 with m.Else():
1172 sync += r1.hit_load_valid.eq(0)
1173
1174 with m.If((req_op == Op.OP_LOAD_HIT) | (req_op == Op.OP_STORE_HIT)):
1175 sync += r1.cache_hit.eq(1)
1176 with m.Else():
1177 sync += r1.cache_hit.eq(0)
1178
1179 with m.If(req_op == Op.OP_BAD):
1180 # Display(f"Signalling ld/st error valid_ra={valid_ra}"
1181 # f"rc_ok={rc_ok} perm_ok={perm_ok}"
1182 sync += r1.ls_error.eq(~r0.mmu_req)
1183 sync += r1.mmu_error.eq(r0.mmu_req)
1184 sync += r1.cache_paradox.eq(access_ok)
1185
1186 with m.Else():
1187 sync += r1.ls_error.eq(0)
1188 sync += r1.mmu_error.eq(0)
1189 sync += r1.cache_paradox.eq(0)
1190
1191 with m.If(req_op == Op.OP_STCX_FAIL):
1192 sync += r1.stcx_fail.eq(1)
1193 with m.Else():
1194 sync += r1.stcx_fail.eq(0)
1195
1196 # Record TLB hit information for updating TLB PLRU
1197 sync += r1.tlb_hit.eq(tlb_hit)
1198 sync += r1.tlb_hit_way.eq(tlb_hit_way)
1199 sync += r1.tlb_hit_index.eq(tlb_req_index)
1200
1201 # Memory accesses are handled by this state machine:
1202 #
1203 # * Cache load miss/reload (in conjunction with "rams")
1204 # * Load hits for non-cachable forms
1205 # * Stores (the collision case is handled in "rams")
1206 #
1207 # All wishbone requests generation is done here.
1208 # This machine operates at stage 1.
1209 def dcache_slow(self, m, r1, use_forward1_next, use_forward2_next,
1210 cache_valids, r0, replace_way,
1211 req_hit_way, req_same_tag,
1212 r0_valid, req_op, cache_tags, req_go, ra):
1213
1214 comb = m.d.comb
1215 sync = m.d.sync
1216 wb_in = self.wb_in
1217 d_in = self.d_in
1218
1219 req = MemAccessRequest("mreq_ds")
1220
1221 req_row = Signal(ROW_BITS)
1222 req_idx = Signal(INDEX_BITS)
1223 req_tag = Signal(TAG_BITS)
1224 comb += req_idx.eq(get_index(req.real_addr))
1225 comb += req_row.eq(get_row(req.real_addr))
1226 comb += req_tag.eq(get_tag(req.real_addr))
1227
1228 sync += r1.use_forward1.eq(use_forward1_next)
1229 sync += r1.forward_sel.eq(0)
1230
1231 with m.If(use_forward1_next):
1232 sync += r1.forward_sel.eq(r1.req.byte_sel)
1233 with m.Elif(use_forward2_next):
1234 sync += r1.forward_sel.eq(r1.forward_sel1)
1235
1236 sync += r1.forward_data2.eq(r1.forward_data1)
1237 with m.If(r1.write_bram):
1238 sync += r1.forward_data1.eq(r1.req.data)
1239 sync += r1.forward_sel1.eq(r1.req.byte_sel)
1240 sync += r1.forward_way1.eq(r1.req.hit_way)
1241 sync += r1.forward_row1.eq(get_row(r1.req.real_addr))
1242 sync += r1.forward_valid1.eq(1)
1243 with m.Else():
1244 with m.If(r1.dcbz):
1245 sync += r1.forward_data1.eq(0)
1246 with m.Else():
1247 sync += r1.forward_data1.eq(wb_in.dat)
1248 sync += r1.forward_sel1.eq(~0) # all 1s
1249 sync += r1.forward_way1.eq(replace_way)
1250 sync += r1.forward_row1.eq(r1.store_row)
1251 sync += r1.forward_valid1.eq(0)
1252
1253 # One cycle pulses reset
1254 sync += r1.slow_valid.eq(0)
1255 sync += r1.write_bram.eq(0)
1256 sync += r1.inc_acks.eq(0)
1257 sync += r1.dec_acks.eq(0)
1258
1259 sync += r1.ls_valid.eq(0)
1260 # complete tlbies and TLB loads in the third cycle
1261 sync += r1.mmu_done.eq(r0_valid & (r0.tlbie | r0.tlbld))
1262
1263 with m.If((req_op == Op.OP_LOAD_HIT) | (req_op == Op.OP_STCX_FAIL)):
1264 with m.If(~r0.mmu_req):
1265 sync += r1.ls_valid.eq(1)
1266 with m.Else():
1267 sync += r1.mmu_done.eq(1)
1268
1269 with m.If(r1.write_tag):
1270 # Store new tag in selected way
1271 for i in range(NUM_WAYS):
1272 with m.If(i == replace_way):
1273 ct = Signal(TAG_RAM_WIDTH)
1274 comb += ct.eq(cache_tags[r1.store_index])
1275 """
1276 TODO: check this
1277 cache_tags(r1.store_index)((i + 1) * TAG_WIDTH - 1 downto i * TAG_WIDTH) <=
1278 (TAG_WIDTH - 1 downto TAG_BITS => '0') & r1.reload_tag;
1279 """
1280 comb += ct.word_select(i, TAG_WIDTH).eq(r1.reload_tag)
1281 sync += cache_tags[r1.store_index].eq(ct)
1282 sync += r1.store_way.eq(replace_way)
1283 sync += r1.write_tag.eq(0)
1284
1285 # Take request from r1.req if there is one there,
1286 # else from req_op, ra, etc.
1287 with m.If(r1.full):
1288 comb += req.eq(r1.req)
1289 with m.Else():
1290 comb += req.op.eq(req_op)
1291 comb += req.valid.eq(req_go)
1292 comb += req.mmu_req.eq(r0.mmu_req)
1293 comb += req.dcbz.eq(r0.req.dcbz)
1294 comb += req.real_addr.eq(ra)
1295
1296 with m.If(r0.req.dcbz):
1297 # force data to 0 for dcbz
1298 comb += req.data.eq(0)
1299 with m.Elif(r0.d_valid):
1300 comb += req.data.eq(r0.req.data)
1301 with m.Else():
1302 comb += req.data.eq(d_in.data)
1303
1304 # Select all bytes for dcbz
1305 # and for cacheable loads
1306 with m.If(r0.req.dcbz | (r0.req.load & ~r0.req.nc)):
1307 comb += req.byte_sel.eq(~0) # all 1s
1308 with m.Else():
1309 comb += req.byte_sel.eq(r0.req.byte_sel)
1310 comb += req.hit_way.eq(req_hit_way)
1311 comb += req.same_tag.eq(req_same_tag)
1312
1313 # Store the incoming request from r0,
1314 # if it is a slow request
1315 # Note that r1.full = 1 implies req_op = OP_NONE
1316 with m.If((req_op == Op.OP_LOAD_MISS)
1317 | (req_op == Op.OP_LOAD_NC)
1318 | (req_op == Op.OP_STORE_MISS)
1319 | (req_op == Op.OP_STORE_HIT)):
1320 sync += r1.req.eq(req)
1321 sync += r1.full.eq(1)
1322
1323 # Main state machine
1324 with m.Switch(r1.state):
1325
1326 with m.Case(State.IDLE):
1327 sync += r1.wb.adr.eq(req.real_addr[ROW_LINE_BITS:])
1328 sync += r1.wb.sel.eq(req.byte_sel)
1329 sync += r1.wb.dat.eq(req.data)
1330 sync += r1.dcbz.eq(req.dcbz)
1331
1332 # Keep track of our index and way
1333 # for subsequent stores.
1334 sync += r1.store_index.eq(req_idx)
1335 sync += r1.store_row.eq(req_row)
1336 sync += r1.end_row_ix.eq(get_row_of_line(req_row)-1)
1337 sync += r1.reload_tag.eq(req_tag)
1338 sync += r1.req.same_tag.eq(1)
1339
1340 with m.If(req.op == Op.OP_STORE_HIT):
1341 sync += r1.store_way.eq(req.hit_way)
1342
1343 # Reset per-row valid bits,
1344 # ready for handling OP_LOAD_MISS
1345 for i in range(ROW_PER_LINE):
1346 sync += r1.rows_valid[i].eq(0)
1347
1348 with m.If(req_op != Op.OP_NONE):
1349 sync += Display("cache op %d", req.op)
1350
1351 with m.Switch(req.op):
1352 with m.Case(Op.OP_LOAD_HIT):
1353 # stay in IDLE state
1354 pass
1355
1356 with m.Case(Op.OP_LOAD_MISS):
1357 sync += Display("cache miss real addr: %x " \
1358 "idx: %x tag: %x",
1359 req.real_addr, req_row, req_tag)
1360
1361 # Start the wishbone cycle
1362 sync += r1.wb.we.eq(0)
1363 sync += r1.wb.cyc.eq(1)
1364 sync += r1.wb.stb.eq(1)
1365
1366 # Track that we had one request sent
1367 sync += r1.state.eq(State.RELOAD_WAIT_ACK)
1368 sync += r1.write_tag.eq(1)
1369
1370 with m.Case(Op.OP_LOAD_NC):
1371 sync += r1.wb.cyc.eq(1)
1372 sync += r1.wb.stb.eq(1)
1373 sync += r1.wb.we.eq(0)
1374 sync += r1.state.eq(State.NC_LOAD_WAIT_ACK)
1375
1376 with m.Case(Op.OP_STORE_HIT, Op.OP_STORE_MISS):
1377 with m.If(~req.dcbz):
1378 sync += r1.state.eq(State.STORE_WAIT_ACK)
1379 sync += r1.acks_pending.eq(1)
1380 sync += r1.full.eq(0)
1381 sync += r1.slow_valid.eq(1)
1382
1383 with m.If(~req.mmu_req):
1384 sync += r1.ls_valid.eq(1)
1385 with m.Else():
1386 sync += r1.mmu_done.eq(1)
1387
1388 with m.If(req.op == Op.OP_STORE_HIT):
1389 sync += r1.write_bram.eq(1)
1390 with m.Else():
1391 # dcbz is handled much like a load miss except
1392 # that we are writing to memory instead of reading
1393 sync += r1.state.eq(State.RELOAD_WAIT_ACK)
1394
1395 with m.If(req.op == Op.OP_STORE_MISS):
1396 sync += r1.write_tag.eq(1)
1397
1398 sync += r1.wb.we.eq(1)
1399 sync += r1.wb.cyc.eq(1)
1400 sync += r1.wb.stb.eq(1)
1401
1402 # OP_NONE and OP_BAD do nothing
1403 # OP_BAD & OP_STCX_FAIL were
1404 # handled above already
1405 with m.Case(Op.OP_NONE):
1406 pass
1407 with m.Case(Op.OP_BAD):
1408 pass
1409 with m.Case(Op.OP_STCX_FAIL):
1410 pass
1411
1412 with m.Case(State.RELOAD_WAIT_ACK):
1413 ld_stbs_done = Signal()
1414 # Requests are all sent if stb is 0
1415 comb += ld_stbs_done.eq(~r1.wb.stb)
1416
1417 # If we are still sending requests, was one accepted?
1418 with m.If((~wb_in.stall) & r1.wb.stb):
1419 # That was the last word? We are done sending.
1420 # Clear stb and set ld_stbs_done so we can handle an
1421 # eventual last ack on the same cycle.
1422 # sigh - reconstruct wb adr with 3 extra 0s at front
1423 wb_adr = Cat(Const(0, ROW_OFF_BITS), r1.wb.adr)
1424 with m.If(is_last_row_addr(wb_adr, r1.end_row_ix)):
1425 sync += r1.wb.stb.eq(0)
1426 comb += ld_stbs_done.eq(1)
1427
1428 # Calculate the next row address in the current cache line
1429 row = Signal(LINE_OFF_BITS-ROW_OFF_BITS)
1430 comb += row.eq(r1.wb.adr)
1431 sync += r1.wb.adr[:LINE_OFF_BITS-ROW_OFF_BITS].eq(row+1)
1432
1433 # Incoming acks processing
1434 sync += r1.forward_valid1.eq(wb_in.ack)
1435 with m.If(wb_in.ack):
1436 srow = Signal(ROW_LINE_BITS)
1437 comb += srow.eq(r1.store_row)
1438 sync += r1.rows_valid[srow].eq(1)
1439
1440 # If this is the data we were looking for,
1441 # we can complete the request next cycle.
1442 # Compare the whole address in case the
1443 # request in r1.req is not the one that
1444 # started this refill.
1445 with m.If(req.valid & r1.req.same_tag &
1446 ((r1.dcbz & r1.req.dcbz) |
1447 (~r1.dcbz & (r1.req.op == Op.OP_LOAD_MISS))) &
1448 (r1.store_row == get_row(req.real_addr))):
1449 sync += r1.full.eq(0)
1450 sync += r1.slow_valid.eq(1)
1451 with m.If(~r1.mmu_req):
1452 sync += r1.ls_valid.eq(1)
1453 with m.Else():
1454 sync += r1.mmu_done.eq(1)
1455 sync += r1.forward_sel.eq(~0) # all 1s
1456 sync += r1.use_forward1.eq(1)
1457
1458 # Check for completion
1459 with m.If(ld_stbs_done & is_last_row(r1.store_row,
1460 r1.end_row_ix)):
1461 # Complete wishbone cycle
1462 sync += r1.wb.cyc.eq(0)
1463
1464 # Cache line is now valid
1465 cv = Signal(INDEX_BITS)
1466 comb += cv.eq(cache_valids[r1.store_index])
1467 comb += cv.bit_select(r1.store_way, 1).eq(1)
1468 sync += cache_valids[r1.store_index].eq(cv)
1469
1470 sync += r1.state.eq(State.IDLE)
1471
1472 # Increment store row counter
1473 sync += r1.store_row.eq(next_row(r1.store_row))
1474
1475 with m.Case(State.STORE_WAIT_ACK):
1476 st_stbs_done = Signal()
1477 acks = Signal(3)
1478 adjust_acks = Signal(3)
1479
1480 comb += st_stbs_done.eq(~r1.wb.stb)
1481 comb += acks.eq(r1.acks_pending)
1482
1483 with m.If(r1.inc_acks != r1.dec_acks):
1484 with m.If(r1.inc_acks):
1485 comb += adjust_acks.eq(acks + 1)
1486 with m.Else():
1487 comb += adjust_acks.eq(acks - 1)
1488 with m.Else():
1489 comb += adjust_acks.eq(acks)
1490
1491 sync += r1.acks_pending.eq(adjust_acks)
1492
1493 # Clear stb when slave accepted request
1494 with m.If(~wb_in.stall):
1495 # See if there is another store waiting
1496 # to be done which is in the same real page.
1497 with m.If(req.valid):
1498 _ra = req.real_addr[ROW_LINE_BITS:SET_SIZE_BITS]
1499 sync += r1.wb.adr[0:SET_SIZE_BITS].eq(_ra)
1500 sync += r1.wb.dat.eq(req.data)
1501 sync += r1.wb.sel.eq(req.byte_sel)
1502
1503 with m.If((adjust_acks < 7) & req.same_tag &
1504 ((req.op == Op.OP_STORE_MISS)
1505 | (req.op == Op.OP_STORE_HIT))):
1506 sync += r1.wb.stb.eq(1)
1507 comb += st_stbs_done.eq(0)
1508
1509 with m.If(req.op == Op.OP_STORE_HIT):
1510 sync += r1.write_bram.eq(1)
1511 sync += r1.full.eq(0)
1512 sync += r1.slow_valid.eq(1)
1513
1514 # Store requests never come from the MMU
1515 sync += r1.ls_valid.eq(1)
1516 comb += st_stbs_done.eq(0)
1517 sync += r1.inc_acks.eq(1)
1518 with m.Else():
1519 sync += r1.wb.stb.eq(0)
1520 comb += st_stbs_done.eq(1)
1521
1522 # Got ack ? See if complete.
1523 with m.If(wb_in.ack):
1524 with m.If(st_stbs_done & (adjust_acks == 1)):
1525 sync += r1.state.eq(State.IDLE)
1526 sync += r1.wb.cyc.eq(0)
1527 sync += r1.wb.stb.eq(0)
1528 sync += r1.dec_acks.eq(1)
1529
1530 with m.Case(State.NC_LOAD_WAIT_ACK):
1531 # Clear stb when slave accepted request
1532 with m.If(~wb_in.stall):
1533 sync += r1.wb.stb.eq(0)
1534
1535 # Got ack ? complete.
1536 with m.If(wb_in.ack):
1537 sync += r1.state.eq(State.IDLE)
1538 sync += r1.full.eq(0)
1539 sync += r1.slow_valid.eq(1)
1540
1541 with m.If(~r1.mmu_req):
1542 sync += r1.ls_valid.eq(1)
1543 with m.Else():
1544 sync += r1.mmu_done.eq(1)
1545
1546 sync += r1.forward_sel.eq(~0) # all 1s
1547 sync += r1.use_forward1.eq(1)
1548 sync += r1.wb.cyc.eq(0)
1549 sync += r1.wb.stb.eq(0)
1550
1551 def dcache_log(self, m, r1, valid_ra, tlb_hit_way, stall_out):
1552
1553 sync = m.d.sync
1554 d_out, wb_in, log_out = self.d_out, self.wb_in, self.log_out
1555
1556 sync += log_out.eq(Cat(r1.state[:3], valid_ra, tlb_hit_way[:3],
1557 stall_out, req_op[:3], d_out.valid, d_out.error,
1558 r1.wb.cyc, r1.wb.stb, wb_in.ack, wb_in.stall,
1559 r1.real_adr[3:6]))
1560
1561 def elaborate(self, platform):
1562
1563 m = Module()
1564 comb = m.d.comb
1565 d_in = self.d_in
1566
1567 # Storage. Hopefully "cache_rows" is a BRAM, the rest is LUTs
1568 cache_tags = CacheTagArray()
1569 cache_tag_set = Signal(TAG_RAM_WIDTH)
1570 cache_valids = CacheValidBitsArray()
1571
1572 # TODO attribute ram_style : string;
1573 # TODO attribute ram_style of cache_tags : signal is "distributed";
1574
1575 """note: these are passed to nmigen.hdl.Memory as "attributes".
1576 don't know how, just that they are.
1577 """
1578 dtlb_valid_bits = TLBValidBitsArray()
1579 dtlb_tags = TLBTagsArray()
1580 dtlb_ptes = TLBPtesArray()
1581 # TODO attribute ram_style of
1582 # dtlb_tags : signal is "distributed";
1583 # TODO attribute ram_style of
1584 # dtlb_ptes : signal is "distributed";
1585
1586 r0 = RegStage0("r0")
1587 r0_full = Signal()
1588
1589 r1 = RegStage1("r1")
1590
1591 reservation = Reservation()
1592
1593 # Async signals on incoming request
1594 req_index = Signal(INDEX_BITS)
1595 req_row = Signal(ROW_BITS)
1596 req_hit_way = Signal(WAY_BITS)
1597 req_tag = Signal(TAG_BITS)
1598 req_op = Signal(Op)
1599 req_data = Signal(64)
1600 req_same_tag = Signal()
1601 req_go = Signal()
1602
1603 early_req_row = Signal(ROW_BITS)
1604
1605 cancel_store = Signal()
1606 set_rsrv = Signal()
1607 clear_rsrv = Signal()
1608
1609 r0_valid = Signal()
1610 r0_stall = Signal()
1611
1612 use_forward1_next = Signal()
1613 use_forward2_next = Signal()
1614
1615 cache_out_row = Signal(WB_DATA_BITS)
1616
1617 plru_victim = PLRUOut()
1618 replace_way = Signal(WAY_BITS)
1619
1620 # Wishbone read/write/cache write formatting signals
1621 bus_sel = Signal(8)
1622
1623 # TLB signals
1624 tlb_tag_way = Signal(TLB_TAG_WAY_BITS)
1625 tlb_pte_way = Signal(TLB_PTE_WAY_BITS)
1626 tlb_valid_way = Signal(TLB_NUM_WAYS)
1627 tlb_req_index = Signal(TLB_SET_BITS)
1628 tlb_hit = Signal()
1629 tlb_hit_way = Signal(TLB_WAY_BITS)
1630 pte = Signal(TLB_PTE_BITS)
1631 ra = Signal(REAL_ADDR_BITS)
1632 valid_ra = Signal()
1633 perm_attr = PermAttr("dc_perms")
1634 rc_ok = Signal()
1635 perm_ok = Signal()
1636 access_ok = Signal()
1637
1638 tlb_plru_victim = TLBPLRUOut()
1639
1640 # we don't yet handle collisions between loadstore1 requests
1641 # and MMU requests
1642 comb += self.m_out.stall.eq(0)
1643
1644 # Hold off the request in r0 when r1 has an uncompleted request
1645 comb += r0_stall.eq(r0_full & (r1.full | d_in.hold))
1646 comb += r0_valid.eq(r0_full & ~r1.full & ~d_in.hold)
1647 comb += self.stall_out.eq(r0_stall)
1648
1649 # Wire up wishbone request latch out of stage 1
1650 comb += self.wb_out.eq(r1.wb)
1651
1652 # deal with litex not doing wishbone pipeline mode
1653 # XXX in wrong way. FIFOs are needed in the SRAM test
1654 # so that stb/ack match up
1655 comb += self.wb_in.stall.eq(self.wb_out.cyc & ~self.wb_in.ack)
1656
1657 # call sub-functions putting everything together, using shared
1658 # signals established above
1659 self.stage_0(m, r0, r1, r0_full)
1660 self.tlb_read(m, r0_stall, tlb_valid_way,
1661 tlb_tag_way, tlb_pte_way, dtlb_valid_bits,
1662 dtlb_tags, dtlb_ptes)
1663 self.tlb_search(m, tlb_req_index, r0, r0_valid,
1664 tlb_valid_way, tlb_tag_way, tlb_hit_way,
1665 tlb_pte_way, pte, tlb_hit, valid_ra, perm_attr, ra)
1666 self.tlb_update(m, r0_valid, r0, dtlb_valid_bits, tlb_req_index,
1667 tlb_hit_way, tlb_hit, tlb_plru_victim, tlb_tag_way,
1668 dtlb_tags, tlb_pte_way, dtlb_ptes)
1669 self.maybe_plrus(m, r1, plru_victim)
1670 self.maybe_tlb_plrus(m, r1, tlb_plru_victim)
1671 self.cache_tag_read(m, r0_stall, req_index, cache_tag_set, cache_tags)
1672 self.dcache_request(m, r0, ra, req_index, req_row, req_tag,
1673 r0_valid, r1, cache_valids, replace_way,
1674 use_forward1_next, use_forward2_next,
1675 req_hit_way, plru_victim, rc_ok, perm_attr,
1676 valid_ra, perm_ok, access_ok, req_op, req_go,
1677 tlb_pte_way,
1678 tlb_hit, tlb_hit_way, tlb_valid_way, cache_tag_set,
1679 cancel_store, req_same_tag, r0_stall, early_req_row)
1680 self.reservation_comb(m, cancel_store, set_rsrv, clear_rsrv,
1681 r0_valid, r0, reservation)
1682 self.reservation_reg(m, r0_valid, access_ok, set_rsrv, clear_rsrv,
1683 reservation, r0)
1684 self.writeback_control(m, r1, cache_out_row)
1685 self.rams(m, r1, early_req_row, cache_out_row, replace_way)
1686 self.dcache_fast_hit(m, req_op, r0_valid, r0, r1,
1687 req_hit_way, req_index, req_tag, access_ok,
1688 tlb_hit, tlb_hit_way, tlb_req_index)
1689 self.dcache_slow(m, r1, use_forward1_next, use_forward2_next,
1690 cache_valids, r0, replace_way,
1691 req_hit_way, req_same_tag,
1692 r0_valid, req_op, cache_tags, req_go, ra)
1693 #self.dcache_log(m, r1, valid_ra, tlb_hit_way, stall_out)
1694
1695 return m
1696
1697 def dcache_load(dut, addr, nc=0):
1698 yield dut.d_in.load.eq(1)
1699 yield dut.d_in.nc.eq(nc)
1700 yield dut.d_in.addr.eq(addr)
1701 yield dut.d_in.byte_sel.eq(~0)
1702 yield dut.d_in.valid.eq(1)
1703 yield
1704 yield dut.d_in.valid.eq(0)
1705 yield dut.d_in.byte_sel.eq(0)
1706 while not (yield dut.d_out.valid):
1707 yield
1708 # yield # data is valid one cycle AFTER valid goes hi? (no it isn't)
1709 data = yield dut.d_out.data
1710 return data
1711
1712
1713 def dcache_store(dut, addr, data, nc=0):
1714 yield dut.d_in.load.eq(0)
1715 yield dut.d_in.nc.eq(nc)
1716 yield dut.d_in.byte_sel.eq(~0)
1717 yield dut.d_in.addr.eq(addr)
1718 yield dut.d_in.valid.eq(1)
1719 yield
1720 yield dut.d_in.data.eq(data) # leave set, but the cycle AFTER
1721 yield dut.d_in.valid.eq(0)
1722 yield dut.d_in.byte_sel.eq(0)
1723 while not (yield dut.d_out.valid):
1724 yield
1725
1726
1727 def dcache_random_sim(dut, mem, nc=0):
1728
1729 # start copy of mem
1730 sim_mem = deepcopy(mem)
1731 memsize = len(sim_mem)
1732 print ("mem len", memsize)
1733
1734 # clear stuff
1735 yield dut.d_in.valid.eq(0)
1736 yield dut.d_in.load.eq(0)
1737 yield dut.d_in.priv_mode.eq(1)
1738 yield dut.d_in.nc.eq(0)
1739 yield dut.d_in.addr.eq(0)
1740 yield dut.d_in.data.eq(0)
1741 yield dut.m_in.valid.eq(0)
1742 yield dut.m_in.addr.eq(0)
1743 yield dut.m_in.pte.eq(0)
1744 # wait 4 * clk_period
1745 yield
1746 yield
1747 yield
1748 yield
1749
1750 print ()
1751
1752 #for i in range(1024):
1753 # sim_mem[i] = i
1754
1755 for i in range(1024):
1756 addr = randint(0, memsize-1)
1757 data = randint(0, (1<<64)-1)
1758 sim_mem[addr] = data
1759 row = addr
1760 addr *= 8
1761
1762 print ("random testing %d 0x%x row %d data 0x%x" % (i, addr, row, data))
1763
1764 yield from dcache_load(dut, addr, nc)
1765 yield from dcache_store(dut, addr, data, nc)
1766
1767 addr = randint(0, memsize-1)
1768 sim_data = sim_mem[addr]
1769 row = addr
1770 addr *= 8
1771
1772 print (" load 0x%x row %d expect data 0x%x" % (addr, row, sim_data))
1773 data = yield from dcache_load(dut, addr, nc)
1774 assert data == sim_data, \
1775 "check addr 0x%x row %d data %x != %x" % (addr, row, data, sim_data)
1776
1777 for addr in range(memsize):
1778 data = yield from dcache_load(dut, addr*8, nc)
1779 assert data == sim_mem[addr], \
1780 "final check %x data %x != %x" % (addr*8, data, sim_mem[addr])
1781
1782
1783 def dcache_regression_sim(dut, mem, nc=0):
1784
1785 # start copy of mem
1786 sim_mem = deepcopy(mem)
1787 memsize = len(sim_mem)
1788 print ("mem len", memsize)
1789
1790 # clear stuff
1791 yield dut.d_in.valid.eq(0)
1792 yield dut.d_in.load.eq(0)
1793 yield dut.d_in.priv_mode.eq(1)
1794 yield dut.d_in.nc.eq(0)
1795 yield dut.d_in.addr.eq(0)
1796 yield dut.d_in.data.eq(0)
1797 yield dut.m_in.valid.eq(0)
1798 yield dut.m_in.addr.eq(0)
1799 yield dut.m_in.pte.eq(0)
1800 # wait 4 * clk_period
1801 yield
1802 yield
1803 yield
1804 yield
1805
1806 addr = 0
1807 row = addr
1808 addr *= 8
1809
1810 print ("random testing %d 0x%x row %d" % (i, addr, row))
1811
1812 yield from dcache_load(dut, addr, nc)
1813
1814 addr = 2
1815 sim_data = sim_mem[addr]
1816 row = addr
1817 addr *= 8
1818
1819 print (" load 0x%x row %d expect data 0x%x" % (addr, row, sim_data))
1820 data = yield from dcache_load(dut, addr, nc)
1821 assert data == sim_data, \
1822 "check addr 0x%x row %d data %x != %x" % (addr, row, data, sim_data)
1823
1824
1825
1826 def dcache_sim(dut, mem):
1827 # clear stuff
1828 yield dut.d_in.valid.eq(0)
1829 yield dut.d_in.load.eq(0)
1830 yield dut.d_in.priv_mode.eq(1)
1831 yield dut.d_in.nc.eq(0)
1832 yield dut.d_in.addr.eq(0)
1833 yield dut.d_in.data.eq(0)
1834 yield dut.m_in.valid.eq(0)
1835 yield dut.m_in.addr.eq(0)
1836 yield dut.m_in.pte.eq(0)
1837 # wait 4 * clk_period
1838 yield
1839 yield
1840 yield
1841 yield
1842
1843 # Cacheable read of address 4
1844 data = yield from dcache_load(dut, 0x58)
1845 addr = yield dut.d_in.addr
1846 assert data == 0x0000001700000016, \
1847 f"data @%x=%x expected 0x0000001700000016" % (addr, data)
1848
1849 # Cacheable read of address 20
1850 data = yield from dcache_load(dut, 0x20)
1851 addr = yield dut.d_in.addr
1852 assert data == 0x0000000900000008, \
1853 f"data @%x=%x expected 0x0000000900000008" % (addr, data)
1854
1855 # Cacheable read of address 30
1856 data = yield from dcache_load(dut, 0x530)
1857 addr = yield dut.d_in.addr
1858 assert data == 0x0000014D0000014C, \
1859 f"data @%x=%x expected 0000014D0000014C" % (addr, data)
1860
1861 # 2nd Cacheable read of address 30
1862 data = yield from dcache_load(dut, 0x530)
1863 addr = yield dut.d_in.addr
1864 assert data == 0x0000014D0000014C, \
1865 f"data @%x=%x expected 0000014D0000014C" % (addr, data)
1866
1867 # Non-cacheable read of address 100
1868 data = yield from dcache_load(dut, 0x100, nc=1)
1869 addr = yield dut.d_in.addr
1870 assert data == 0x0000004100000040, \
1871 f"data @%x=%x expected 0000004100000040" % (addr, data)
1872
1873 # Store at address 530
1874 yield from dcache_store(dut, 0x530, 0x121)
1875
1876 # Store at address 30
1877 yield from dcache_store(dut, 0x530, 0x12345678)
1878
1879 # 3nd Cacheable read of address 530
1880 data = yield from dcache_load(dut, 0x530)
1881 addr = yield dut.d_in.addr
1882 assert data == 0x12345678, \
1883 f"data @%x=%x expected 0x12345678" % (addr, data)
1884
1885 # 4th Cacheable read of address 20
1886 data = yield from dcache_load(dut, 0x20)
1887 addr = yield dut.d_in.addr
1888 assert data == 0x0000000900000008, \
1889 f"data @%x=%x expected 0x0000000900000008" % (addr, data)
1890
1891 yield
1892 yield
1893 yield
1894 yield
1895
1896
1897 def test_dcache(mem, test_fn, test_name):
1898 dut = DCache()
1899
1900 memory = Memory(width=64, depth=len(mem), init=mem, simulate=True)
1901 sram = SRAM(memory=memory, granularity=8)
1902
1903 m = Module()
1904 m.submodules.dcache = dut
1905 m.submodules.sram = sram
1906
1907 m.d.comb += sram.bus.cyc.eq(dut.wb_out.cyc)
1908 m.d.comb += sram.bus.stb.eq(dut.wb_out.stb)
1909 m.d.comb += sram.bus.we.eq(dut.wb_out.we)
1910 m.d.comb += sram.bus.sel.eq(dut.wb_out.sel)
1911 m.d.comb += sram.bus.adr.eq(dut.wb_out.adr)
1912 m.d.comb += sram.bus.dat_w.eq(dut.wb_out.dat)
1913
1914 m.d.comb += dut.wb_in.ack.eq(sram.bus.ack)
1915 m.d.comb += dut.wb_in.dat.eq(sram.bus.dat_r)
1916
1917 dcache_write_gtkw(test_name)
1918
1919 # nmigen Simulation
1920 sim = Simulator(m)
1921 sim.add_clock(1e-6)
1922
1923 sim.add_sync_process(wrap(test_fn(dut, mem)))
1924 with sim.write_vcd('test_dcache%s.vcd' % test_name):
1925 sim.run()
1926
1927
1928 def dcache_write_gtkw(test_name):
1929 traces = [
1930 'clk',
1931 ('d_in', [
1932 'd_in_load', 'd_in_nc', 'd_in_addr[63:0]', 'd_in_data[63:0]',
1933 'd_in_byte_sel[7:0]', 'd_in_valid'
1934 ]),
1935 ('d_out', [
1936 'd_out_valid', 'd_out_data[63:0]'
1937 ]),
1938 ('wb_out', [
1939 'wb_out_cyc', 'wb_out_stb', 'wb_out_we',
1940 'wb_out_adr[31:0]', 'wb_out_sel[7:0]', 'wb_out_dat[63:0]'
1941 ]),
1942 ('wb_in', [
1943 'wb_in_stall', 'wb_in_ack', 'wb_in_dat[63:0]'
1944 ])
1945 ]
1946 write_gtkw('test_dcache%s.gtkw' % test_name,
1947 'test_dcache%s.vcd' % test_name,
1948 traces, module='top.dcache')
1949
1950
1951 if __name__ == '__main__':
1952 seed(0)
1953 dut = DCache()
1954 vl = rtlil.convert(dut, ports=[])
1955 with open("test_dcache.il", "w") as f:
1956 f.write(vl)
1957
1958 mem = []
1959 memsize = 16
1960 for i in range(memsize):
1961 mem.append(i)
1962
1963 test_dcache(mem, dcache_regression_sim, "simpleregression")
1964
1965 mem = []
1966 memsize = 256
1967 for i in range(memsize):
1968 mem.append(i)
1969
1970 test_dcache(mem, dcache_random_sim, "random")
1971
1972 mem = []
1973 for i in range(1024):
1974 mem.append((i*2)| ((i*2+1)<<32))
1975
1976 test_dcache(mem, dcache_sim, "")
1977