gram.phy.ecp5ddrphy: Fix DQSBUFM's pause signal (fixes #51)
[gram.git] / gram / phy / ecp5ddrphy.py
1 # This file is Copyright (c) 2019 David Shah <dave@ds0.me>
2 # This file is Copyright (c) 2019-2020 Florent Kermarrec <florent@enjoy-digital.fr>
3 # This file is Copyright (c) 2020 LambdaConcept <contact@lambdaconcept.com>
4 # License: BSD
5
6 # 1:2 frequency-ratio DDR3 PHY for Lattice's ECP5
7 # DDR3: 800 MT/s
8
9 import math
10
11 from nmigen import *
12 from nmigen.lib.cdc import FFSynchronizer
13 from nmigen.utils import log2_int
14
15 from lambdasoc.periph import Peripheral
16
17 import gram.stream as stream
18 from gram.common import *
19 from gram.phy.dfi import Interface
20 from gram.compat import Timeline
21
22 __all__ = ["ECP5DDRPHY"]
23
24
25 class ECP5DDRPHYInit(Elaboratable):
26 def __init__(self):
27 self.pause = Signal()
28 self.stop = Signal()
29 self.delay = Signal()
30 self.reset = Signal()
31
32 def elaborate(self, platform):
33 m = Module()
34
35 new_lock = Signal()
36 update = Signal()
37 freeze = Signal()
38
39 # DDRDLLA instance -------------------------------------------------------------------------
40 _lock = Signal()
41 delay = Signal()
42 m.submodules += Instance("DDRDLLA",
43 i_CLK=ClockSignal("sync2x"),
44 i_RST=ResetSignal("init"),
45 i_UDDCNTLN=~update,
46 i_FREEZE=freeze,
47 o_DDRDEL=delay,
48 o_LOCK=_lock)
49 lock = Signal()
50 lock_d = Signal()
51 m.submodules += FFSynchronizer(_lock, lock, o_domain="init")
52 m.d.init += lock_d.eq(lock)
53 m.d.sync += new_lock.eq(lock & ~lock_d)
54
55 # DDRDLLA/DDQBUFM/ECLK initialization sequence ---------------------------------------------
56 t = 8 # in cycles
57 tl = Timeline([
58 (1*t, [freeze.eq(1)]), # Freeze DDRDLLA
59 (2*t, [self.stop.eq(1)]), # Stop ECLK domain
60 (3*t, [self.reset.eq(1)]), # Reset ECLK domain
61 (4*t, [self.reset.eq(0)]), # Release ECLK domain reset
62 (5*t, [self.stop.eq(0)]), # Release ECLK domain stop
63 (6*t, [freeze.eq(0)]), # Release DDRDLLA freeze
64 (7*t, [self.pause.eq(1)]), # Pause DQSBUFM
65 (8*t, [update.eq(1)]), # Update DDRDLLA
66 (9*t, [update.eq(0)]), # Release DDRDMMA update
67 (10*t, [self.pause.eq(0)]), # Release DQSBUFM pause
68 ])
69 m.submodules += tl
70 # Wait DDRDLLA Lock
71 m.d.comb += tl.trigger.eq(new_lock)
72
73 m.d.comb += self.delay.eq(delay)
74
75 return m
76
77
78 class _DQSBUFMSettingManager(Elaboratable):
79 def __init__(self, rdly_csr):
80 self.rdly_csr = rdly_csr
81
82 self.pause = Signal()
83 self.readclksel = Signal(3)
84
85 def elaborate(self, platform):
86 m = Module()
87
88 with m.FSM():
89 with m.State("Idle"):
90 with m.If(self.rdly_csr.w_stb):
91 m.d.sync += self.pause.eq(1)
92 m.next = "RdlyUpdateRequested"
93
94 with m.State("RdlyUpdateRequested"):
95 m.d.sync += self.readclksel.eq(self.rdly_csr.w_data)
96 m.next = "ResetPause"
97
98 with m.State("ResetPause"):
99 m.d.sync += self.pause.eq(0)
100 m.next = "Idle"
101
102 return m
103
104
105 class ECP5DDRPHY(Peripheral, Elaboratable):
106 def __init__(self, pads, sys_clk_freq=100e6):
107 super().__init__(name="phy")
108
109 self.pads = pads
110 self._sys_clk_freq = sys_clk_freq
111
112 databits = len(self.pads.dq.io)
113 if databits % 8 != 0:
114 raise ValueError("DQ pads should come in a multiple of 8")
115
116 # CSR
117 bank = self.csr_bank()
118
119 self.burstdet = bank.csr(databits//8, "rw")
120
121 self.rdly = []
122 self.rdly += [bank.csr(3, "rw", name="rdly_p0")]
123 self.rdly += [bank.csr(3, "rw", name="rdly_p1")]
124
125 self._bridge = self.bridge(data_width=32, granularity=8, alignment=2)
126 self.bus = self._bridge.bus
127
128 addressbits = len(self.pads.a.o0)
129 bankbits = len(self.pads.ba.o0)
130 nranks = 1 if not hasattr(self.pads, "cs") else len(self.pads.cs.o0)
131 databits = len(self.pads.dq.io)
132 self.dfi = Interface(addressbits, bankbits, nranks, 4*databits, 4)
133
134 # PHY settings -----------------------------------------------------------------------------
135 tck = 1/(2*self._sys_clk_freq)
136 nphases = 2
137 databits = len(self.pads.dq.io)
138 nranks = 1 if not hasattr(self.pads, "cs") else len(self.pads.cs.o0)
139 cl, cwl = get_cl_cw("DDR3", tck)
140 cl_sys_latency = get_sys_latency(nphases, cl)
141 cwl_sys_latency = get_sys_latency(nphases, cwl)
142 rdcmdphase, rdphase = get_sys_phases(nphases, cl_sys_latency, cl)
143 wrcmdphase, wrphase = get_sys_phases(nphases, cwl_sys_latency, cwl)
144 self.settings = PhySettings(
145 phytype="ECP5DDRPHY",
146 memtype="DDR3",
147 databits=databits,
148 dfi_databits=4*databits,
149 nranks=nranks,
150 nphases=nphases,
151 rdphase=rdphase,
152 wrphase=wrphase,
153 rdcmdphase=rdcmdphase,
154 wrcmdphase=wrcmdphase,
155 cl=cl,
156 cwl=cwl,
157 read_latency=2 + cl_sys_latency + 2 + log2_int(4//nphases) + 4,
158 write_latency=cwl_sys_latency
159 )
160
161 def elaborate(self, platform):
162 m = Module()
163
164 m.submodules.bridge = self._bridge
165
166 tck = 1/(2*self._sys_clk_freq)
167 nphases = 2
168 databits = len(self.pads.dq.io)
169
170 burstdet_reg = Signal(databits//8, reset_less=True)
171 m.d.comb += self.burstdet.r_data.eq(burstdet_reg)
172
173 # Burstdet clear
174 with m.If(self.burstdet.w_stb):
175 m.d.sync += burstdet_reg.eq(0)
176
177 # Init -------------------------------------------------------------------------------------
178 m.submodules.init = init = ECP5DDRPHYInit()
179
180 # Parameters -------------------------------------------------------------------------------
181 cl, cwl = get_cl_cw("DDR3", tck)
182 cl_sys_latency = get_sys_latency(nphases, cl)
183 cwl_sys_latency = get_sys_latency(nphases, cwl)
184
185 # DFI Interface ----------------------------------------------------------------------------
186 dfi = self.dfi
187
188 bl8_chunk = Signal()
189
190 # Clock --------------------------------------------------------------------------------
191 m.d.comb += [
192 self.pads.clk.o_clk.eq(ClockSignal("dramsync")),
193 self.pads.clk.o_fclk.eq(ClockSignal("sync2x")),
194 ]
195 for i in range(len(self.pads.clk.o0)):
196 m.d.comb += [
197 self.pads.clk.o0[i].eq(0),
198 self.pads.clk.o1[i].eq(1),
199 self.pads.clk.o2[i].eq(0),
200 self.pads.clk.o3[i].eq(1),
201 ]
202
203 # Addresses and Commands ---------------------------------------------------------------
204 m.d.comb += [
205 self.pads.a.o_clk.eq(ClockSignal("dramsync")),
206 self.pads.a.o_fclk.eq(ClockSignal("sync2x")),
207 self.pads.ba.o_clk.eq(ClockSignal("dramsync")),
208 self.pads.ba.o_fclk.eq(ClockSignal("sync2x")),
209 ]
210 for i in range(len(self.pads.a.o0)):
211 m.d.comb += [
212 self.pads.a.o0[i].eq(dfi.phases[0].address[i]),
213 self.pads.a.o1[i].eq(dfi.phases[0].address[i]),
214 self.pads.a.o2[i].eq(dfi.phases[1].address[i]),
215 self.pads.a.o3[i].eq(dfi.phases[1].address[i]),
216 ]
217 for i in range(len(self.pads.ba.o0)):
218 m.d.comb += [
219 self.pads.ba.o0[i].eq(dfi.phases[0].bank[i]),
220 self.pads.ba.o1[i].eq(dfi.phases[0].bank[i]),
221 self.pads.ba.o2[i].eq(dfi.phases[1].bank[i]),
222 self.pads.ba.o3[i].eq(dfi.phases[1].bank[i]),
223 ]
224
225 # Control pins
226 controls = ["ras", "cas", "we", "clk_en", "odt"]
227 if hasattr(self.pads, "reset"):
228 controls.append("reset")
229 if hasattr(self.pads, "cs"):
230 controls.append("cs")
231 for name in controls:
232 m.d.comb += [
233 getattr(self.pads, name).o_clk.eq(ClockSignal("dramsync")),
234 getattr(self.pads, name).o_fclk.eq(ClockSignal("sync2x")),
235 ]
236 for i in range(len(getattr(self.pads, name).o0)):
237 m.d.comb += [
238 getattr(self.pads, name).o0[i].eq(getattr(dfi.phases[0], name)[i]),
239 getattr(self.pads, name).o1[i].eq(getattr(dfi.phases[0], name)[i]),
240 getattr(self.pads, name).o2[i].eq(getattr(dfi.phases[1], name)[i]),
241 getattr(self.pads, name).o3[i].eq(getattr(dfi.phases[1], name)[i]),
242 ]
243
244 # DQ ---------------------------------------------------------------------------------------
245 dq_oe = Signal()
246 dqs_re = Signal()
247 dqs_oe = Signal()
248 dqs_postamble = Signal()
249 dqs_preamble = Signal()
250 for i in range(databits//8):
251 # DQSBUFM
252 dqs_i = Signal()
253 dqsr90 = Signal()
254 dqsw270 = Signal()
255 dqsw = Signal()
256 rdpntr = Signal(3)
257 wrpntr = Signal(3)
258 burstdet = Signal()
259 datavalid = Signal()
260 datavalid_prev = Signal()
261 m.d.sync += datavalid_prev.eq(datavalid)
262
263 dqsbufm_manager = _DQSBUFMSettingManager(self.rdly[i])
264 setattr(m.submodules, f"dqsbufm_manager{i}", dqsbufm_manager)
265
266 m.submodules += Instance("DQSBUFM",
267 p_DQS_LI_DEL_ADJ="MINUS",
268 p_DQS_LI_DEL_VAL=1,
269 p_DQS_LO_DEL_ADJ="MINUS",
270 p_DQS_LO_DEL_VAL=4,
271
272 # Delay
273 i_DYNDELAY0=0,
274 i_DYNDELAY1=0,
275 i_DYNDELAY2=0,
276 i_DYNDELAY3=0,
277 i_DYNDELAY4=0,
278 i_DYNDELAY5=0,
279 i_DYNDELAY6=0,
280 i_DYNDELAY7=0,
281
282 # Clocks / Reset
283 i_SCLK=ClockSignal("sync"),
284 i_ECLK=ClockSignal("sync2x"),
285 i_RST=ResetSignal("dramsync"),
286 i_DDRDEL=init.delay,
287 i_PAUSE=init.pause | dqsbufm_manager.pause,
288
289 # Control
290 # Assert LOADNs to use DDRDEL control
291 i_RDLOADN=0,
292 i_RDMOVE=0,
293 i_RDDIRECTION=1,
294 i_WRLOADN=0,
295 i_WRMOVE=0,
296 i_WRDIRECTION=1,
297
298 # Reads (generate shifted DQS clock for reads)
299 i_READ0=dqs_re,
300 i_READ1=dqs_re,
301 i_READCLKSEL0=dqsbufm_manager.readclksel[0],
302 i_READCLKSEL1=dqsbufm_manager.readclksel[1],
303 i_READCLKSEL2=dqsbufm_manager.readclksel[2],
304 i_DQSI=dqs_i,
305 o_DQSR90=dqsr90,
306 o_RDPNTR0=rdpntr[0],
307 o_RDPNTR1=rdpntr[1],
308 o_RDPNTR2=rdpntr[2],
309 o_WRPNTR0=wrpntr[0],
310 o_WRPNTR1=wrpntr[1],
311 o_WRPNTR2=wrpntr[2],
312 o_BURSTDET=burstdet,
313 o_DATAVALID=datavalid,
314
315 # Writes (generate shifted ECLK clock for writes)
316 o_DQSW270=dqsw270,
317 o_DQSW=dqsw)
318
319 with m.If(burstdet):
320 m.d.sync += burstdet_reg[i].eq(1)
321
322 # DQS and DM ---------------------------------------------------------------------------
323 dm_o_data = Signal(8)
324 dm_o_data_d = Signal(8)
325 dm_o_data_muxed = Signal(4)
326 m.d.comb += dm_o_data.eq(Cat(
327 dfi.phases[0].wrdata_mask[0*databits//8+i],
328 dfi.phases[0].wrdata_mask[1*databits//8+i],
329 dfi.phases[0].wrdata_mask[2*databits//8+i],
330 dfi.phases[0].wrdata_mask[3*databits//8+i],
331
332 dfi.phases[1].wrdata_mask[0*databits//8+i],
333 dfi.phases[1].wrdata_mask[1*databits//8+i],
334 dfi.phases[1].wrdata_mask[2*databits//8+i],
335 dfi.phases[1].wrdata_mask[3*databits//8+i]),
336 )
337 m.d.sync += dm_o_data_d.eq(dm_o_data)
338
339 with m.If(bl8_chunk):
340 m.d.sync += dm_o_data_muxed.eq(dm_o_data_d[4:])
341 with m.Else():
342 m.d.sync += dm_o_data_muxed.eq(dm_o_data[:4])
343
344 m.submodules += Instance("ODDRX2DQA",
345 i_RST=ResetSignal("dramsync"),
346 i_ECLK=ClockSignal("sync2x"),
347 i_SCLK=ClockSignal("dramsync"),
348 i_DQSW270=dqsw270,
349 i_D0=dm_o_data_muxed[0],
350 i_D1=dm_o_data_muxed[1],
351 i_D2=dm_o_data_muxed[2],
352 i_D3=dm_o_data_muxed[3],
353 o_Q=self.pads.dm.o[i])
354
355 dqs = Signal()
356 dqs_oe_n = Signal()
357 m.submodules += [
358 Instance("ODDRX2DQSB",
359 i_RST=ResetSignal("dramsync"),
360 i_ECLK=ClockSignal("sync2x"),
361 i_SCLK=ClockSignal(),
362 i_DQSW=dqsw,
363 i_D0=0,
364 i_D1=1,
365 i_D2=0,
366 i_D3=1,
367 o_Q=dqs),
368 Instance("TSHX2DQSA",
369 i_RST=ResetSignal("dramsync"),
370 i_ECLK=ClockSignal("sync2x"),
371 i_SCLK=ClockSignal(),
372 i_DQSW=dqsw,
373 i_T0=~(dqs_oe | dqs_postamble),
374 i_T1=~(dqs_oe | dqs_preamble),
375 o_Q=dqs_oe_n),
376 Instance("BB",
377 i_I=dqs,
378 i_T=dqs_oe_n,
379 o_O=dqs_i,
380 io_B=self.pads.dqs.p[i]),
381 ]
382
383 for j in range(8*i, 8*(i+1)):
384 dq_o = Signal()
385 dq_i = Signal()
386 dq_oe_n = Signal()
387 dq_i_delayed = Signal()
388 dq_i_data = Signal(4)
389 dq_o_data = Signal(8)
390 dq_o_data_d = Signal(8)
391 dq_o_data_muxed = Signal(4)
392 m.d.comb += dq_o_data.eq(Cat(
393 dfi.phases[0].wrdata[0*databits+j],
394 dfi.phases[0].wrdata[1*databits+j],
395 dfi.phases[0].wrdata[2*databits+j],
396 dfi.phases[0].wrdata[3*databits+j],
397 dfi.phases[1].wrdata[0*databits+j],
398 dfi.phases[1].wrdata[1*databits+j],
399 dfi.phases[1].wrdata[2*databits+j],
400 dfi.phases[1].wrdata[3*databits+j])
401 )
402
403 m.d.sync += dq_o_data_d.eq(dq_o_data)
404 with m.If(bl8_chunk):
405 m.d.sync += dq_o_data_muxed.eq(dq_o_data_d[4:])
406 with m.Else():
407 m.d.sync += dq_o_data_muxed.eq(dq_o_data[:4])
408
409 m.submodules += [
410 Instance("ODDRX2DQA",
411 i_RST=ResetSignal("dramsync"),
412 i_ECLK=ClockSignal("sync2x"),
413 i_SCLK=ClockSignal(),
414 i_DQSW270=dqsw270,
415 i_D0=dq_o_data_muxed[0],
416 i_D1=dq_o_data_muxed[1],
417 i_D2=dq_o_data_muxed[2],
418 i_D3=dq_o_data_muxed[3],
419 o_Q=dq_o),
420 Instance("DELAYF",
421 p_DEL_MODE="DQS_ALIGNED_X2",
422 i_LOADN=1,
423 i_MOVE=0,
424 i_DIRECTION=0,
425 i_A=dq_i,
426 o_Z=dq_i_delayed),
427 Instance("IDDRX2DQA",
428 i_RST=ResetSignal("dramsync"),
429 i_ECLK=ClockSignal("sync2x"),
430 i_SCLK=ClockSignal(),
431 i_DQSR90=dqsr90,
432 i_RDPNTR0=rdpntr[0],
433 i_RDPNTR1=rdpntr[1],
434 i_RDPNTR2=rdpntr[2],
435 i_WRPNTR0=wrpntr[0],
436 i_WRPNTR1=wrpntr[1],
437 i_WRPNTR2=wrpntr[2],
438 i_D=dq_i_delayed,
439 o_Q0=dq_i_data[0],
440 o_Q1=dq_i_data[1],
441 o_Q2=dq_i_data[2],
442 o_Q3=dq_i_data[3]),
443 Instance("TSHX2DQA",
444 i_RST=ResetSignal("dramsync"),
445 i_ECLK=ClockSignal("sync2x"),
446 i_SCLK=ClockSignal(),
447 i_DQSW270=dqsw270,
448 i_T0=~dq_oe,
449 i_T1=~dq_oe,
450 o_Q=dq_oe_n),
451 Instance("BB",
452 i_I=dq_o,
453 i_T=dq_oe_n,
454 o_O=dq_i,
455 io_B=self.pads.dq.io[j])
456 ]
457 with m.If(~datavalid_prev & datavalid):
458 m.d.sync += [
459 dfi.phases[0].rddata[0*databits+j].eq(dq_i_data[0]),
460 dfi.phases[0].rddata[1*databits+j].eq(dq_i_data[1]),
461 dfi.phases[0].rddata[2*databits+j].eq(dq_i_data[2]),
462 dfi.phases[0].rddata[3*databits+j].eq(dq_i_data[3]),
463 ]
464 with m.Elif(datavalid):
465 m.d.sync += [
466 dfi.phases[1].rddata[0*databits+j].eq(dq_i_data[0]),
467 dfi.phases[1].rddata[1*databits+j].eq(dq_i_data[1]),
468 dfi.phases[1].rddata[2*databits+j].eq(dq_i_data[2]),
469 dfi.phases[1].rddata[3*databits+j].eq(dq_i_data[3]),
470 ]
471
472 # Read Control Path ------------------------------------------------------------------------
473 # Creates a shift register of read commands coming from the DFI interface. This shift register
474 # is used to control DQS read (internal read pulse of the DQSBUF) and to indicate to the
475 # DFI interface that the read data is valid.
476 #
477 # The DQS read must be asserted for 2 sys_clk cycles before the read data is coming back from
478 # the DRAM (see 6.2.4 READ Pulse Positioning Optimization of FPGA-TN-02035-1.2)
479 #
480 # The read data valid is asserted for 1 sys_clk cycle when the data is available on the DFI
481 # interface, the latency is the sum of the ODDRX2DQA, CAS, IDDRX2DQA latencies.
482 rddata_en = Signal(self.settings.read_latency)
483 rddata_en_last = Signal.like(rddata_en)
484 m.d.comb += rddata_en.eq(Cat(dfi.phases[self.settings.rdphase].rddata_en, rddata_en_last))
485 m.d.sync += rddata_en_last.eq(rddata_en)
486 m.d.comb += dqs_re.eq(rddata_en[cl_sys_latency + 0] | rddata_en[cl_sys_latency + 1] | rddata_en[cl_sys_latency + 2])
487
488 rddata_valid = Signal()
489 m.d.sync += rddata_valid.eq(datavalid_prev & ~datavalid)
490 for phase in dfi.phases:
491 m.d.comb += phase.rddata_valid.eq(rddata_valid)
492
493 # Write Control Path -----------------------------------------------------------------------
494 # Creates a shift register of write commands coming from the DFI interface. This shift register
495 # is used to control DQ/DQS tristates and to select write data of the DRAM burst from the DFI
496 # interface: The PHY is operating in halfrate mode (so provide 4 datas every sys_clk cycles:
497 # 2x for DDR, 2x for halfrate) but DDR3 requires a burst of 8 datas (BL8) for best efficiency.
498 # Writes are then performed in 2 sys_clk cycles and data needs to be selected for each cycle.
499 # FIXME: understand +2
500 wrdata_en = Signal(cwl_sys_latency + 4)
501 wrdata_en_last = Signal.like(wrdata_en)
502 m.d.comb += wrdata_en.eq(Cat(dfi.phases[self.settings.wrphase].wrdata_en, wrdata_en_last))
503 m.d.sync += wrdata_en_last.eq(wrdata_en)
504 m.d.comb += dq_oe.eq(wrdata_en[cwl_sys_latency + 1] | wrdata_en[cwl_sys_latency + 2])
505 m.d.comb += bl8_chunk.eq(wrdata_en[cwl_sys_latency + 1])
506 m.d.comb += dqs_oe.eq(dq_oe)
507
508 # Write DQS Postamble/Preamble Control Path ------------------------------------------------
509 # Generates DQS Preamble 1 cycle before the first write and Postamble 1 cycle after the last
510 # write. During writes, DQS tristate is configured as output for at least 4 sys_clk cycles:
511 # 1 for Preamble, 2 for the Write and 1 for the Postamble.
512 m.d.comb += dqs_preamble.eq(wrdata_en[cwl_sys_latency + 0] & ~wrdata_en[cwl_sys_latency + 1])
513 m.d.comb += dqs_postamble.eq(wrdata_en[cwl_sys_latency + 3] & ~wrdata_en[cwl_sys_latency + 2])
514
515 return m