Sample data based on datavalid signal (fixes #47)
[gram.git] / gram / phy / ecp5ddrphy.py
1 # This file is Copyright (c) 2019 David Shah <dave@ds0.me>
2 # This file is Copyright (c) 2019-2020 Florent Kermarrec <florent@enjoy-digital.fr>
3 # This file is Copyright (c) 2020 LambdaConcept <contact@lambdaconcept.com>
4 # License: BSD
5
6 # 1:2 frequency-ratio DDR3 PHY for Lattice's ECP5
7 # DDR3: 800 MT/s
8
9 import math
10
11 from nmigen import *
12 from nmigen.lib.cdc import FFSynchronizer
13 from nmigen.utils import log2_int
14
15 from lambdasoc.periph import Peripheral
16
17 import gram.stream as stream
18 from gram.common import *
19 from gram.phy.dfi import Interface
20 from gram.compat import Timeline
21
22 # Lattice ECP5 DDR PHY Initialization --------------------------------------------------------------
23
24
25 class ECP5DDRPHYInit(Elaboratable):
26 def __init__(self):
27 self.pause = Signal()
28 self.stop = Signal()
29 self.delay = Signal()
30 self.reset = Signal()
31
32 def elaborate(self, platform):
33 m = Module()
34
35 new_lock = Signal()
36 update = Signal()
37 freeze = Signal()
38
39 # DDRDLLA instance -------------------------------------------------------------------------
40 _lock = Signal()
41 delay = Signal()
42 m.submodules += Instance("DDRDLLA",
43 i_CLK=ClockSignal("sync2x"),
44 i_RST=ResetSignal("init"),
45 i_UDDCNTLN=~update,
46 i_FREEZE=freeze,
47 o_DDRDEL=delay,
48 o_LOCK=_lock)
49 lock = Signal()
50 lock_d = Signal()
51 m.submodules += FFSynchronizer(_lock, lock, o_domain="init")
52 m.d.init += lock_d.eq(lock)
53 m.d.sync += new_lock.eq(lock & ~lock_d)
54
55 # DDRDLLA/DDQBUFM/ECLK initialization sequence ---------------------------------------------
56 t = 8 # in cycles
57 tl = Timeline([
58 (1*t, [freeze.eq(1)]), # Freeze DDRDLLA
59 (2*t, [self.stop.eq(1)]), # Stop ECLK domain
60 (3*t, [self.reset.eq(1)]), # Reset ECLK domain
61 (4*t, [self.reset.eq(0)]), # Release ECLK domain reset
62 (5*t, [self.stop.eq(0)]), # Release ECLK domain stop
63 (6*t, [freeze.eq(0)]), # Release DDRDLLA freeze
64 (7*t, [self.pause.eq(1)]), # Pause DQSBUFM
65 (8*t, [update.eq(1)]), # Update DDRDLLA
66 (9*t, [update.eq(0)]), # Release DDRDMMA update
67 (10*t, [self.pause.eq(0)]), # Release DQSBUFM pause
68 ])
69 m.submodules += tl
70 # Wait DDRDLLA Lock
71 m.d.comb += tl.trigger.eq(new_lock)
72
73 m.d.comb += self.delay.eq(delay)
74
75 return m
76
77 # Lattice ECP5 DDR PHY -----------------------------------------------------------------------------
78
79
80 class ECP5DDRPHY(Peripheral, Elaboratable):
81 def __init__(self, pads, sys_clk_freq=100e6):
82 super().__init__(name="phy")
83
84 self.pads = pads
85 self._sys_clk_freq = sys_clk_freq
86
87 databits = len(self.pads.dq.io)
88 assert databits % 8 == 0
89
90 # CSR
91 bank = self.csr_bank()
92
93 self.burstdet = bank.csr(databits//8, "rw")
94
95 self.rdly = []
96 self.rdly += [bank.csr(3, "rw", name="rdly_p0")]
97 self.rdly += [bank.csr(3, "rw", name="rdly_p1")]
98
99 self._bridge = self.bridge(data_width=32, granularity=8, alignment=2)
100 self.bus = self._bridge.bus
101
102 addressbits = len(self.pads.a.o0)
103 bankbits = len(self.pads.ba.o0)
104 nranks = 1 if not hasattr(self.pads, "cs") else len(self.pads.cs.o0)
105 databits = len(self.pads.dq.io)
106 self.dfi = Interface(addressbits, bankbits, nranks, 4*databits, 4)
107
108 # PHY settings -----------------------------------------------------------------------------
109 tck = 1/(2*self._sys_clk_freq)
110 nphases = 2
111 databits = len(self.pads.dq.io)
112 nranks = 1 if not hasattr(self.pads, "cs") else len(self.pads.cs.o0)
113 cl, cwl = get_cl_cw("DDR3", tck)
114 cl_sys_latency = get_sys_latency(nphases, cl)
115 cwl_sys_latency = get_sys_latency(nphases, cwl)
116 rdcmdphase, rdphase = get_sys_phases(nphases, cl_sys_latency, cl)
117 wrcmdphase, wrphase = get_sys_phases(nphases, cwl_sys_latency, cwl)
118 self.settings = PhySettings(
119 phytype="ECP5DDRPHY",
120 memtype="DDR3",
121 databits=databits,
122 dfi_databits=4*databits,
123 nranks=nranks,
124 nphases=nphases,
125 rdphase=rdphase,
126 wrphase=wrphase,
127 rdcmdphase=rdcmdphase,
128 wrcmdphase=wrcmdphase,
129 cl=cl,
130 cwl=cwl,
131 read_latency=2 + cl_sys_latency + 2 + log2_int(4//nphases) + 4,
132 write_latency=cwl_sys_latency
133 )
134
135 def elaborate(self, platform):
136 m = Module()
137
138 m.submodules.bridge = self._bridge
139
140 tck = 1/(2*self._sys_clk_freq)
141 nphases = 2
142 databits = len(self.pads.dq.io)
143
144 burstdet_reg = Signal(databits//8, reset_less=True)
145 m.d.comb += self.burstdet.r_data.eq(burstdet_reg)
146
147 # Burstdet clear
148 with m.If(self.burstdet.w_stb):
149 m.d.sync += burstdet_reg.eq(0)
150
151 # Init -------------------------------------------------------------------------------------
152 m.submodules.init = init = ECP5DDRPHYInit()
153
154 # Parameters -------------------------------------------------------------------------------
155 cl, cwl = get_cl_cw("DDR3", tck)
156 cl_sys_latency = get_sys_latency(nphases, cl)
157 cwl_sys_latency = get_sys_latency(nphases, cwl)
158
159 # DFI Interface ----------------------------------------------------------------------------
160 dfi = self.dfi
161
162 bl8_chunk = Signal()
163
164 # Clock --------------------------------------------------------------------------------
165 m.d.comb += [
166 self.pads.clk.o_clk.eq(ClockSignal("dramsync")),
167 self.pads.clk.o_fclk.eq(ClockSignal("sync2x")),
168 ]
169 for i in range(len(self.pads.clk.o0)):
170 m.d.comb += [
171 self.pads.clk.o0[i].eq(0),
172 self.pads.clk.o1[i].eq(1),
173 self.pads.clk.o2[i].eq(0),
174 self.pads.clk.o3[i].eq(1),
175 ]
176
177 # Addresses and Commands ---------------------------------------------------------------
178 m.d.comb += [
179 self.pads.a.o_clk.eq(ClockSignal("dramsync")),
180 self.pads.a.o_fclk.eq(ClockSignal("sync2x")),
181 self.pads.ba.o_clk.eq(ClockSignal("dramsync")),
182 self.pads.ba.o_fclk.eq(ClockSignal("sync2x")),
183 ]
184 for i in range(len(self.pads.a.o0)):
185 m.d.comb += [
186 self.pads.a.o0[i].eq(dfi.phases[0].address[i]),
187 self.pads.a.o1[i].eq(dfi.phases[0].address[i]),
188 self.pads.a.o2[i].eq(dfi.phases[1].address[i]),
189 self.pads.a.o3[i].eq(dfi.phases[1].address[i]),
190 ]
191 for i in range(len(self.pads.ba.o0)):
192 m.d.comb += [
193 self.pads.ba.o0[i].eq(dfi.phases[0].bank[i]),
194 self.pads.ba.o1[i].eq(dfi.phases[0].bank[i]),
195 self.pads.ba.o2[i].eq(dfi.phases[1].bank[i]),
196 self.pads.ba.o3[i].eq(dfi.phases[1].bank[i]),
197 ]
198
199 # Control pins
200 controls = ["ras", "cas", "we", "clk_en", "odt"]
201 if hasattr(self.pads, "reset"):
202 controls.append("reset")
203 if hasattr(self.pads, "cs"):
204 controls.append("cs")
205 for name in controls:
206 m.d.comb += [
207 getattr(self.pads, name).o_clk.eq(ClockSignal("dramsync")),
208 getattr(self.pads, name).o_fclk.eq(ClockSignal("sync2x")),
209 ]
210 for i in range(len(getattr(self.pads, name).o0)):
211 m.d.comb += [
212 getattr(self.pads, name).o0[i].eq(getattr(dfi.phases[0], name)[i]),
213 getattr(self.pads, name).o1[i].eq(getattr(dfi.phases[0], name)[i]),
214 getattr(self.pads, name).o2[i].eq(getattr(dfi.phases[1], name)[i]),
215 getattr(self.pads, name).o3[i].eq(getattr(dfi.phases[1], name)[i]),
216 ]
217
218 # DQ ---------------------------------------------------------------------------------------
219 dq_oe = Signal()
220 dqs_re = Signal()
221 dqs_oe = Signal()
222 dqs_postamble = Signal()
223 dqs_preamble = Signal()
224 for i in range(databits//8):
225 # DQSBUFM
226 dqs_i = Signal()
227 dqsr90 = Signal()
228 dqsw270 = Signal()
229 dqsw = Signal()
230 rdpntr = Signal(3)
231 wrpntr = Signal(3)
232 burstdet = Signal()
233 datavalid = Signal()
234 datavalid_prev = Signal()
235 m.d.sync += datavalid_prev.eq(datavalid)
236
237 m.submodules += Instance("DQSBUFM",
238 p_DQS_LI_DEL_ADJ="MINUS",
239 p_DQS_LI_DEL_VAL=1,
240 p_DQS_LO_DEL_ADJ="MINUS",
241 p_DQS_LO_DEL_VAL=4,
242
243 # Delay
244 i_DYNDELAY0=0,
245 i_DYNDELAY1=0,
246 i_DYNDELAY2=0,
247 i_DYNDELAY3=0,
248 i_DYNDELAY4=0,
249 i_DYNDELAY5=0,
250 i_DYNDELAY6=0,
251 i_DYNDELAY7=0,
252
253 # Clocks / Reset
254 i_SCLK=ClockSignal("sync"),
255 i_ECLK=ClockSignal("sync2x"),
256 i_RST=ResetSignal("dramsync"),
257 i_DDRDEL=init.delay,
258 i_PAUSE=init.pause | self.rdly[i].w_stb,
259
260 # Control
261 # Assert LOADNs to use DDRDEL control
262 i_RDLOADN=0,
263 i_RDMOVE=0,
264 i_RDDIRECTION=1,
265 i_WRLOADN=0,
266 i_WRMOVE=0,
267 i_WRDIRECTION=1,
268
269 # Reads (generate shifted DQS clock for reads)
270 i_READ0=dqs_re,
271 i_READ1=dqs_re,
272 i_READCLKSEL0=self.rdly[i].w_data[0],
273 i_READCLKSEL1=self.rdly[i].w_data[1],
274 i_READCLKSEL2=self.rdly[i].w_data[2],
275 i_DQSI=dqs_i,
276 o_DQSR90=dqsr90,
277 o_RDPNTR0=rdpntr[0],
278 o_RDPNTR1=rdpntr[1],
279 o_RDPNTR2=rdpntr[2],
280 o_WRPNTR0=wrpntr[0],
281 o_WRPNTR1=wrpntr[1],
282 o_WRPNTR2=wrpntr[2],
283 o_BURSTDET=burstdet,
284 o_DATAVALID=datavalid,
285
286 # Writes (generate shifted ECLK clock for writes)
287 o_DQSW270=dqsw270,
288 o_DQSW=dqsw)
289
290 with m.If(burstdet):
291 m.d.sync += burstdet_reg[i].eq(1)
292
293 # DQS and DM ---------------------------------------------------------------------------
294 dm_o_data = Signal(8)
295 dm_o_data_d = Signal(8)
296 dm_o_data_muxed = Signal(4)
297 m.d.comb += dm_o_data.eq(Cat(
298 dfi.phases[0].wrdata_mask[0*databits//8+i],
299 dfi.phases[0].wrdata_mask[1*databits//8+i],
300 dfi.phases[0].wrdata_mask[2*databits//8+i],
301 dfi.phases[0].wrdata_mask[3*databits//8+i],
302
303 dfi.phases[1].wrdata_mask[0*databits//8+i],
304 dfi.phases[1].wrdata_mask[1*databits//8+i],
305 dfi.phases[1].wrdata_mask[2*databits//8+i],
306 dfi.phases[1].wrdata_mask[3*databits//8+i]),
307 )
308 m.d.sync += dm_o_data_d.eq(dm_o_data)
309
310 with m.If(bl8_chunk):
311 m.d.sync += dm_o_data_muxed.eq(dm_o_data_d[4:])
312 with m.Else():
313 m.d.sync += dm_o_data_muxed.eq(dm_o_data[:4])
314
315 m.submodules += Instance("ODDRX2DQA",
316 i_RST=ResetSignal("dramsync"),
317 i_ECLK=ClockSignal("sync2x"),
318 i_SCLK=ClockSignal("dramsync"),
319 i_DQSW270=dqsw270,
320 i_D0=dm_o_data_muxed[0],
321 i_D1=dm_o_data_muxed[1],
322 i_D2=dm_o_data_muxed[2],
323 i_D3=dm_o_data_muxed[3],
324 o_Q=self.pads.dm.o[i])
325
326 dqs = Signal()
327 dqs_oe_n = Signal()
328 m.submodules += [
329 Instance("ODDRX2DQSB",
330 i_RST=ResetSignal("dramsync"),
331 i_ECLK=ClockSignal("sync2x"),
332 i_SCLK=ClockSignal(),
333 i_DQSW=dqsw,
334 i_D0=0,
335 i_D1=1,
336 i_D2=0,
337 i_D3=1,
338 o_Q=dqs),
339 Instance("TSHX2DQSA",
340 i_RST=ResetSignal("dramsync"),
341 i_ECLK=ClockSignal("sync2x"),
342 i_SCLK=ClockSignal(),
343 i_DQSW=dqsw,
344 i_T0=~(dqs_oe | dqs_postamble),
345 i_T1=~(dqs_oe | dqs_preamble),
346 o_Q=dqs_oe_n),
347 Instance("BB",
348 i_I=dqs,
349 i_T=dqs_oe_n,
350 o_O=dqs_i,
351 io_B=self.pads.dqs.p[i]),
352 ]
353
354 for j in range(8*i, 8*(i+1)):
355 dq_o = Signal()
356 dq_i = Signal()
357 dq_oe_n = Signal()
358 dq_i_delayed = Signal()
359 dq_i_data = Signal(4)
360 dq_o_data = Signal(8)
361 dq_o_data_d = Signal(8)
362 dq_o_data_muxed = Signal(4)
363 m.d.comb += dq_o_data.eq(Cat(
364 dfi.phases[0].wrdata[0*databits+j],
365 dfi.phases[0].wrdata[1*databits+j],
366 dfi.phases[0].wrdata[2*databits+j],
367 dfi.phases[0].wrdata[3*databits+j],
368 dfi.phases[1].wrdata[0*databits+j],
369 dfi.phases[1].wrdata[1*databits+j],
370 dfi.phases[1].wrdata[2*databits+j],
371 dfi.phases[1].wrdata[3*databits+j])
372 )
373
374 m.d.sync += dq_o_data_d.eq(dq_o_data)
375 with m.If(bl8_chunk):
376 m.d.sync += dq_o_data_muxed.eq(dq_o_data_d[4:])
377 with m.Else():
378 m.d.sync += dq_o_data_muxed.eq(dq_o_data[:4])
379
380 m.submodules += [
381 Instance("ODDRX2DQA",
382 i_RST=ResetSignal("dramsync"),
383 i_ECLK=ClockSignal("sync2x"),
384 i_SCLK=ClockSignal(),
385 i_DQSW270=dqsw270,
386 i_D0=dq_o_data_muxed[0],
387 i_D1=dq_o_data_muxed[1],
388 i_D2=dq_o_data_muxed[2],
389 i_D3=dq_o_data_muxed[3],
390 o_Q=dq_o),
391 Instance("DELAYF",
392 p_DEL_MODE="DQS_ALIGNED_X2",
393 i_LOADN=1,
394 i_MOVE=0,
395 i_DIRECTION=0,
396 i_A=dq_i,
397 o_Z=dq_i_delayed),
398 Instance("IDDRX2DQA",
399 i_RST=ResetSignal("dramsync"),
400 i_ECLK=ClockSignal("sync2x"),
401 i_SCLK=ClockSignal(),
402 i_DQSR90=dqsr90,
403 i_RDPNTR0=rdpntr[0],
404 i_RDPNTR1=rdpntr[1],
405 i_RDPNTR2=rdpntr[2],
406 i_WRPNTR0=wrpntr[0],
407 i_WRPNTR1=wrpntr[1],
408 i_WRPNTR2=wrpntr[2],
409 i_D=dq_i_delayed,
410 o_Q0=dq_i_data[0],
411 o_Q1=dq_i_data[1],
412 o_Q2=dq_i_data[2],
413 o_Q3=dq_i_data[3]),
414 ]
415 m.submodules += [
416 Instance("TSHX2DQA",
417 i_RST=ResetSignal("dramsync"),
418 i_ECLK=ClockSignal("sync2x"),
419 i_SCLK=ClockSignal(),
420 i_DQSW270=dqsw270,
421 i_T0=~dq_oe,
422 i_T1=~dq_oe,
423 o_Q=dq_oe_n),
424 Instance("BB",
425 i_I=dq_o,
426 i_T=dq_oe_n,
427 o_O=dq_i,
428 io_B=self.pads.dq.io[j])
429 ]
430 with m.If(~datavalid_prev & datavalid):
431 m.d.sync += [
432 dfi.phases[0].rddata[0*databits+j].eq(dq_i_data[0]),
433 dfi.phases[0].rddata[1*databits+j].eq(dq_i_data[1]),
434 dfi.phases[0].rddata[2*databits+j].eq(dq_i_data[2]),
435 dfi.phases[0].rddata[3*databits+j].eq(dq_i_data[3]),
436 ]
437 with m.Elif(datavalid):
438 m.d.sync += [
439 dfi.phases[1].rddata[0*databits+j].eq(dq_i_data[0]),
440 dfi.phases[1].rddata[1*databits+j].eq(dq_i_data[1]),
441 dfi.phases[1].rddata[2*databits+j].eq(dq_i_data[2]),
442 dfi.phases[1].rddata[3*databits+j].eq(dq_i_data[3]),
443 ]
444
445 # Read Control Path ------------------------------------------------------------------------
446 # Creates a shift register of read commands coming from the DFI interface. This shift register
447 # is used to control DQS read (internal read pulse of the DQSBUF) and to indicate to the
448 # DFI interface that the read data is valid.
449 #
450 # The DQS read must be asserted for 2 sys_clk cycles before the read data is coming back from
451 # the DRAM (see 6.2.4 READ Pulse Positioning Optimization of FPGA-TN-02035-1.2)
452 #
453 # The read data valid is asserted for 1 sys_clk cycle when the data is available on the DFI
454 # interface, the latency is the sum of the ODDRX2DQA, CAS, IDDRX2DQA latencies.
455 rddata_en = Signal(self.settings.read_latency)
456 rddata_en_last = Signal.like(rddata_en)
457 m.d.comb += rddata_en.eq(Cat(dfi.phases[self.settings.rdphase].rddata_en, rddata_en_last))
458 m.d.sync += rddata_en_last.eq(rddata_en)
459 m.d.comb += dqs_re.eq(rddata_en[cl_sys_latency + 0] | rddata_en[cl_sys_latency + 1] | rddata_en[cl_sys_latency + 2])
460
461 rddata_valid = Signal()
462 m.d.sync += rddata_valid.eq(datavalid_prev & ~datavalid)
463 for phase in dfi.phases:
464 m.d.comb += phase.rddata_valid.eq(rddata_valid)
465
466 # Write Control Path -----------------------------------------------------------------------
467 # Creates a shift register of write commands coming from the DFI interface. This shift register
468 # is used to control DQ/DQS tristates and to select write data of the DRAM burst from the DFI
469 # interface: The PHY is operating in halfrate mode (so provide 4 datas every sys_clk cycles:
470 # 2x for DDR, 2x for halfrate) but DDR3 requires a burst of 8 datas (BL8) for best efficiency.
471 # Writes are then performed in 2 sys_clk cycles and data needs to be selected for each cycle.
472 # FIXME: understand +2
473 wrdata_en = Signal(cwl_sys_latency + 4)
474 wrdata_en_last = Signal.like(wrdata_en)
475 m.d.comb += wrdata_en.eq(Cat(dfi.phases[self.settings.wrphase].wrdata_en, wrdata_en_last))
476 m.d.sync += wrdata_en_last.eq(wrdata_en)
477 m.d.comb += dq_oe.eq(wrdata_en[cwl_sys_latency + 1] | wrdata_en[cwl_sys_latency + 2])
478 m.d.comb += bl8_chunk.eq(wrdata_en[cwl_sys_latency + 1])
479 m.d.comb += dqs_oe.eq(dq_oe)
480
481 # Write DQS Postamble/Preamble Control Path ------------------------------------------------
482 # Generates DQS Preamble 1 cycle before the first write and Postamble 1 cycle after the last
483 # write. During writes, DQS tristate is configured as output for at least 4 sys_clk cycles:
484 # 1 for Preamble, 2 for the Write and 1 for the Postamble.
485 m.d.comb += dqs_preamble.eq(wrdata_en[cwl_sys_latency + 0] & ~wrdata_en[cwl_sys_latency + 1])
486 m.d.comb += dqs_postamble.eq(wrdata_en[cwl_sys_latency + 3] & ~wrdata_en[cwl_sys_latency + 2])
487
488 return m