22cd0eb7d741114eb78fae8c72dbc89a521e81b5
[pyelftools.git] / elftools / dwarf / dwarfinfo.py
1 #-------------------------------------------------------------------------------
2 # elftools: dwarf/dwarfinfo.py
3 #
4 # DWARFInfo - Main class for accessing DWARF debug information
5 #
6 # Eli Bendersky (eliben@gmail.com)
7 # This code is in the public domain
8 #-------------------------------------------------------------------------------
9 import os
10 from collections import namedtuple
11 from bisect import bisect_right
12
13 from ..construct.lib.container import Container
14 from ..common.exceptions import DWARFError
15 from ..common.utils import (struct_parse, dwarf_assert,
16 parse_cstring_from_stream)
17 from .structs import DWARFStructs
18 from .compileunit import CompileUnit
19 from .abbrevtable import AbbrevTable
20 from .lineprogram import LineProgram
21 from .callframe import CallFrameInfo
22 from .locationlists import LocationLists, LocationListsPair
23 from .ranges import RangeLists, RangeListsPair
24 from .aranges import ARanges
25 from .namelut import NameLUT
26 from .dwarf_util import _get_base_offset
27
28
29 # Describes a debug section
30 #
31 # stream: a stream object containing the data of this section
32 # name: section name in the container file
33 # global_offset: the global offset of the section in its container file
34 # size: the size of the section's data, in bytes
35 # address: the virtual address for the section's data
36 #
37 # 'name' and 'global_offset' are for descriptional purposes only and
38 # aren't strictly required for the DWARF parsing to work. 'address' is required
39 # to properly decode the special '.eh_frame' format.
40 #
41 DebugSectionDescriptor = namedtuple('DebugSectionDescriptor',
42 'stream name global_offset size address')
43
44
45 # Some configuration parameters for the DWARF reader. This exists to allow
46 # DWARFInfo to be independent from any specific file format/container.
47 #
48 # little_endian:
49 # boolean flag specifying whether the data in the file is little endian
50 #
51 # machine_arch:
52 # Machine architecture as a string. For example 'x86' or 'x64'
53 #
54 # default_address_size:
55 # The default address size for the container file (sizeof pointer, in bytes)
56 #
57 DwarfConfig = namedtuple('DwarfConfig',
58 'little_endian machine_arch default_address_size')
59
60
61 class DWARFInfo(object):
62 """ Acts also as a "context" to other major objects, bridging between
63 various parts of the debug infromation.
64 """
65 def __init__(self,
66 config,
67 debug_info_sec,
68 debug_aranges_sec,
69 debug_abbrev_sec,
70 debug_frame_sec,
71 eh_frame_sec,
72 debug_str_sec,
73 debug_loc_sec,
74 debug_ranges_sec,
75 debug_line_sec,
76 debug_pubtypes_sec,
77 debug_pubnames_sec,
78 debug_addr_sec,
79 debug_str_offsets_sec,
80 debug_line_str_sec,
81 debug_loclists_sec,
82 debug_rnglists_sec): # Not parsed for now
83 """ config:
84 A DwarfConfig object
85
86 debug_*_sec:
87 DebugSectionDescriptor for a section. Pass None for sections
88 that don't exist. These arguments are best given with
89 keyword syntax.
90 """
91 self.config = config
92 self.debug_info_sec = debug_info_sec
93 self.debug_aranges_sec = debug_aranges_sec
94 self.debug_abbrev_sec = debug_abbrev_sec
95 self.debug_frame_sec = debug_frame_sec
96 self.eh_frame_sec = eh_frame_sec
97 self.debug_str_sec = debug_str_sec
98 self.debug_loc_sec = debug_loc_sec
99 self.debug_ranges_sec = debug_ranges_sec
100 self.debug_line_sec = debug_line_sec
101 self.debug_addr_sec = debug_addr_sec
102 self.debug_str_offsets_sec = debug_str_offsets_sec
103 self.debug_line_str_sec = debug_line_str_sec
104 self.debug_pubtypes_sec = debug_pubtypes_sec
105 self.debug_pubnames_sec = debug_pubnames_sec
106 self.debug_loclists_sec = debug_loclists_sec
107 self.debug_rnglists_sec = debug_rnglists_sec
108
109 # This is the DWARFStructs the context uses, so it doesn't depend on
110 # DWARF format and address_size (these are determined per CU) - set them
111 # to default values.
112 self.structs = DWARFStructs(
113 little_endian=self.config.little_endian,
114 dwarf_format=32,
115 address_size=self.config.default_address_size)
116
117 # Cache for abbrev tables: a dict keyed by offset
118 self._abbrevtable_cache = {}
119
120 # Cache of compile units and map of their offsets for bisect lookup.
121 # Access with .iter_CUs(), .get_CU_containing(), and/or .get_CU_at().
122 self._cu_cache = []
123 self._cu_offsets_map = []
124
125 @property
126 def has_debug_info(self):
127 """ Return whether this contains debug information.
128
129 It can be not the case when the ELF only contains .eh_frame, which is
130 encoded DWARF but not actually for debugging.
131 """
132 return bool(self.debug_info_sec)
133
134 def get_DIE_from_lut_entry(self, lut_entry):
135 """ Get the DIE from the pubnames or putbtypes lookup table entry.
136
137 lut_entry:
138 A NameLUTEntry object from a NameLUT instance (see
139 .get_pubmames and .get_pubtypes methods).
140 """
141 cu = self.get_CU_at(lut_entry.cu_ofs)
142 return self.get_DIE_from_refaddr(lut_entry.die_ofs, cu)
143
144 def get_DIE_from_refaddr(self, refaddr, cu=None):
145 """ Given a .debug_info section offset of a DIE, return the DIE.
146
147 refaddr:
148 The refaddr may come from a DW_FORM_ref_addr attribute.
149
150 cu:
151 The compile unit object, if known. If None a search
152 from the closest offset less than refaddr will be performed.
153 """
154 if cu is None:
155 cu = self.get_CU_containing(refaddr)
156 return cu.get_DIE_from_refaddr(refaddr)
157
158 def get_CU_containing(self, refaddr):
159 """ Find the CU that includes the given reference address in the
160 .debug_info section.
161
162 refaddr:
163 Either a refaddr of a DIE (possibly from a DW_FORM_ref_addr
164 attribute) or the section offset of a CU (possibly from an
165 aranges table).
166
167 This function will parse and cache CUs until the search criteria
168 is met, starting from the closest known offset lessthan or equal
169 to the given address.
170 """
171 dwarf_assert(
172 self.has_debug_info,
173 'CU lookup but no debug info section')
174 dwarf_assert(
175 0 <= refaddr < self.debug_info_sec.size,
176 "refaddr %s beyond .debug_info size" % refaddr)
177
178 # The CU containing the DIE we desire will be to the right of the
179 # DIE insert point. If we have a CU address, then it will be a
180 # match but the right insert minus one will still be the item.
181 # The first CU starts at offset 0, so start there if cache is empty.
182 i = bisect_right(self._cu_offsets_map, refaddr)
183 start = self._cu_offsets_map[i - 1] if i > 0 else 0
184
185 # parse CUs until we find one containing the desired address
186 for cu in self._parse_CUs_iter(start):
187 if cu.cu_offset <= refaddr < cu.cu_offset + cu.size:
188 return cu
189
190 raise ValueError("CU for reference address %s not found" % refaddr)
191
192 def get_CU_at(self, offset):
193 """ Given a CU header offset, return the parsed CU.
194
195 offset:
196 The offset may be from an accelerated access table such as
197 the public names, public types, address range table, or
198 prior use.
199
200 This function will directly parse the CU doing no validation of
201 the offset beyond checking the size of the .debug_info section.
202 """
203 dwarf_assert(
204 self.has_debug_info,
205 'CU lookup but no debug info section')
206 dwarf_assert(
207 0 <= offset < self.debug_info_sec.size,
208 "offset %s beyond .debug_info size" % offset)
209
210 return self._cached_CU_at_offset(offset)
211
212 def iter_CUs(self):
213 """ Yield all the compile units (CompileUnit objects) in the debug info
214 """
215 return self._parse_CUs_iter()
216
217 def get_abbrev_table(self, offset):
218 """ Get an AbbrevTable from the given offset in the debug_abbrev
219 section.
220
221 The only verification done on the offset is that it's within the
222 bounds of the section (if not, an exception is raised).
223 It is the caller's responsibility to make sure the offset actually
224 points to a valid abbreviation table.
225
226 AbbrevTable objects are cached internally (two calls for the same
227 offset will return the same object).
228 """
229 dwarf_assert(
230 offset < self.debug_abbrev_sec.size,
231 "Offset '0x%x' to abbrev table out of section bounds" % offset)
232 if offset not in self._abbrevtable_cache:
233 self._abbrevtable_cache[offset] = AbbrevTable(
234 structs=self.structs,
235 stream=self.debug_abbrev_sec.stream,
236 offset=offset)
237 return self._abbrevtable_cache[offset]
238
239 def get_string_from_table(self, offset):
240 """ Obtain a string from the string table section, given an offset
241 relative to the section.
242 """
243 return parse_cstring_from_stream(self.debug_str_sec.stream, offset)
244
245 def get_string_from_linetable(self, offset):
246 """ Obtain a string from the string table section, given an offset
247 relative to the section.
248 """
249 return parse_cstring_from_stream(self.debug_line_str_sec.stream, offset)
250
251 def line_program_for_CU(self, CU):
252 """ Given a CU object, fetch the line program it points to from the
253 .debug_line section.
254 If the CU doesn't point to a line program, return None.
255
256 Note about directory and file names. They are returned as two collections
257 in the lineprogram object's header - include_directory and file_entry.
258
259 In DWARFv5, they have introduced a different, extensible format for those
260 collections. So in a lineprogram v5+, there are two more collections in
261 the header - directories and file_names. Those might contain extra DWARFv5
262 information that is not exposed in include_directory and file_entry.
263 """
264 # The line program is pointed to by the DW_AT_stmt_list attribute of
265 # the top DIE of a CU.
266 top_DIE = CU.get_top_DIE()
267 if 'DW_AT_stmt_list' in top_DIE.attributes:
268 return self._parse_line_program_at_offset(
269 top_DIE.attributes['DW_AT_stmt_list'].value, CU.structs)
270 else:
271 return None
272
273 def has_CFI(self):
274 """ Does this dwarf info have a dwarf_frame CFI section?
275 """
276 return self.debug_frame_sec is not None
277
278 def CFI_entries(self):
279 """ Get a list of dwarf_frame CFI entries from the .debug_frame section.
280 """
281 cfi = CallFrameInfo(
282 stream=self.debug_frame_sec.stream,
283 size=self.debug_frame_sec.size,
284 address=self.debug_frame_sec.address,
285 base_structs=self.structs)
286 return cfi.get_entries()
287
288 def has_EH_CFI(self):
289 """ Does this dwarf info have a eh_frame CFI section?
290 """
291 return self.eh_frame_sec is not None
292
293 def EH_CFI_entries(self):
294 """ Get a list of eh_frame CFI entries from the .eh_frame section.
295 """
296 cfi = CallFrameInfo(
297 stream=self.eh_frame_sec.stream,
298 size=self.eh_frame_sec.size,
299 address=self.eh_frame_sec.address,
300 base_structs=self.structs,
301 for_eh_frame=True)
302 return cfi.get_entries()
303
304 def get_pubtypes(self):
305 """
306 Returns a NameLUT object that contains information read from the
307 .debug_pubtypes section in the ELF file.
308
309 NameLUT is essentially a dictionary containing the CU/DIE offsets of
310 each symbol. See the NameLUT doc string for more details.
311 """
312
313 if self.debug_pubtypes_sec:
314 return NameLUT(self.debug_pubtypes_sec.stream,
315 self.debug_pubtypes_sec.size,
316 self.structs)
317 else:
318 return None
319
320 def get_pubnames(self):
321 """
322 Returns a NameLUT object that contains information read from the
323 .debug_pubnames section in the ELF file.
324
325 NameLUT is essentially a dictionary containing the CU/DIE offsets of
326 each symbol. See the NameLUT doc string for more details.
327 """
328
329 if self.debug_pubnames_sec:
330 return NameLUT(self.debug_pubnames_sec.stream,
331 self.debug_pubnames_sec.size,
332 self.structs)
333 else:
334 return None
335
336 def get_aranges(self):
337 """ Get an ARanges object representing the .debug_aranges section of
338 the DWARF data, or None if the section doesn't exist
339 """
340 if self.debug_aranges_sec:
341 return ARanges(self.debug_aranges_sec.stream,
342 self.debug_aranges_sec.size,
343 self.structs)
344 else:
345 return None
346
347 def location_lists(self):
348 """ Get a LocationLists object representing the .debug_loc/debug_loclists section of
349 the DWARF data, or None if this section doesn't exist.
350
351 If both sections exist, it returns a LocationListsPair.
352 """
353 if self.debug_loclists_sec and self.debug_loc_sec is None:
354 return LocationLists(self.debug_loclists_sec.stream, self.structs, 5, self)
355 elif self.debug_loc_sec and self.debug_loclists_sec is None:
356 return LocationLists(self.debug_loc_sec.stream, self.structs, 4, self)
357 elif self.debug_loc_sec and self.debug_loclists_sec:
358 return LocationListsPair(self.debug_loclists_sec.stream, self.debug_loclists_sec.stream, self.structs, self)
359 else:
360 return None
361
362 def range_lists(self):
363 """ Get a RangeLists object representing the .debug_ranges/.debug_rnglists section of
364 the DWARF data, or None if this section doesn't exist.
365
366 If both sections exist, it returns a RangeListsPair.
367 """
368 if self.debug_rnglists_sec and self.debug_ranges_sec is None:
369 return RangeLists(self.debug_rnglists_sec.stream, self.structs, 5, self)
370 elif self.debug_ranges_sec and self.debug_rnglists_sec is None:
371 return RangeLists(self.debug_ranges_sec.stream, self.structs, 4, self)
372 elif self.debug_ranges_sec and self.debug_rnglists_sec:
373 return RangeListsPair(self.debug_ranges_sec.stream, self.debug_rnglists_sec.stream, self.structs, self)
374 else:
375 return None
376
377 def get_addr(self, cu, addr_index):
378 """Provided a CU and an index, retrieves an address from the debug_addr section
379 """
380 if not self.debug_addr_sec:
381 raise DWARFError('The file does not contain a debug_addr section for indirect address access')
382 # Selectors are not supported, but no assert on that. TODO?
383 cu_addr_base = _get_base_offset(cu, 'DW_AT_addr_base')
384 return struct_parse(cu.structs.Dwarf_target_addr(''), self.debug_addr_sec.stream, cu_addr_base + addr_index*cu.header.address_size)
385
386 #------ PRIVATE ------#
387
388 def _parse_CUs_iter(self, offset=0):
389 """ Iterate CU objects in order of appearance in the debug_info section.
390
391 offset:
392 The offset of the first CU to yield. Additional iterations
393 will return the sequential unit objects.
394
395 See .iter_CUs(), .get_CU_containing(), and .get_CU_at().
396 """
397 if self.debug_info_sec is None:
398 return
399
400 while offset < self.debug_info_sec.size:
401 cu = self._cached_CU_at_offset(offset)
402 # Compute the offset of the next CU in the section. The unit_length
403 # field of the CU header contains its size not including the length
404 # field itself.
405 offset = ( offset +
406 cu['unit_length'] +
407 cu.structs.initial_length_field_size())
408 yield cu
409
410 def _cached_CU_at_offset(self, offset):
411 """ Return the CU with unit header at the given offset into the
412 debug_info section from the cache. If not present, the unit is
413 header is parsed and the object is installed in the cache.
414
415 offset:
416 The offset of the unit header in the .debug_info section
417 to of the unit to fetch from the cache.
418
419 See get_CU_at().
420 """
421 # Find the insert point for the requested offset. With bisect_right,
422 # if this entry is present in the cache it will be the prior entry.
423 i = bisect_right(self._cu_offsets_map, offset)
424 if i >= 1 and offset == self._cu_offsets_map[i - 1]:
425 return self._cu_cache[i - 1]
426
427 # Parse the CU and insert the offset and object into the cache.
428 # The ._cu_offsets_map[] contains just the numeric offsets for the
429 # bisect_right search while the parallel indexed ._cu_cache[] holds
430 # the object references.
431 cu = self._parse_CU_at_offset(offset)
432 self._cu_offsets_map.insert(i, offset)
433 self._cu_cache.insert(i, cu)
434 return cu
435
436 def _parse_CU_at_offset(self, offset):
437 """ Parse and return a CU at the given offset in the debug_info stream.
438 """
439 # Section 7.4 (32-bit and 64-bit DWARF Formats) of the DWARF spec v3
440 # states that the first 32-bit word of the CU header determines
441 # whether the CU is represented with 32-bit or 64-bit DWARF format.
442 #
443 # So we peek at the first word in the CU header to determine its
444 # dwarf format. Based on it, we then create a new DWARFStructs
445 # instance suitable for this CU and use it to parse the rest.
446 #
447 initial_length = struct_parse(
448 self.structs.Dwarf_uint32(''), self.debug_info_sec.stream, offset)
449 dwarf_format = 64 if initial_length == 0xFFFFFFFF else 32
450
451
452 # Temporary structs for parsing the header
453 # The structs for the rest of the CU depend on the header data.
454 #
455 cu_structs = DWARFStructs(
456 little_endian=self.config.little_endian,
457 dwarf_format=dwarf_format,
458 address_size=4,
459 dwarf_version=2)
460
461 cu_header = struct_parse(
462 cu_structs.Dwarf_CU_header, self.debug_info_sec.stream, offset)
463
464 # structs for the rest of the CU, taking into account bitness and DWARF version
465 cu_structs = DWARFStructs(
466 little_endian=self.config.little_endian,
467 dwarf_format=dwarf_format,
468 address_size=cu_header['address_size'],
469 dwarf_version=cu_header['version'])
470
471 cu_die_offset = self.debug_info_sec.stream.tell()
472 dwarf_assert(
473 self._is_supported_version(cu_header['version']),
474 "Expected supported DWARF version. Got '%s'" % cu_header['version'])
475 return CompileUnit(
476 header=cu_header,
477 dwarfinfo=self,
478 structs=cu_structs,
479 cu_offset=offset,
480 cu_die_offset=cu_die_offset)
481
482 def _is_supported_version(self, version):
483 """ DWARF version supported by this parser
484 """
485 return 2 <= version <= 5
486
487 def _parse_line_program_at_offset(self, debug_line_offset, structs):
488 """ Given an offset to the .debug_line section, parse the line program
489 starting at this offset in the section and return it.
490 structs is the DWARFStructs object used to do this parsing.
491 """
492 lineprog_header = struct_parse(
493 structs.Dwarf_lineprog_header,
494 self.debug_line_sec.stream,
495 debug_line_offset)
496
497 # DWARF5: resolve names
498 def resolve_strings(self, lineprog_header, format_field, data_field):
499 if lineprog_header.get(format_field, False):
500 data = lineprog_header[data_field]
501 for field in lineprog_header[format_field]:
502 def replace_value(data, content_type, replacer):
503 for entry in data:
504 entry[content_type] = replacer(entry[content_type])
505
506 if field.form == 'DW_FORM_line_strp':
507 replace_value(data, field.content_type, self.get_string_from_linetable)
508 elif field.form == 'DW_FORM_strp':
509 replace_value(data, field.content_type, self.get_string_from_table)
510 elif field.form in ('DW_FORM_strp_sup', 'DW_FORM_strx', 'DW_FORM_strx1', 'DW_FORM_strx2', 'DW_FORM_strx3', 'DW_FORM_strx4'):
511 raise NotImplementedError()
512
513 resolve_strings(self, lineprog_header, 'directory_entry_format', 'directories')
514 resolve_strings(self, lineprog_header, 'file_name_entry_format', 'file_names')
515
516 # DWARF5: provide compatible file/directory name arrays for legacy lineprogram consumers
517 if lineprog_header.get('directories', False):
518 lineprog_header.include_directory = tuple(d.DW_LNCT_path for d in lineprog_header.directories)
519 if lineprog_header.get('file_names', False):
520 lineprog_header.file_entry = tuple(
521 Container(**{
522 'name':e.get('DW_LNCT_path'),
523 'dir_index': e.get('DW_LNCT_directory_index'),
524 'mtime': e.get('DW_LNCT_timestamp'),
525 'length': e.get('DW_LNCT_size')})
526 for e in lineprog_header.file_names)
527
528 # Calculate the offset to the next line program (see DWARF 6.2.4)
529 end_offset = ( debug_line_offset + lineprog_header['unit_length'] +
530 structs.initial_length_field_size())
531
532 return LineProgram(
533 header=lineprog_header,
534 stream=self.debug_line_sec.stream,
535 structs=structs,
536 program_start_offset=self.debug_line_sec.stream.tell(),
537 program_end_offset=end_offset)
538