Version 5 lineprogram header (#411)
authorSeva Alekseyev <sevaa@yarxi.ru>
Mon, 13 Jun 2022 12:44:44 +0000 (08:44 -0400)
committerGitHub <noreply@github.com>
Mon, 13 Jun 2022 12:44:44 +0000 (05:44 -0700)
* Version 5 lineprogram header, take 1

* Readelf/decodedline formatting fix

* DWARF 5 fields None, not missing

* Comment

* Sample binary

* Dump unit type in readelf info

* More languages described

* Describing form_line_strp

* Basic support for GNU_PROPERTY_X86_ISA_1

* Readelf decodedline format fixes to match with DWARF5

* Readelf test shorted out for the file/test where a bug in GNU readelf manifests, see PR #411.

* Newline :)

* Readelf' language names matched against binutils

* Comment about lineprogram files and directories

* GNU binutils bug worked around in a slightly less disturbing way - patched the binary, left a comment in the test script.

* Examples autotest no longer fails on Windows over expected path format

* Autotest fix

* Typo

* Windows compatibility, take 2

* No pathlib on Python 2

Co-authored-by: Seva Alekseyev <sevaa@nih.gov>
13 files changed:
elftools/common/py3compat.py
elftools/dwarf/descriptions.py
elftools/dwarf/dwarfinfo.py
elftools/dwarf/enums.py
elftools/dwarf/structs.py
elftools/elf/descriptions.py
elftools/elf/enums.py
examples/dwarf_die_tree.py
examples/dwarf_lineprogram_filenames.py
examples/examine_dwarf_info.py
scripts/readelf.py
test/run_readelf_tests.py
test/testfiles_for_readelf/dwarf_lineprogramv5.elf [new file with mode: 0644]

index 259926440a4340b81812f21f845d830567685f7f..c701cc0d4a19c3a6663a8b6a53dafda228a17e9a 100644 (file)
@@ -12,6 +12,7 @@ PY3 = sys.version_info[0] == 3
 
 if PY3:
     import io
+    from pathlib import Path
     StringIO = io.StringIO
     BytesIO = io.BytesIO
 
@@ -42,8 +43,15 @@ if PY3:
     ifilter = filter
 
     maxint = sys.maxsize
+
+    def path_to_posix(s):
+        return Path(s).as_posix()
+
 else:
     import cStringIO
+    import os
+    import posixpath
+
     StringIO = BytesIO = cStringIO.StringIO
 
     def bytes2hex(b, sep=''):
@@ -63,6 +71,9 @@ else:
 
     maxint = sys.maxint
 
+    def path_to_posix(s):
+        return posixpath.join(*os.path.split(s))
+
 
 def iterkeys(d):
     """Return an iterator over the keys of a dictionary."""
index ef6ac19fb32b51bde5fb64fa2fb9a762f624e661..f51f69b02ae3b5930f312be6e2e05477b4faa82a 100644 (file)
@@ -206,6 +206,10 @@ def _describe_attr_strp(attr, die, section_offset):
     return '(indirect string, offset: 0x%x): %s' % (
         attr.raw_value, bytes2str(attr.value))
 
+def _describe_attr_line_strp(attr, die, section_offset):
+    return '(indirect line string, offset: 0x%x): %s' % (
+        attr.raw_value, bytes2str(attr.value))        
+
 def _describe_attr_string(attr, die, section_offset):
     return bytes2str(attr.value)
 
@@ -247,6 +251,7 @@ _ATTR_DESCRIPTION_MAP = defaultdict(
     DW_FORM_udata=_describe_attr_value_passthrough,
     DW_FORM_string=_describe_attr_string,
     DW_FORM_strp=_describe_attr_strp,
+    DW_FORM_line_strp=_describe_attr_line_strp,
     DW_FORM_block1=_describe_attr_block,
     DW_FORM_block2=_describe_attr_block,
     DW_FORM_block4=_describe_attr_block,
@@ -312,12 +317,29 @@ _DESCR_DW_LANG = {
     DW_LANG_UPC: '(Unified Parallel C)',
     DW_LANG_D: '(D)',
     DW_LANG_Python: '(Python)',
+    DW_LANG_OpenCL: '(OpenCL)',
+    DW_LANG_Go: '(Go)',
+    DW_LANG_Modula3: '(Modula 3)',
+    DW_LANG_Haskell: '(Haskell)',
+    DW_LANG_C_plus_plus_03: '(C++03)',
+    DW_LANG_C_plus_plus_11: '(C++11)',
+    DW_LANG_OCaml: '(OCaml)',
+    DW_LANG_Rust: '(Rust)',
+    DW_LANG_C11: '(C11)',
+    DW_LANG_Swift: '(Swift)',
+    DW_LANG_Julia: '(Julia)',
+    DW_LANG_Dylan: '(Dylan)',
+    DW_LANG_C_plus_plus_14: '(C++14)',
+    DW_LANG_Fortran03: '(Fortran 03)',
+    DW_LANG_Fortran08: '(Fortran 08)',
+    DW_LANG_RenderScript: '(RenderScript)',
+    DW_LANG_BLISS: '(Bliss)', # Not in binutils
     DW_LANG_Mips_Assembler: '(MIPS assembler)',
     DW_LANG_HP_Bliss: '(HP Bliss)',
     DW_LANG_HP_Basic91: '(HP Basic 91)',
     DW_LANG_HP_Pascal91: '(HP Pascal 91)',
     DW_LANG_HP_IMacro: '(HP IMacro)',
-    DW_LANG_HP_Assembler: '(HP assembler)',
+    DW_LANG_HP_Assembler: '(HP assembler)'
 }
 
 _DESCR_DW_ATE = {
index a300e7b9a07476c6096e9c50240536ce2fce2355..69d7b91b32888967129351989e055e39dc05d431 100644 (file)
@@ -243,6 +243,14 @@ class DWARFInfo(object):
         """ Given a CU object, fetch the line program it points to from the
             .debug_line section.
             If the CU doesn't point to a line program, return None.
+            
+            Note about directory and file names. They are returned as two collections
+            in the lineprogram object's header - include_directory and file_entry.
+
+            In DWARFv5, they have introduced a different, extensible format for those
+            collections. So in a lineprogram v5+, there are two more collections in
+            the header - directories and file_names. Those might contain extra DWARFv5
+            information that is not exposed in include_directory and file_entry.
         """
         # The line program is pointed to by the DW_AT_stmt_list attribute of
         # the top DIE of a CU.
@@ -456,6 +464,34 @@ class DWARFInfo(object):
             self.debug_line_sec.stream,
             debug_line_offset)
 
+        # DWARF5: resolve names
+        def resolve_strings(self, lineprog_header, format_field, data_field):
+            if lineprog_header.get(format_field, False):
+                data = lineprog_header[data_field]
+                for field in lineprog_header[format_field]:
+                    def replace_value(data, content_type, replacer):
+                        for entry in data:
+                            entry[content_type] = replacer(entry[content_type])
+
+                    if field.form == 'DW_FORM_line_strp':
+                        replace_value(data, field.content_type, self.get_string_from_linetable)
+                    elif field.form == 'DW_FORM_strp':
+                        replace_value(data, field.content_type, self.get_string_from_table)
+                    elif field.form in ('DW_FORM_strp_sup', 'DW_FORM_strx', 'DW_FORM_strx1', 'DW_FORM_strx2', 'DW_FORM_strx3', 'DW_FORM_strx4'):
+                        raise NotImplementedError()
+
+        resolve_strings(self, lineprog_header, 'directory_entry_format', 'directories')
+        resolve_strings(self, lineprog_header, 'file_name_entry_format', 'file_names')
+
+        # DWARF5: provide compatible file/directory name arrays for legacy lineprogram consumers
+        if lineprog_header.get('directories', False):
+            lineprog_header.include_directory = tuple(d.DW_LNCT_path for d in lineprog_header.directories)
+        if lineprog_header.get('file_names', False):
+            translate = namedtuple("file_entry", "name dir_index mtime length")
+            lineprog_header.file_entry = tuple(
+                translate(e.get('DW_LNCT_path'), e.get('DW_LNCT_directory_index'), e.get('DW_LNCT_timestamp'), e.get('DW_LNCT_size'))
+                for e in lineprog_header.file_names)
+        
         # Calculate the offset to the next line program (see DWARF 6.2.4)
         end_offset = (  debug_line_offset + lineprog_header['unit_length'] +
                         structs.initial_length_field_size())
index bfeca5907f84889cafaf7f7ac16700f3d6dbee2c..a52e8034413350d07aa402145ac1aa668dcd3f9f 100644 (file)
@@ -394,3 +394,24 @@ DW_EH_encoding_flags = dict(
 
     DW_EH_PE_omit     = 0xff,
 )
+
+ENUM_DW_LNCT = dict(
+    DW_LNCT_path             = 0x1,
+    DW_LNCT_directory_index  = 0x2,
+    DW_LNCT_timestamp        = 0x3,
+    DW_LNCT_size             = 0x4,
+    DW_LNCT_MD5              = 0x5,
+    DW_LNCT_lo_user          = 0x2000,
+    DW_LNCT_hi_user          = 0x3fff
+)
+
+ENUM_DW_UT = dict(
+    DW_UT_compile       = 0x01,
+    DW_UT_type          = 0x02,
+    DW_UT_partial       = 0x03,
+    DW_UT_skeleton      = 0x04,
+    DW_UT_split_compile = 0x05,
+    DW_UT_split_type    = 0x06,
+    DW_UT_lo_user       = 0x80,
+    DW_UT_hi_user       = 0xff
+)
index 79e0d8f230ac10648b6d2161eae3bd0bb31cce8e..46cd81777d15c19951246bdc674c3dd6d9a9dae9 100644 (file)
@@ -7,11 +7,13 @@
 # Eli Bendersky (eliben@gmail.com)
 # This code is in the public domain
 #-------------------------------------------------------------------------------
+from elftools.construct.core import Subconstruct
+from elftools.construct.macros import Embedded
 from ..construct import (
     UBInt8, UBInt16, UBInt32, UBInt64, ULInt8, ULInt16, ULInt32, ULInt64,
     SBInt8, SBInt16, SBInt32, SBInt64, SLInt8, SLInt16, SLInt32, SLInt64,
     Adapter, Struct, ConstructError, If, Enum, Array, PrefixedArray,
-    CString, Embed, StaticField, IfThenElse
+    CString, Embed, StaticField, IfThenElse, Construct, Rename, Sequence
     )
 from ..common.construct_utils import RepeatUntilExcluding, ULEB128, SLEB128
 from .enums import *
@@ -282,27 +284,80 @@ class DWARFStructs(object):
                     self.Dwarf_uleb128('mtime'),
                     self.Dwarf_uleb128('length')))))
 
+        class FormattedEntry(Construct):
+            # Generates a parser based on a previously parsed piece,
+            # similar to deprecared Dynamic.
+            # Strings are resolved later, since it potentially requires
+            # looking at another section.
+            def __init__(self, name, structs, format_field):
+                Construct.__init__(self, name)
+                self.structs = structs
+                self.format_field = format_field
+
+            def _parse(self, stream, context):
+                # Somewhat tricky technique here, explicitly writing back to the context
+                if self.format_field + "_parser" in context:
+                    parser = context[self.format_field + "_parser"]
+                else:
+                    fields = tuple(
+                        Rename(f.content_type, self.structs.Dwarf_dw_form[f.form])
+                        for f in context[self.format_field])
+                    parser = Struct('formatted_entry', *fields)
+                    context[self.format_field + "_parser"] = parser
+                return parser._parse(stream, context)
+        ver5 = lambda ctx: ctx.version >= 5
+        
         self.Dwarf_lineprog_header = Struct('Dwarf_lineprog_header',
             self.Dwarf_initial_length('unit_length'),
             self.Dwarf_uint16('version'),
+            If(ver5,
+                self.Dwarf_uint8("address_size"),
+                None),
+            If(ver5,
+                self.Dwarf_uint8("segment_selector_size"),
+                None),
             self.Dwarf_offset('header_length'),
             self.Dwarf_uint8('minimum_instruction_length'),
-            If(lambda ctx: ctx['version'] >= 4,
+            If(lambda ctx: ctx.version >= 4,
                 self.Dwarf_uint8("maximum_operations_per_instruction"),
                 1),
             self.Dwarf_uint8('default_is_stmt'),
             self.Dwarf_int8('line_base'),
             self.Dwarf_uint8('line_range'),
             self.Dwarf_uint8('opcode_base'),
-            Array(lambda ctx: ctx['opcode_base'] - 1,
+            Array(lambda ctx: ctx.opcode_base - 1,
                   self.Dwarf_uint8('standard_opcode_lengths')),
-            RepeatUntilExcluding(
-                lambda obj, ctx: obj == b'',
-                CString('include_directory')),
-            RepeatUntilExcluding(
-                lambda obj, ctx: len(obj.name) == 0,
-                self.Dwarf_lineprog_file_entry),
-            )
+            If(ver5,
+                PrefixedArray(
+                    Struct('directory_entry_format',
+                        Enum(self.Dwarf_uleb128('content_type'), **ENUM_DW_LNCT),
+                        Enum(self.Dwarf_uleb128('form'), **ENUM_DW_FORM)),
+                    self.Dwarf_uint8("directory_entry_format_count"))),
+            If(ver5, # Name deliberately doesn't match the legacy object, since the format can't be made compatible
+                PrefixedArray(
+                    FormattedEntry('directories', self, "directory_entry_format"),
+                    self.Dwarf_uleb128('directories_count'))),
+            If(ver5,
+                PrefixedArray(
+                    Struct('file_name_entry_format',
+                        Enum(self.Dwarf_uleb128('content_type'), **ENUM_DW_LNCT),
+                        Enum(self.Dwarf_uleb128('form'), **ENUM_DW_FORM)),
+                    self.Dwarf_uint8("file_name_entry_format_count"))),
+            If(ver5,
+                PrefixedArray(
+                    FormattedEntry('file_names', self, "file_name_entry_format"),
+                    self.Dwarf_uleb128('file_names_count'))),
+            # Legacy  directories/files - DWARF < 5 only
+            If(lambda ctx: ctx.version < 5,
+                RepeatUntilExcluding(
+                    lambda obj, ctx: obj == b'',
+                    CString('include_directory'))),
+            If(lambda ctx: ctx.version < 5, 
+                RepeatUntilExcluding(
+                    lambda obj, ctx: len(obj.name) == 0,
+                    self.Dwarf_lineprog_file_entry)) # array name is file_entry 
+        )
 
     def _create_callframe_entry_headers(self):
         self.Dwarf_CIE_header = Struct('Dwarf_CIE_header',
index 0ccc9a19e6472ef708619b360aaed2fb856e000e..9314a62093b09f1daa93455566e5bae509fc2451 100644 (file)
@@ -259,6 +259,13 @@ def describe_note_gnu_property_x86_feature_1(value):
             descs.append(desc)
     return 'x86 feature: ' + ', '.join(descs)
 
+def describe_note_gnu_property_x86_isa_1(value):
+    descs = []
+    for mask, desc in _DESCR_NOTE_GNU_PROPERTY_X86_ISA_1_FLAGS:
+        if value & mask:
+            descs.append(desc)
+    return 'x86 ISA needed: ' + ', '.join(descs)    
+
 def describe_note_gnu_properties(properties):
     descriptions = []
     for prop in properties:
@@ -278,6 +285,11 @@ def describe_note_gnu_properties(properties):
                 prop_desc = ' <corrupt length: 0x%x>' % sz
             else:
                 prop_desc = describe_note_gnu_property_x86_feature_1(d)
+        elif t == 'GNU_PROPERTY_X86_ISA_1_NEEDED':
+            if sz != 4:
+                prop_desc = ' <corrupt length: 0x%x>' % sz
+            else:
+                prop_desc = describe_note_gnu_property_x86_isa_1(d)
         elif _DESCR_NOTE_GNU_PROPERTY_TYPE_LOPROC <= t <= _DESCR_NOTE_GNU_PROPERTY_TYPE_HIPROC:
             prop_desc = '<processor-specific type 0x%x data: %s >' % (t, bytes2hex(d, sep=' '))
         elif _DESCR_NOTE_GNU_PROPERTY_TYPE_LOUSER <= t <= _DESCR_NOTE_GNU_PROPERTY_TYPE_HIUSER:
@@ -603,6 +615,12 @@ _DESCR_NOTE_GNU_PROPERTY_X86_FEATURE_1_FLAGS = (
     (8, 'LAM_U57'),
 )
 
+# Same for GNU_PROPERTY_X86_SET_1_xxx
+_DESCR_NOTE_GNU_PROPERTY_X86_ISA_1_FLAGS = (
+    (1, 'x86-64-baseline'),
+    # TODO; there is a long list
+)
+
 
 def _reverse_dict(d, low_priority=()):
     """
index 8519f4e63e2a546c0dfdef850f40fe7f9d31c9ab..a5855c2677e7c8eef23148b68400ba4f61488c72 100644 (file)
@@ -877,6 +877,7 @@ ENUM_NOTE_GNU_PROPERTY_TYPE = dict(
     GNU_PROPERTY_STACK_SIZE=1,
     GNU_PROPERTY_NO_COPY_ON_PROTECTED=2,
     GNU_PROPERTY_X86_FEATURE_1_AND=0xc0000002,
+    GNU_PROPERTY_X86_ISA_1_NEEDED=0xc0008002,
     _default_=Pass,
 )
 
index 0e46d9302771b7bfb38169a26de49fcfc9aa7104..afda68cca7456da2eacc9be074a8f3cc33edc5be 100644 (file)
@@ -15,6 +15,7 @@ import sys
 sys.path[0:0] = ['.', '..']
 
 from elftools.elf.elffile import ELFFile
+from elftools.common.py3compat import path_to_posix
 
 
 def process_file(filename):
@@ -44,7 +45,7 @@ def process_file(filename):
             print('    Top DIE with tag=%s' % top_DIE.tag)
 
             # We're interested in the filename...
-            print('    name=%s' % top_DIE.get_full_path())
+            print('    name=%s' % path_to_posix(top_DIE.get_full_path()))
 
             # Display DIEs recursively starting with top_DIE
             die_info_rec(top_DIE)
index 2dd0e70496cda3e231c9f117c63573ffce3faadf..6098f45ee45fc973478a30675297b1eeb427025e 100644 (file)
@@ -12,6 +12,7 @@ from __future__ import print_function
 from collections import defaultdict
 import os
 import sys
+import posixpath
 
 # If pyelftools is not installed, the example can also run from the root or
 # examples/ dir of the source distribution.
@@ -86,7 +87,7 @@ def lpe_filename(line_program, file_index):
         return file_entry.name.decode()
 
     directory = lp_header["include_directory"][dir_index - 1]
-    return os.path.join(directory, file_entry.name).decode()
+    return posixpath.join(directory, file_entry.name).decode()
 
 
 if __name__ == '__main__':
index bb43bcc2f760d02aacfbd63ac45673a7f59ab42e..fdf10dabb34838255b9a39f3b49616b443a92e3f 100644 (file)
@@ -14,6 +14,7 @@ import sys
 sys.path[0:0] = ['.', '..']
 
 from elftools.elf.elffile import ELFFile
+from elftools.common.py3compat import path_to_posix
 
 
 def process_file(filename):
@@ -43,7 +44,7 @@ def process_file(filename):
             print('    Top DIE with tag=%s' % top_DIE.tag)
 
             # We're interested in the filename...
-            print('    name=%s' % top_DIE.get_full_path())
+            print('    name=%s' % path_to_posix(top_DIE.get_full_path()))
 
 if __name__ == '__main__':
     if sys.argv[1] == '--test':
index 9b98d0237a0c329b375121e712632c9d9429cd06..a45ec3019556d8533bb5515cfa0e359d5db3e689 100755 (executable)
@@ -65,6 +65,7 @@ from elftools.dwarf.constants import (
 from elftools.dwarf.locationlists import LocationParser, LocationEntry
 from elftools.dwarf.callframe import CIE, FDE, ZERO
 from elftools.ehabi.ehabiinfo import CorruptEHABIEntry, CannotUnwindEHABIEntry, GenericEHABIEntry
+from elftools.dwarf.enums import ENUM_DW_UT
 
 
 class ReadElf(object):
@@ -1061,7 +1062,10 @@ class ReadElf(object):
             self._emitline('   Length:        %s (%s)' % (
                 self._format_hex(cu['unit_length']),
                 '%s-bit' % cu.dwarf_format()))
-            self._emitline('   Version:       %s' % cu['version']),
+            self._emitline('   Version:       %s' % cu['version'])
+            if cu.header.get("unit_type", False):
+                ut = next((key for key, value in ENUM_DW_UT.items() if value == cu.header.unit_type), '?')
+                self._emitline('   Unit Type:     %s (%d)' % (ut, cu.header.unit_type))
             self._emitline('   Abbrev Offset: %s' % (
                 self._format_hex(cu['debug_abbrev_offset']))),
             self._emitline('   Pointer Size:  %s' % cu['address_size'])
@@ -1121,6 +1125,7 @@ class ReadElf(object):
 
         for cu in self._dwarfinfo.iter_CUs():
             lineprogram = self._dwarfinfo.line_program_for_CU(cu)
+            ver5 = lineprogram.header.version >= 5
 
             cu_filename = bytes2str(lineprogram['file_entry'][0].name)
             if len(lineprogram['include_directory']) > 0:
@@ -1132,7 +1137,9 @@ class ReadElf(object):
                 cu_filename = '%s/%s' % (bytes2str(dir), cu_filename)
 
             self._emitline('CU: %s:' % cu_filename)
-            self._emitline('File name                            Line number    Starting address    Stmt')
+            self._emitline('File name                            Line number    Starting address    View    Stmt' if ver5
+                else 'File name                            Line number    Starting address    Stmt')
+            # What goes into View on V5? To be seen...
 
             # Print each state's file, line and address information. For some
             # instructions other output is needed to be compatible with
@@ -1161,11 +1168,14 @@ class ReadElf(object):
                         '0' if state.address == 0 else self._format_hex(state.address),
                         'x' if state.is_stmt and not state.end_sequence else ''))
                 else:
-                    self._emitline('%-35s  %11d  %18s[%d] %s' % (
+                    # What's the deal with op_index after address on DWARF 5? Is omitting it
+                    # a function of DWARF version, or ISA, or what?
+                    # Used to be unconditional, even on non-VLIW machines.
+                    self._emitline('%-35s  %s  %18s%s %s' % (
                         bytes2str(lineprogram['file_entry'][state.file - 1].name),
-                        state.line if not state.end_sequence else '-',
+                        "%11d" % (state.line,) if not state.end_sequence else '-',
                         '0' if state.address == 0 else self._format_hex(state.address),
-                        state.op_index,
+                        '' if ver5 else '[%d]' % (state.op_index,),
                         'x' if state.is_stmt and not state.end_sequence else ''))
                 if entry.command == DW_LNS_copy:
                     # Another readelf oddity...
index 59a039c5e214d25fb5b98cd5e1819b26d4cfd426..ceca7ec06c7809f4d4a3eb3a7a7cd679c22006df 100755 (executable)
@@ -80,6 +80,12 @@ def run_test_on_file(filename, verbose=False, opt=None):
                 testlog.info('.......................SKIPPED')
             continue
 
+        # sevaa says: there is another shorted out test; in dwarf_lineprogramv5.elf, the two bytes at 0x2072 were
+        # patched from 0x07 0x10 to 00 00.
+        # Those represented the second instruction in the first FDE in .eh_frame. This changed the instruction
+        # from "DW_CFA_undefined 16" to two NOPs.
+        # GNU readelf had a bug here, had to work around. See PR #411.
+
         # stdouts will be a 2-element list: output of readelf and output
         # of scripts/readelf.py
         stdouts = []
diff --git a/test/testfiles_for_readelf/dwarf_lineprogramv5.elf b/test/testfiles_for_readelf/dwarf_lineprogramv5.elf
new file mode 100644 (file)
index 0000000..33f051b
Binary files /dev/null and b/test/testfiles_for_readelf/dwarf_lineprogramv5.elf differ