gen-order-file [plain text]

#!/usr/bin/env python

import bisect
import random
import os
import subprocess
import sys
import pprint
from optparse import OptionParser, OptionGroup

import commands

def check_output(*popen_args, **popen_kwargs):
    p = subprocess.Popen(stdout=subprocess.PIPE, *popen_args, **popen_kwargs)
    stdout,stderr = p.communicate()
    if p.wait() != 0:
        raise RuntimeError("process failed")
    return stdout

def uniq(list):
    seen = set()
    for item in list:
        if item not in seen:
            yield item
            seen.add(item)

def form_by_call_order(symbol_lists):
   # Simply strategy, just return symbols in order of occurrence, even across
   # multiple runs.
   return uniq(s
               for symbols in symbol_lists
               for s in symbols)

def form_by_call_order_fair(symbol_lists):
    # More complicated strategy that tries to respect the call order across all
    # of the test cases, instead of giving a huge preference to the first test
    # case.
 
    # First, uniq all the lists.
    uniq_lists = [list(uniq(symbols))
                  for symbols in symbol_lists]
 
    # Compute the successors for each list.
    succs = {}
    for symbols in uniq_lists:
        for a,b in zip(symbols[:-1], symbols[1:]):
            succs[a] = items = succs.get(a, [])
            if b not in items:
                items.append(b)
    
    # Emit all the symbols, but make sure to always emit all successors from any
    # call list whenever we see a symbol.
    #
    # There isn't much science here, but this sometimes works better than the
    # more naive strategy. Then again, sometimes it doesn't so more research is
    # probably needed.
    return uniq(s
                for symbols in symbol_lists
                for node in symbols
                for s in ([node] + succs.get(node,[])))
 
def form_by_frequency(symbol_lists):
    # Form the order file by just putting the most commonly occurring symbols
    # first. This assumes the data files didn't use the oneshot dtrace method.
 
    counts = {}
    for symbols in symbol_lists:
        for a in symbols:
            counts[a] = counts.get(a,0) + 1
   
    by_count = counts.items()
    by_count.sort(key = lambda (_,n): -n)
 
    return [s for s,n in by_count]
 
def form_by_random(symbol_lists):
    # Randomize the symbols.
    merged_symbols = uniq(s for symbols in symbol_lists
                          for s in symbols)
    random.shuffle(merged_symbols)
    return merged_symbols
 
def form_by_alphabetical(symbol_lists):
    # Alphabetize the symbols.
    merged_symbols = list(set(s for symbols in symbol_lists
                           for s in symbols))
    merged_symbols.sort()
    return merged_symbols
 
###

methods = dict((name[len("form_by_"):],value)
               for name,value in locals().items()
               if name.startswith("form_by_"))

def parse_dtrace_symbol_file(path, all_symbols, all_symbols_set,
                             missing_symbols, opts):
    def fix_mangling(symbol):
        if symbol[0] != '_' and symbol != 'start':
            symbol = '_' + symbol
        return symbol
    def get_symbols_with_prefix(symbol):
        start_index = bisect.bisect_left(all_symbols, symbol)
        for s in all_symbols[start_index:]:
            if not s.startswith(symbol):
                break
            yield s

    # Extract the list of symbols from the given file, which is assumed to be
    # the output of a dtrace run logging either probefunc or ustack(1) and
    # nothing else. The dtrace -xdemangle option needs to be used.
    #
    # This is particular to OS X at the moment, because of the '_' handling.
    with open(path) as f:
        current_timestamp = None
        for ln in f:
            # Drop leading and trailing whitespace.
            ln = ln.strip()
            if not ln:
                continue

            # If this is a timestamp specifier, extract it.
            if ln.startswith("TS: "):
                _,data = ln.split(': ', 1)
                if not data.isdigit():
                    print >>sys.stderr, (
                        "warning: unrecognized timestamp line %r, ignoring" % (
                            ln,))
                    continue
                current_timestamp = int(data)
                continue

            # If there is a '`' in the line, assume it is a ustack(1) entry in
            # the form of <modulename>`<modulefunc>, where <modulefunc> is never
            # truncated (but does need the mangling patched).
            if '`' in ln:
                yield (current_timestamp, fix_mangling(ln.split('`',1)[1]))
                continue

            # Otherwise, assume this is a probefunc printout. DTrace on OS X
            # seems to have a bug where it prints the mangled version of symbols
            # which aren't C++ mangled. We just add a '_' to anything but start
            # which doesn't already have a '_'.
            symbol = fix_mangling(ln)

            # If we don't know all the symbols, or the symbol is one of them,
            # just return it.
            if not all_symbols_set or symbol in all_symbols_set:
                yield (current_timestamp, symbol)
                continue

            # Otherwise, we have a symbol name which isn't present in the
            # binary. We assume it is truncated, and try to extend it.

            # Get all the symbols with this prefix.
            possible_symbols = list(get_symbols_with_prefix(symbol))
            if not possible_symbols:
               return
 
            # If we found too many possible symbols, ignore this as a prefix.
            if len(possible_symbols) > 100:
               print >>sys.stderr, (
                  "warning: ignoring symbol %r (no match and too many "
                  "possible suffixes") % (symbol,)
               return
 
            # Report that we resolved a missing symbol.
            if opts.show_missing_symbols and symbol not in missing_symbols:
               print >>sys.stderr, (
                  "warning: resolved missing symbol %r" % (symbol,))
               missing_symbols.add(symbol)

            # Otherwise, treat all the possible matches as having occurred. This
            # is an over-approximation, but it should be ok in practice.
            for s in possible_symbols:
               yield (current_timestmap, s)

def main():
    parser = OptionParser("%prog  [options] <dtrace data files>*]")
    parser.add_option("", "--binary", dest="binary_path", type='str',
                      help=("Path to the binary being ordered "
                            "(for getting all symbols)"),
                      action="store", default=None, metavar="PATH")
    parser.add_option("", "--output", dest="output_path",
                      help="path to output order file to write",
                      default=None, metavar="PATH")
    parser.add_option("", "--show-missing-symbols", dest="show_missing_symbols",
                      help=("show symbols which are 'fixed up' to a valid name "
                            "(requires --binary)"),
                      action="store_true", default=None)
    parser.add_option("", "--output-unordered-symbols",
                      dest="output_unordered_symbols_path",
                      help=("write a list of the unordered symbols to PATH "
                            "(requires --binary)"),
                      default=None, metavar="PATH")
    parser.add_option("", "--method", dest="method",
                      help="order file generation method to use",
                      choices=methods.keys(), default='call_order')
    (opts, args) = parser.parse_args()

    if opts.output_path is None:
       parser.error("--output is required")
    if len(args) < 1:
       parser.error("invalid number of arguments")

    # If the user gave us a binary, get all the symbols in the binary by
    # snarfing 'nm' output.
    if opts.binary_path is not None:
       output = check_output(['nm', '-P', opts.binary_path])
       lines = output.split("\n")
       all_symbols = [ln.split(' ',1)[0]
                      for ln in lines
                      if ln.strip()]
       commands.note("found %d symbols in binary" % (len(all_symbols),))
       all_symbols.sort()
    else:
       all_symbols = []
    all_symbols_set = set(all_symbols)

    # Compute the list of input files.
    input_files = []
    for path in args:
        # If this is a directory, read any files ending with .log.
        if os.path.isdir(path):
            for name in os.listdir(path):
                if name.endswith('.log'):
                    input_files.append(os.path.join(path, name))
        else:
            input_files.append(path)

    # Load all of the input files.
    commands.note("loading from %d data files" % (len(input_files),))
    missing_symbols = set()
    timestamped_symbol_lists = [
        list(parse_dtrace_symbol_file(path, all_symbols, all_symbols_set,
                                      missing_symbols, opts))
        for path in input_files]

    # Reorder each symbol list.
    symbol_lists = []
    for timestamped_symbols_list in timestamped_symbol_lists:
        timestamped_symbols_list.sort()
        symbol_lists.append([symbol for _,symbol in timestamped_symbols_list])

    # Execute the desire order file generation method.
    method = methods.get(opts.method)
    result = list(method(symbol_lists))

    # Report to the user on what percentage of symbols are present in the order
    # file.
    num_ordered_symbols = len(result)
    if all_symbols:
       print >>sys.stderr, "note: order file contains %d/%d symbols (%.2f%%)" %(
          num_ordered_symbols, len(all_symbols),
          100.*num_ordered_symbols/len(all_symbols))

       if opts.output_unordered_symbols_path:
          ordered_symbols_set = set(result)
          with open(opts.output_unordered_symbols_path, 'w') as f:
             f.write("\n".join(s for s in all_symbols
                               if s not in ordered_symbols_set))

    # Write the order file.
    with open(opts.output_path, 'w') as f:
        f.write("\n".join(result))
        f.write("\n")

if __name__ == '__main__':
   main()