#!/usr/bin/env python import bisect import random import os import subprocess import sys import pprint from optparse import OptionParser, OptionGroup import commands def check_output(*popen_args, **popen_kwargs): p = subprocess.Popen(stdout=subprocess.PIPE, *popen_args, **popen_kwargs) stdout,stderr = p.communicate() if p.wait() != 0: raise RuntimeError("process failed") return stdout def uniq(list): seen = set() for item in list: if item not in seen: yield item seen.add(item) def form_by_call_order(symbol_lists): # Simply strategy, just return symbols in order of occurrence, even across # multiple runs. return uniq(s for symbols in symbol_lists for s in symbols) def form_by_call_order_fair(symbol_lists): # More complicated strategy that tries to respect the call order across all # of the test cases, instead of giving a huge preference to the first test # case. # First, uniq all the lists. uniq_lists = [list(uniq(symbols)) for symbols in symbol_lists] # Compute the successors for each list. succs = {} for symbols in uniq_lists: for a,b in zip(symbols[:-1], symbols[1:]): succs[a] = items = succs.get(a, []) if b not in items: items.append(b) # Emit all the symbols, but make sure to always emit all successors from any # call list whenever we see a symbol. # # There isn't much science here, but this sometimes works better than the # more naive strategy. Then again, sometimes it doesn't so more research is # probably needed. return uniq(s for symbols in symbol_lists for node in symbols for s in ([node] + succs.get(node,[]))) def form_by_frequency(symbol_lists): # Form the order file by just putting the most commonly occurring symbols # first. This assumes the data files didn't use the oneshot dtrace method. counts = {} for symbols in symbol_lists: for a in symbols: counts[a] = counts.get(a,0) + 1 by_count = counts.items() by_count.sort(key = lambda (_,n): -n) return [s for s,n in by_count] def form_by_random(symbol_lists): # Randomize the symbols. merged_symbols = uniq(s for symbols in symbol_lists for s in symbols) random.shuffle(merged_symbols) return merged_symbols def form_by_alphabetical(symbol_lists): # Alphabetize the symbols. merged_symbols = list(set(s for symbols in symbol_lists for s in symbols)) merged_symbols.sort() return merged_symbols ### methods = dict((name[len("form_by_"):],value) for name,value in locals().items() if name.startswith("form_by_")) def parse_dtrace_symbol_file(path, all_symbols, all_symbols_set, missing_symbols, opts): def fix_mangling(symbol): if symbol[0] != '_' and symbol != 'start': symbol = '_' + symbol return symbol def get_symbols_with_prefix(symbol): start_index = bisect.bisect_left(all_symbols, symbol) for s in all_symbols[start_index:]: if not s.startswith(symbol): break yield s # Extract the list of symbols from the given file, which is assumed to be # the output of a dtrace run logging either probefunc or ustack(1) and # nothing else. The dtrace -xdemangle option needs to be used. # # This is particular to OS X at the moment, because of the '_' handling. with open(path) as f: current_timestamp = None for ln in f: # Drop leading and trailing whitespace. ln = ln.strip() if not ln: continue # If this is a timestamp specifier, extract it. if ln.startswith("TS: "): _,data = ln.split(': ', 1) if not data.isdigit(): print >>sys.stderr, ( "warning: unrecognized timestamp line %r, ignoring" % ( ln,)) continue current_timestamp = int(data) continue # If there is a '`' in the line, assume it is a ustack(1) entry in # the form of <modulename>`<modulefunc>, where <modulefunc> is never # truncated (but does need the mangling patched). if '`' in ln: yield (current_timestamp, fix_mangling(ln.split('`',1)[1])) continue # Otherwise, assume this is a probefunc printout. DTrace on OS X # seems to have a bug where it prints the mangled version of symbols # which aren't C++ mangled. We just add a '_' to anything but start # which doesn't already have a '_'. symbol = fix_mangling(ln) # If we don't know all the symbols, or the symbol is one of them, # just return it. if not all_symbols_set or symbol in all_symbols_set: yield (current_timestamp, symbol) continue # Otherwise, we have a symbol name which isn't present in the # binary. We assume it is truncated, and try to extend it. # Get all the symbols with this prefix. possible_symbols = list(get_symbols_with_prefix(symbol)) if not possible_symbols: return # If we found too many possible symbols, ignore this as a prefix. if len(possible_symbols) > 100: print >>sys.stderr, ( "warning: ignoring symbol %r (no match and too many " "possible suffixes") % (symbol,) return # Report that we resolved a missing symbol. if opts.show_missing_symbols and symbol not in missing_symbols: print >>sys.stderr, ( "warning: resolved missing symbol %r" % (symbol,)) missing_symbols.add(symbol) # Otherwise, treat all the possible matches as having occurred. This # is an over-approximation, but it should be ok in practice. for s in possible_symbols: yield (current_timestmap, s) def main(): parser = OptionParser("%prog [options] <dtrace data files>*]") parser.add_option("", "--binary", dest="binary_path", type='str', help=("Path to the binary being ordered " "(for getting all symbols)"), action="store", default=None, metavar="PATH") parser.add_option("", "--output", dest="output_path", help="path to output order file to write", default=None, metavar="PATH") parser.add_option("", "--show-missing-symbols", dest="show_missing_symbols", help=("show symbols which are 'fixed up' to a valid name " "(requires --binary)"), action="store_true", default=None) parser.add_option("", "--output-unordered-symbols", dest="output_unordered_symbols_path", help=("write a list of the unordered symbols to PATH " "(requires --binary)"), default=None, metavar="PATH") parser.add_option("", "--method", dest="method", help="order file generation method to use", choices=methods.keys(), default='call_order') (opts, args) = parser.parse_args() if opts.output_path is None: parser.error("--output is required") if len(args) < 1: parser.error("invalid number of arguments") # If the user gave us a binary, get all the symbols in the binary by # snarfing 'nm' output. if opts.binary_path is not None: output = check_output(['nm', '-P', opts.binary_path]) lines = output.split("\n") all_symbols = [ln.split(' ',1)[0] for ln in lines if ln.strip()] commands.note("found %d symbols in binary" % (len(all_symbols),)) all_symbols.sort() else: all_symbols = [] all_symbols_set = set(all_symbols) # Compute the list of input files. input_files = [] for path in args: # If this is a directory, read any files ending with .log. if os.path.isdir(path): for name in os.listdir(path): if name.endswith('.log'): input_files.append(os.path.join(path, name)) else: input_files.append(path) # Load all of the input files. commands.note("loading from %d data files" % (len(input_files),)) missing_symbols = set() timestamped_symbol_lists = [ list(parse_dtrace_symbol_file(path, all_symbols, all_symbols_set, missing_symbols, opts)) for path in input_files] # Reorder each symbol list. symbol_lists = [] for timestamped_symbols_list in timestamped_symbol_lists: timestamped_symbols_list.sort() symbol_lists.append([symbol for _,symbol in timestamped_symbols_list]) # Execute the desire order file generation method. method = methods.get(opts.method) result = list(method(symbol_lists)) # Report to the user on what percentage of symbols are present in the order # file. num_ordered_symbols = len(result) if all_symbols: print >>sys.stderr, "note: order file contains %d/%d symbols (%.2f%%)" %( num_ordered_symbols, len(all_symbols), 100.*num_ordered_symbols/len(all_symbols)) if opts.output_unordered_symbols_path: ordered_symbols_set = set(result) with open(opts.output_unordered_symbols_path, 'w') as f: f.write("\n".join(s for s in all_symbols if s not in ordered_symbols_set)) # Write the order file. with open(opts.output_path, 'w') as f: f.write("\n".join(result)) f.write("\n") if __name__ == '__main__': main()