#!/usr/bin/env python
import bisect
import random
import os
import subprocess
import sys
import pprint
from optparse import OptionParser, OptionGroup
import commands
def check_output(*popen_args, **popen_kwargs):
p = subprocess.Popen(stdout=subprocess.PIPE, *popen_args, **popen_kwargs)
stdout,stderr = p.communicate()
if p.wait() != 0:
raise RuntimeError("process failed")
return stdout
def uniq(list):
seen = set()
for item in list:
if item not in seen:
yield item
seen.add(item)
def form_by_call_order(symbol_lists):
# Simply strategy, just return symbols in order of occurrence, even across
# multiple runs.
return uniq(s
for symbols in symbol_lists
for s in symbols)
def form_by_call_order_fair(symbol_lists):
# More complicated strategy that tries to respect the call order across all
# of the test cases, instead of giving a huge preference to the first test
# case.
# First, uniq all the lists.
uniq_lists = [list(uniq(symbols))
for symbols in symbol_lists]
# Compute the successors for each list.
succs = {}
for symbols in uniq_lists:
for a,b in zip(symbols[:-1], symbols[1:]):
succs[a] = items = succs.get(a, [])
if b not in items:
items.append(b)
# Emit all the symbols, but make sure to always emit all successors from any
# call list whenever we see a symbol.
#
# There isn't much science here, but this sometimes works better than the
# more naive strategy. Then again, sometimes it doesn't so more research is
# probably needed.
return uniq(s
for symbols in symbol_lists
for node in symbols
for s in ([node] + succs.get(node,[])))
def form_by_frequency(symbol_lists):
# Form the order file by just putting the most commonly occurring symbols
# first. This assumes the data files didn't use the oneshot dtrace method.
counts = {}
for symbols in symbol_lists:
for a in symbols:
counts[a] = counts.get(a,0) + 1
by_count = counts.items()
by_count.sort(key = lambda (_,n): -n)
return [s for s,n in by_count]
def form_by_random(symbol_lists):
# Randomize the symbols.
merged_symbols = uniq(s for symbols in symbol_lists
for s in symbols)
random.shuffle(merged_symbols)
return merged_symbols
def form_by_alphabetical(symbol_lists):
# Alphabetize the symbols.
merged_symbols = list(set(s for symbols in symbol_lists
for s in symbols))
merged_symbols.sort()
return merged_symbols
###
methods = dict((name[len("form_by_"):],value)
for name,value in locals().items()
if name.startswith("form_by_"))
def parse_dtrace_symbol_file(path, all_symbols, all_symbols_set,
missing_symbols, opts):
def fix_mangling(symbol):
if symbol[0] != '_' and symbol != 'start':
symbol = '_' + symbol
return symbol
def get_symbols_with_prefix(symbol):
start_index = bisect.bisect_left(all_symbols, symbol)
for s in all_symbols[start_index:]:
if not s.startswith(symbol):
break
yield s
# Extract the list of symbols from the given file, which is assumed to be
# the output of a dtrace run logging either probefunc or ustack(1) and
# nothing else. The dtrace -xdemangle option needs to be used.
#
# This is particular to OS X at the moment, because of the '_' handling.
with open(path) as f:
current_timestamp = None
for ln in f:
# Drop leading and trailing whitespace.
ln = ln.strip()
if not ln:
continue
# If this is a timestamp specifier, extract it.
if ln.startswith("TS: "):
_,data = ln.split(': ', 1)
if not data.isdigit():
print >>sys.stderr, (
"warning: unrecognized timestamp line %r, ignoring" % (
ln,))
continue
current_timestamp = int(data)
continue
# If there is a '`' in the line, assume it is a ustack(1) entry in
# the form of `, where is never
# truncated (but does need the mangling patched).
if '`' in ln:
yield (current_timestamp, fix_mangling(ln.split('`',1)[1]))
continue
# Otherwise, assume this is a probefunc printout. DTrace on OS X
# seems to have a bug where it prints the mangled version of symbols
# which aren't C++ mangled. We just add a '_' to anything but start
# which doesn't already have a '_'.
symbol = fix_mangling(ln)
# If we don't know all the symbols, or the symbol is one of them,
# just return it.
if not all_symbols_set or symbol in all_symbols_set:
yield (current_timestamp, symbol)
continue
# Otherwise, we have a symbol name which isn't present in the
# binary. We assume it is truncated, and try to extend it.
# Get all the symbols with this prefix.
possible_symbols = list(get_symbols_with_prefix(symbol))
if not possible_symbols:
return
# If we found too many possible symbols, ignore this as a prefix.
if len(possible_symbols) > 100:
print >>sys.stderr, (
"warning: ignoring symbol %r (no match and too many "
"possible suffixes") % (symbol,)
return
# Report that we resolved a missing symbol.
if opts.show_missing_symbols and symbol not in missing_symbols:
print >>sys.stderr, (
"warning: resolved missing symbol %r" % (symbol,))
missing_symbols.add(symbol)
# Otherwise, treat all the possible matches as having occurred. This
# is an over-approximation, but it should be ok in practice.
for s in possible_symbols:
yield (current_timestmap, s)
def main():
parser = OptionParser("%prog [options] *]")
parser.add_option("", "--binary", dest="binary_path", type='str',
help=("Path to the binary being ordered "
"(for getting all symbols)"),
action="store", default=None, metavar="PATH")
parser.add_option("", "--output", dest="output_path",
help="path to output order file to write",
default=None, metavar="PATH")
parser.add_option("", "--show-missing-symbols", dest="show_missing_symbols",
help=("show symbols which are 'fixed up' to a valid name "
"(requires --binary)"),
action="store_true", default=None)
parser.add_option("", "--output-unordered-symbols",
dest="output_unordered_symbols_path",
help=("write a list of the unordered symbols to PATH "
"(requires --binary)"),
default=None, metavar="PATH")
parser.add_option("", "--method", dest="method",
help="order file generation method to use",
choices=methods.keys(), default='call_order')
(opts, args) = parser.parse_args()
if opts.output_path is None:
parser.error("--output is required")
if len(args) < 1:
parser.error("invalid number of arguments")
# If the user gave us a binary, get all the symbols in the binary by
# snarfing 'nm' output.
if opts.binary_path is not None:
output = check_output(['nm', '-P', opts.binary_path])
lines = output.split("\n")
all_symbols = [ln.split(' ',1)[0]
for ln in lines
if ln.strip()]
commands.note("found %d symbols in binary" % (len(all_symbols),))
all_symbols.sort()
else:
all_symbols = []
all_symbols_set = set(all_symbols)
# Compute the list of input files.
input_files = []
for path in args:
# If this is a directory, read any files ending with .log.
if os.path.isdir(path):
for name in os.listdir(path):
if name.endswith('.log'):
input_files.append(os.path.join(path, name))
else:
input_files.append(path)
# Load all of the input files.
commands.note("loading from %d data files" % (len(input_files),))
missing_symbols = set()
timestamped_symbol_lists = [
list(parse_dtrace_symbol_file(path, all_symbols, all_symbols_set,
missing_symbols, opts))
for path in input_files]
# Reorder each symbol list.
symbol_lists = []
for timestamped_symbols_list in timestamped_symbol_lists:
timestamped_symbols_list.sort()
symbol_lists.append([symbol for _,symbol in timestamped_symbols_list])
# Execute the desire order file generation method.
method = methods.get(opts.method)
result = list(method(symbol_lists))
# Report to the user on what percentage of symbols are present in the order
# file.
num_ordered_symbols = len(result)
if all_symbols:
print >>sys.stderr, "note: order file contains %d/%d symbols (%.2f%%)" %(
num_ordered_symbols, len(all_symbols),
100.*num_ordered_symbols/len(all_symbols))
if opts.output_unordered_symbols_path:
ordered_symbols_set = set(result)
with open(opts.output_unordered_symbols_path, 'w') as f:
f.write("\n".join(s for s in all_symbols
if s not in ordered_symbols_set))
# Write the order file.
with open(opts.output_path, 'w') as f:
f.write("\n".join(result))
f.write("\n")
if __name__ == '__main__':
main()