Created
July 31, 2012 13:35
-
-
Save bnagy/3217079 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Find the closest template to match a crashfile using the Normalised | |
# Compression Distance (NCD), defined as: | |
# NCD(x,y)=( C(xy) - min{C(x), C(y)} ) / max{C(x), C(y)} | |
# | |
# Based on my testing, LZMA works best, which is implemented in xz for *nix, | |
# bzip2 (Burrows-Wheeler) works well enough but zip doesn't differentiate well | |
# and will be flat out wrong sometimes. | |
# | |
# Author: Ben Nagy Copyright: Copyright (c) Ben Nagy, 2006-2012. | |
# License: The MIT License | |
# (See http://www.opensource.org/licenses/mit-license.php for details.) | |
require 'rubygems' | |
require 'parallel' | |
class MatchNCD | |
attr_reader :results | |
def initialize( crash_fnames, template_regex, max_size_delta=10000, max_cpus=false ) | |
`which which` rescue fail "#{__FILE__}: Only unix supported, kthxbye" | |
if not `which xz`.empty? | |
@compressor="xz -9" | |
elsif not `which bzip2`.empty? | |
@compressor="bzip2" | |
elsif not `which zip`.empty? | |
warn "#{self.to_s}: Warning - could only find zip, but zip sucks." | |
@compressor="zip" | |
else | |
fail "#{__FILE__}: Can't find a useable compressor" | |
end | |
@cpus=( max_cpus || Parallel.processor_count ) | |
begin | |
fork {} | |
@parallel_type=:in_processes | |
rescue NotImplementedError | |
@parallel_type=:in_threads | |
end | |
raise ArgumentError, "Can't find any crashfiles" if crash_fnames.empty? | |
template_fnames=Dir.glob( File.expand_path( template_regex ) ) | |
raise ArgumentError, "Can't find any templates using #{template_regex}" if template_fnames.empty? | |
# We only need to consider templates that are within the max size delta of | |
# at least one crashfile | |
interesting_templates=template_fnames.select {|template| | |
crash_fnames.any? {|crash| (File.size( crash ) - File.size( template )).abs <= max_size_delta} | |
} | |
# Build a cache of all C(y) | |
comp_template_sizes=Parallel.map( interesting_templates, @parallel_type=>@cpus ) {|template_fname| | |
comp_template_size=Integer(`cat #{template_fname} | #{@compressor} | wc -c`) | |
[template_fname, comp_template_size] | |
} | |
comp_template_sizes=Hash[*(comp_template_sizes.flatten)] | |
@results=crash_fnames.map {|crash_fname| | |
# cache C(x) | |
comp_crash_size=Integer(`cat #{crash_fname} | #{@compressor} | wc -c`) | |
results=Parallel.map( template_fnames, @parallel_type=>@cpus ) {|template_fname| | |
next if max_size_delta and (File.size( template_fname ) - File.size( crash_fname )).abs > max_size_delta | |
comp_template_size=comp_template_sizes[template_fname] | |
# Calculate C(xy) | |
comp_both_size=Integer(`cat #{template_fname} #{crash_fname} | #{@compressor} | wc -c`) | |
ncd=(comp_both_size.to_f - [comp_crash_size, comp_template_size].min) / [comp_crash_size, comp_template_size].max | |
[template_fname, ncd] | |
} | |
# lower is better | |
[crash_fname, results.compact.sort_by {|ary| ary[1]}] | |
} | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment