Skip to content

Instantly share code, notes, and snippets.

@Raven24
Last active August 29, 2015 13:56
Show Gist options
  • Save Raven24/8826008 to your computer and use it in GitHub Desktop.
Save Raven24/8826008 to your computer and use it in GitHub Desktop.
dir comparator [wip]
require 'digest'
require 'forwardable'
module CompStat
SAMPLE_SIZE = 92 #128
FIELD_SPLIT = "\t\t"
class File
attr_accessor :size, :mtime, :spl_start, :spl_length, :sample_digest
class << self
def from_line(data)
File.new do |f|
f.size = data.shift.to_i
f.mtime = data.shift.to_i
spl_data = data.shift.split(' ')
f.spl_start = spl_data.shift.to_i
f.spl_length = spl_data.shift.to_i
f.sample_digest = spl_data.shift
end
end
def from_filename(name, spl_start=nil, spl_length=nil)
stat = ::File.stat(name)
File.new do |f|
f.size = stat.size
f.mtime = stat.mtime.to_i
spl_size = [stat.size, SAMPLE_SIZE].min
f.spl_start = spl_start || rand(stat.size-spl_size).to_i
f.spl_length = spl_length || [stat.size-f.spl_start, SAMPLE_SIZE].min
f.sample_digest = Digest::SHA1.hexdigest(::File.read(name, f.spl_length, f.spl_start))
end
rescue StandardError => e
nil
end
end
def initialize
yield self if block_given?
freeze
end
def ==(other)
raise ArgumentError unless other.is_a?(File)
raise MoreRecent if other.mtime > mtime
raise LessRecent if other.mtime < mtime
raise DifferentSize if size != other.size
raise DifferentSample if spl_start != other.spl_start || spl_length != other.spl_length
raise DifferentDigest if sample_digest != other.sample_digest
true
end
def to_line
[size, mtime, sample].join(FIELD_SPLIT)
end
private
def sample
"#{spl_start} #{spl_length} #{sample_digest}"
end
end
class Dir
attr_reader :subdirs, :files, :exclusive
def initialize
@subdirs = {}
@files = {}
@exclusive = {}
end
def compare_to(other_tree, path='.')
other_files = Tree.find_dir(other_tree, ::File.join(path, '.'), false).files
files.each do |fname, file|
DiffHandler.catcher(path, fname) do
raise FileNotInTree if other_files.nil?
other_file = other_files[fname]
raise FileNotInTree if other_file.nil?
file == other_file
end
end
subdirs.each do |dname, dir|
DiffHandler.catcher(path, dname) do
dir.compare_to(other_tree, ::File.join(path, dname))
end
end
end
def all_files
files.merge(exclusive)
end
def to_lines(path='.')
out = ""
all_files.each do |fname, file|
out += ::File.join(path, [fname, file.to_line].join(FIELD_SPLIT))
out += "\n"
end
@subdirs.each do |dname, dir|
out += dir.to_lines(::File.join(path, dname))
end
out
end
def to_s(indent=0)
out = ""
all_files.keys.each_slice(3) do |fslice|
out += (" "*indent) + "|-- #{fslice.join(", ")}\n"
end
@subdirs.each do |k, v|
out += (" "*indent) + "|-+ [#{k}]\n"
out += v.to_s(indent+2)
end
out
end
end
class Tree
class << self
def from_fs(dir)
@basedir = ::File.realpath(dir)
@tree = Dir.new
init_counter
stat_fs(@basedir)
Tree.new @tree
end
def from_fs_and_other(dir, other_tree)
@basedir = ::File.realpath(dir)
@tree = Dir.new
init_counter
stat_fs(@basedir, other_tree)
Tree.new @tree
end
def from_savefile(file)
@basedir = ''
@tree = Dir.new
init_counter
::File.open(file) do |f|
f.each_line do |line|
data = line.split(FIELD_SPLIT)
insert_tree_leaf(data.shift) do
File.from_line(data)
end
end
end
Tree.new @tree
end
def find_dir(tree, path, create=true)
file = path.split('/')[1..-1]
dir = tree
while file.count > 1
subdir = file.shift
raise DirNotInTree if !create && dir.subdirs[subdir].nil?
dir.subdirs[subdir] ||= Dir.new
dir = dir.subdirs[subdir]
end
dir
end
def find_file(tree, path)
dir = find_dir(tree, path, false)
f = dir.files[::File.basename(path)]
raise FileNotInTree if f.nil?
f
end
private
def init_counter
@counter = 0
$stderr.puts "reading tree..."
end
def stat_fs(dir=nil, other_tree=nil)
raise ArgumentError if dir.nil?
::Dir.foreach(dir) do |item|
next if item=='.' || item=='..'
path = ::File.join(dir, item)
next if ::File.symlink?(path)
next unless ::File.exists?(path)
if ::File.directory? path
stat_fs(path, other_tree)
next
end
name = path[@basedir.length..-1]
begin
other_file = Tree.find_file(other_tree, name) unless other_tree.nil?
rescue FileNotInTree
end
exclusive = (!other_tree.nil? && other_file.nil?)
insert_tree_leaf(name, exclusive) do
if other_file
File.from_filename(path, other_file.spl_start, other_file.spl_length)
else
File.from_filename(path)
end
end
end
end
def insert_tree_leaf(file, exclusive=false)
dir = find_dir(@tree, file)
leaf = yield
return if leaf.nil?
if exclusive
dir.exclusive[::File.basename(file)] = leaf
else
dir.files[::File.basename(file)] = leaf
end
@counter += 1
$stderr.puts("#{@counter}".rjust(8) + " files") if( @counter%5000==0 )
end
end
extend Forwardable
def initialize(root)
@root = root
end
def_delegator :@root, :subdirs
def_delegator :@root, :files
def_delegator :@root, :compare_to
def_delegator :@root, :to_s
def_delegator :@root, :to_lines
def compare_file(name, other_file=nil, basedir=nil)
file = Tree.find_file(@root, name)
if other_file.nil?
raise ArgumentError if basedir.nil? || basedir.empty?
bd = ::File.realpath(basedir)
other_file = File.from_filename(::File.join(bd, name), file.spl_start, file.spl_length)
end
file==other_file
end
end
class Writer
class << self
def fs_tree(tree)
$output_handle.puts tree.to_s
end
def stat_savefile(tree)
$output_handle.puts tree.to_lines
end
end
end
class DiffHandler
class << self
def catcher(path, file)
@name = ::File.join(path, file)
begin
puts "EQUAL" if yield
rescue MoreRecent
puts "destination is more recent"
rescue LessRecent
puts "destination is older"
rescue DifferentSize
puts "filesize different"
rescue DifferentSample
puts "samples were taken from different places"
rescue DifferentDigest
puts "samples don't match"
rescue FileNotInTree
puts "destination file doesn't exist"
rescue DirNotInTree
puts "destination dir doesn't exist"
rescue Inequality
$output_handle.puts "#{name} is different"
end
end
private
def puts(msg)
$output_handle.puts "#{@name} -- #{msg}"
end
end
end
class Inequality < StandardError; end
class MoreRecent < Inequality; end
class LessRecent < Inequality; end
class DifferentSize < Inequality; end
class DifferentSample < Inequality; end
class DifferentDigest < Inequality; end
class FileNotInTree < Inequality; end
class DirNotInTree < Inequality; end
end
class App
def initialize(mode='stat')
@mode = mode
$output_handle = $stdout
end
def exec!
case @mode
when 'tree'
tree = CompStat::Tree.from_fs ARGV.shift
handle_output ARGV.shift
CompStat::Writer.fs_tree tree
when 'stat'
tree = CompStat::Tree.from_fs ARGV.shift
handle_output ARGV.shift
CompStat::Writer.stat_savefile tree
when 'read_tree'
tree = CompStat::Tree.from_savefile ARGV.shift
handle_output ARGV.shift
CompStat::Writer.fs_tree tree
when 'tree_two'
t1 = CompStat::Tree.from_fs ARGV.shift
t2 = CompStat::Tree.from_fs_and_other ARGV.shift, t1
handle_output ARGV.shift
t1.compare_to(t2)
when 'test'
tree = CompStat::Tree.from_fs ARGV.shift
path = ARGV.shift
name = ARGV.shift
f = CompStat::Tree.find_file(tree, name)
other = CompStat::File.from_filename path, f.spl_start, f.spl_length
handle_output ARGV.shift
$output_handle.puts("EQUAL") if tree.compare_file name, other
else
puts 'unknown mode'
end
ensure
$output_handle.close
end
private
def handle_output(file)
$output_handle = File.open(file, 'w') unless file.nil? || file.empty?
end
end
app = App.new(ARGV.shift)
app.exec!
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment