Created
November 28, 2024 22:39
-
-
Save romiras/9e63629dc6bea79586551539bd75b7e3 to your computer and use it in GitHub Desktop.
Naïve data deduplication in Ruby, variant 1 (2018)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env ruby | |
# Usage: | |
# for f in $(find . -type f); do ./chunker.rb -k "$f" < "$f"; done | |
require 'optparse' | |
require 'digest/sha1' | |
require 'data_mapper' | |
CHUNK_SIZE = 1 << 13 # 8192 | |
class DataChunk | |
include DataMapper::Resource | |
property :id, Serial | |
property :seq, Integer | |
property :hash_sum, String, length: 40 | |
property :file_key, String, length: 255 | |
end | |
class Storage | |
BLOB_DIR = 'blobs'.freeze | |
def initialize(path, options = {}) | |
@path = File.join(path, BLOB_DIR) | |
Dir.mkdir(@path, 0700) unless Dir.exists?(@path) | |
end | |
def get_path_for(storage_key) | |
sub_path = File.join(@path, storage_key[0,2]) | |
filename = storage_key[2..-1] | |
[sub_path, filename] | |
end | |
def has_object?(sub_path, filename) | |
Dir.mkdir(sub_path) unless Dir.exist?(sub_path) | |
File.exists?(File.join(sub_path, filename)) | |
end | |
# persist content of io under storage_key | |
def put_object(io, copy_length, storage_key) | |
return if copy_length == 0 | |
sub_path, filename = get_path_for(storage_key) | |
if has_object?(sub_path, filename) | |
# puts "\t#{storage_key} - skipped! :-)" | |
else | |
#IO.binwrite( File.join(@path, sub_path, filename), io.binread ) | |
File.open(File.join(sub_path, filename), 'wb') do |output_stream| | |
IO.copy_stream(io, output_stream, copy_length) | |
end | |
# puts "Saved BLOB to #{storage_key} (#{copy_length} bytes)" | |
end | |
end | |
end | |
# default options: | |
options = {} | |
OptionParser.new do |opts| | |
# banner and separator are the usage description showed with '--help' or '-h' | |
opts.banner = "Usage: chunker.rb [options]" | |
opts.separator "Deduplicates and stores in storage data from standard input or file" | |
opts.separator "Options:" | |
# options (switch - true/false) | |
opts.on("-k", "--storage_key KEY", "Key for storage") do |f| | |
options[:storage_key] = f | |
end | |
# opts.on("-f", "--file FILE", "File to process") do |f| | |
# options[:file] = f # f becames a filename given after -f or --file | |
# end | |
end.parse! | |
# files = ARGV.join(', ') # storing, because ARGF.read* clears the ARGV | |
# ARGV now contains no options, only file | |
# if options[:lines] | |
# number = "#{ARGF.readlines.count} lines" | |
# else | |
# number = "#{ARGF.each_byte.count} bytes" | |
# end | |
# DataMapper::Logger.new($stdout, :debug) | |
DataMapper.setup(:default, "sqlite://#{Dir.pwd}/index.db") | |
DataMapper.finalize | |
DataMapper.auto_upgrade! | |
storage = Storage.new(Dir.pwd) | |
sha1 = Digest::SHA1.new | |
seq = 0 | |
while chunk = $stdin.read(CHUNK_SIZE) | |
sha1 << chunk | |
storage.put_object(StringIO.new(chunk), chunk.size, sha1.hexdigest) | |
seq += 1 | |
DataChunk.create( | |
seq: seq, | |
hash_sum: sha1.hexdigest, | |
file_key: options[:storage_key] | |
) | |
# puts sha1.hexdigest | |
sha1.reset | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment