Created
August 30, 2011 05:56
-
-
Save pellegrino/1180280 to your computer and use it in GitHub Desktop.
Find duplicates in a given directory (by @bdunagan)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/ruby | |
| # From the post: http://www.bdunagan.com/2010/08/24/dedupe-files-with-50-lines-of-ruby/ | |
| # Originally at: | |
| # https://github.com/bdunagan/codebucket/blob/master/dedupe.files.rb | |
| # dedupe.files.rb | |
| require 'rubygems' | |
| require 'sqlite3' # Look at 'http://github.com/luislavena/sqlite3-ruby' then do 'sudo gem install sqlite3-ruby' | |
| require 'digest/sha1' | |
| require 'pathname' | |
| # Pass in the directory or assume the current one. | |
| arg = ARGV[0] || "." | |
| root_path = Pathname.new(arg).realpath.to_s | |
| puts "Examining #{root_path}" | |
| # Create a SQLite3 database in the current directory. | |
| db = SQLite3::Database.new("dedupe.files.db") | |
| db.execute("create table files(digest varchar(40),path varchar(1024))") | |
| # Recursively generate hash digests of all files. | |
| Dir.chdir("#{root_path}") | |
| current_file = 0 | |
| Dir['**/*.*'].each do |file| | |
| path = "#{root_path}/#{file}" | |
| # Ignore non-existent files (symbolic links) and directories. | |
| next if !File.exists?("#{path}") || File.directory?("#{path}") | |
| # Create a hash digest for the current file. | |
| digest = Digest::SHA1.new | |
| File.open(file, 'r') do |handle| | |
| while buffer = handle.read(1024) | |
| digest << buffer | |
| end | |
| end | |
| # Store the hash digest and full path in the database. | |
| db.execute("insert into files values(\"#{digest}\",\"#{path}\")") | |
| # Print out every Nth file. | |
| puts "[#{digest}] #{path} (#{current_file})" if current_file % 100 == 0 | |
| current_file = current_file + 1 | |
| end | |
| # Loop through digests. | |
| db.execute("select digest,path,count(1) as count from files group by digest order by count desc").each do |row| | |
| # puts "row: #{row}" | |
| if row[2].to_i > 1 # Skip unique files. | |
| puts "Duplicates found:" | |
| digest = row[0] | |
| # List the duplicate files. | |
| db.execute("select digest,path from files where digest='#{digest}'").each do |dup_row| | |
| puts "[#{digest}] #{dup_row[1]}" | |
| end | |
| end | |
| end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment