-
-
Save Frontear/e1895bc47e940da1716d258cab668b1f to your computer and use it in GitHub Desktop.
| #include <map> | |
| #include <set> | |
| #include <string> | |
| #include <vector> | |
| #include <fstream> | |
| #include <iostream> | |
| #include <filesystem> | |
| using namespace std; | |
| /** | |
| * @brief Searches directories for files, and maps them to their size. It does this by using an std::map | |
| * @param paths A vector of paths that will be deep searched for duplicates | |
| * @return An std::map<long, std::vector<std::string>>, where long is the size, and std::vector<std::string> is whatever files had the same size | |
| */ | |
| map<long, vector<string>> map_file_size(const set<string> &paths) { | |
| map<long, vector<string>> files; | |
| for (const auto &path : paths) { | |
| for (const auto &entry : filesystem::recursive_directory_iterator(path)) { | |
| const auto &syspath = entry.path(); | |
| if (!filesystem::is_regular_file(syspath)) { | |
| continue; | |
| } | |
| const auto key = filesystem::file_size(syspath); | |
| const auto value = files.emplace(key, vector<string>()).first; | |
| value->second.push_back(syspath.string()); | |
| } | |
| } | |
| return files; | |
| } | |
| /** | |
| * @brief Searches a vector of files mapped to specific sizes, and performs byte by byte comparisons | |
| * @param files The files to compare and return duplicates in a map | |
| * @return An std::map<long, std::vector<std::string>>, where long is the size, and std::vector<std::string> is whatever files had the same contents | |
| */ | |
| map<long, vector<string>> map_size_dupe(const map<long, vector<string>> &files) { | |
| map<long, vector<string>> dupes; | |
| for (const auto&[k, v] : files) { | |
| if (v.size() == 1) { | |
| continue; | |
| } | |
| for (const auto &p1 : v) { | |
| ifstream f1(p1, ios::binary | ios::ate); | |
| f1.seekg(0, ios::beg); | |
| auto dupe = false; // if even one duplicate exists | |
| for (const auto &p2 : v) { | |
| if (p1 == p2) { // don't compare the same file | |
| continue; | |
| } | |
| ifstream f2(p2, ios::binary | ios::ate); | |
| f2.seekg(0, ios::beg); | |
| if (!equal(istreambuf_iterator<char>(f1.rdbuf()), | |
| istreambuf_iterator<char>(), | |
| istreambuf_iterator<char>(f2.rdbuf()))) { | |
| continue; | |
| } | |
| const auto key = k; | |
| const auto value = dupes.emplace(key, vector<string>()).first; | |
| if (!dupe) { | |
| dupe = true; | |
| value->second.push_back(p1); // only push once | |
| } | |
| value->second.push_back(p2); | |
| } | |
| break; // we've done all comparisons, no need to repeat them | |
| } | |
| } | |
| return dupes; | |
| } | |
| /** | |
| * @brief Prints out duplicates files which have the exact same size | |
| * @param hashes An std::map of files which have the exact same size | |
| */ | |
| void print_duplicates(const map<long, vector<string>> &hashes) { | |
| for (const auto&[k, v] : hashes) { | |
| if (v.size() == 1) { | |
| continue; | |
| } | |
| cout << "Found duplicates:" << endl; | |
| for (const auto &path: v) { | |
| cout << "\t- " << path << endl; | |
| } | |
| cout << endl; | |
| } | |
| } | |
| /** | |
| * @brief The main function of the application. This will control the processing of the program | |
| * @param argc Argument count. This will be the amount of arguments passed in | |
| * @param argv Argument vector. This contains a pointer to C-string values | |
| * @return EXIT_SUCCESS | |
| */ | |
| int main(int argc, const char *argv[]) { | |
| set<string> args; | |
| while (argc-- > 1) { // > 1 will ignore the first element, which is simply the path to this binary | |
| const auto arg = argv[argc]; | |
| if (!filesystem::is_directory(arg)) { | |
| cerr << quoted(arg) << " is not a directory (skipping)" << endl; | |
| continue; | |
| } | |
| args.insert(argv[argc]); | |
| } | |
| auto files = map_file_size(args); | |
| auto hashes = map_size_dupe(files); | |
| print_duplicates(hashes); | |
| return EXIT_SUCCESS; | |
| } |
| #!/usr/bin/python3 | |
| import os, hashlib | |
| from sys import argv | |
| from collections import defaultdict | |
| def lappend(dictionary, keyfunc, value): # list append | |
| dictionary[keyfunc(value)].append(value) | |
| def ffilter(files): # file filter | |
| return [ x for x in files.values() if len(x) > 1 ] # discards files which only occur once | |
| def fhash(filename, algorithm = hashlib.sha1): # file hash | |
| filehash = algorithm() | |
| with open(filename, "rb") as f: | |
| filehash.update(f.read()) | |
| return filehash.digest() | |
| def main(paths): | |
| files = defaultdict(list) # keep references to the files with the same sizes | |
| for p in paths: # path | |
| for d, dn, fn in os.walk(p): # directory, directory name, file name | |
| for f in fn: # file | |
| lappend(files, os.path.getsize, os.path.realpath(os.path.join(d, f))) # realpath follows (possible) symlinks | |
| hashes = defaultdict(list) # contains the hashes for our files | |
| for fs in ffilter(files): # files | |
| for fn in fs: # filename | |
| lappend(hashes, fhash, fn) | |
| for fs in ffilter(hashes): # files | |
| print("\nFound duplicates:") | |
| for fn in fs: # filename | |
| print("\t- %s" % fn) | |
| if __name__ == "__main__": | |
| if len(argv) < 2: | |
| print("Usage: %s <directory> [directories...]" % argv[0]) | |
| else: | |
| main(set(argv[1:])) # prevent duplicate paths |
I don't really want to add any excess functionality. I feel that adding options for, say deleting duplicates or moving them to a special 'excess' directory goes outside the scope of what I want from this program. It would become a maintenance burden, at least for this script.
Note however, that I'd like to make this a functional possibility. I instead plan to create a gui application on GitHub that can use pyqt or such. It's a good opportunity to explore gui development in python. Nonetheless, that's a future expedition.
The C++ equivalent of the duplicate search is around the same speed as the python version. Nonetheless, I've developed one that is nearly 2x faster than both the python and c++ versions. It can be found at Frontear/Archikos. It has been developed in Rust.
A blazing fast program that will recursively go through directories, find any duplicate files, and then print them.
It works by first finding files that have equal file sizes, then finding the hash of said files, and simply printing out the files that have the same hash. The technical details can be explored in the actual script, however it's fairly basic and very straightforward to understand.
For context, here are some benchmarks of the program: