Frontear · February 15, 2020 19:16 · Frontear · Dec 12, 2019 · Frontear · Jan 17, 2020
diff --git a/duplicates.cpp b/duplicates.cpp
 #include <map>
 #include <set>
 #include <string>
 #include <vector>
 #include <fstream>
 #include <iostream>
 #include <filesystem>

 using namespace std;

 /**
 * @brief Searches directories for files, and maps them to their size. It does this by using an std::map
 * @param paths A vector of paths that will be deep searched for duplicates
 * @return An std::map<long, std::vector<std::string>>, where long is the size, and std::vector<std::string> is whatever files had the same size
 */
 map<long, vector<string>> map_file_size(const set<string> &paths) {
    map<long, vector<string>> files;

    for (const auto &path : paths) {
        for (const auto &entry : filesystem::recursive_directory_iterator(path)) {
            const auto &syspath = entry.path();

            if (!filesystem::is_regular_file(syspath)) {
                continue;
            }

            const auto key = filesystem::file_size(syspath);
            const auto value = files.emplace(key, vector<string>()).first;

            value->second.push_back(syspath.string());
        }
    }

    return files;
 }

 /**
 * @brief Searches a vector of files mapped to specific sizes, and performs byte by byte comparisons
 * @param files The files to compare and return duplicates in a map
 * @return An std::map<long, std::vector<std::string>>, where long is the size, and std::vector<std::string> is whatever files had the same contents
 */
 map<long, vector<string>> map_size_dupe(const map<long, vector<string>> &files) {
    map<long, vector<string>> dupes;

    for (const auto&[k, v] : files) {
        if (v.size() == 1) {
            continue;
        }

        for (const auto &p1 : v) {
            ifstream f1(p1, ios::binary | ios::ate);
            f1.seekg(0, ios::beg);
            auto dupe = false; // if even one duplicate exists

            for (const auto &p2 : v) {
                if (p1 == p2) { // don't compare the same file
                    continue;
                }

                ifstream f2(p2, ios::binary | ios::ate);
                f2.seekg(0, ios::beg);

                if (!equal(istreambuf_iterator<char>(f1.rdbuf()),
                           istreambuf_iterator<char>(),
                           istreambuf_iterator<char>(f2.rdbuf()))) {
                    continue;
                }

                const auto key = k;
                const auto value = dupes.emplace(key, vector<string>()).first;

                if (!dupe) {
                    dupe = true;

                    value->second.push_back(p1); // only push once
                }

                value->second.push_back(p2);
            }

            break; // we've done all comparisons, no need to repeat them
        }
    }

    return dupes;
 }

 /**
 * @brief Prints out duplicates files which have the exact same size
 * @param hashes An std::map of files which have the exact same size
 */
 void print_duplicates(const map<long, vector<string>> &hashes) {
    for (const auto&[k, v] : hashes) {
        if (v.size() == 1) {
            continue;
        }

        cout << "Found duplicates:" << endl;
        for (const auto &path: v) {
            cout << "\t- " << path << endl;
        }

        cout << endl;
    }
 }

 /**
 * @brief The main function of the application. This will control the processing of the program
 * @param argc Argument count. This will be the amount of arguments passed in
 * @param argv Argument vector. This contains a pointer to C-string values
 * @return EXIT_SUCCESS
 */

 int main(int argc, const char *argv[]) {
    set<string> args;
    while (argc-- > 1) { // > 1 will ignore the first element, which is simply the path to this binary
        const auto arg = argv[argc];
        if (!filesystem::is_directory(arg)) {
            cerr << quoted(arg) << " is not a directory (skipping)" << endl;
            continue;
        }

        args.insert(argv[argc]);
    }

    auto files = map_file_size(args);
    auto hashes = map_size_dupe(files);

    print_duplicates(hashes);

    return EXIT_SUCCESS;
 }
diff --git a/duplicates.py b/duplicates.py
 #!/usr/bin/python3

 import os, hashlib
 from sys import argv
 from collections import defaultdict

 def lappend(dictionary, keyfunc, value): # list append
    dictionary[keyfunc(value)].append(value)

 def ffilter(files): # file filter
    return [ x for x in files.values() if len(x) > 1 ] # discards files which only occur once

 def fhash(filename, algorithm = hashlib.sha1): # file hash
    filehash = algorithm()
    with open(filename, "rb") as f:
        filehash.update(f.read())
    
    return filehash.digest()

 def main(paths):
    files = defaultdict(list) # keep references to the files with the same sizes
    for p in paths: # path
        for d, dn, fn in os.walk(p): # directory, directory name, file name
            for f in fn: # file
                lappend(files, os.path.getsize, os.path.realpath(os.path.join(d, f))) # realpath follows (possible) symlinks

    hashes = defaultdict(list) # contains the hashes for our files
    for fs in ffilter(files): # files
        for fn in fs: # filename
            lappend(hashes, fhash, fn)

    for fs in ffilter(hashes): # files
        print("\nFound duplicates:")
        for fn in fs: # filename
            print("\t- %s" % fn)

 if __name__ == "__main__":
    if len(argv) < 2:
        print("Usage: %s <directory> [directories...]" % argv[0])
    else:
        main(set(argv[1:])) # prevent duplicate paths
	#include <map>
	#include <set>
	#include <string>
	#include <vector>
	#include <fstream>
	#include <iostream>
	#include <filesystem>

	using namespace std;

	/**
	* @brief Searches directories for files, and maps them to their size. It does this by using an std::map
	* @param paths A vector of paths that will be deep searched for duplicates
	* @return An std::map<long, std::vector<std::string>>, where long is the size, and std::vector<std::string> is whatever files had the same size
	*/
	map<long, vector<string>> map_file_size(const set<string> &paths) {
	map<long, vector<string>> files;

	for (const auto &path : paths) {
	for (const auto &entry : filesystem::recursive_directory_iterator(path)) {
	const auto &syspath = entry.path();

	if (!filesystem::is_regular_file(syspath)) {
	continue;
	}

	const auto key = filesystem::file_size(syspath);
	const auto value = files.emplace(key, vector<string>()).first;

	value->second.push_back(syspath.string());
	}
	}

	return files;
	}

	/**
	* @brief Searches a vector of files mapped to specific sizes, and performs byte by byte comparisons
	* @param files The files to compare and return duplicates in a map
	* @return An std::map<long, std::vector<std::string>>, where long is the size, and std::vector<std::string> is whatever files had the same contents
	*/
	map<long, vector<string>> map_size_dupe(const map<long, vector<string>> &files) {
	map<long, vector<string>> dupes;

	for (const auto&[k, v] : files) {
	if (v.size() == 1) {
	continue;
	}

	for (const auto &p1 : v) {
	ifstream f1(p1, ios::binary \| ios::ate);
	f1.seekg(0, ios::beg);
	auto dupe = false; // if even one duplicate exists

	for (const auto &p2 : v) {
	if (p1 == p2) { // don't compare the same file
	continue;
	}

	ifstream f2(p2, ios::binary \| ios::ate);
	f2.seekg(0, ios::beg);

	if (!equal(istreambuf_iterator<char>(f1.rdbuf()),
	istreambuf_iterator<char>(),
	istreambuf_iterator<char>(f2.rdbuf()))) {
	continue;
	}

	const auto key = k;
	const auto value = dupes.emplace(key, vector<string>()).first;

	if (!dupe) {
	dupe = true;

	value->second.push_back(p1); // only push once
	}

	value->second.push_back(p2);
	}

	break; // we've done all comparisons, no need to repeat them
	}
	}

	return dupes;
	}

	/**
	* @brief Prints out duplicates files which have the exact same size
	* @param hashes An std::map of files which have the exact same size
	*/
	void print_duplicates(const map<long, vector<string>> &hashes) {
	for (const auto&[k, v] : hashes) {
	if (v.size() == 1) {
	continue;
	}

	cout << "Found duplicates:" << endl;
	for (const auto &path: v) {
	cout << "\t- " << path << endl;
	}

	cout << endl;
	}
	}

	/**
	* @brief The main function of the application. This will control the processing of the program
	* @param argc Argument count. This will be the amount of arguments passed in
	* @param argv Argument vector. This contains a pointer to C-string values
	* @return EXIT_SUCCESS
	*/

	int main(int argc, const char *argv[]) {
	set<string> args;
	while (argc-- > 1) { // > 1 will ignore the first element, which is simply the path to this binary
	const auto arg = argv[argc];
	if (!filesystem::is_directory(arg)) {
	cerr << quoted(arg) << " is not a directory (skipping)" << endl;
	continue;
	}

	args.insert(argv[argc]);
	}

	auto files = map_file_size(args);
	auto hashes = map_size_dupe(files);

	print_duplicates(hashes);

	return EXIT_SUCCESS;
	}
	#!/usr/bin/python3

	import os, hashlib
	from sys import argv
	from collections import defaultdict

	def lappend(dictionary, keyfunc, value): # list append
	dictionary[keyfunc(value)].append(value)

	def ffilter(files): # file filter
	return [ x for x in files.values() if len(x) > 1 ] # discards files which only occur once

	def fhash(filename, algorithm = hashlib.sha1): # file hash
	filehash = algorithm()
	with open(filename, "rb") as f:
	filehash.update(f.read())

	return filehash.digest()

	def main(paths):
	files = defaultdict(list) # keep references to the files with the same sizes
	for p in paths: # path
	for d, dn, fn in os.walk(p): # directory, directory name, file name
	for f in fn: # file
	lappend(files, os.path.getsize, os.path.realpath(os.path.join(d, f))) # realpath follows (possible) symlinks

	hashes = defaultdict(list) # contains the hashes for our files
	for fs in ffilter(files): # files
	for fn in fs: # filename
	lappend(hashes, fhash, fn)

	for fs in ffilter(hashes): # files
	print("\nFound duplicates:")
	for fn in fs: # filename
	print("\t- %s" % fn)

	if __name__ == "__main__":
	if len(argv) < 2:
	print("Usage: %s <directory> [directories...]" % argv[0])
	else:
	main(set(argv[1:])) # prevent duplicate paths