Created
April 3, 2023 18:10
-
-
Save chasemc/0f326627bcf65b3aedd31b07638b801d to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- encoding: utf-8 -*- | |
import os | |
import hashlib | |
from pathlib import Path | |
import argparse | |
from collections import defaultdict | |
import glob, os | |
from rich.progress import ( | |
Progress, | |
SpinnerColumn, | |
MofNCompleteColumn, | |
) | |
import pandas as pd | |
parser = argparse.ArgumentParser(description="Find duplicated files") | |
parser.add_argument( | |
"--indir", | |
metavar="filepath", | |
help="input_dir", | |
required=True, | |
) | |
parser.add_argument( | |
"--outpath", | |
metavar="filepath", | |
help="input_dir", | |
required=True, | |
) | |
parser.add_argument( | |
"--max_bytes", metavar="int", help="max_bytes", required=False, default=1000 | |
) | |
parser.add_argument( | |
"--min_filesize", metavar="int", help="min_filesize", required=False, default=1e9 | |
) | |
def hashy(path_to_file, max_bytes=1000, min_filesize=1e9): | |
with open(path_to_file, "rb") as f: | |
f_size = Path(path_to_file).stat().st_size | |
if f_size > min_filesize: | |
return f"{hashlib.sha256(f.read(max_bytes)).hexdigest()}_{Path(path_to_file).stat().st_size}" | |
else: | |
return False | |
def mm(starting_dir, pg, task, **kwargs): | |
for filename in glob.iglob(f"{starting_dir}/**", recursive=True): | |
if os.path.isfile(filename) and not os.path.islink(filename): | |
try: | |
temp = hashy(filename, **kwargs) | |
if temp: | |
yield (filename, temp) | |
except: | |
pass | |
pg.update(task, advance=1) | |
def main(): | |
args = parser.parse_args() | |
progress_columns = ( | |
SpinnerColumn(spinner_name="runner"), | |
MofNCompleteColumn(), | |
) | |
z = defaultdict(list) | |
with Progress(*progress_columns) as pg: | |
task = pg.add_task("Progress...", total=None) | |
for i in mm( | |
args.indir, | |
pg, | |
task, | |
max_bytes=args.max_bytes, | |
min_filesize=args.min_filesize, | |
): | |
z[i[1]].append(i[0]) | |
to_drop = [] | |
for k, v in z.items(): | |
if len(v) < 2: | |
to_drop.append(k) | |
for i in to_drop: | |
_ = z.pop(i) | |
df = ( | |
pd.DataFrame.from_dict(z, orient="index") | |
.rename_axis("not_full_hash") | |
.reset_index() | |
.melt(id_vars=["not_full_hash"], value_name="path") | |
.drop("variable", axis=1) | |
.dropna() | |
) | |
df[["not_full_hash", "size"]] = df.not_full_hash.str.split("_", expand=True) | |
df = df.astype({"size": "int"}) | |
df["size"] = round(df["size"] / 1000 / 10000 / 100, 4) | |
df.set_index(["size", "not_full_hash"], inplace=True) | |
df.sort_index(inplace=True, ascending=False) | |
df.to_csv(args.outpath) | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment