Skip to content

Instantly share code, notes, and snippets.

@chasemc
Created April 3, 2023 18:10
Show Gist options
  • Save chasemc/0f326627bcf65b3aedd31b07638b801d to your computer and use it in GitHub Desktop.
Save chasemc/0f326627bcf65b3aedd31b07638b801d to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
import os
import hashlib
from pathlib import Path
import argparse
from collections import defaultdict
import glob, os
from rich.progress import (
Progress,
SpinnerColumn,
MofNCompleteColumn,
)
import pandas as pd
parser = argparse.ArgumentParser(description="Find duplicated files")
parser.add_argument(
"--indir",
metavar="filepath",
help="input_dir",
required=True,
)
parser.add_argument(
"--outpath",
metavar="filepath",
help="input_dir",
required=True,
)
parser.add_argument(
"--max_bytes", metavar="int", help="max_bytes", required=False, default=1000
)
parser.add_argument(
"--min_filesize", metavar="int", help="min_filesize", required=False, default=1e9
)
def hashy(path_to_file, max_bytes=1000, min_filesize=1e9):
with open(path_to_file, "rb") as f:
f_size = Path(path_to_file).stat().st_size
if f_size > min_filesize:
return f"{hashlib.sha256(f.read(max_bytes)).hexdigest()}_{Path(path_to_file).stat().st_size}"
else:
return False
def mm(starting_dir, pg, task, **kwargs):
for filename in glob.iglob(f"{starting_dir}/**", recursive=True):
if os.path.isfile(filename) and not os.path.islink(filename):
try:
temp = hashy(filename, **kwargs)
if temp:
yield (filename, temp)
except:
pass
pg.update(task, advance=1)
def main():
args = parser.parse_args()
progress_columns = (
SpinnerColumn(spinner_name="runner"),
MofNCompleteColumn(),
)
z = defaultdict(list)
with Progress(*progress_columns) as pg:
task = pg.add_task("Progress...", total=None)
for i in mm(
args.indir,
pg,
task,
max_bytes=args.max_bytes,
min_filesize=args.min_filesize,
):
z[i[1]].append(i[0])
to_drop = []
for k, v in z.items():
if len(v) < 2:
to_drop.append(k)
for i in to_drop:
_ = z.pop(i)
df = (
pd.DataFrame.from_dict(z, orient="index")
.rename_axis("not_full_hash")
.reset_index()
.melt(id_vars=["not_full_hash"], value_name="path")
.drop("variable", axis=1)
.dropna()
)
df[["not_full_hash", "size"]] = df.not_full_hash.str.split("_", expand=True)
df = df.astype({"size": "int"})
df["size"] = round(df["size"] / 1000 / 10000 / 100, 4)
df.set_index(["size", "not_full_hash"], inplace=True)
df.sort_index(inplace=True, ascending=False)
df.to_csv(args.outpath)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment