Last active
April 11, 2023 16:13
-
-
Save Shubhang/c3172f24513eb36772d70763219f35a8 to your computer and use it in GitHub Desktop.
Group by Similarity
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import shutil | |
from difflib import SequenceMatcher | |
def similarity(a, b): | |
return SequenceMatcher(None, a, b).ratio() | |
def main(): | |
base_dir = '.' # Set the base directory to search for files | |
similarity_threshold = 0.6 | |
file_groups = {} | |
for root, _, files in os.walk(base_dir): | |
for file in files: | |
file_prefix = file[:10] # Compare the first 10 characters of the file name | |
grouped = False | |
for group_prefix, group_files in file_groups.items(): | |
if similarity(file_prefix, group_prefix) >= similarity_threshold: | |
group_files.append(file) | |
grouped = True | |
break | |
if not grouped: | |
file_groups[file_prefix] = [file] | |
# Move the files into their respective folders based on similarity | |
for group_prefix, group_files in file_groups.items(): | |
new_folder_name = group_prefix + "_group" | |
new_folder_path = os.path.join(base_dir, new_folder_name) | |
if not os.path.exists(new_folder_path): | |
os.makedirs(new_folder_path) | |
for file in group_files: | |
file_path = os.path.join(root, file) | |
new_file_path = os.path.join(new_folder_path, file) | |
shutil.move(file_path, new_file_path) | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment