Skip to content

Instantly share code, notes, and snippets.

@tarekbadrsh
Last active July 26, 2024 07:09
Show Gist options
  • Save tarekbadrsh/e6aa6cde277cf1af9031e245b0b82d58 to your computer and use it in GitHub Desktop.
Save tarekbadrsh/e6aa6cde277cf1af9031e245b0b82d58 to your computer and use it in GitHub Desktop.
Multi-File Commenter and Combiner
import os
import fnmatch
def get_comment_prefix(filename):
extension_to_comment = {
'.asm': (';', ';'),
'.awk': ('#', '#'),
'.c': ('//', '//'),
'.clj': (';;', ';;'),
'.cpp': ('//', '//'),
'.css': ('/*', '*/'),
'.cs': ('//', '//'),
'.dart': ('//', '//'),
'.dockerfile': ('#', '#'),
'.ex': ('#', '#'),
'.erl': ('%', '%'),
'.fs': ('//', '//'),
'.f90': ('!', '!'),
'.go': ('//', '//'),
'.groovy': ('//', '//'),
'.hs': ('--', '--'),
'.html': ('<!--', '-->'),
'.java': ('//', '//'),
'.js': ('//', '//'),
'.jl': ('#', '#'),
'.kt': ('//', '//'),
'.latex': ('%', '%'),
'.lisp': (';', ';'),
'.lua': ('--', '--'),
'.mk': ('#', '#'),
'.md': ('<!--', '-->'),
'.m': ('%', '%'),
'.mm': ('//', '//'),
'.ml': ('//', '//'),
'.pas': ('//', '//'),
'.pl': ('#', '#'),
'.php': ('//', '//'),
'.plain': ('#', '#'),
'.ps1': ('#', '#'),
'.py': ('##', '##'),
'.r': ('#', '#'),
'.rb': ('#', '#'),
'.rs': ('//', '//'),
'.scala': ('//', '//'),
'.scm': (';', ';'),
'.sed': ('#', '#'),
'.sh': ('#', '#'),
'.st': ('"', '"'),
'.sql': ('--', '--'),
'.swift': ('//', '//'),
'.ts': ('//', '//'),
'.tsx': ('//', '//'),
'.vb': ("'", "'"),
'.xml': ('<!--', '-->'),
'.yaml': ('#', '#'),
}
_, ext = os.path.splitext(filename)
return extension_to_comment.get(ext, ('#', '#'))
def read_gitignore(directory):
result = [".git", ".gitignore", ".dockerignore"]
gitignore_path = os.path.join(directory, '.gitignore')
if os.path.exists(gitignore_path):
with open(gitignore_path, 'r') as gitignore_file:
result.extend([line.strip() for line in gitignore_file if line.strip(
) and not line.startswith('#')])
return result
def should_ignore(path, ignore_patterns):
for pattern in ignore_patterns:
if fnmatch.fnmatch(path, pattern) or fnmatch.fnmatch(os.path.basename(path), pattern):
return True
return False
def read_and_save_files(directory, output_file, ignore_patterns):
with open(output_file, 'w') as outfile:
for root, dirs, files in os.walk(directory):
# Remove directories that match ignore patterns
dirs[:] = [d for d in dirs if not should_ignore(
os.path.join(root, d), ignore_patterns)]
for filename in files:
full_path = os.path.join(root, filename)
if should_ignore(full_path, ignore_patterns):
continue
comment_prefix = get_comment_prefix(filename)
try:
with open(full_path, 'r') as infile:
outfile.write(
f"{comment_prefix[0]}---FILE_PATH---{full_path}---FILE_PATH---{comment_prefix[1]}\n")
outfile.write(infile.read())
outfile.write("\n")
except Exception as e:
print(f"Error reading file {full_path}: {e}")
if __name__ == "__main__":
directory = input("Enter the directory path: ")
output_file = os.path.join(
directory, f"{os.path.basename(directory)}_output.txt")
# Read patterns from .gitignore if it exists
ignore_patterns = read_gitignore(directory)
read_and_save_files(directory, output_file, ignore_patterns)
print(f"Output saved to {output_file}")
@tarekbadrsh
Copy link
Author

# Multi-File Commenter and Combiner

This Python script is designed to read multiple files from a specified directory, determine the appropriate comment prefix for each file based on its extension, and then combine the contents of these files into a single output file. Each file's content is prefixed with a comment indicating its original path, making it easy to trace back to the source file.

## Features

1. **Comment Prefix Detection**: The script automatically detects the comment syntax for various file types, ensuring that the combined output maintains the correct formatting for each file's language.
2. **File Combination**: All files from the specified directory (including subdirectories) are read and their contents are written to a single output file.
3. **Path Annotation**: Each file's content in the output is prefixed and suffixed with a comment that includes the original file path, aiding in identification and traceability.

## Usage

1. Run the script.
2. Enter the directory path when prompted.
3. The script will generate an output file named `<directory_name>_output.txt` within the specified directory, containing the combined contents of all files with appropriate comments.

## Supported File Types

The script supports a wide range of file types including but not limited to:
- `.asm`, `.awk`, `.c`, `.clj`, `.cpp`, `.css`, `.cs`, `.dart`, `.dockerfile`, `.ex`, `.erl`, `.fs`, `.f90`, `.go`, `.groovy`, `.hs`, `.html`, `.java`, `.js`, `.jl`, `.kt`, `.latex`, `.lisp`, `.lua`, `.mk`, `.md`, `.m`, `.mm`, `.ml`, `.pas`, `.pl`, `.php`, `.plain`, `.ps1`, `.py`, `.r`, `.rb`, `.rs`, `.scala`, `.scm`, `.sed`, `.sh`, `.st`, `.sql`, `.swift`, `.ts`, `.tsx`, `.vb`, `.xml`, `.yaml`

## Example

```bash
Enter the directory path: /path/to/your/directory

The output file /path/to/your/directory/directory_name_output.txt will contain:

#---FILE_PATH---/path/to/your/directory/file1.py---FILE_PATH---
# File content of file1.py

//---FILE_PATH---/path/to/your/directory/subdir/file2.cpp---FILE_PATH---
// File content of file2.cpp

This script is useful for code review, backup, or documentation purposes where combining multiple files into one with clear annotations is beneficial.


This description provides a clear overview of the script's functionality, its usage, and the types of files it supports, making it suitable for sharing on platforms like GitHub Gist.

@EmadAnwer
Copy link

You can ignore directories or files by editing the ignore_dirs and ignore_files

def read_and_save_files(directory, output_file):
    ignore_dirs = ["__pycache__", "venv", ".git"]
    ignore_files = [
        "collect_all_files.py",
        "README.md",
        "LICENSE",
        ".gitignore",
        ".git",
        ".vscode",
        ".idea",
        ".dockerignore",
        ".gitingore",
    ]

    with open(output_file, "w") as outfile:
        for root, _, files in os.walk(directory):
            if any(ignore_dir in root for ignore_dir in ignore_dirs):
                continue
            for filename in files:
                if any(ignore_file in filename for ignore_file in ignore_files):
                    continue
                if filename == os.path.basename(output_file):
                    continue
                full_path = os.path.join(root, filename)
                comment_prefix = get_comment_prefix(filename)
                with open(full_path, "r") as infile:
                    outfile.write(
                        f"{comment_prefix[0]}---FILE_PATH---{full_path}---FILE_PATH---{comment_prefix[1]}\n"
                    )
                    outfile.write(infile.read())
                    outfile.write("\n")

@tarekbadrsh
Copy link
Author

It would be better if we allow users to specify what to include or exclude by using the content of .gitignore. Additionally, we should consider adding the .git directory, since it is not ignored by .gitignore.

@EmadAnwer
Copy link

EmadAnwer commented Jul 20, 2024

It would be better if we allow users to specify what to include or exclude by using the content of .gitignore. Additionally, we should consider adding the .git directory, since it is not ignored by .gitignore.

I was thinking of it but it will be tricky to implement if you need to support all .gitignore hacks, I will try to implement an initial version of it

@EmadAnwer
Copy link

@tarekbadrsh Check this

import os
import fnmatch


def get_comment_prefix(filename):
    extension_to_comment = {
        ".asm": (";", ";"),
        ".awk": ("#", "#"),
        ".c": ("//", "//"),
        ".clj": (";;", ";;"),
        ".cpp": ("//", "//"),
        ".css": ("/*", "*/"),
        ".cs": ("//", "//"),
        ".dart": ("//", "//"),
        ".dockerfile": ("#", "#"),
        ".ex": ("#", "#"),
        ".erl": ("%", "%"),
        ".fs": ("//", "//"),
        ".f90": ("!", "!"),
        ".go": ("//", "//"),
        ".groovy": ("//", "//"),
        ".hs": ("--", "--"),
        ".html": ("<!--", "-->"),
        ".java": ("//", "//"),
        ".js": ("//", "//"),
        ".jl": ("#", "#"),
        ".kt": ("//", "//"),
        ".latex": ("%", "%"),
        ".lisp": (";", ";"),
        ".lua": ("--", "--"),
        ".mk": ("#", "#"),
        ".md": ("<!--", "-->"),
        ".m": ("%", "%"),
        ".mm": ("//", "//"),
        ".ml": ("//", "//"),
        ".pas": ("//", "//"),
        ".pl": ("#", "#"),
        ".php": ("//", "//"),
        ".plain": ("#", "#"),
        ".ps1": ("#", "#"),
        ".py": ("##", "##"),
        ".r": ("#", "#"),
        ".rb": ("#", "#"),
        ".rs": ("//", "//"),
        ".scala": ("//", "//"),
        ".scm": (";", ";"),
        ".sed": ("#", "#"),
        ".sh": ("#", "#"),
        ".st": ('"', '"'),
        ".sql": ("--", "--"),
        ".swift": ("//", "//"),
        ".ts": ("//", "//"),
        ".tsx": ("//", "//"),
        ".vb": ("'", "'"),
        ".xml": ("<!--", "-->"),
        ".yaml": ("#", "#"),
    }

    _, ext = os.path.splitext(filename)
    return extension_to_comment.get(ext, ("#", "#"))


def dot_ignore_loader(directory):
    ignore_files = [".gitignore", ".dockerignore", ".gitingore"]
    ignore_lines = []
    for ignore_file in ignore_files:
        ignore_file_path = os.path.join(directory, ignore_file)
        if os.path.exists(ignore_file_path):
            with open(ignore_file_path, "r") as infile:
                for line in infile:
                    line = line.strip()
                    if not line or line.startswith("#"):
                        continue
                    ignore_lines.append(line)
    return ignore_lines


def is_ignored(path, patterns):
    for pattern in patterns:
        if fnmatch.fnmatch(path, pattern) or fnmatch.fnmatch(
            os.path.basename(path), pattern
        ):
            return True
    return False


def read_and_save_files(directory, output_file):
    ignore_patterns = dot_ignore_loader(directory)

    # add additional ignore patterns here like .gitingore patterns
    additional_ignore_patterns = [
        ".git*",
        ".vscode*",
        ".gitignore",
    ]
    ignore_patterns.extend(additional_ignore_patterns)
    with open(output_file, "w") as outfile:
        for root, _, files in os.walk(directory):
            for filename in files:
                full_path = os.path.join(root, filename)
                rel_path = os.path.relpath(full_path, directory)
                if is_ignored(rel_path, ignore_patterns):
                    continue
                if filename == os.path.basename(output_file):
                    continue
                comment_prefix = get_comment_prefix(filename)
                with open(full_path, "r") as infile:
                    outfile.write(
                        f"{comment_prefix[0]}---FILE_PATH---{full_path}---FILE_PATH---{comment_prefix[1]}\n"
                    )
                    outfile.write(infile.read())
                    outfile.write("\n")


if __name__ == "__main__":
    directory = input("Enter the directory path: ")
    output_file = os.path.join(directory, f"{os.path.basename(directory)}_output.txt")
    read_and_save_files(directory, output_file)

@tarekbadrsh
Copy link
Author

Thank @EmadAnwer, for your suggestions on using .gitignore in the script.
I have made some updates to your code and hope they are helpful for everyone.

Cheers 🍻

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment