-
-
Save evantancy/c1a1defc7ff4030595fb67ae8820585a to your computer and use it in GitHub Desktop.
LangChain Directory Loader that respects .gitignore files
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"""Loading logic for loading documents from a git directory respecting .gitignore files.""" | |
import logging | |
import fnmatch | |
from pathlib import Path | |
from typing import List, Type, Union | |
from langchain.docstore.document import Document | |
from langchain.document_loaders.base import BaseLoader | |
from langchain.document_loaders.html_bs import BSHTMLLoader | |
from langchain.document_loaders.text import TextLoader | |
from langchain.document_loaders.unstructured import UnstructuredFileLoader | |
FILE_LOADER_TYPE = Union[ | |
Type[UnstructuredFileLoader], Type[TextLoader], Type[BSHTMLLoader] | |
] | |
logger = logging.getLogger(__file__) | |
def _load_gitignore_patterns(dir_path: Path): | |
gitignore_path = dir_path / ".gitignore" | |
if gitignore_path.is_file(): | |
lines = gitignore_path.read_text().splitlines() | |
return [line.strip() for line in lines if line.strip() and not line.startswith('#')] | |
return [] | |
class GitDirectoryLoader(BaseLoader): | |
"""Loading logic for loading documents from a git directory respecting .gitignore files.""" | |
def __init__( | |
self, | |
path: str, | |
silent_errors: bool = False, | |
load_hidden: bool = False, | |
loader_cls: FILE_LOADER_TYPE = UnstructuredFileLoader, | |
recursive: bool = False, | |
): | |
self.path = path | |
self.load_hidden = load_hidden | |
self.loader_cls = loader_cls | |
self.silent_errors = silent_errors | |
self.recursive = recursive | |
def _load(self, p: Path, gitignore_patterns: List[str]) -> List[Document]: | |
docs = [] | |
for file_or_dir in p.iterdir(): | |
if file_or_dir.is_file(): | |
if not any(fnmatch.fnmatch(str(file_or_dir.relative_to(p)), pattern) for pattern in gitignore_patterns): | |
try: | |
sub_docs = self.loader_cls(str(file_or_dir)).load() | |
docs.extend(sub_docs) | |
except Exception as error: | |
if self.silent_errors: | |
logger.warning(error) | |
else: | |
raise error | |
elif file_or_dir.is_dir() and (self.recursive or file_or_dir == p): | |
# Ignore hidden directories unless the load_hidden flag is set | |
if file_or_dir.name.startswith(".") and not self.load_hidden: | |
continue | |
subdir_gitignore_patterns = gitignore_patterns + _load_gitignore_patterns(file_or_dir) | |
docs.extend(self._load(file_or_dir, subdir_gitignore_patterns)) | |
return docs | |
def load(self) -> List[Document]: | |
p = Path(self.path) | |
gitignore_patterns = _load_gitignore_patterns(p) | |
return self._load(p, gitignore_patterns) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment