Created
January 6, 2024 19:31
-
-
Save sabatale/754019aa244a5f02ef53b321512c3083 to your computer and use it in GitHub Desktop.
Generic JSON Loader for Langchain Python
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"""See https://github.com/langchain-ai/langchain/issues/4396 for original author.""" | |
"""Loader that loads data from JSON.""" | |
import json | |
from pathlib import Path | |
from typing import Callable, Dict, List, Optional, Union | |
from langchain.docstore.document import Document | |
from langchain.document_loaders.base import BaseLoader | |
class JSONLoader(BaseLoader): | |
def __init__( | |
self, | |
file_path: Union[str, Path], | |
content_key: Optional[str] = None, | |
): | |
self.file_path = Path(file_path).resolve() | |
self._content_key = content_key | |
def create_documents(self, processed_data): | |
documents = [] | |
for item in processed_data: | |
content = ''.join(item) | |
document = Document(page_content=content, metadata={}) | |
documents.append(document) | |
return documents | |
def process_item(self, item, prefix=""): | |
if isinstance(item, dict): | |
result = [] | |
for key, value in item.items(): | |
new_prefix = f"{prefix}.{key}" if prefix else key | |
result.extend(self.process_item(value, new_prefix)) | |
return result | |
elif isinstance(item, list): | |
result = [] | |
for value in item: | |
result.extend(self.process_item(value, prefix)) | |
return result | |
else: | |
return [f"{prefix}: {item}"] | |
def process_json(self,data): | |
if isinstance(data, list): | |
processed_data = [] | |
for item in data: | |
processed_data.extend(self.process_item(item)) | |
return processed_data | |
elif isinstance(data, dict): | |
return self.process_item(data) | |
else: | |
return [] | |
def load(self) -> List[Document]: | |
"""Load and return documents from the JSON file.""" | |
docs=[] | |
with open(self.file_path, mode="r", encoding="utf-8") as json_file: | |
try: | |
data = json.load(json_file) | |
processed_json = self.process_json(data) | |
docs = self.create_documents(processed_json) | |
except json.JSONDecodeError: | |
print("Error: Invalid JSON format in the file.") | |
return docs |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment