Last active
December 30, 2024 05:46
-
-
Save hitripod/87d25310c0a2469c68603e07d6c35621 to your computer and use it in GitHub Desktop.
This script fetches all pages from a GitBook URL and saves their content to a single text file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sys | |
import os | |
from langchain_community.document_loaders import GitbookLoader | |
def save_gitbook_to_txt(url, output_file): | |
try: | |
# Initialize the loader with all paths enabled | |
loader = GitbookLoader(url, load_all_paths=True) | |
# Load all pages from the GitBook | |
all_pages_data = loader.load() | |
# Combine all page content into a single string | |
combined_content = "\n\n".join([doc.page_content for doc in all_pages_data]) | |
# Save the combined content to a file | |
with open(output_file, "w", encoding="utf-8") as f: | |
f.write(combined_content) | |
print(f"Successfully saved {len(all_pages_data)} pages to {output_file}.") | |
except Exception as e: | |
print(f"An error occurred: {e}") | |
if __name__ == "__main__": | |
os.environ['USER_AGENT'] = 'myagent' | |
if len(sys.argv) != 2: | |
print("Usage: python fetch_gitbook.py <GitBook URL>") | |
sys.exit(1) | |
gitbook_url = sys.argv[1] | |
output_file = os.path.join(os.getcwd(), "gitbook_content.txt") | |
save_gitbook_to_txt(gitbook_url, output_file) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment