Skip to content

Instantly share code, notes, and snippets.

@odewahn
Last active December 12, 2024 20:30
Show Gist options
  • Save odewahn/83265d5641ce9f469295fde3bce769d9 to your computer and use it in GitHub Desktop.
Save odewahn/83265d5641ce9f469295fde3bce769d9 to your computer and use it in GitHub Desktop.
Grab content from ORM api
import aiohttp
import sys
import asyncio
import os
# Before you start, get a content token from https://learning.oreilly.com/account/api-tokens/
# You'll need the account manager role on your user membership. If you're not an account manager
# then you'll need to request on from the U&A team of someone who can create tokens
# Then create an environment variable called ORM_AUTH_TOKEN with the token as the value
async def async_fetch_url(session, url, format="json"):
headers = {"Authorization": f"Token {os.getenv('ORM_AUTH_TOKEN')}"}
async with session.get(url, headers=headers) as r:
if format == "html":
return await r.text()
else:
return await r.json()
# Grab the metadata for a book, then get all metadata for each chapter, then get the content of each chapter
async def fetch_book(work):
async with aiohttp.ClientSession() as session:
metadata = await async_fetch_url(
session, f"https://learning.oreilly.com/api/v1/book/{work}/"
)
# Fetch the metadata about each chapter
chapters_metadata = await asyncio.gather(
*[async_fetch_url(session, url) for url in metadata["chapters"]]
)
# Fetch content of each chapter based on the medata file. This maps 1:1 to the metadata
chapters_content = await asyncio.gather(
*[
async_fetch_url(session, chapter["content"], "html")
for chapter in chapters_metadata
]
)
return metadata, chapters_metadata, chapters_content
if __name__ == "__main__":
metadata, chapters_metadata, chapters_content = asyncio.run(fetch_book(sys.argv[1]))
for chapter in chapters_content:
print(chapter)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment