Created
November 24, 2022 09:37
-
-
Save hay/de8f14e787fedfe504417e4b3e8a2613 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
from bs4 import BeautifulSoup | |
from dataknead import Knead | |
from pathlib import Path | |
import requests | |
import sys | |
import urllib.request | |
API_KEY = "your-api-key-here" | |
class Builder: | |
API_ENDPOINT = "https://www.getrevue.co/api" | |
def __init__(self, api_key): | |
self.api_key = API_KEY | |
self.issues = None | |
def get_issues(self): | |
url = f"{self.API_ENDPOINT}/v2/issues" | |
req = requests.get(url, headers = { | |
"Authorization" : f"Token {self.api_key}" | |
}) | |
if req.status_code != 200: | |
raise Exception(f"Invalid status code: {req.status_code}") | |
data = req.json() | |
# Check if data is valid | |
if not data or (len(data) == 0): | |
raise Exception("No data in JSON") | |
# Also check for the html property | |
if "html" not in data[0]: | |
raise Exception("Valid JSON, but no html attribute") | |
Knead(data).write("data/issues.json") | |
self.issues = data | |
# Create an index and split out the html to separate files | |
def parse_issues(self): | |
for issue in self.issues: | |
path = f"issues/{issue['id']}.html" | |
html = issue["html"] | |
issue_id = issue["id"] | |
print(f"Parsing {issue_id}") | |
# Replace all images with the 'image-missing' image | |
soup = BeautifulSoup(html, "lxml") | |
for img in soup.select("img"): | |
img["src"] = "img/image-missing.svg" | |
img["class"] = "image-missing" | |
img["alt"] = "Deze afbeelding is niet beschikbaar" | |
html = soup.prettify() | |
# Write issue html | |
with open(path, "w") as f: | |
f.write(html) | |
print(f"Wrote {path}") | |
issue["html"] = None | |
Knead(self.issues).write("data/index.json", indent = 4) | |
def main(): | |
builder = Builder(API_KEY) | |
print("Getting all issues") | |
builder.get_issues() | |
print("Now parsing") | |
builder.parse_issues() | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment