Created
October 14, 2020 18:49
-
-
Save verdverm/23aefb64ee981e17452e95dd5c491d26 to your computer and use it in GitHub Desktop.
HN Upvotes Backup and JSONify
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env bash | |
set -euo pipefail | |
COOKIE="user=verdverm&..." | |
USERNAME="verdverm" | |
BASEURL="https://news.ycombinator.com/upvoted" | |
PAGE=0 | |
mkdir -p html | |
while true; do | |
URL="${BASEURL}?id=${USERNAME}&p=${PAGE}" | |
NUM=$(printf "%06d\n" $PAGE) | |
echo "fetching page $NUM -- $(date)" | |
curl -s -b "${COOKIE}" "$URL" > html/page-$NUM.html | |
# loop update | |
PAGE=$((PAGE+1)) | |
sleep 1 | |
done |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
from os import walk | |
from bs4 import BeautifulSoup | |
import json | |
AllItems = [] | |
def parseFile(fn): | |
with open(fn, 'r') as content_file: | |
content = content_file.read() | |
soup = BeautifulSoup(content, 'html.parser') | |
upvotes = soup.find(class_="itemlist") | |
parseItems(upvotes) | |
def parseItems(upvotes): | |
items = upvotes.find_all(class_="athing") | |
for item in items: | |
parseItem(item) | |
def parseItem(item): | |
itemID = item.get("id") | |
story = item.find(class_="storylink") | |
href = story.get("href") | |
title = story.contents[0] | |
site = "none" | |
try: | |
site = item.find(class_="sitestr").contents[0] | |
except: | |
site = "user" | |
saveItem(itemID, href, title, site) | |
def saveItem(itemID, href, title, site): | |
item = { | |
"id": itemID, | |
"href": href, | |
"title": title, | |
"site": site | |
} | |
AllItems.append(item) | |
# Read data | |
for (dirpath, dirnames, filenames) in walk("html"): | |
for fn in filenames: | |
print("parsing: " + fn) | |
parseFile("html/" + fn) | |
# Write data | |
with open('data.json', 'w') as outfile: | |
json.dump(AllItems, outfile) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment