Created
September 12, 2024 20:44
-
-
Save vitorio/f8c98083da6428cb15cc05fce70bca22 to your computer and use it in GitHub Desktop.
Scrape the Pinboard UI's "bmarks" JSON into a pickled Python dict (req. python3, requests, bs4)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
import bs4 | |
import pickle | |
import os | |
import json | |
import time | |
import random | |
PINBOARD_URL = 'https://pinboard.in' | |
PINBOARD_USER = 'YOUR_PINBOARD_USERNAME' | |
PINBOARD_PASS = 'YOUR_PINBOARD_PASSWORD' | |
if not os.path.exists('pinboard-cookies.pickle'): | |
session = requests.Session() | |
response = session.post(PINBOARD_URL + '/auth/', data={'username': PINBOARD_USER, 'password': PINBOARD_PASS}, allow_redirects=False) | |
if response.status_code == 302: | |
rloc = response.headers['Location'] | |
print(rloc) | |
if rloc == '/u:' + PINBOARD_USER + '/': | |
print('Login successful, redirecting to known good user URL, saving cookies') | |
print(session.cookies.keys()) | |
with open('pinboard-cookies.pickle', 'wb') as f: | |
pickle.dump(session.cookies, f) | |
elif 'bad+login' in rloc: | |
print('Failed login, being redirected to homepage') | |
else: | |
print('???') | |
else: | |
print(response.status_code, '???') | |
session = requests.Session() | |
with open('pinboard-cookies.pickle', 'rb') as f: | |
session.cookies.update(pickle.load(f)) | |
response = session.get(PINBOARD_URL + '/u:' + PINBOARD_USER + '/') | |
print(response.status_code) | |
if not response.status_code == 200: | |
raise Exception() | |
soup = bs4.BeautifulSoup(response.content, 'html.parser') | |
small_username = soup.find('div', {'class': 'small_username'}) | |
assert(small_username.text.strip() == PINBOARD_USER) | |
bookmark_count = soup.find('span', {'class': 'bookmark_count'}) | |
print(bookmark_count.text, 'bookmarks') | |
NO_MORE_EARLIER = False | |
bmarks = {} | |
while not NO_MORE_EARLIER: | |
print(len(bmarks), 'bmarks parsed so far') | |
top_earlier = soup.find('a', {'id': 'top_earlier'}) | |
if top_earlier == None: | |
NO_MORE_EARLIER = True | |
print('This is the last page') | |
else: | |
print('Next earlier URL', PINBOARD_URL + top_earlier['href']) | |
print('Parsing current page') | |
all_scripts = soup.find_all('script') | |
for a in all_scripts: | |
if 'var bmarks={};' in a.text: | |
rough_bmarks = a.text.split(';bmarks[') | |
print(len(rough_bmarks), 'in bmarks JSON') | |
for b in rough_bmarks: | |
try: | |
c = json.loads(b.split('] = ')[1].strip(';')) | |
except: | |
print(b.split('] = ')[1]) | |
raise Exception() | |
if not c['id'] in bmarks: | |
bmarks[c['id']] = c | |
else: | |
print('Duplicate bmark in JSON') | |
with open('pinboard-bmarks.pickle', 'wb') as f: | |
pickle.dump(bmarks, f) | |
if top_earlier: | |
time.sleep(random.uniform(1, 5)) | |
response = session.get(PINBOARD_URL + top_earlier['href']) | |
print(response.status_code) | |
if not response.status_code == 200: | |
raise() | |
soup = bs4.BeautifulSoup(response.content, 'html.parser') | |
else: | |
print(len(bmarks), 'final tally') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment