Skip to content

Instantly share code, notes, and snippets.

@vitorio
Created September 12, 2024 20:44
Show Gist options
  • Save vitorio/f8c98083da6428cb15cc05fce70bca22 to your computer and use it in GitHub Desktop.
Save vitorio/f8c98083da6428cb15cc05fce70bca22 to your computer and use it in GitHub Desktop.
Scrape the Pinboard UI's "bmarks" JSON into a pickled Python dict (req. python3, requests, bs4)
import requests
import bs4
import pickle
import os
import json
import time
import random
PINBOARD_URL = 'https://pinboard.in'
PINBOARD_USER = 'YOUR_PINBOARD_USERNAME'
PINBOARD_PASS = 'YOUR_PINBOARD_PASSWORD'
if not os.path.exists('pinboard-cookies.pickle'):
session = requests.Session()
response = session.post(PINBOARD_URL + '/auth/', data={'username': PINBOARD_USER, 'password': PINBOARD_PASS}, allow_redirects=False)
if response.status_code == 302:
rloc = response.headers['Location']
print(rloc)
if rloc == '/u:' + PINBOARD_USER + '/':
print('Login successful, redirecting to known good user URL, saving cookies')
print(session.cookies.keys())
with open('pinboard-cookies.pickle', 'wb') as f:
pickle.dump(session.cookies, f)
elif 'bad+login' in rloc:
print('Failed login, being redirected to homepage')
else:
print('???')
else:
print(response.status_code, '???')
session = requests.Session()
with open('pinboard-cookies.pickle', 'rb') as f:
session.cookies.update(pickle.load(f))
response = session.get(PINBOARD_URL + '/u:' + PINBOARD_USER + '/')
print(response.status_code)
if not response.status_code == 200:
raise Exception()
soup = bs4.BeautifulSoup(response.content, 'html.parser')
small_username = soup.find('div', {'class': 'small_username'})
assert(small_username.text.strip() == PINBOARD_USER)
bookmark_count = soup.find('span', {'class': 'bookmark_count'})
print(bookmark_count.text, 'bookmarks')
NO_MORE_EARLIER = False
bmarks = {}
while not NO_MORE_EARLIER:
print(len(bmarks), 'bmarks parsed so far')
top_earlier = soup.find('a', {'id': 'top_earlier'})
if top_earlier == None:
NO_MORE_EARLIER = True
print('This is the last page')
else:
print('Next earlier URL', PINBOARD_URL + top_earlier['href'])
print('Parsing current page')
all_scripts = soup.find_all('script')
for a in all_scripts:
if 'var bmarks={};' in a.text:
rough_bmarks = a.text.split(';bmarks[')
print(len(rough_bmarks), 'in bmarks JSON')
for b in rough_bmarks:
try:
c = json.loads(b.split('] = ')[1].strip(';'))
except:
print(b.split('] = ')[1])
raise Exception()
if not c['id'] in bmarks:
bmarks[c['id']] = c
else:
print('Duplicate bmark in JSON')
with open('pinboard-bmarks.pickle', 'wb') as f:
pickle.dump(bmarks, f)
if top_earlier:
time.sleep(random.uniform(1, 5))
response = session.get(PINBOARD_URL + top_earlier['href'])
print(response.status_code)
if not response.status_code == 200:
raise()
soup = bs4.BeautifulSoup(response.content, 'html.parser')
else:
print(len(bmarks), 'final tally')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment