Created
April 17, 2017 15:44
-
-
Save rahbirul/a5c25458986ddecafd5e569866cfd171 to your computer and use it in GitHub Desktop.
Python script for scraping Del.icio.us and exporting all bookmarks
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
import json | |
import argparse | |
import mechanize | |
from lxml import html | |
def run(username, password): | |
browser = mechanize.Browser() | |
browser.set_handle_robots(False) | |
login(browser, username, password) | |
page = 1 | |
all_bookmarks = [] | |
while True: | |
bookmarks = list(get_bookmarks(browser, username, page)) | |
if not bookmarks: | |
break | |
all_bookmarks += bookmarks | |
page += 1 | |
if all_bookmarks: | |
with open('bookmarks.json', 'w') as fp: | |
json.dump(all_bookmarks, fp, indent=4) | |
def get_bookmarks(browser, username, page): | |
print "browsing page %d" % page | |
url = 'https://del.icio.us/%s?&page=%s' % (username, page) | |
browser.open(url) | |
content = browser.response().read() | |
doc = html.fromstring(content) | |
for article_block in doc.xpath('//div[@class="articleThumbBlock "]'): | |
title = _get_title(article_block) | |
link = _get_link(article_block) | |
domain = _get_domain(article_block) | |
saved_by = _get_saved_by(article_block) | |
date_saved = _get_date_saved(article_block) | |
tags = _get_tags(article_block) | |
yield {'title': title, 'link': link, 'domain': domain, 'saved_by': saved_by, 'date_saved': date_saved, | |
'tags': tags} | |
def login(browser, username, password): | |
print "loggin in" | |
login_url = 'https://del.icio.us/login' | |
browser.open(login_url) | |
browser.select_form(id="user-login-form") | |
browser.form['username'] = username | |
browser.form['password'] = password | |
browser.submit() | |
def _get_title(article_block): | |
return article_block.find('div[@class="articleTitlePan"]/h3/a').text | |
def _get_link(article_block): | |
return article_block.find('div[@class="articleInfoPan"]/p[1]').find('a').attrib['href'] | |
def _get_domain(article_block): | |
return article_block.find('div[@class="articleInfoPan"]/p[2]').find('a').attrib['href'] | |
def _get_date_saved(article_block): | |
text = article_block.find('div[@class="articleInfoPan"]/p[3]').find('a').tail | |
return re.sub(r'^\s(on)\s', '', text) | |
def _get_saved_by(article_block): | |
return 'https://del.icio.us' + article_block.find('div[@class="articleInfoPan"]/p[3]').find('a').attrib['href'] | |
def _get_tags(article_block): | |
return [tag.text for tag in article_block.iterfind('div[@class="thumbTBriefTxt"]/ul/li/a')] | |
if __name__ == '__main__': | |
arg_parser = argparse.ArgumentParser() | |
arg_parser.add_argument('--username', help="Delicious.com username") | |
arg_parser.add_argument('--password', help="Delicious.com password") | |
parsed_args = arg_parser.parse_args() | |
run(parsed_args.username, parsed_args.password) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment