Created
December 17, 2010 16:56
-
-
Save fish2000/745264 to your computer and use it in GitHub Desktop.
Exporting from delicious.com (née del.icio.us) gives you crap HTML, this script reorganizes it into a JSON structure suitable for the high standards we have for datastructures here at the end of 2010.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# get it from https://secure.delicious.com/settings/bookmarks/export | |
# use the default options | |
# file will be named with a datestamp, like: 'delicious-YYYYMMDD.htm' | |
import os, re, site, sys | |
from BeautifulSoup import BeautifulSoup | |
try: | |
import json | |
except ImportError: | |
import simplejson as json | |
argvee = ' '.join(argvee[1:]) | |
bkmarks = argvee and argvee or '/Users/fish/Downloads/delicious-20101216.htm' | |
out = re.sub('htm$', 'json', bkmarks) | |
print "Reading Delicious links from crappy HTML in %s..." % bkmarks | |
if not os.path.exists(out): | |
delicious = dict() | |
with open(bkmarks, 'r+w') as bkf: | |
icio = BeautifulSoup(bkf.read()) | |
for dt in icio.findAll('a'): | |
atts = dict(dt.attrs) | |
desc = dt.findNext('dd').string | |
ln = atts.get('href') | |
delicious[ln] = dict() | |
delicious[ln]['href'] = ln | |
delicious[ln]['add_date'] = atts.get('add_date') | |
delicious[ln]['private'] = atts.get('private') | |
delicious[ln]['tags'] = str(atts.get('tags')).split(',') | |
with open(out, 'w+b') as jsonf: | |
json.dump(delicious, jsonf, skipkeys=True, indent=4) | |
print "Delicious links dumped to a reasonable JSON structure in %s" % out | |
else: | |
print "File already exists" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment