Skip to content

Instantly share code, notes, and snippets.

@vrypan
Last active December 24, 2015 16:47
Show Gist options
  • Save vrypan/84d760366616dcd80391 to your computer and use it in GitHub Desktop.
Save vrypan/84d760366616dcd80391 to your computer and use it in GitHub Desktop.
improve twitter exported archive
#! /usr/bin/env python
"""
This script will parse an unzipped Twitter archive export,
look for media links and download them localy, and replace
the links in the export to point to the local media copies.
It will also do the same for user avatars.
For more info visit: https://blog.vrypan.net/2015/12/24/how-to-archive-your-tweets/
"""
import urllib2
from urlparse import urlparse
import json
import os
import sys
import httplib
import sqlite3
import codecs
db_conn = sqlite3.connect(
os.path.join(os.path.expanduser('~'), 'expanded_urls.db'),
isolation_level="IMMEDIATE"
)
db_conn.row_factory = sqlite3.Row
db_cur = db_conn.cursor()
db_cur.execute("""CREATE TABLE IF NOT EXISTS REDIRECTS (
src TEXT unique not null,
dst TEXT not null);
""")
def expand_url(url, depth=0):
global db_cur
print "Expanding %s..." % url
if depth>10:
return unicode(url)
parsed = urlparse(url)
try :
h = httplib.HTTPConnection(parsed.netloc)
h.request('HEAD', url)
response = h.getresponse()
print response.status, response.reason
if response.status in range(300, 400) and response.getheader('Location'):
return expand_url(response.getheader('Location'), depth+1)
else:
return unicode(url)
except:
print '** Failed to expand %s' % url
return unicode(url)
def _update_links_urls(i):
global db_conn
global db_cur
if type(i) is dict:
for k in i:
if k == 'urls' and len(i[k])>0 :
urls = i[k]
for u in urls:
if 'original_url' in u:
continue
db_cur.execute("SELECT * FROM REDIRECTS WHERE src=?", (u['url'],))
r = db_cur.fetchone()
if r:
x_url = r['dst']
else:
x_url = expand_url(u['url'])
db_cur.execute("INSERT INTO REDIRECTS VALUES(?,?)", (u['url'], x_url) )
db_conn.commit()
u['original_url'] = u['url']
u['url'] = x_url
u['expanded_url'] = u['url']
parsed_url = urlparse(u['url'])
u['display_url'] = "%s%s%s" % ( parsed_url.netloc, parsed_url.path, parsed_url.query )
if len(u['display_url']) > 27 :
u['display_url'] = u['display_url'][0:26] + '...'
else:
if type(i[k]) is dict or type(i[k]) is tuple or type(i[k]) is list :
i[k] = _update_links_urls(i[k])
if type(i) is list or type(i) is tuple:
i = [ _update_links_urls(j) for j in i ]
return i
def _update_media_urls(i):
if type(i) is dict:
for k in i:
if k in ('media_url','media_url_https', 'profile_image_url_https'):
url = i[k]
local_file = os.path.join('img', urlparse(url).path.split('/')[-1] )
if not os.path.isfile(local_file):
try:
media_file = urllib2.urlopen(url)
output = codecs.open(local_file, encoding='utf-8', mode='wb')
output.write(media_file.read())
output.close()
i[k] = local_file
except:
print '** Failed to download %s' % url
else:
i[k] = local_file
else:
if type(i[k]) is dict or type(i[k]) is tuple or type(i[k]) is list :
i[k] = _update_media_urls(i[k])
if type(i) is list or type(i) is tuple:
i = [ _update_media_urls(j) for j in i ]
return i
if len(sys.argv) == 1:
print './improve_twitter_archive.py <path to unzipped Twitter export>'
else:
path = sys.argv[1]
for data_file in os.listdir(os.path.join(path, 'data','js','tweets')):
print '\nParsing %s.' % data_file
data_raw = codecs.open(os.path.join(path, 'data','js','tweets', data_file), mode='r', encoding='utf-8').read().splitlines(True)
if data_raw:
data_js = json.loads('\n'.join(data_raw[1:]))
data_js_updated = json.dumps(
_update_links_urls(_update_links_urls(data_js)),
indent=4, separators=(',', ': '))
fp = codecs.open( os.path.join(path, 'data','js','tweets', data_file), encoding='utf=8', mode='w')
fp.write(data_raw[0])
fp.write(data_js_updated)
fp.close()
@Fil
Copy link

Fil commented Dec 24, 2015

in http://blog.vrypan.net/2015/12/24/how-to-archive-your-tweets/index.html you write python ./download_twitter_media.py but here the script is called improve_twitter_archive.py ; also you need to give it the path to your archive

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment