Last active
December 24, 2015 16:47
-
-
Save vrypan/84d760366616dcd80391 to your computer and use it in GitHub Desktop.
improve twitter exported archive
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! /usr/bin/env python | |
""" | |
This script will parse an unzipped Twitter archive export, | |
look for media links and download them localy, and replace | |
the links in the export to point to the local media copies. | |
It will also do the same for user avatars. | |
For more info visit: https://blog.vrypan.net/2015/12/24/how-to-archive-your-tweets/ | |
""" | |
import urllib2 | |
from urlparse import urlparse | |
import json | |
import os | |
import sys | |
import httplib | |
import sqlite3 | |
import codecs | |
db_conn = sqlite3.connect( | |
os.path.join(os.path.expanduser('~'), 'expanded_urls.db'), | |
isolation_level="IMMEDIATE" | |
) | |
db_conn.row_factory = sqlite3.Row | |
db_cur = db_conn.cursor() | |
db_cur.execute("""CREATE TABLE IF NOT EXISTS REDIRECTS ( | |
src TEXT unique not null, | |
dst TEXT not null); | |
""") | |
def expand_url(url, depth=0): | |
global db_cur | |
print "Expanding %s..." % url | |
if depth>10: | |
return unicode(url) | |
parsed = urlparse(url) | |
try : | |
h = httplib.HTTPConnection(parsed.netloc) | |
h.request('HEAD', url) | |
response = h.getresponse() | |
print response.status, response.reason | |
if response.status in range(300, 400) and response.getheader('Location'): | |
return expand_url(response.getheader('Location'), depth+1) | |
else: | |
return unicode(url) | |
except: | |
print '** Failed to expand %s' % url | |
return unicode(url) | |
def _update_links_urls(i): | |
global db_conn | |
global db_cur | |
if type(i) is dict: | |
for k in i: | |
if k == 'urls' and len(i[k])>0 : | |
urls = i[k] | |
for u in urls: | |
if 'original_url' in u: | |
continue | |
db_cur.execute("SELECT * FROM REDIRECTS WHERE src=?", (u['url'],)) | |
r = db_cur.fetchone() | |
if r: | |
x_url = r['dst'] | |
else: | |
x_url = expand_url(u['url']) | |
db_cur.execute("INSERT INTO REDIRECTS VALUES(?,?)", (u['url'], x_url) ) | |
db_conn.commit() | |
u['original_url'] = u['url'] | |
u['url'] = x_url | |
u['expanded_url'] = u['url'] | |
parsed_url = urlparse(u['url']) | |
u['display_url'] = "%s%s%s" % ( parsed_url.netloc, parsed_url.path, parsed_url.query ) | |
if len(u['display_url']) > 27 : | |
u['display_url'] = u['display_url'][0:26] + '...' | |
else: | |
if type(i[k]) is dict or type(i[k]) is tuple or type(i[k]) is list : | |
i[k] = _update_links_urls(i[k]) | |
if type(i) is list or type(i) is tuple: | |
i = [ _update_links_urls(j) for j in i ] | |
return i | |
def _update_media_urls(i): | |
if type(i) is dict: | |
for k in i: | |
if k in ('media_url','media_url_https', 'profile_image_url_https'): | |
url = i[k] | |
local_file = os.path.join('img', urlparse(url).path.split('/')[-1] ) | |
if not os.path.isfile(local_file): | |
try: | |
media_file = urllib2.urlopen(url) | |
output = codecs.open(local_file, encoding='utf-8', mode='wb') | |
output.write(media_file.read()) | |
output.close() | |
i[k] = local_file | |
except: | |
print '** Failed to download %s' % url | |
else: | |
i[k] = local_file | |
else: | |
if type(i[k]) is dict or type(i[k]) is tuple or type(i[k]) is list : | |
i[k] = _update_media_urls(i[k]) | |
if type(i) is list or type(i) is tuple: | |
i = [ _update_media_urls(j) for j in i ] | |
return i | |
if len(sys.argv) == 1: | |
print './improve_twitter_archive.py <path to unzipped Twitter export>' | |
else: | |
path = sys.argv[1] | |
for data_file in os.listdir(os.path.join(path, 'data','js','tweets')): | |
print '\nParsing %s.' % data_file | |
data_raw = codecs.open(os.path.join(path, 'data','js','tweets', data_file), mode='r', encoding='utf-8').read().splitlines(True) | |
if data_raw: | |
data_js = json.loads('\n'.join(data_raw[1:])) | |
data_js_updated = json.dumps( | |
_update_links_urls(_update_links_urls(data_js)), | |
indent=4, separators=(',', ': ')) | |
fp = codecs.open( os.path.join(path, 'data','js','tweets', data_file), encoding='utf=8', mode='w') | |
fp.write(data_raw[0]) | |
fp.write(data_js_updated) | |
fp.close() | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
in http://blog.vrypan.net/2015/12/24/how-to-archive-your-tweets/index.html you write
python ./download_twitter_media.py
but here the script is calledimprove_twitter_archive.py
; also you need to give it the path to your archive