edsu · January 27, 2017 20:51
diff --git a/tweet-ids.py b/tweet-ids.py
 #!/usr/bin/env python

 """
 If you use webrecorder.io to archive a Twitter search result and then download
 the WARC file you can use this script to read through the WARC file looking 
 for tweet identifiers to hydrate.

 You'll need to install the [warc] module for the script to work. After that you should be able to:

    ./tweet-ids.py file.warc.gz > ids.txt

 [warc]: http://warc.readthedocs.io/en/latest/

 """

 import re
 import sys
 import warc
 import gzip
 import json

 from StringIO import StringIO
 from httplib import HTTPResponse

 if len(sys.argv) != 2:
    sys.stderr.write("usage: tweet-ids.py [warc-file]\n")
    sys.exit(1)

 warc_file = sys.argv[1]

 class FakeSocket():
    """Fake class so we can parse HTTP response using a string from the WARC.
    """
    def __init__(self, response_str):
        self._file = StringIO(response_str)
    def makefile(self, *args, **kwargs):
        return self._file

 # read through the WARC records looking for text/javascript HTTP responses
 # and parse them for the twitter identifiers

 for record in warc.open(warc_file):
    if record.type == "response":
        resp = HTTPResponse(FakeSocket(record.payload.read()))
        resp.begin()

        # only interested in these API calls
        if resp.getheader('content-type') != 'text/javascript; charset=utf-8':
            continue

        # the content is gzip encoded
        fh = gzip.GzipFile(fileobj=StringIO(resp.read()))
        content = fh.read()

        # sometimes there's no content, which is weird
        if not content:
            continue

        # parse the json
        data = json.loads(content)

        # only interested in items_html content that has the twitter ids
        if 'items_html' not in data:
            continue

        # print out all the tweet ids present in the text
        for m in re.finditer('data-tweet-id="(\d+)"', data['items_html']):
            print m.group(1)
	#!/usr/bin/env python

	"""
	If you use webrecorder.io to archive a Twitter search result and then download
	the WARC file you can use this script to read through the WARC file looking
	for tweet identifiers to hydrate.

	You'll need to install the [warc] module for the script to work. After that you should be able to:

	./tweet-ids.py file.warc.gz > ids.txt

	[warc]: http://warc.readthedocs.io/en/latest/

	"""

	import re
	import sys
	import warc
	import gzip
	import json

	from StringIO import StringIO
	from httplib import HTTPResponse

	if len(sys.argv) != 2:
	sys.stderr.write("usage: tweet-ids.py [warc-file]\n")
	sys.exit(1)

	warc_file = sys.argv[1]

	class FakeSocket():
	"""Fake class so we can parse HTTP response using a string from the WARC.
	"""
	def __init__(self, response_str):
	self._file = StringIO(response_str)
	def makefile(self, args, *kwargs):
	return self._file

	# read through the WARC records looking for text/javascript HTTP responses
	# and parse them for the twitter identifiers

	for record in warc.open(warc_file):
	if record.type == "response":
	resp = HTTPResponse(FakeSocket(record.payload.read()))
	resp.begin()

	# only interested in these API calls
	if resp.getheader('content-type') != 'text/javascript; charset=utf-8':
	continue

	# the content is gzip encoded
	fh = gzip.GzipFile(fileobj=StringIO(resp.read()))
	content = fh.read()

	# sometimes there's no content, which is weird
	if not content:
	continue

	# parse the json
	data = json.loads(content)

	# only interested in items_html content that has the twitter ids
	if 'items_html' not in data:
	continue

	# print out all the tweet ids present in the text
	for m in re.finditer('data-tweet-id="(\d+)"', data['items_html']):
	print m.group(1)