Skip to content

Instantly share code, notes, and snippets.

@kemitche
Created October 15, 2012 18:48
Show Gist options
  • Save kemitche/3894322 to your computer and use it in GitHub Desktop.
Save kemitche/3894322 to your computer and use it in GitHub Desktop.
Transifex Timeline scraper
#!/usr/bin/python
import collections
import os
import sys
import time
from BeautifulSoup import BeautifulSoup
import requests
TX_SITE = 'https://www.transifex.com'
PROJECT_PATH = 'projects/p/%(project)s'
TIMELINE = 'timeline'
USERAGENT = ('Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:16.0; scraping '
'contributors for trophies; email [email protected] with '
'concerns) Gecko/20100101 Firefox/16.0')
VERBOSE = True
Event = collections.namedtuple('Event', 'kind lang user when')
def dump_events(outfile, start_at=1, end_at=None):
with open(outfile, 'w') as write_to:
for event in iter_timeline(start_at=start_at, end_at=end_at):
write_to.write(repr(event))
write_to.write('\n')
def get_cookie():
return {'cookie': os.environ['TXCOOKIE']}
def get_timeline_page(project='reddit', pagenum=1):
path = '/'.join([TX_SITE, PROJECT_PATH, TIMELINE]) % {'project': project}
params = {'page': pagenum}
if VERBOSE:
print "Getting: %s?page=%s" % (path, pagenum)
headers = {'User-Agent': USERAGENT}
headers.update(get_cookie())
response = requests.get(path, params=params, headers=headers)
if response.status_code == 200:
return BeautifulSoup(response.content)
else:
raise StandardError('Something went wrong', response)
def iter_timeline(start_at=1, end_at=None, sleep=2):
page = start_at
while True:
soup = get_timeline_page(pagenum=page)
table = soup.find('tbody')
if not table:
break
for item in iter_table(table):
yield item
print "Latest item: %r" % (item,)
if end_at is not None and page >= end_at:
break
page += 1
time.sleep(sleep)
def iter_table(table):
rows = table.findAll('tr')
for row in rows:
event = decompose_row(row)
if event:
yield event
def decompose_row(row):
action_type = get_type(row)
user = get_user(row)
when = get_when(row)
lang = get_lang(row)
return Event(action_type, lang, user, when)
def get_type(row):
span = row.findAll('td')[0].find('span')
return _attrs(span)['title']
def get_user(row):
td = row.findAll('td')[1]
assert 'timelineuser' in _attrs(td)['class']
return td.text.strip()
def get_when(row):
td = row.findAll('td')[2]
assert 'timelinewhen' in _attrs(td)['class']
return td.text
def get_lang(row):
td = row.findAll('td')[3]
text = td.text.strip()
if text.startswith('A translation for'):
text = text.split()
start = len('A translation for'.split())
end = text.index('was')
return u' '.join(text[start:end])
else:
hrefs = td.findAll('a')
for href in hrefs:
if '/language/' in _attrs(href)['href']:
lang = href.text
return lang[:-len(' language translation')]
return None
def _attrs(soup):
return dict(soup.attrs)
def main(args):
assert args[1] == '--page'
page = int(args[2])
outfile = args[3]
assert outfile
dump_events(outfile, end_at=page)
if __name__ == '__main__':
main(sys.argv)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment