Skip to content

Instantly share code, notes, and snippets.

@0x5742
Created October 17, 2015 13:14
Show Gist options
  • Select an option

  • Save 0x5742/1315a0431e032d1236ef to your computer and use it in GitHub Desktop.

Select an option

Save 0x5742/1315a0431e032d1236ef to your computer and use it in GitHub Desktop.
Scrape Twitter's API documentation and pound it into SQLite CREATE TABLE statements
#!/usr/bin/env python
# Scrape Twitter's API documentation and pound it into SQLite CREATE TABLE statements.
#
# Give this URLs to the documentation pages you're interested in, e.g.:
# $ python twitter-api-doc-to-sql.py https://dev.twitter.com/overview/api/tweets > tweets.sql
#
# It's always a good idea to review the output before making use of it.
import re, sys, urllib.request, lxml.html, posixpath
def wrap(text, width, before):
return re.sub(r'^', before,
re.sub(r'(?=.{%d})(.{1,%d})\s+' % (width, width), r'\1\n', text),
flags=re.MULTILINE).rstrip()
# SQL types corresponding to the value in the data type field.
# None indicates to skip the field.
# Values not defined here will cause an error.
typemap = {
'Boolean': 'BOOLEAN', # not a "real" type but works in sqlite anyway
'String': 'TEXT',
'Int': 'INTEGER',
'Integer': 'INTEGER',
'Int64': 'INTEGER',
'Collection of Contributors': None,
'Array of String': None,
'Coordinates': None,
# Storing individual entities in the database is just kind of overkill
# since they never travel without the thing they're attached to
'Entities': 'JSON',
'Object': None,
'Places': None,
'Type': None,
#'Tweet': None,
'Tweet': ('retweeted_status_id_str', 'TEXT'), # Original tweet this is a retweet of
'Tweets': None,
'User': ('user_id_str', 'TEXT'),
'Users': ('user_id_str', 'TEXT'),
}
# Skip this field if its name is in this set.
# ('id' is skipped because 'id_str' is preferred)
skip_fieldnames = re.compile(r'^(?:id|.*_id|truncated)$')
# Skip this field if the description matches this pattern
skip_keywords = re.compile(r'\b(?:Unused|Deprecated|Perspectival)\.')
# Add these fields to each table.
extra_fields = ['_updated INTEGER']
def generate_sql(url, output):
f = urllib.request.urlopen(url)
tablename = posixpath.basename(url)
data = f.read().decode('utf8', 'replace')
doc = lxml.html.document_fromstring(data)
rows = doc.xpath("""descendant-or-self::table[contains(concat(' ', normalize-space(@class), ' '),
' platform-object ')][1]/descendant::tr""")
last_updated = doc.xpath("""descendant-or-self::*[contains(concat(' ', normalize-space(@class), ' '),
' doc-updated ')]""")
output.write('-- Generated from ' + url + '\n')
if last_updated:
output.write('-- ' + last_updated[0].text_content().strip() + '\n')
output.write('CREATE TABLE ' + tablename + ' (\n')
found_id = False
for row in rows[1:]:
field, datatype, desc = [cell.text_content().strip() for cell in row]
if field == 'id_str':
found_id = True
sqltype = typemap[datatype] # might cause an error if not defined.
sqldesc = wrap(desc, 72, ' --- ')
output.write(sqldesc + '\n')
if type(sqltype) is tuple:
field, sqltype = sqltype
if sqltype is None or skip_fieldnames.search(field) or skip_keywords.search(desc):
output.write(' --')
sqltype = sqltype or datatype
else:
output.write(' ')
output.write(field + ' ' + sqltype + ',\n\n')
if not found_id:
sys.stderr.write('warning: table %r has no id_str field' % tablename)
if extra_fields:
for field in extra_fields:
output.write(' ' + field + ',\n')
output.write('\n')
output.write(' PRIMARY KEY (id_str)\n);\n\n')
for arg in sys.argv[1:]:
generate_sql(arg, sys.stdout)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment