Created
October 17, 2015 13:14
-
-
Save 0x5742/1315a0431e032d1236ef to your computer and use it in GitHub Desktop.
Scrape Twitter's API documentation and pound it into SQLite CREATE TABLE statements
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python | |
| # Scrape Twitter's API documentation and pound it into SQLite CREATE TABLE statements. | |
| # | |
| # Give this URLs to the documentation pages you're interested in, e.g.: | |
| # $ python twitter-api-doc-to-sql.py https://dev.twitter.com/overview/api/tweets > tweets.sql | |
| # | |
| # It's always a good idea to review the output before making use of it. | |
| import re, sys, urllib.request, lxml.html, posixpath | |
| def wrap(text, width, before): | |
| return re.sub(r'^', before, | |
| re.sub(r'(?=.{%d})(.{1,%d})\s+' % (width, width), r'\1\n', text), | |
| flags=re.MULTILINE).rstrip() | |
| # SQL types corresponding to the value in the data type field. | |
| # None indicates to skip the field. | |
| # Values not defined here will cause an error. | |
| typemap = { | |
| 'Boolean': 'BOOLEAN', # not a "real" type but works in sqlite anyway | |
| 'String': 'TEXT', | |
| 'Int': 'INTEGER', | |
| 'Integer': 'INTEGER', | |
| 'Int64': 'INTEGER', | |
| 'Collection of Contributors': None, | |
| 'Array of String': None, | |
| 'Coordinates': None, | |
| # Storing individual entities in the database is just kind of overkill | |
| # since they never travel without the thing they're attached to | |
| 'Entities': 'JSON', | |
| 'Object': None, | |
| 'Places': None, | |
| 'Type': None, | |
| #'Tweet': None, | |
| 'Tweet': ('retweeted_status_id_str', 'TEXT'), # Original tweet this is a retweet of | |
| 'Tweets': None, | |
| 'User': ('user_id_str', 'TEXT'), | |
| 'Users': ('user_id_str', 'TEXT'), | |
| } | |
| # Skip this field if its name is in this set. | |
| # ('id' is skipped because 'id_str' is preferred) | |
| skip_fieldnames = re.compile(r'^(?:id|.*_id|truncated)$') | |
| # Skip this field if the description matches this pattern | |
| skip_keywords = re.compile(r'\b(?:Unused|Deprecated|Perspectival)\.') | |
| # Add these fields to each table. | |
| extra_fields = ['_updated INTEGER'] | |
| def generate_sql(url, output): | |
| f = urllib.request.urlopen(url) | |
| tablename = posixpath.basename(url) | |
| data = f.read().decode('utf8', 'replace') | |
| doc = lxml.html.document_fromstring(data) | |
| rows = doc.xpath("""descendant-or-self::table[contains(concat(' ', normalize-space(@class), ' '), | |
| ' platform-object ')][1]/descendant::tr""") | |
| last_updated = doc.xpath("""descendant-or-self::*[contains(concat(' ', normalize-space(@class), ' '), | |
| ' doc-updated ')]""") | |
| output.write('-- Generated from ' + url + '\n') | |
| if last_updated: | |
| output.write('-- ' + last_updated[0].text_content().strip() + '\n') | |
| output.write('CREATE TABLE ' + tablename + ' (\n') | |
| found_id = False | |
| for row in rows[1:]: | |
| field, datatype, desc = [cell.text_content().strip() for cell in row] | |
| if field == 'id_str': | |
| found_id = True | |
| sqltype = typemap[datatype] # might cause an error if not defined. | |
| sqldesc = wrap(desc, 72, ' --- ') | |
| output.write(sqldesc + '\n') | |
| if type(sqltype) is tuple: | |
| field, sqltype = sqltype | |
| if sqltype is None or skip_fieldnames.search(field) or skip_keywords.search(desc): | |
| output.write(' --') | |
| sqltype = sqltype or datatype | |
| else: | |
| output.write(' ') | |
| output.write(field + ' ' + sqltype + ',\n\n') | |
| if not found_id: | |
| sys.stderr.write('warning: table %r has no id_str field' % tablename) | |
| if extra_fields: | |
| for field in extra_fields: | |
| output.write(' ' + field + ',\n') | |
| output.write('\n') | |
| output.write(' PRIMARY KEY (id_str)\n);\n\n') | |
| for arg in sys.argv[1:]: | |
| generate_sql(arg, sys.stdout) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment