Skip to content

Instantly share code, notes, and snippets.

@abegong
Created November 9, 2013 04:48
Show Gist options
  • Save abegong/7381803 to your computer and use it in GitHub Desktop.
Save abegong/7381803 to your computer and use it in GitHub Desktop.
Parse chrome history and extract recent google searches
import sqlite3
import pandas
import re
#cp ~/Library/Application\ Support/Google/Chrome/Default/History ./History
sql = sqlite3.connect("History")
c = sql.cursor()
#List table creation commands to introspect schemas
print c.execute("select * from sqlite_master").fetchall()
#Most of the action seems to be in the urls and visits tables
print c.execute("select * from urls limit 20").fetchall()
print c.execute("select * from visits limit 20").fetchall()
#Deeper inspection of the urls table
print c.execute("select url, last_visit_time FROM urls LIMIT 20").fetchall()
#Extract all the pages I've visited since yesterday
X = c.execute("select url, DATETIME(last_visit_time/1000000-11644473600,'unixepoch','localtime') FROM urls WHERE DATETIME(last_visit_time/1000000-11644473600,'unixepoch','localtime')>DATETIME('2013-11-07')").fetchall()
D = pandas.DataFrame(X)
D.columns = ['url', 'last_visit_time']
#Limit the search to just google (image?) searches
D2 = D[D.url.apply(lambda x: 'https://www.google.com/search?' in x)]
#Pull out the query term alone
D2['query'] = D2.url.apply(lambda x: re.findall('&q=(.*?)&', x))
#Find all the urls that have a query, and pull them out as a set
queries_since_yesterday = set([l[0] for l in list(D2.query) if len(l) > 0])
#Voila
print queries_since_yesterday
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment