Created
November 9, 2013 04:48
-
-
Save abegong/7381803 to your computer and use it in GitHub Desktop.
Parse chrome history and extract recent google searches
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sqlite3 | |
import pandas | |
import re | |
#cp ~/Library/Application\ Support/Google/Chrome/Default/History ./History | |
sql = sqlite3.connect("History") | |
c = sql.cursor() | |
#List table creation commands to introspect schemas | |
print c.execute("select * from sqlite_master").fetchall() | |
#Most of the action seems to be in the urls and visits tables | |
print c.execute("select * from urls limit 20").fetchall() | |
print c.execute("select * from visits limit 20").fetchall() | |
#Deeper inspection of the urls table | |
print c.execute("select url, last_visit_time FROM urls LIMIT 20").fetchall() | |
#Extract all the pages I've visited since yesterday | |
X = c.execute("select url, DATETIME(last_visit_time/1000000-11644473600,'unixepoch','localtime') FROM urls WHERE DATETIME(last_visit_time/1000000-11644473600,'unixepoch','localtime')>DATETIME('2013-11-07')").fetchall() | |
D = pandas.DataFrame(X) | |
D.columns = ['url', 'last_visit_time'] | |
#Limit the search to just google (image?) searches | |
D2 = D[D.url.apply(lambda x: 'https://www.google.com/search?' in x)] | |
#Pull out the query term alone | |
D2['query'] = D2.url.apply(lambda x: re.findall('&q=(.*?)&', x)) | |
#Find all the urls that have a query, and pull them out as a set | |
queries_since_yesterday = set([l[0] for l in list(D2.query) if len(l) > 0]) | |
#Voila | |
print queries_since_yesterday |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment