Created
January 16, 2017 14:52
-
-
Save d-schmidt/090f7f905bb75fbd5c8e5528ccf792ba to your computer and use it in GitHub Desktop.
Cleaning the Chrome browser history with Python
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
import sqlite3 | |
import re | |
# find your 'History' file | |
conn = sqlite3.connect('c:/Users/username/AppData/Local/Google/Chrome/User Data/Default/History') | |
c = conn.cursor() | |
print("history length", c.execute('SELECT count(1) FROM urls').fetchone()[0]) | |
domainPattern = re.compile(r"https?://([^/]+)/") | |
domains = {} | |
result = True | |
id = 0 | |
while result: | |
result = False | |
ids = [] | |
for row in c.execute('SELECT id, url, title FROM urls WHERE id > ? LIMIT 1000', (id,)): | |
result = True | |
match = domainPattern.search(row[1]) | |
id = row[0] | |
if match: | |
domain = match.group(1) | |
domains[domain] = domains.get(domain, 0) + 1 | |
# clean if this is true | |
if "imgur" in domain: | |
ids.append((id,)) | |
c.executemany('DELETE FROM urls WHERE id=?', ids) | |
conn.commit() | |
conn.close() | |
import pprint | |
pprint.pprint(domains) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment