Skip to content

Instantly share code, notes, and snippets.

@jschairb
Created November 9, 2017 15:39
Show Gist options
  • Save jschairb/863ebacc28ba52ade8295564142c1506 to your computer and use it in GitHub Desktop.
Save jschairb/863ebacc28ba52ade8295564142c1506 to your computer and use it in GitHub Desktop.
Chrome tabs to JSON structure
# -*- coding: utf-8 -*-
#!/usr/bin/env python
import hashlib
import json
import re
import subprocess
import sys
from time import gmtime, strftime
import lxml
from lxml.html.clean import Cleaner
TAB_ID_REGEX = re.compile('\[[0-9]+:([0-9]+)\]')
def run_command(command):
return subprocess.check_output(command).strip('\n')
try:
CHROME_CLI = run_command(['which', 'chrome-cli'])
except:
print "No exec for chrome-cli was found. Please install."
sys.exit(1)
class Chrome(object):
def __init__(self, bin):
self.bin = bin
def list(self, element='tabs'):
return self._run(['list', element])
def tab_ids(self):
tabs = self.tabs_list()
tab_ids = []
for tab in tabs:
tab_id_match = TAB_ID_REGEX.match(tab)
if tab_id_match == None:
continue
tab_id = tab_id_match.group(1)
tab_ids.append(tab_id)
return tab_ids
def tab_info(self, tab_id):
return self._run(['info', '-t', tab_id])
def tabs_list(self):
tabs_list = self.list('tabs')
return tabs_list.split('\n')
def _run(self, command):
full_command = [self.bin] + command
return subprocess.check_output(full_command).strip('\n')
class TabInfo(object):
def __init__(self, tab_id, chrome):
self.chrome = chrome
self.tab_id = tab_id
def data(self):
return chrome.tab_info(self.tab_id)
def id(self):
pass
def loading(self):
pass
def title(self):
return self._parsed_row(self.data().split('\n')[1].replace('Title: ', '').decode('utf-8'))
def url(self):
pass
def _parsed_row(self, row_index, replace):
return self.data().split('\n')[row_index].replace(replace, '').decode('utf-8')
class Tab(object):
def __init__(self, id, chrome):
self.chrome = chrome
self.id = id
def info(self):
return TabInfo(id, chrome)
chrome = Chrome(CHROME_CLI)
tabs = chrome.tabs_list()
for tab in tabs:
tab_id_match = TAB_ID_REGEX.match(tab)
if tab_id_match == None:
continue
else:
tab_id = tab_id_match.group(1)
tab_info = run_command([CHROME_CLI, 'info', '-t', tab_id])
tab_record = {}
tab_record['title'] = tab_info.split('\n')[1].replace('Title: ', '').decode('utf-8')
tab_record['url'] = tab_info.split('\n')[2].replace('Url: ', '')
tab_record['id'] = hashlib.md5(tab_record['url']).hexdigest()
tab_record['updated_at'] = strftime("%Y-%m-%d %H:%M:%S", gmtime())
tab_source = run_command([CHROME_CLI, 'source', '-t', tab_id])
cleaner = Cleaner()
cleaner.page_structure = False
cleaner.javascript = True # This is True because we want to activate the javascript filter
cleaner.style = True # This is True because we want to activate the styles & stylesheet filter
cleaner.remove_tags = ['div', 'h3', 'span', 'table', 'th', 'tr', 'td', 'ul', 'li', 'section', 'footer', 'hr', 'br', 'p', 'img', 'dd', 'dl', 'dt']
tab_record['source'] = lxml.html.tostring(cleaner.clean_html(lxml.html.fromstri
ng(tab_source)))
@jschairb
Copy link
Author

jschairb commented Nov 9, 2017

This code may or may not work, I can't remember how I left it.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment