Created
          November 9, 2017 15:39 
        
      - 
      
- 
        Save jschairb/863ebacc28ba52ade8295564142c1506 to your computer and use it in GitHub Desktop. 
    Chrome tabs to JSON structure 
  
        
  
    
      This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
      Learn more about bidirectional Unicode characters
    
  
  
    
  | # -*- coding: utf-8 -*- | |
| #!/usr/bin/env python | |
| import hashlib | |
| import json | |
| import re | |
| import subprocess | |
| import sys | |
| from time import gmtime, strftime | |
| import lxml | |
| from lxml.html.clean import Cleaner | |
| TAB_ID_REGEX = re.compile('\[[0-9]+:([0-9]+)\]') | |
| def run_command(command): | |
| return subprocess.check_output(command).strip('\n') | |
| try: | |
| CHROME_CLI = run_command(['which', 'chrome-cli']) | |
| except: | |
| print "No exec for chrome-cli was found. Please install." | |
| sys.exit(1) | |
| class Chrome(object): | |
| def __init__(self, bin): | |
| self.bin = bin | |
| def list(self, element='tabs'): | |
| return self._run(['list', element]) | |
| def tab_ids(self): | |
| tabs = self.tabs_list() | |
| tab_ids = [] | |
| for tab in tabs: | |
| tab_id_match = TAB_ID_REGEX.match(tab) | |
| if tab_id_match == None: | |
| continue | |
| tab_id = tab_id_match.group(1) | |
| tab_ids.append(tab_id) | |
| return tab_ids | |
| def tab_info(self, tab_id): | |
| return self._run(['info', '-t', tab_id]) | |
| def tabs_list(self): | |
| tabs_list = self.list('tabs') | |
| return tabs_list.split('\n') | |
| def _run(self, command): | |
| full_command = [self.bin] + command | |
| return subprocess.check_output(full_command).strip('\n') | |
| class TabInfo(object): | |
| def __init__(self, tab_id, chrome): | |
| self.chrome = chrome | |
| self.tab_id = tab_id | |
| def data(self): | |
| return chrome.tab_info(self.tab_id) | |
| def id(self): | |
| pass | |
| def loading(self): | |
| pass | |
| def title(self): | |
| return self._parsed_row(self.data().split('\n')[1].replace('Title: ', '').decode('utf-8')) | |
| def url(self): | |
| pass | |
| def _parsed_row(self, row_index, replace): | |
| return self.data().split('\n')[row_index].replace(replace, '').decode('utf-8') | |
| class Tab(object): | |
| def __init__(self, id, chrome): | |
| self.chrome = chrome | |
| self.id = id | |
| def info(self): | |
| return TabInfo(id, chrome) | |
| chrome = Chrome(CHROME_CLI) | |
| tabs = chrome.tabs_list() | |
| for tab in tabs: | |
| tab_id_match = TAB_ID_REGEX.match(tab) | |
| if tab_id_match == None: | |
| continue | |
| else: | |
| tab_id = tab_id_match.group(1) | |
| tab_info = run_command([CHROME_CLI, 'info', '-t', tab_id]) | |
| tab_record = {} | |
| tab_record['title'] = tab_info.split('\n')[1].replace('Title: ', '').decode('utf-8') | |
| tab_record['url'] = tab_info.split('\n')[2].replace('Url: ', '') | |
| tab_record['id'] = hashlib.md5(tab_record['url']).hexdigest() | |
| tab_record['updated_at'] = strftime("%Y-%m-%d %H:%M:%S", gmtime()) | |
| tab_source = run_command([CHROME_CLI, 'source', '-t', tab_id]) | |
| cleaner = Cleaner() | |
| cleaner.page_structure = False | |
| cleaner.javascript = True # This is True because we want to activate the javascript filter | |
| cleaner.style = True # This is True because we want to activate the styles & stylesheet filter | |
| cleaner.remove_tags = ['div', 'h3', 'span', 'table', 'th', 'tr', 'td', 'ul', 'li', 'section', 'footer', 'hr', 'br', 'p', 'img', 'dd', 'dl', 'dt'] | |
| tab_record['source'] = lxml.html.tostring(cleaner.clean_html(lxml.html.fromstri | |
| ng(tab_source))) | 
  
    Sign up for free
    to join this conversation on GitHub.
    Already have an account?
    Sign in to comment
  
            
This code may or may not work, I can't remember how I left it.