Created
October 10, 2023 23:56
-
-
Save jaggzh/d6707c262db788f60b1f087551877e10 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python3 | |
| import sys | |
| import sqlite3 | |
| import csv | |
| import argparse | |
| import re | |
| from datetime import datetime | |
| def pe(*x, **y): print(*x, *y, file=sys.stderr) | |
| def pf(*x, **y): print(*x, *y, flush=True) | |
| parser = argparse.ArgumentParser() | |
| parser.add_argument("-f", "--file", default="places.sqlite") | |
| parser.add_argument("-o", "--output", default="places", help="output filename (default places.tab for csv, places.html for html") | |
| parser.add_argument("-t", "--type", default="csvtab", help="output type: csvtab, html") | |
| parser.add_argument("-c", "--cols", help="Default for csv: url,title,description,visit_count,frecency,last_visit_date,favicon_url") | |
| parser.add_argument("-fi", "--favicons_file", type=str, default='favicons.sqlite') | |
| parser.add_argument("-rf", "--regex_file", type=str) | |
| parser.add_argument("-v", "--verbose", action="count", default=0, help="increase output verbosity") | |
| parser.add_argument("-r", "--regex", action="append") | |
| args = parser.parse_args() | |
| regex_cols = ["url", "title", "description"] | |
| type_cols = { | |
| 'csvtab': "last_visit_date,url,title,description,visit_count,frecency,favicon_url", | |
| 'html': "last_visit_date,favicon_url,url,title,description,visit_count,frecency", | |
| } | |
| if args.type == 'csvtab': | |
| output_mode = "w" | |
| if args.output is None: args.output = 'places.tab' | |
| elif args.type == 'html': | |
| output_mode = "w" | |
| if args.output is None: args.output = 'places.html' | |
| else: | |
| print(f"Invalid type: {args.type}") | |
| sys.exit(1) | |
| newline = None | |
| if args.output == '-': | |
| output_mode = "a" | |
| newline = "" | |
| if args.cols is None: | |
| args.cols = type_cols[args.type] | |
| cols = args.cols.split(",") | |
| regexes = args.regex or [] | |
| regex_user = False | |
| if args.regex is not None: | |
| regex_user=True | |
| if args.regex_file: | |
| regex_user=True | |
| with open(args.regex_file, "r") as f: | |
| regexes.extend([line.strip() for line in f if not line.startswith('#')]) | |
| print(f"Regexes: {regexes}") | |
| con = sqlite3.connect(args.file) | |
| if 'favicon_url' in cols: | |
| con.execute("ATTACH DATABASE ? AS favicons", (args.favicons_file,)) | |
| def regexp_func(expr, item): | |
| try: | |
| if item is None: return False | |
| reg = re.compile(expr, re.IGNORECASE) | |
| if args.verbose > 2: print(f"Comparing {item} to r'{expr}'") | |
| return reg.search(item) is not None | |
| except re.error as e: | |
| print(f"Invalid regex {expr}: {str(e)}", file=sys.stderr) | |
| return False | |
| except Exception as e: | |
| print(f"Exception with regex {expr} and item {item}: {str(e)}", file=sys.stderr) | |
| return False | |
| con.create_function("REGEXP", 2, regexp_func) | |
| cur = con.cursor() | |
| if not regex_user: | |
| print("Dumping all entries (see -r or -rf to restrict)...", file=sys.stderr) | |
| results = set() | |
| formatted_datetime_query = """ | |
| datetime(moz_places.last_visit_date / 1000000, 'unixepoch') as last_visit_date | |
| """ if 'last_visit_date' in args.cols else 'moz_places.last_visit_date' | |
| # Extract cols that are not 'favicon_url' and replace 'last_visit_date' with formatted_datetime_query | |
| # Replacing last_visit_date in cols with formatted_datetime_query | |
| selected_cols = [col.replace('last_visit_date', formatted_datetime_query) for col in cols] | |
| # Subselect to get the favicon URL | |
| favicon_subselect = """ | |
| (SELECT Icon.icon_url | |
| FROM favicons.moz_pages_w_icons AS Url2PageID | |
| LEFT JOIN favicons.moz_icons_to_pages AS PageID2IconID ON Url2PageID.id = PageID2IconID.page_id | |
| LEFT JOIN favicons.moz_icons AS Icon ON PageID2IconID.icon_id = Icon.id | |
| WHERE moz_places.url = Url2PageID.page_url) AS favicon_url | |
| """ | |
| # Check if favicon_url is requested in cols and replace it with the subselect if it is | |
| selected_cols = [favicon_subselect if col == "favicon_url" else col for col in selected_cols] | |
| # Construct the SQL query with the selected_cols | |
| query = f""" | |
| SELECT {', '.join(selected_cols)} | |
| FROM moz_places | |
| """ | |
| if args.verbose>0: pf(f"Query:\n{query}") | |
| # Existing query execution logic remains the same | |
| if regex_user: | |
| # Constructing the WHERE clause with ORs between different regex conditions | |
| where_clause = " OR ".join( | |
| [f"({col} REGEXP ?)" for col in regex_cols for _ in regexes] | |
| ) | |
| cur.execute(query + "WHERE " + where_clause, tuple(regexes * len(regex_cols))) | |
| else: | |
| cur.execute(query) | |
| results.update(cur.fetchall()) | |
| last_visit_date_idx = None | |
| if 'last_visit_date' in cols: | |
| last_visit_date_idx = cols.index('last_visit_date') | |
| results = sorted(results, key=lambda x: x[last_visit_date_idx] or "1970-01-01 00:00:00") | |
| favicon_idx = None | |
| if 'favicon_url' in cols: | |
| favicon_idx = cols.index('favicon_url') | |
| if args.verbose>1: print(f"Results: {results}") | |
| with (open(args.output, output_mode, newline=newline) if args.output != "-" else sys.stdout) as f: | |
| if args.type == "csvtab": | |
| print("#" + "\t".join(cols), file=f) | |
| esc_map = {'\t':'\\t', '\n':'\\n', '\r':'\\r', '\\':'\\\\'} | |
| f.writelines([ | |
| "\t".join([ | |
| re.sub(r'[\t\n\r\\]', | |
| lambda x: esc_map[x.group()], | |
| str(cell) if cell is not None else "" | |
| ) | |
| for cell in row | |
| ]) + "\n" | |
| for row in results | |
| ]) | |
| elif args.type == "html": | |
| f.write("<html><head><title>Places</title></head><body><table border='1'>\n") | |
| f.write("<tr>" + "".join([f"<th>{col}</th>" for col in cols]) + "</tr>\n") | |
| for row in results: | |
| f.write("<tr>") | |
| for i, cell in enumerate(row): | |
| clean_cell = cell if cell is not None else '' | |
| if i == favicon_idx: | |
| f.write(f"<td><img src='{clean_cell}' width='24' height='24'></td>") | |
| elif i == cols.index('url'): | |
| f.write(f"<td><a href='{clean_cell}'>{clean_cell}</a></td>") | |
| else: | |
| # Replacing empty strings with non-breaking space for better HTML formatting | |
| f.write(f"<td>{clean_cell if clean_cell != '' else ' '}</td>") | |
| f.write("</tr>\n") | |
| f.write("</table></body></html>\n") | |
| else: | |
| writer = csv.DictWriter(f, fieldnames=cols) | |
| writer.writeheader() | |
| writer.writerows([{ | |
| cols[i]: (cell if cell is not None else '') | |
| for i, cell in enumerate(row) | |
| } for row in results]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment