Skip to content

Instantly share code, notes, and snippets.

@jaggzh
Created October 10, 2023 23:56
Show Gist options
  • Select an option

  • Save jaggzh/d6707c262db788f60b1f087551877e10 to your computer and use it in GitHub Desktop.

Select an option

Save jaggzh/d6707c262db788f60b1f087551877e10 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python3
import sys
import sqlite3
import csv
import argparse
import re
from datetime import datetime
def pe(*x, **y): print(*x, *y, file=sys.stderr)
def pf(*x, **y): print(*x, *y, flush=True)
parser = argparse.ArgumentParser()
parser.add_argument("-f", "--file", default="places.sqlite")
parser.add_argument("-o", "--output", default="places", help="output filename (default places.tab for csv, places.html for html")
parser.add_argument("-t", "--type", default="csvtab", help="output type: csvtab, html")
parser.add_argument("-c", "--cols", help="Default for csv: url,title,description,visit_count,frecency,last_visit_date,favicon_url")
parser.add_argument("-fi", "--favicons_file", type=str, default='favicons.sqlite')
parser.add_argument("-rf", "--regex_file", type=str)
parser.add_argument("-v", "--verbose", action="count", default=0, help="increase output verbosity")
parser.add_argument("-r", "--regex", action="append")
args = parser.parse_args()
regex_cols = ["url", "title", "description"]
type_cols = {
'csvtab': "last_visit_date,url,title,description,visit_count,frecency,favicon_url",
'html': "last_visit_date,favicon_url,url,title,description,visit_count,frecency",
}
if args.type == 'csvtab':
output_mode = "w"
if args.output is None: args.output = 'places.tab'
elif args.type == 'html':
output_mode = "w"
if args.output is None: args.output = 'places.html'
else:
print(f"Invalid type: {args.type}")
sys.exit(1)
newline = None
if args.output == '-':
output_mode = "a"
newline = ""
if args.cols is None:
args.cols = type_cols[args.type]
cols = args.cols.split(",")
regexes = args.regex or []
regex_user = False
if args.regex is not None:
regex_user=True
if args.regex_file:
regex_user=True
with open(args.regex_file, "r") as f:
regexes.extend([line.strip() for line in f if not line.startswith('#')])
print(f"Regexes: {regexes}")
con = sqlite3.connect(args.file)
if 'favicon_url' in cols:
con.execute("ATTACH DATABASE ? AS favicons", (args.favicons_file,))
def regexp_func(expr, item):
try:
if item is None: return False
reg = re.compile(expr, re.IGNORECASE)
if args.verbose > 2: print(f"Comparing {item} to r'{expr}'")
return reg.search(item) is not None
except re.error as e:
print(f"Invalid regex {expr}: {str(e)}", file=sys.stderr)
return False
except Exception as e:
print(f"Exception with regex {expr} and item {item}: {str(e)}", file=sys.stderr)
return False
con.create_function("REGEXP", 2, regexp_func)
cur = con.cursor()
if not regex_user:
print("Dumping all entries (see -r or -rf to restrict)...", file=sys.stderr)
results = set()
formatted_datetime_query = """
datetime(moz_places.last_visit_date / 1000000, 'unixepoch') as last_visit_date
""" if 'last_visit_date' in args.cols else 'moz_places.last_visit_date'
# Extract cols that are not 'favicon_url' and replace 'last_visit_date' with formatted_datetime_query
# Replacing last_visit_date in cols with formatted_datetime_query
selected_cols = [col.replace('last_visit_date', formatted_datetime_query) for col in cols]
# Subselect to get the favicon URL
favicon_subselect = """
(SELECT Icon.icon_url
FROM favicons.moz_pages_w_icons AS Url2PageID
LEFT JOIN favicons.moz_icons_to_pages AS PageID2IconID ON Url2PageID.id = PageID2IconID.page_id
LEFT JOIN favicons.moz_icons AS Icon ON PageID2IconID.icon_id = Icon.id
WHERE moz_places.url = Url2PageID.page_url) AS favicon_url
"""
# Check if favicon_url is requested in cols and replace it with the subselect if it is
selected_cols = [favicon_subselect if col == "favicon_url" else col for col in selected_cols]
# Construct the SQL query with the selected_cols
query = f"""
SELECT {', '.join(selected_cols)}
FROM moz_places
"""
if args.verbose>0: pf(f"Query:\n{query}")
# Existing query execution logic remains the same
if regex_user:
# Constructing the WHERE clause with ORs between different regex conditions
where_clause = " OR ".join(
[f"({col} REGEXP ?)" for col in regex_cols for _ in regexes]
)
cur.execute(query + "WHERE " + where_clause, tuple(regexes * len(regex_cols)))
else:
cur.execute(query)
results.update(cur.fetchall())
last_visit_date_idx = None
if 'last_visit_date' in cols:
last_visit_date_idx = cols.index('last_visit_date')
results = sorted(results, key=lambda x: x[last_visit_date_idx] or "1970-01-01 00:00:00")
favicon_idx = None
if 'favicon_url' in cols:
favicon_idx = cols.index('favicon_url')
if args.verbose>1: print(f"Results: {results}")
with (open(args.output, output_mode, newline=newline) if args.output != "-" else sys.stdout) as f:
if args.type == "csvtab":
print("#" + "\t".join(cols), file=f)
esc_map = {'\t':'\\t', '\n':'\\n', '\r':'\\r', '\\':'\\\\'}
f.writelines([
"\t".join([
re.sub(r'[\t\n\r\\]',
lambda x: esc_map[x.group()],
str(cell) if cell is not None else ""
)
for cell in row
]) + "\n"
for row in results
])
elif args.type == "html":
f.write("<html><head><title>Places</title></head><body><table border='1'>\n")
f.write("<tr>" + "".join([f"<th>{col}</th>" for col in cols]) + "</tr>\n")
for row in results:
f.write("<tr>")
for i, cell in enumerate(row):
clean_cell = cell if cell is not None else ''
if i == favicon_idx:
f.write(f"<td><img src='{clean_cell}' width='24' height='24'></td>")
elif i == cols.index('url'):
f.write(f"<td><a href='{clean_cell}'>{clean_cell}</a></td>")
else:
# Replacing empty strings with non-breaking space for better HTML formatting
f.write(f"<td>{clean_cell if clean_cell != '' else '&nbsp;'}</td>")
f.write("</tr>\n")
f.write("</table></body></html>\n")
else:
writer = csv.DictWriter(f, fieldnames=cols)
writer.writeheader()
writer.writerows([{
cols[i]: (cell if cell is not None else '')
for i, cell in enumerate(row)
} for row in results])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment