Created
June 13, 2025 11:28
-
-
Save lepinkainen/d3e520304066f0020e926cee51f5f953 to your computer and use it in GitHub Desktop.
Create a datasette compatible sqlite3 database from IRCCloud logs, full text search is enabled
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
import sqlite3 | |
from datetime import datetime, UTC | |
import argparse | |
import os | |
def init_db(db_path, channel, source_file): | |
conn = sqlite3.connect(db_path) | |
c = conn.cursor() | |
# Main messages table | |
c.execute( | |
""" | |
CREATE TABLE IF NOT EXISTS messages ( | |
id INTEGER PRIMARY KEY, | |
timestamp TEXT, | |
channel TEXT, | |
sender TEXT, | |
type TEXT, | |
content TEXT, | |
metadata TEXT | |
) | |
""" | |
) | |
# Indexes | |
c.execute("CREATE INDEX IF NOT EXISTS idx_timestamp ON messages(timestamp)") | |
c.execute("CREATE INDEX IF NOT EXISTS idx_sender ON messages(sender)") | |
c.execute("CREATE INDEX IF NOT EXISTS idx_channel ON messages(channel)") | |
c.execute("CREATE INDEX IF NOT EXISTS idx_type ON messages(type)") | |
# FTS table | |
c.execute( | |
""" | |
CREATE VIRTUAL TABLE IF NOT EXISTS messages_fts | |
USING fts5(content, content='messages', content_rowid='id'); | |
""" | |
) | |
# Triggers to sync messages -> messages_fts | |
c.executescript( | |
""" | |
CREATE TRIGGER IF NOT EXISTS messages_ai AFTER INSERT ON messages BEGIN | |
INSERT INTO messages_fts(rowid, content) VALUES (new.id, new.content); | |
END; | |
CREATE TRIGGER IF NOT EXISTS messages_ad AFTER DELETE ON messages BEGIN | |
DELETE FROM messages_fts WHERE rowid = old.id; | |
END; | |
CREATE TRIGGER IF NOT EXISTS messages_au AFTER UPDATE ON messages BEGIN | |
UPDATE messages_fts SET content = new.content WHERE rowid = old.id; | |
END; | |
""" | |
) | |
# Meta table | |
c.execute( | |
""" | |
CREATE TABLE IF NOT EXISTS meta ( | |
key TEXT PRIMARY KEY, | |
value TEXT | |
) | |
""" | |
) | |
c.execute( | |
"INSERT OR REPLACE INTO meta (key, value) VALUES (?, ?)", ("channel", channel) | |
) | |
c.execute( | |
"INSERT OR REPLACE INTO meta (key, value) VALUES (?, ?)", | |
("import_time", datetime.now(UTC).isoformat()), | |
) | |
c.execute( | |
"INSERT OR REPLACE INTO meta (key, value) VALUES (?, ?)", | |
("source_file", os.path.basename(source_file)), | |
) | |
c.execute( | |
"INSERT OR REPLACE INTO meta (key, value) VALUES (?, ?)", | |
("script_version", "1.0"), | |
) | |
return conn | |
msg_pattern = re.compile(r"^\[(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2})] ([^ ]+) (.+)$") | |
def parse_line(line, default_channel): | |
match = msg_pattern.match(line) | |
if not match: | |
return None | |
timestamp_str, prefix, rest = match.groups() | |
try: | |
iso_ts = datetime.strptime(timestamp_str, "%Y-%m-%d %H:%M:%S").isoformat() | |
except ValueError: | |
return None | |
if prefix.startswith("<"): | |
sender = prefix.strip("<>") | |
return (iso_ts, default_channel, sender, "message", rest.strip(), "{}") | |
elif prefix == "*": | |
# Handle system messages | |
if "set +o" in rest: | |
return (iso_ts, default_channel, None, "mode", rest.strip(), "{}") | |
elif "joined" in rest: | |
parts = rest.split() | |
sender = parts[0] if parts else None | |
return (iso_ts, default_channel, sender, "join", rest.strip(), "{}") | |
elif "quit" in rest: | |
parts = rest.split("(", 1) | |
sender = parts[0].strip() | |
msg = parts[1].rstrip(")") if len(parts) > 1 else "" | |
return (iso_ts, default_channel, sender, "quit", msg, "{}") | |
else: | |
return (iso_ts, default_channel, None, "system", rest.strip(), "{}") | |
elif prefix == "→": | |
sender = rest.split()[0] | |
return (iso_ts, default_channel, sender, "join", rest.strip(), "{}") | |
elif prefix == "←": | |
sender = rest.split()[0] | |
return (iso_ts, default_channel, sender, "part", rest.strip(), "{}") | |
else: | |
return (iso_ts, default_channel, None, "unknown", line.strip(), "{}") | |
def main(): | |
parser = argparse.ArgumentParser( | |
description="Parse IRC logs to SQLite for Datasette." | |
) | |
parser.add_argument("input_file", help="Path to the IRC log file (text)") | |
parser.add_argument( | |
"--output-db", | |
help="Path to output SQLite database (auto-named if not provided and --channel is set)", | |
default=None, | |
) | |
parser.add_argument( | |
"--channel", default="#unknown", help="Channel name to assign to messages" | |
) | |
args = parser.parse_args() | |
if not os.path.exists(args.input_file): | |
print(f"Input file {args.input_file} not found.") | |
return | |
# Determine output database path | |
if args.output_db: | |
output_db = args.output_db | |
elif args.channel.startswith("#"): | |
clean_channel = args.channel[1:].replace("/", "_") | |
output_db = f"irc_{clean_channel}.db" | |
else: | |
output_db = "irc_unknown.db" | |
print( | |
f"Parsing {args.input_file} and writing to {output_db}, channel set to {args.channel}..." | |
) | |
if os.path.exists(output_db): | |
print(f"Output database {output_db} already exists. Overwriting.") | |
os.remove(output_db) | |
conn = init_db(output_db, args.channel, args.input_file) | |
cur = conn.cursor() | |
with open(args.input_file, encoding="utf-8") as f: | |
for line in f: | |
parsed = parse_line(line, args.channel) | |
if parsed: | |
cur.execute( | |
"INSERT INTO messages (timestamp, channel, sender, type, content, metadata) VALUES (?, ?, ?, ?, ?, ?)", | |
parsed, | |
) | |
conn.commit() | |
conn.close() | |
print(f"Done. SQLite database written to {output_db}") | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment