Skip to content

Instantly share code, notes, and snippets.

@lepinkainen
Created June 13, 2025 11:28
Show Gist options
  • Save lepinkainen/d3e520304066f0020e926cee51f5f953 to your computer and use it in GitHub Desktop.
Save lepinkainen/d3e520304066f0020e926cee51f5f953 to your computer and use it in GitHub Desktop.
Create a datasette compatible sqlite3 database from IRCCloud logs, full text search is enabled
import re
import sqlite3
from datetime import datetime, UTC
import argparse
import os
def init_db(db_path, channel, source_file):
conn = sqlite3.connect(db_path)
c = conn.cursor()
# Main messages table
c.execute(
"""
CREATE TABLE IF NOT EXISTS messages (
id INTEGER PRIMARY KEY,
timestamp TEXT,
channel TEXT,
sender TEXT,
type TEXT,
content TEXT,
metadata TEXT
)
"""
)
# Indexes
c.execute("CREATE INDEX IF NOT EXISTS idx_timestamp ON messages(timestamp)")
c.execute("CREATE INDEX IF NOT EXISTS idx_sender ON messages(sender)")
c.execute("CREATE INDEX IF NOT EXISTS idx_channel ON messages(channel)")
c.execute("CREATE INDEX IF NOT EXISTS idx_type ON messages(type)")
# FTS table
c.execute(
"""
CREATE VIRTUAL TABLE IF NOT EXISTS messages_fts
USING fts5(content, content='messages', content_rowid='id');
"""
)
# Triggers to sync messages -> messages_fts
c.executescript(
"""
CREATE TRIGGER IF NOT EXISTS messages_ai AFTER INSERT ON messages BEGIN
INSERT INTO messages_fts(rowid, content) VALUES (new.id, new.content);
END;
CREATE TRIGGER IF NOT EXISTS messages_ad AFTER DELETE ON messages BEGIN
DELETE FROM messages_fts WHERE rowid = old.id;
END;
CREATE TRIGGER IF NOT EXISTS messages_au AFTER UPDATE ON messages BEGIN
UPDATE messages_fts SET content = new.content WHERE rowid = old.id;
END;
"""
)
# Meta table
c.execute(
"""
CREATE TABLE IF NOT EXISTS meta (
key TEXT PRIMARY KEY,
value TEXT
)
"""
)
c.execute(
"INSERT OR REPLACE INTO meta (key, value) VALUES (?, ?)", ("channel", channel)
)
c.execute(
"INSERT OR REPLACE INTO meta (key, value) VALUES (?, ?)",
("import_time", datetime.now(UTC).isoformat()),
)
c.execute(
"INSERT OR REPLACE INTO meta (key, value) VALUES (?, ?)",
("source_file", os.path.basename(source_file)),
)
c.execute(
"INSERT OR REPLACE INTO meta (key, value) VALUES (?, ?)",
("script_version", "1.0"),
)
return conn
msg_pattern = re.compile(r"^\[(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2})] ([^ ]+) (.+)$")
def parse_line(line, default_channel):
match = msg_pattern.match(line)
if not match:
return None
timestamp_str, prefix, rest = match.groups()
try:
iso_ts = datetime.strptime(timestamp_str, "%Y-%m-%d %H:%M:%S").isoformat()
except ValueError:
return None
if prefix.startswith("<"):
sender = prefix.strip("<>")
return (iso_ts, default_channel, sender, "message", rest.strip(), "{}")
elif prefix == "*":
# Handle system messages
if "set +o" in rest:
return (iso_ts, default_channel, None, "mode", rest.strip(), "{}")
elif "joined" in rest:
parts = rest.split()
sender = parts[0] if parts else None
return (iso_ts, default_channel, sender, "join", rest.strip(), "{}")
elif "quit" in rest:
parts = rest.split("(", 1)
sender = parts[0].strip()
msg = parts[1].rstrip(")") if len(parts) > 1 else ""
return (iso_ts, default_channel, sender, "quit", msg, "{}")
else:
return (iso_ts, default_channel, None, "system", rest.strip(), "{}")
elif prefix == "→":
sender = rest.split()[0]
return (iso_ts, default_channel, sender, "join", rest.strip(), "{}")
elif prefix == "←":
sender = rest.split()[0]
return (iso_ts, default_channel, sender, "part", rest.strip(), "{}")
else:
return (iso_ts, default_channel, None, "unknown", line.strip(), "{}")
def main():
parser = argparse.ArgumentParser(
description="Parse IRC logs to SQLite for Datasette."
)
parser.add_argument("input_file", help="Path to the IRC log file (text)")
parser.add_argument(
"--output-db",
help="Path to output SQLite database (auto-named if not provided and --channel is set)",
default=None,
)
parser.add_argument(
"--channel", default="#unknown", help="Channel name to assign to messages"
)
args = parser.parse_args()
if not os.path.exists(args.input_file):
print(f"Input file {args.input_file} not found.")
return
# Determine output database path
if args.output_db:
output_db = args.output_db
elif args.channel.startswith("#"):
clean_channel = args.channel[1:].replace("/", "_")
output_db = f"irc_{clean_channel}.db"
else:
output_db = "irc_unknown.db"
print(
f"Parsing {args.input_file} and writing to {output_db}, channel set to {args.channel}..."
)
if os.path.exists(output_db):
print(f"Output database {output_db} already exists. Overwriting.")
os.remove(output_db)
conn = init_db(output_db, args.channel, args.input_file)
cur = conn.cursor()
with open(args.input_file, encoding="utf-8") as f:
for line in f:
parsed = parse_line(line, args.channel)
if parsed:
cur.execute(
"INSERT INTO messages (timestamp, channel, sender, type, content, metadata) VALUES (?, ?, ?, ?, ?, ?)",
parsed,
)
conn.commit()
conn.close()
print(f"Done. SQLite database written to {output_db}")
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment