Skip to content

Instantly share code, notes, and snippets.

@szhu
Last active April 29, 2026 09:05
Show Gist options
  • Select an option

  • Save szhu/44ae5aacafccbe35d793546ab2ec8c5d to your computer and use it in GitHub Desktop.

Select an option

Save szhu/44ae5aacafccbe35d793546ab2ec8c5d to your computer and use it in GitHub Desktop.
// ==UserScript==
// @name Messenger Scraper
// @namespace https://github.com/szhu
// @version 0.20260429.5
// @match https://www.facebook.com/messages/*
// @match https://www.messenger.com/*
// @grant none
// @run-at document-idle
// ==/UserScript==
// @ts-check
// Lint: tsc --checkJs --allowJs --noEmit --target ES2020 --lib ES2020,DOM --strict messenger-scraper.user.js
// Product design
//
// - **What is this?** This is a script that exports Messenger conversations as
// plain text. It scrapes message content, sender names, and timestamps from
// the DOM as the user scrolls, accumulating a full conversation history
// across multiple scroll sessions.
// - **How to use:** Click the small dot in the bottom-right corner to toggle
// the panel. The panel contains the textarea and controls.
// - **The panel activates and deactivates the observer.** The observer doesn't
// start until the panel is opened; it disconnects when closed. In-memory
// state is cleared on close. Nothing runs on script load.
// - **The textarea is the only persistent state.** No `localStorage`, no export
// file. `pages` and `seen` are always derived from it. Copying the textarea
// captures the full state; pasting it back restores it.
//
// Known limitations
//
// - **Duplicate messages when sender name changes.** Messenger sometimes renders
// a sender's full name on the first load ("Jane Smith") and a nickname on
// subsequent loads ("Jane"). Since the message key includes the sender name,
// these appear as two different messages and both get stored. Pressing Clear
// and re-scraping is the workaround.
// - **Message key collides within one minute.** The message key is
// `[timestamp] sender: text`, where the timestamp has one-minute resolution.
// If the same person sends the exact same text twice within the same minute,
// the second occurrence is silently dropped as a duplicate. This is rare in
// practice.
//
// Non-obvious Messenger DOM facts
//
// - **Sender names differ between first and subsequent renders.** On the first
// render of a conversation, Messenger uses full FB profile names ("Jane
// Smith"); subsequent renders use nicknames or first names ("Jane"). Same
// message, different string.
// - **Navigating between conversations replaces the entire conversation DOM.**
// The message container, scrollable element, and observer target all go stale.
// - **Message class names are obfuscated and unstable.** The stable structure
// for finding message text is positional: the message button's parent's
// sibling element with non-empty inner text.
// - **The scrollable container is a child of `[role="log"]`.** The log element
// itself has `overflow:hidden`; the actual scrollable element is a descendant
// with `overflow:auto` or `overflow:scroll`.
// - **Weekday timestamps always refer to the past.** Messenger shows
// "Tuesday 3:42pm" only for messages from the past 7 days that aren't
// today — never for today's messages.
(function () {
'use strict';
// --- Date parsing ---
/**
* Converts a time string like "11:43pm" to "11:43 PM" so it can be parsed by `new Date()`.
* @param {string} t
* @returns {string}
*/
function normalizeTime(t) {
return t.replace(/\s*([ap]m)$/i, (/** @type {string} */ _, /** @type {string} */ ampm) => ' ' + ampm.toUpperCase());
}
/**
* Parses the three timestamp formats Facebook Messenger uses in aria-labels:
* full dates ("April 21, 2026, 5:00 PM"), weekday-relative ("Tuesday 11:43pm"),
* and time-only ("12:27 AM", resolved to today). Returns null if unrecognized.
* @param {string} str
* @returns {Date | null}
*/
function parseMessengerDate(str) {
str = str.trim().replace(/\s/g, ' '); // normalize narrow no-break spaces (U+202F) etc.
// "Month DD, YYYY, H:MM AM/PM"
const fullDate = str.match(/^(\w+ \d+, \d{4}), (\d+:\d+ [AP]M)$/i);
if (fullDate) {
const d = new Date(`${fullDate[1]} ${fullDate[2]}`);
if (!isNaN(d.getTime())) return d;
}
const now = new Date();
const weekdays = ['sunday','monday','tuesday','wednesday','thursday','friday','saturday'];
// "Weekday H:MMam/pm"
const weekdayMatch = str.match(/^(sunday|monday|tuesday|wednesday|thursday|friday|saturday)\s+(\d+:\d+\s*[ap]m)$/i);
if (weekdayMatch) {
const targetDay = weekdays.indexOf(weekdayMatch[1].toLowerCase());
const d = new Date(now);
let diff = d.getDay() - targetDay;
if (diff <= 0) diff += 7;
d.setDate(d.getDate() - diff);
const time = new Date(`${d.toDateString()} ${normalizeTime(weekdayMatch[2])}`);
if (!isNaN(time.getTime())) {
d.setHours(time.getHours(), time.getMinutes(), 0, 0);
return d;
}
}
// "H:MM AM/PM" or "H:MMam/pm" (today)
const timeOnly = str.match(/^(\d+:\d+\s*[ap]m)$/i);
if (timeOnly) {
const d = new Date(`${now.toDateString()} ${normalizeTime(timeOnly[1])}`);
if (!isNaN(d.getTime())) return d;
}
return null;
}
/**
* Formats a Date as "YYYY-MM-DD HH:MM" for fixed-width, lexicographically sortable output.
* Returns a placeholder string if the date is null.
* @param {Date | null} d
* @returns {string}
*/
function formatDate(d) {
if (!d) return '????-??-?? ??:??';
/** @param {number} n */ const pad = n => String(n).padStart(2, '0');
return `${d.getFullYear()}-${pad(d.getMonth()+1)}-${pad(d.getDate())} ${pad(d.getHours())}:${pad(d.getMinutes())}`;
}
// --- Message extraction ---
/**
* Extracts a formatted message line from a Messenger message element.
* Reads the timestamp and sender from the aria-label, and the full text from
* the sibling content div (the button's parent's sibling). The first line of
* that div is the sender name for the first message in a group — it's stripped
* when it matches the sender. Internal newlines are converted to tabs.
* Returns null if the element isn't a recognizable message or has no text.
* @param {Element} el
* @returns {string | null}
*/
function extractMessage(el) {
const label = el.getAttribute('aria-label') || '';
const m = label.match(/^Enter, Message sent (.+?) by (.+?): /);
if (!m) return null;
const date = formatDate(parseMessengerDate(m[1]));
const sender = m[2].trim();
const wrapper = el.parentElement;
const contentDiv = Array.from(wrapper?.parentElement?.children || [])
.find(s => s !== wrapper && s instanceof HTMLElement && s.innerText?.trim());
const raw = contentDiv instanceof HTMLElement ? contentDiv.innerText?.trim() : '';
const lines = raw.split('\n');
if (lines[0] === sender) lines.shift();
const text = lines.join('\t').replace(/\t+/g, '\t');
if (!text) return null;
return `[${date}] ${sender}: ${text}`;
}
/**
* Creates an HTML element with props and children. Props support event listeners
* (onclick, onkeydown, …), style as a CSS string, and any writable DOM property.
* Children can be strings, Nodes, or arrays of either.
* @param {string} tag
* @param {Record<string, any>} [props]
* @param {...(Node | string | (Node | string)[])} children
* @returns {HTMLElement}
*/
function createElement(tag, props = {}, ...children) {
const el = document.createElement(tag);
for (const [k, v] of Object.entries(props)) {
if (k.startsWith('on') && typeof v === 'function') {
el.addEventListener(k.slice(2).toLowerCase(), v);
} else if (k === 'style') {
el.style.cssText = v;
} else {
Object.assign(el, { [k]: v });
}
}
for (const child of /** @type {(Node | string)[]} */ (children.flat())) {
el.appendChild(typeof child === 'string' ? document.createTextNode(child) : child);
}
return el;
}
/**
* Builds the UI, wires up state and event handlers, performs an initial message
* scan, and starts observing the conversation container for DOM changes.
* Exits early if the panel is already present (idempotent).
*/
function init() {
if (document.getElementById('__msg_scraper__')) return;
// --- UI ---
/** Returns the scrollable message container inside [role="log"]. */
function getLog() {
const log = document.querySelector('[role="log"]');
if (!log) return null;
for (const el of Array.from(log.querySelectorAll('*'))) {
const oy = getComputedStyle(el).overflowY;
if ((oy === 'auto' || oy === 'scroll') && el.scrollHeight > el.clientHeight) return el;
}
return log;
}
const bs = 'font-size:11px;cursor:pointer;background:#333;color:#ccc;border:none;padding:2px 6px;border-radius:3px';
/** @param {string} label @param {() => void} handler @returns {HTMLElement} */
const btn = (label, handler) => createElement('button', { style: bs, onclick: handler }, label);
const totalSpan = createElement('span', {}, '0');
const pageSpan = createElement('span', {}, '');
const ta = /** @type {HTMLTextAreaElement} */ (createElement('textarea', {
style: 'flex:1;background:transparent;color:#ddd;font-size:11px;line-height:1.4;padding:8px;border:none;resize:none;outline:none;',
spellcheck: false,
}));
const panel = createElement('div', {
id: '__msg_scraper__',
style: 'position:fixed;bottom:28px;right:4px;width:360px;height:480px;display:none;flex-direction:column;background:rgba(0,0,0,0.92);font-family:monospace;box-shadow:-2px -2px 12px rgba(0,0,0,0.6);z-index:99999;border-radius:4px;overflow:hidden;',
},
createElement('div', { style: 'padding:5px 8px;display:flex;gap:8px;align-items:center;border-bottom:1px solid #333;flex-shrink:0;flex-wrap:wrap;' },
createElement('span', { style: 'color:#aaa;font-size:11px;flex:1' }, 'Conversation Exporter'),
btn('↑', () => { const l = getLog(); if (l) l.scrollTop = 0; }),
btn('↓', () => { const l = getLog(); if (l) l.scrollTop = l.scrollHeight; }),
btn('re-attach', () => { deactivate(); activate(); }),
createElement('div', { style: 'color:#666;font-size:10px;width:100%;padding:0 0 2px 0;display:flex;align-items:center;gap:6px;' },
createElement('span', { style: 'flex:1' }, 'Messages: ', totalSpan, ' total', pageSpan),
btn('copy', () => navigator.clipboard.writeText(ta.value).catch(() => { ta.select(); document.execCommand('copy'); })),
btn('clear', () => { ta.value = ''; pages = []; seen.clear(); totalSpan.textContent = '0'; pageSpan.textContent = ''; }),
),
),
ta,
);
const dot = createElement('div', {
id: '__msg_dot__',
style: 'position:fixed;bottom:4px;right:4px;width:10px;height:10px;border-radius:50%;background:#555;cursor:pointer;z-index:99999;',
title: 'Conversation Exporter',
onclick: () => {
const show = panel.style.display === 'none';
panel.style.display = show ? 'flex' : 'none';
dot.style.background = show ? '#aaa' : '#555';
if (show) activate(); else deactivate();
},
});
document.body.appendChild(dot);
document.body.appendChild(panel);
// --- State (rebuilt from textarea so it's the single source of truth) ---
// Pages are contiguous runs of messages separated by blank lines.
// Each page is an array of message strings in correct chronological order.
// Pages themselves are sorted by their first message's timestamp.
/** @type {string[][]} */
let pages = [];
const seen = new Set(/** @type {string[]} */ ([]));
/** Serializes pages back to the textarea. */
function render() {
ta.value = pages.map(p => p.join('\n')).join('\n\n');
}
/** Rebuilds pages and seen from the current textarea content. */
function reloadFromTextarea() {
pages = ta.value.split('\n\n').filter(Boolean).map(b => b.split('\n').filter(Boolean));
seen.clear();
for (const line of pages.flat()) seen.add(line);
totalSpan.textContent = String(seen.size);
}
ta.addEventListener('blur', reloadFromTextarea);
ta.addEventListener('keydown', (/** @type {KeyboardEvent} */ e) => {
if (e.key === 'Enter' && (e.metaKey || e.ctrlKey)) reloadFromTextarea();
});
/**
* Updates the status line: "Messages: N total / P (+new -dropped) page".
* @param {number} newCount
* @param {number} pageSize
* @param {number} dropped
*/
function updateStatus(newCount, pageSize, dropped) {
totalSpan.textContent = String(seen.size);
pageSpan.textContent = ` / ${pageSize} (+${newCount} -${dropped}) page`;
}
// --- Collection ---
/**
* Reads the visible page from the DOM, merges it with any overlapping stored pages,
* re-sorts all pages by first-message timestamp, and re-renders.
*
* Definitions:
* - Page: a contiguous block of messages. In the textarea, pages are separated by
* blank lines.
* - Visible page: the page extracted from the current DOM state (what Messenger has
* rendered at this moment, regardless of how it fetched or cached it).
*
* Stored pages are always disjoint — every merge pass enforces this by collapsing
* overlapping pages into one. So the visible page can overlap with at most one stored
* page on its left side and one on its right side.
*
* Merge strategy (LCS-inspired), applied to each overlapping stored page:
* - Find the first and last line in the stored page that also appears in the visible page
* ("anchors"). The visible page is authoritative between them: stored lines not
* present in it are dropped — the primary case is a deleted or edited message; a
* secondary case is a sender name changing between renders (e.g. full name on first
* render, nickname on subsequent renders), which produces a new key and makes the
* old one appear absent.
* - Lines before firstAnchor are kept only if their timestamp predates the visible page
* (older scroll history). Lines at or within the visible page's range are dropped as stale.
* - Lines after lastAnchor are always kept (newer scroll history not in the visible page).
* - The merged page is: [kept pre-anchor lines] + [visible page] + [kept post-anchor lines].
*
* Finally, all resulting pages (merged page + any remaining non-overlapping pages) are
* sorted by first-message timestamp.
*/
function addMessages() {
seen.clear();
for (const line of pages.flat()) seen.add(line);
const els = document.querySelectorAll('[role="button"][aria-label^="Enter, Message sent"]');
const domLines = /** @type {string[]} */ ([]);
for (const el of Array.from(els)) {
const line = extractMessage(el);
if (line) domLines.push(line);
}
if (!domLines.length) return;
const newLines = domLines.filter(l => !seen.has(l));
if (!newLines.length) return;
const domSet = new Set(domLines);
const overlapping = pages.filter(p => p.some(l => domSet.has(l)));
const rest = pages.filter(p => !p.some(l => domSet.has(l)));
/** @type {string[]} */ const before = [];
/** @type {string[]} */ const after = [];
let dropped = 0;
for (const page of overlapping) {
let firstAnchor = -1, lastAnchor = -1;
for (let i = 0; i < page.length; i++) {
if (domSet.has(page[i])) { if (firstAnchor === -1) firstAnchor = i; lastAnchor = i; }
}
if (firstAnchor === -1) continue;
for (let i = 0; i < firstAnchor; i++) {
// Drop stale early-injection duplicates whose timestamp falls within the DOM window.
if (page[i].slice(1, 17) < domLines[0].slice(1, 17)) before.push(page[i]); else dropped++;
}
for (let i = firstAnchor; i <= lastAnchor; i++) { if (!domSet.has(page[i])) dropped++; }
for (let i = lastAnchor + 1; i < page.length; i++) after.push(page[i]);
}
const merged = [...before, ...domLines, ...after];
pages = [...rest, merged].sort((a, b) => {
const tsA = a[0].slice(1, 17), tsB = b[0].slice(1, 17);
return tsA < tsB ? -1 : tsA > tsB ? 1 : 0;
});
seen.clear();
for (const line of pages.flat()) seen.add(line);
render();
updateStatus(newLines.length, domLines.length, dropped);
}
// --- Observer ---
/** @type {MutationObserver | null} */
let observer = null;
function deactivate() {
if (observer) { observer.disconnect(); observer = null; }
pages = []; seen.clear();
}
function activate() {
reloadFromTextarea();
addMessages();
const log = document.querySelector('[role="log"]');
if (log) {
observer = new MutationObserver(() => addMessages());
observer.observe(log, { childList: true, subtree: true });
} else {
const poller = setInterval(() => {
const l = document.querySelector('[role="log"]');
if (!l) return;
clearInterval(poller);
observer = new MutationObserver(() => addMessages());
observer.observe(l, { childList: true, subtree: true });
addMessages();
}, 500);
}
}
}
init();
})();
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment