Last active
April 29, 2026 09:05
-
-
Save szhu/44ae5aacafccbe35d793546ab2ec8c5d to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| // ==UserScript== | |
| // @name Messenger Scraper | |
| // @namespace https://github.com/szhu | |
| // @version 0.20260429.5 | |
| // @match https://www.facebook.com/messages/* | |
| // @match https://www.messenger.com/* | |
| // @grant none | |
| // @run-at document-idle | |
| // ==/UserScript== | |
| // @ts-check | |
| // Lint: tsc --checkJs --allowJs --noEmit --target ES2020 --lib ES2020,DOM --strict messenger-scraper.user.js | |
| // Product design | |
| // | |
| // - **What is this?** This is a script that exports Messenger conversations as | |
| // plain text. It scrapes message content, sender names, and timestamps from | |
| // the DOM as the user scrolls, accumulating a full conversation history | |
| // across multiple scroll sessions. | |
| // - **How to use:** Click the small dot in the bottom-right corner to toggle | |
| // the panel. The panel contains the textarea and controls. | |
| // - **The panel activates and deactivates the observer.** The observer doesn't | |
| // start until the panel is opened; it disconnects when closed. In-memory | |
| // state is cleared on close. Nothing runs on script load. | |
| // - **The textarea is the only persistent state.** No `localStorage`, no export | |
| // file. `pages` and `seen` are always derived from it. Copying the textarea | |
| // captures the full state; pasting it back restores it. | |
| // | |
| // Known limitations | |
| // | |
| // - **Duplicate messages when sender name changes.** Messenger sometimes renders | |
| // a sender's full name on the first load ("Jane Smith") and a nickname on | |
| // subsequent loads ("Jane"). Since the message key includes the sender name, | |
| // these appear as two different messages and both get stored. Pressing Clear | |
| // and re-scraping is the workaround. | |
| // - **Message key collides within one minute.** The message key is | |
| // `[timestamp] sender: text`, where the timestamp has one-minute resolution. | |
| // If the same person sends the exact same text twice within the same minute, | |
| // the second occurrence is silently dropped as a duplicate. This is rare in | |
| // practice. | |
| // | |
| // Non-obvious Messenger DOM facts | |
| // | |
| // - **Sender names differ between first and subsequent renders.** On the first | |
| // render of a conversation, Messenger uses full FB profile names ("Jane | |
| // Smith"); subsequent renders use nicknames or first names ("Jane"). Same | |
| // message, different string. | |
| // - **Navigating between conversations replaces the entire conversation DOM.** | |
| // The message container, scrollable element, and observer target all go stale. | |
| // - **Message class names are obfuscated and unstable.** The stable structure | |
| // for finding message text is positional: the message button's parent's | |
| // sibling element with non-empty inner text. | |
| // - **The scrollable container is a child of `[role="log"]`.** The log element | |
| // itself has `overflow:hidden`; the actual scrollable element is a descendant | |
| // with `overflow:auto` or `overflow:scroll`. | |
| // - **Weekday timestamps always refer to the past.** Messenger shows | |
| // "Tuesday 3:42pm" only for messages from the past 7 days that aren't | |
| // today — never for today's messages. | |
| (function () { | |
| 'use strict'; | |
| // --- Date parsing --- | |
| /** | |
| * Converts a time string like "11:43pm" to "11:43 PM" so it can be parsed by `new Date()`. | |
| * @param {string} t | |
| * @returns {string} | |
| */ | |
| function normalizeTime(t) { | |
| return t.replace(/\s*([ap]m)$/i, (/** @type {string} */ _, /** @type {string} */ ampm) => ' ' + ampm.toUpperCase()); | |
| } | |
| /** | |
| * Parses the three timestamp formats Facebook Messenger uses in aria-labels: | |
| * full dates ("April 21, 2026, 5:00 PM"), weekday-relative ("Tuesday 11:43pm"), | |
| * and time-only ("12:27 AM", resolved to today). Returns null if unrecognized. | |
| * @param {string} str | |
| * @returns {Date | null} | |
| */ | |
| function parseMessengerDate(str) { | |
| str = str.trim().replace(/\s/g, ' '); // normalize narrow no-break spaces (U+202F) etc. | |
| // "Month DD, YYYY, H:MM AM/PM" | |
| const fullDate = str.match(/^(\w+ \d+, \d{4}), (\d+:\d+ [AP]M)$/i); | |
| if (fullDate) { | |
| const d = new Date(`${fullDate[1]} ${fullDate[2]}`); | |
| if (!isNaN(d.getTime())) return d; | |
| } | |
| const now = new Date(); | |
| const weekdays = ['sunday','monday','tuesday','wednesday','thursday','friday','saturday']; | |
| // "Weekday H:MMam/pm" | |
| const weekdayMatch = str.match(/^(sunday|monday|tuesday|wednesday|thursday|friday|saturday)\s+(\d+:\d+\s*[ap]m)$/i); | |
| if (weekdayMatch) { | |
| const targetDay = weekdays.indexOf(weekdayMatch[1].toLowerCase()); | |
| const d = new Date(now); | |
| let diff = d.getDay() - targetDay; | |
| if (diff <= 0) diff += 7; | |
| d.setDate(d.getDate() - diff); | |
| const time = new Date(`${d.toDateString()} ${normalizeTime(weekdayMatch[2])}`); | |
| if (!isNaN(time.getTime())) { | |
| d.setHours(time.getHours(), time.getMinutes(), 0, 0); | |
| return d; | |
| } | |
| } | |
| // "H:MM AM/PM" or "H:MMam/pm" (today) | |
| const timeOnly = str.match(/^(\d+:\d+\s*[ap]m)$/i); | |
| if (timeOnly) { | |
| const d = new Date(`${now.toDateString()} ${normalizeTime(timeOnly[1])}`); | |
| if (!isNaN(d.getTime())) return d; | |
| } | |
| return null; | |
| } | |
| /** | |
| * Formats a Date as "YYYY-MM-DD HH:MM" for fixed-width, lexicographically sortable output. | |
| * Returns a placeholder string if the date is null. | |
| * @param {Date | null} d | |
| * @returns {string} | |
| */ | |
| function formatDate(d) { | |
| if (!d) return '????-??-?? ??:??'; | |
| /** @param {number} n */ const pad = n => String(n).padStart(2, '0'); | |
| return `${d.getFullYear()}-${pad(d.getMonth()+1)}-${pad(d.getDate())} ${pad(d.getHours())}:${pad(d.getMinutes())}`; | |
| } | |
| // --- Message extraction --- | |
| /** | |
| * Extracts a formatted message line from a Messenger message element. | |
| * Reads the timestamp and sender from the aria-label, and the full text from | |
| * the sibling content div (the button's parent's sibling). The first line of | |
| * that div is the sender name for the first message in a group — it's stripped | |
| * when it matches the sender. Internal newlines are converted to tabs. | |
| * Returns null if the element isn't a recognizable message or has no text. | |
| * @param {Element} el | |
| * @returns {string | null} | |
| */ | |
| function extractMessage(el) { | |
| const label = el.getAttribute('aria-label') || ''; | |
| const m = label.match(/^Enter, Message sent (.+?) by (.+?): /); | |
| if (!m) return null; | |
| const date = formatDate(parseMessengerDate(m[1])); | |
| const sender = m[2].trim(); | |
| const wrapper = el.parentElement; | |
| const contentDiv = Array.from(wrapper?.parentElement?.children || []) | |
| .find(s => s !== wrapper && s instanceof HTMLElement && s.innerText?.trim()); | |
| const raw = contentDiv instanceof HTMLElement ? contentDiv.innerText?.trim() : ''; | |
| const lines = raw.split('\n'); | |
| if (lines[0] === sender) lines.shift(); | |
| const text = lines.join('\t').replace(/\t+/g, '\t'); | |
| if (!text) return null; | |
| return `[${date}] ${sender}: ${text}`; | |
| } | |
| /** | |
| * Creates an HTML element with props and children. Props support event listeners | |
| * (onclick, onkeydown, …), style as a CSS string, and any writable DOM property. | |
| * Children can be strings, Nodes, or arrays of either. | |
| * @param {string} tag | |
| * @param {Record<string, any>} [props] | |
| * @param {...(Node | string | (Node | string)[])} children | |
| * @returns {HTMLElement} | |
| */ | |
| function createElement(tag, props = {}, ...children) { | |
| const el = document.createElement(tag); | |
| for (const [k, v] of Object.entries(props)) { | |
| if (k.startsWith('on') && typeof v === 'function') { | |
| el.addEventListener(k.slice(2).toLowerCase(), v); | |
| } else if (k === 'style') { | |
| el.style.cssText = v; | |
| } else { | |
| Object.assign(el, { [k]: v }); | |
| } | |
| } | |
| for (const child of /** @type {(Node | string)[]} */ (children.flat())) { | |
| el.appendChild(typeof child === 'string' ? document.createTextNode(child) : child); | |
| } | |
| return el; | |
| } | |
| /** | |
| * Builds the UI, wires up state and event handlers, performs an initial message | |
| * scan, and starts observing the conversation container for DOM changes. | |
| * Exits early if the panel is already present (idempotent). | |
| */ | |
| function init() { | |
| if (document.getElementById('__msg_scraper__')) return; | |
| // --- UI --- | |
| /** Returns the scrollable message container inside [role="log"]. */ | |
| function getLog() { | |
| const log = document.querySelector('[role="log"]'); | |
| if (!log) return null; | |
| for (const el of Array.from(log.querySelectorAll('*'))) { | |
| const oy = getComputedStyle(el).overflowY; | |
| if ((oy === 'auto' || oy === 'scroll') && el.scrollHeight > el.clientHeight) return el; | |
| } | |
| return log; | |
| } | |
| const bs = 'font-size:11px;cursor:pointer;background:#333;color:#ccc;border:none;padding:2px 6px;border-radius:3px'; | |
| /** @param {string} label @param {() => void} handler @returns {HTMLElement} */ | |
| const btn = (label, handler) => createElement('button', { style: bs, onclick: handler }, label); | |
| const totalSpan = createElement('span', {}, '0'); | |
| const pageSpan = createElement('span', {}, ''); | |
| const ta = /** @type {HTMLTextAreaElement} */ (createElement('textarea', { | |
| style: 'flex:1;background:transparent;color:#ddd;font-size:11px;line-height:1.4;padding:8px;border:none;resize:none;outline:none;', | |
| spellcheck: false, | |
| })); | |
| const panel = createElement('div', { | |
| id: '__msg_scraper__', | |
| style: 'position:fixed;bottom:28px;right:4px;width:360px;height:480px;display:none;flex-direction:column;background:rgba(0,0,0,0.92);font-family:monospace;box-shadow:-2px -2px 12px rgba(0,0,0,0.6);z-index:99999;border-radius:4px;overflow:hidden;', | |
| }, | |
| createElement('div', { style: 'padding:5px 8px;display:flex;gap:8px;align-items:center;border-bottom:1px solid #333;flex-shrink:0;flex-wrap:wrap;' }, | |
| createElement('span', { style: 'color:#aaa;font-size:11px;flex:1' }, 'Conversation Exporter'), | |
| btn('↑', () => { const l = getLog(); if (l) l.scrollTop = 0; }), | |
| btn('↓', () => { const l = getLog(); if (l) l.scrollTop = l.scrollHeight; }), | |
| btn('re-attach', () => { deactivate(); activate(); }), | |
| createElement('div', { style: 'color:#666;font-size:10px;width:100%;padding:0 0 2px 0;display:flex;align-items:center;gap:6px;' }, | |
| createElement('span', { style: 'flex:1' }, 'Messages: ', totalSpan, ' total', pageSpan), | |
| btn('copy', () => navigator.clipboard.writeText(ta.value).catch(() => { ta.select(); document.execCommand('copy'); })), | |
| btn('clear', () => { ta.value = ''; pages = []; seen.clear(); totalSpan.textContent = '0'; pageSpan.textContent = ''; }), | |
| ), | |
| ), | |
| ta, | |
| ); | |
| const dot = createElement('div', { | |
| id: '__msg_dot__', | |
| style: 'position:fixed;bottom:4px;right:4px;width:10px;height:10px;border-radius:50%;background:#555;cursor:pointer;z-index:99999;', | |
| title: 'Conversation Exporter', | |
| onclick: () => { | |
| const show = panel.style.display === 'none'; | |
| panel.style.display = show ? 'flex' : 'none'; | |
| dot.style.background = show ? '#aaa' : '#555'; | |
| if (show) activate(); else deactivate(); | |
| }, | |
| }); | |
| document.body.appendChild(dot); | |
| document.body.appendChild(panel); | |
| // --- State (rebuilt from textarea so it's the single source of truth) --- | |
| // Pages are contiguous runs of messages separated by blank lines. | |
| // Each page is an array of message strings in correct chronological order. | |
| // Pages themselves are sorted by their first message's timestamp. | |
| /** @type {string[][]} */ | |
| let pages = []; | |
| const seen = new Set(/** @type {string[]} */ ([])); | |
| /** Serializes pages back to the textarea. */ | |
| function render() { | |
| ta.value = pages.map(p => p.join('\n')).join('\n\n'); | |
| } | |
| /** Rebuilds pages and seen from the current textarea content. */ | |
| function reloadFromTextarea() { | |
| pages = ta.value.split('\n\n').filter(Boolean).map(b => b.split('\n').filter(Boolean)); | |
| seen.clear(); | |
| for (const line of pages.flat()) seen.add(line); | |
| totalSpan.textContent = String(seen.size); | |
| } | |
| ta.addEventListener('blur', reloadFromTextarea); | |
| ta.addEventListener('keydown', (/** @type {KeyboardEvent} */ e) => { | |
| if (e.key === 'Enter' && (e.metaKey || e.ctrlKey)) reloadFromTextarea(); | |
| }); | |
| /** | |
| * Updates the status line: "Messages: N total / P (+new -dropped) page". | |
| * @param {number} newCount | |
| * @param {number} pageSize | |
| * @param {number} dropped | |
| */ | |
| function updateStatus(newCount, pageSize, dropped) { | |
| totalSpan.textContent = String(seen.size); | |
| pageSpan.textContent = ` / ${pageSize} (+${newCount} -${dropped}) page`; | |
| } | |
| // --- Collection --- | |
| /** | |
| * Reads the visible page from the DOM, merges it with any overlapping stored pages, | |
| * re-sorts all pages by first-message timestamp, and re-renders. | |
| * | |
| * Definitions: | |
| * - Page: a contiguous block of messages. In the textarea, pages are separated by | |
| * blank lines. | |
| * - Visible page: the page extracted from the current DOM state (what Messenger has | |
| * rendered at this moment, regardless of how it fetched or cached it). | |
| * | |
| * Stored pages are always disjoint — every merge pass enforces this by collapsing | |
| * overlapping pages into one. So the visible page can overlap with at most one stored | |
| * page on its left side and one on its right side. | |
| * | |
| * Merge strategy (LCS-inspired), applied to each overlapping stored page: | |
| * - Find the first and last line in the stored page that also appears in the visible page | |
| * ("anchors"). The visible page is authoritative between them: stored lines not | |
| * present in it are dropped — the primary case is a deleted or edited message; a | |
| * secondary case is a sender name changing between renders (e.g. full name on first | |
| * render, nickname on subsequent renders), which produces a new key and makes the | |
| * old one appear absent. | |
| * - Lines before firstAnchor are kept only if their timestamp predates the visible page | |
| * (older scroll history). Lines at or within the visible page's range are dropped as stale. | |
| * - Lines after lastAnchor are always kept (newer scroll history not in the visible page). | |
| * - The merged page is: [kept pre-anchor lines] + [visible page] + [kept post-anchor lines]. | |
| * | |
| * Finally, all resulting pages (merged page + any remaining non-overlapping pages) are | |
| * sorted by first-message timestamp. | |
| */ | |
| function addMessages() { | |
| seen.clear(); | |
| for (const line of pages.flat()) seen.add(line); | |
| const els = document.querySelectorAll('[role="button"][aria-label^="Enter, Message sent"]'); | |
| const domLines = /** @type {string[]} */ ([]); | |
| for (const el of Array.from(els)) { | |
| const line = extractMessage(el); | |
| if (line) domLines.push(line); | |
| } | |
| if (!domLines.length) return; | |
| const newLines = domLines.filter(l => !seen.has(l)); | |
| if (!newLines.length) return; | |
| const domSet = new Set(domLines); | |
| const overlapping = pages.filter(p => p.some(l => domSet.has(l))); | |
| const rest = pages.filter(p => !p.some(l => domSet.has(l))); | |
| /** @type {string[]} */ const before = []; | |
| /** @type {string[]} */ const after = []; | |
| let dropped = 0; | |
| for (const page of overlapping) { | |
| let firstAnchor = -1, lastAnchor = -1; | |
| for (let i = 0; i < page.length; i++) { | |
| if (domSet.has(page[i])) { if (firstAnchor === -1) firstAnchor = i; lastAnchor = i; } | |
| } | |
| if (firstAnchor === -1) continue; | |
| for (let i = 0; i < firstAnchor; i++) { | |
| // Drop stale early-injection duplicates whose timestamp falls within the DOM window. | |
| if (page[i].slice(1, 17) < domLines[0].slice(1, 17)) before.push(page[i]); else dropped++; | |
| } | |
| for (let i = firstAnchor; i <= lastAnchor; i++) { if (!domSet.has(page[i])) dropped++; } | |
| for (let i = lastAnchor + 1; i < page.length; i++) after.push(page[i]); | |
| } | |
| const merged = [...before, ...domLines, ...after]; | |
| pages = [...rest, merged].sort((a, b) => { | |
| const tsA = a[0].slice(1, 17), tsB = b[0].slice(1, 17); | |
| return tsA < tsB ? -1 : tsA > tsB ? 1 : 0; | |
| }); | |
| seen.clear(); | |
| for (const line of pages.flat()) seen.add(line); | |
| render(); | |
| updateStatus(newLines.length, domLines.length, dropped); | |
| } | |
| // --- Observer --- | |
| /** @type {MutationObserver | null} */ | |
| let observer = null; | |
| function deactivate() { | |
| if (observer) { observer.disconnect(); observer = null; } | |
| pages = []; seen.clear(); | |
| } | |
| function activate() { | |
| reloadFromTextarea(); | |
| addMessages(); | |
| const log = document.querySelector('[role="log"]'); | |
| if (log) { | |
| observer = new MutationObserver(() => addMessages()); | |
| observer.observe(log, { childList: true, subtree: true }); | |
| } else { | |
| const poller = setInterval(() => { | |
| const l = document.querySelector('[role="log"]'); | |
| if (!l) return; | |
| clearInterval(poller); | |
| observer = new MutationObserver(() => addMessages()); | |
| observer.observe(l, { childList: true, subtree: true }); | |
| addMessages(); | |
| }, 500); | |
| } | |
| } | |
| } | |
| init(); | |
| })(); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment