lukestanley · October 21, 2024 22:32
diff --git a/logging_server_aio.py b/logging_server_aio.py
 """
 # MIT License (c) Luke Stanley 2024
 AIOHTTP server to save webpage text changes with timestamps.
 Prints changes to the console and saves them to 'changes.jsonl' in the background.

 Dependencies: aiohttp, asyncio

 Install: pip install aiohttp
 Run: python3 logging_server_aio.py

 Endpoint:
 1. POST /save_changes/ - Saves changes related to a webpage.
   Example CURL Request:
   curl -X POST http://localhost:8536/save_changes/ -H "Content-Type: application/json" -d '{"url":"http://example.com","changes":"Welcome to Example.com!"}'

 Logging:
 - The server logs incoming data in JSON Lines format to 'changes.jsonl'.
 - Each log entry includes the URL, changes, and timestamps (either provided or automatically generated).
 - Logs are flushed to the file periodically to reduce disk I/O overhead.

 Configuration:
 - LOG_FILE_PATH: Path to the JSONL file where logs are stored (default: 'changes.jsonl').
 - FLUSH_INTERVAL: Interval in seconds to flush the logs to the file (default: 10 seconds).

 Security:
 - No authentication is implemented. This is intended for internal personal, local use!
 - Data sent to the server is in plaintext and vulnerable to sniffing tools.
 """
 from aiohttp import web
 import asyncio
 import json
 from datetime import datetime

 # Configuration
 LOG_FILE_PATH = "changes.jsonl"  # Matches the FastAPI implementation
 FLUSH_INTERVAL = 10  # in seconds

 # Asynchronous log queue
 log_queue = asyncio.Queue()

 async def handle_save_changes(request):
    try:
        # Parse incoming JSON data
        data = await request.json()
        
        # Ensure required fields are present
        if 'url' not in data or 'changes' not in data:
            return web.json_response({"error": "Missing 'url' or 'changes' field"}, status=400)

        # Add timestamps if not provided
        now = datetime.utcnow()
        if 'unix_timestamp' not in data:
            data['unix_timestamp'] = int(now.timestamp())
        if 'pretty_timestamp' not in data:
            data['pretty_timestamp'] = now.strftime("%Y %B %dth %A %H:%M")

        # Print log information to console (no Rich, so using plain print)
        print(f"{data['pretty_timestamp']} [URL]: {data['url']} ({len(data['changes'].strip())} chars)")
        print(f"Changes: {data['changes']}\n")

        # Queue the log data
        await log_queue.put(data)
        
        # Respond with 202 Accepted
        return web.json_response({}, status=202)
    
    except json.JSONDecodeError:
        return web.json_response({"error": "Invalid JSON data"}, status=400)
    except Exception as e:
        print(f"Error handling request: {e}")
        return web.json_response({"error": str(e)}, status=500)

 async def log_writer():
    while True:
        logs_to_write = []
        while not log_queue.empty():
            logs_to_write.append(await log_queue.get())

        if logs_to_write:
            try:
                with open(LOG_FILE_PATH, "a") as f:
                    for log_entry in logs_to_write:
                        f.write(json.dumps(log_entry) + "\n")
            except Exception as e:
                print(f"Error writing logs to file: {e}")

        await asyncio.sleep(FLUSH_INTERVAL)  # Flush logs periodically

 # Start the background task for logging
 async def start_background_tasks(app):
    app['log_writer'] = asyncio.create_task(log_writer())

 # Handle graceful shutdown
 async def on_shutdown(app):
    # Wait until the log queue is fully processed
    while not log_queue.empty():
        await asyncio.sleep(0.1)
    app['log_writer'].cancel()
    try:
        await app['log_writer']
    except asyncio.CancelledError:
        pass

 app = web.Application(client_max_size=1024**2 * 50) # 50 MB limit
 app.add_routes([web.post('/save_changes/', handle_save_changes), web.post('/save_changes', handle_save_changes)])

 app.on_startup.append(start_background_tasks)
 app.on_shutdown.append(on_shutdown)

 if __name__ == "__main__":
    web.run_app(app, host="127.0.0.1", port=8536)
diff --git a/track_browser_text_changes.userscript.js b/track_browser_text_changes.userscript.js
 // ==UserScript==
 // @name         Log webpage text changes to server in Markdown format
 // @namespace    http://tampermonkey.net/
 // @version      1.0
 // @description  Logs webpage text changes in Markdown format to the console and shares with locally running server, batching changes to limit requests. MIT License (c) Luke Stanley 2024
 // @author       Luke Stanley
 // @match        *://*/*
 // @grant        GM_xmlhttpRequest
 // ==/UserScript==
 // I was frustrated by web apps preventing easy exporting and interoperability,
 // and I also want to store a personal reference of what I was looking at to
 // refresh my memory later. So I whipped this up with a minimal web server.

 (function() {
    const LOGGING_SERVER_URL = "http://localhost:8536/save_changes/";
    const BATCH_SENDING_WAIT_PERIOD = 200; // Wait time in milliseconds.

    // Store previous states of text nodes
    const previousTextState = new WeakMap();

    // Queue for changes to be sent
    let changeQueue = [];
    let isSending = false;

    function escapeMarkdown(text) {
        return text.replace(/([_*[\]()])/g, '\\$1');
    }

    function isScriptContent(node) {
        while (node) {
            if (node.nodeType === Node.ELEMENT_NODE && node.tagName.toLowerCase() === 'script') {
                return true;
            }
            node = node.parentNode;
        }
        return false;
    }

    function convertToMarkdown(node) {
        if (isScriptContent(node)) {
            return ''; // Skip content inside or of <script> tags
        }

        let markdown = "";
        let tagName, href, src, alt, title;

        if (node.nodeType === Node.TEXT_NODE) {
            markdown += escapeMarkdown(node.textContent.trim());
        } else if (node.nodeType === Node.ELEMENT_NODE) {
            tagName = node.tagName.toLowerCase();
            switch (tagName) {
                case 'script':
                    return ''; // Early return for script tags
                case 'h1': case 'h2': case 'h3':
                case 'h4': case 'h5': case 'h6':
                    markdown += `\n\n${'#'.repeat(parseInt(tagName[1]))} ${escapeMarkdown(node.textContent.trim())}\n\n`;
                    break;
                case 'p':
                    markdown += `\n\n${escapeMarkdown(node.textContent.trim())}\n\n`;
                    break;
                case 'a':
                    href = node.getAttribute('href') || '#';
                    markdown += `[${escapeMarkdown(node.textContent.trim())}](${href})`;
                    break;
                case 'img':
                    src = node.getAttribute('src') || '';
                    alt = node.getAttribute('alt') || '';
                    title = node.getAttribute('title') || '';
                    markdown += `![${escapeMarkdown(alt)}](${src}${title ? ` "${escapeMarkdown(title)}"` : ''})\n\n`;
                    break;
                case 'ul':
                case 'ol':
                    markdown += `\n${handleList(node)}\n\n`;
                    break;
                case 'li':
                    markdown += handleListItem(node) + '\n';
                    break;
                case 'strong': case 'b':
                    markdown += `**${escapeMarkdown(node.textContent.trim())}**`;
                    break;
                case 'em': case 'i':
                    markdown += `*${escapeMarkdown(node.textContent.trim())}*`;
                    break;
                case 'blockquote':
                    markdown += `\n\n${node.textContent.trim().split('\n').map(line => `> ${line}`).join('\n')}\n\n`;
                    break;
                case 'code':
                    if (node.parentElement.tagName.toLowerCase() === 'pre') {
                        markdown += `\n\n\`\`\`\n${node.textContent.trim()}\n\`\`\`\n\n`;
                    } else {
                        markdown += `\`${escapeMarkdown(node.textContent.trim())}\``;
                    }
                    break;
                case 'hr':
                    markdown += '\n\n---\n\n';
                    break;
                case 'div':
                case 'section':
                case 'article':
                case 'header':
                case 'footer':
                    markdown += `\n\n${convertChildrenToMarkdown(node)}\n\n`;
                    break;
                default:
                    markdown += `\n\n${convertChildrenToMarkdown(node)}\n\n`;
            }
        }
        return markdown.trim();
    }


    // Convert the children of a node to Markdown
    function convertChildrenToMarkdown(node) {
        let markdown = '';
        node.childNodes.forEach(child => {
            if (!isScriptContent(child)) {
                markdown += convertToMarkdown(child) + '\n\n';
            }
        });
        return markdown.trim();
    }

    // Handle unordered and ordered lists
    function handleList(node) {
        let markdown = '';
        node.childNodes.forEach(child => {
            if (child.nodeType === Node.ELEMENT_NODE && child.tagName.toLowerCase() === 'li') {
                const prefix = node.tagName.toLowerCase() === 'ul' ? '- ' : '1. ';
                markdown += prefix + handleListItem(child) + '\n';
            }
        });
        return markdown.trim();
    }

    // Handle individual list items
    function handleListItem(node) {
        return escapeMarkdown(node.textContent.trim());
    }

    // Sends changes to logging server
    function sendPostRequest(url, changeText) {
        GM_xmlhttpRequest({
            method: "POST",
            url: LOGGING_SERVER_URL,
            headers: {
                "Content-Type": "application/json"
            },
            data: JSON.stringify({ url: url, changes: changeText }),
            onload: function(response) {
                console.log("Response:", response.responseText);
            },
            onerror: function(error) {
                console.error("Error:", error);
            }
        });
    }

    // Report changes to console log and server (with server logging batched)
    function reportChange(changeText) {
        console.log('\n\n' + changeText + '\n\n');
        changeQueue.push(changeText);

        // Batch and send changes after a while
        if (!isSending) {
            isSending = true;
            setTimeout(() => {
                const combinedChanges = changeQueue.join('\n\n');
                sendPostRequest(window.location.href, combinedChanges);
                changeQueue = [];
                isSending = false;
            }, BATCH_SENDING_WAIT_PERIOD);
        }
    }

    // Log changes in Markdown format
    function logChanges(mutationsList) {
        for (let mutation of mutationsList) {
            if (mutation.type === 'characterData') {
                if (!isScriptContent(mutation.target)) {
                    const newText = convertToMarkdown(mutation.target);
                    const oldText = previousTextState.get(mutation.target) || "";
                    if (newText !== oldText) {
                        reportChange(newText);
                        previousTextState.set(mutation.target, newText);
                    }
                }
            } else if (mutation.type === 'childList') {
                mutation.addedNodes.forEach(node => {
                    if (!isScriptContent(node)) {
                        const newText = convertToMarkdown(node);
                        if (newText.length > 0) {
                            reportChange(newText);
                        }
                    }
                });
            }
        }
    }

    // Initial log of visible text
    const initialMarkdown = convertToMarkdown(document.body);
    reportChange(initialMarkdown);

    // Watch for changes
    const observer = new MutationObserver(logChanges);

    const config = {
        attributes: false, // Ignore attribute changes
        childList: true,
        characterData: true,
        characterDataOldValue: true, // Capture the old text content
        subtree: true // Monitor the full subtree
    };

    // Start observing page changes
    observer.observe(document.body, config);
 })();
	"""
	# MIT License (c) Luke Stanley 2024
	AIOHTTP server to save webpage text changes with timestamps.
	Prints changes to the console and saves them to 'changes.jsonl' in the background.

	Dependencies: aiohttp, asyncio

	Install: pip install aiohttp
	Run: python3 logging_server_aio.py

	Endpoint:
	1. POST /save_changes/ - Saves changes related to a webpage.
	Example CURL Request:
	curl -X POST http://localhost:8536/save_changes/ -H "Content-Type: application/json" -d '{"url":"http://example.com","changes":"Welcome to Example.com!"}'

	Logging:
	- The server logs incoming data in JSON Lines format to 'changes.jsonl'.
	- Each log entry includes the URL, changes, and timestamps (either provided or automatically generated).
	- Logs are flushed to the file periodically to reduce disk I/O overhead.

	Configuration:
	- LOG_FILE_PATH: Path to the JSONL file where logs are stored (default: 'changes.jsonl').
	- FLUSH_INTERVAL: Interval in seconds to flush the logs to the file (default: 10 seconds).

	Security:
	- No authentication is implemented. This is intended for internal personal, local use!
	- Data sent to the server is in plaintext and vulnerable to sniffing tools.
	"""
	from aiohttp import web
	import asyncio
	import json
	from datetime import datetime

	# Configuration
	LOG_FILE_PATH = "changes.jsonl" # Matches the FastAPI implementation
	FLUSH_INTERVAL = 10 # in seconds

	# Asynchronous log queue
	log_queue = asyncio.Queue()

	async def handle_save_changes(request):
	try:
	# Parse incoming JSON data
	data = await request.json()

	# Ensure required fields are present
	if 'url' not in data or 'changes' not in data:
	return web.json_response({"error": "Missing 'url' or 'changes' field"}, status=400)

	# Add timestamps if not provided
	now = datetime.utcnow()
	if 'unix_timestamp' not in data:
	data['unix_timestamp'] = int(now.timestamp())
	if 'pretty_timestamp' not in data:
	data['pretty_timestamp'] = now.strftime("%Y %B %dth %A %H:%M")

	# Print log information to console (no Rich, so using plain print)
	print(f"{data['pretty_timestamp']} [URL]: {data['url']} ({len(data['changes'].strip())} chars)")
	print(f"Changes: {data['changes']}\n")

	# Queue the log data
	await log_queue.put(data)

	# Respond with 202 Accepted
	return web.json_response({}, status=202)

	except json.JSONDecodeError:
	return web.json_response({"error": "Invalid JSON data"}, status=400)
	except Exception as e:
	print(f"Error handling request: {e}")
	return web.json_response({"error": str(e)}, status=500)

	async def log_writer():
	while True:
	logs_to_write = []
	while not log_queue.empty():
	logs_to_write.append(await log_queue.get())

	if logs_to_write:
	try:
	with open(LOG_FILE_PATH, "a") as f:
	for log_entry in logs_to_write:
	f.write(json.dumps(log_entry) + "\n")
	except Exception as e:
	print(f"Error writing logs to file: {e}")

	await asyncio.sleep(FLUSH_INTERVAL) # Flush logs periodically

	# Start the background task for logging
	async def start_background_tasks(app):
	app['log_writer'] = asyncio.create_task(log_writer())

	# Handle graceful shutdown
	async def on_shutdown(app):
	# Wait until the log queue is fully processed
	while not log_queue.empty():
	await asyncio.sleep(0.1)
	app['log_writer'].cancel()
	try:
	await app['log_writer']
	except asyncio.CancelledError:
	pass

	app = web.Application(client_max_size=1024*2 50) # 50 MB limit
	app.add_routes([web.post('/save_changes/', handle_save_changes), web.post('/save_changes', handle_save_changes)])

	app.on_startup.append(start_background_tasks)
	app.on_shutdown.append(on_shutdown)

	if __name__ == "__main__":
	web.run_app(app, host="127.0.0.1", port=8536)
	// ==UserScript==
	// @name Log webpage text changes to server in Markdown format
	// @namespace http://tampermonkey.net/
	// @version 1.0
	// @description Logs webpage text changes in Markdown format to the console and shares with locally running server, batching changes to limit requests. MIT License (c) Luke Stanley 2024
	// @author Luke Stanley
	// @match :///*
	// @grant GM_xmlhttpRequest
	// ==/UserScript==
	// I was frustrated by web apps preventing easy exporting and interoperability,
	// and I also want to store a personal reference of what I was looking at to
	// refresh my memory later. So I whipped this up with a minimal web server.

	(function() {
	const LOGGING_SERVER_URL = "http://localhost:8536/save_changes/";
	const BATCH_SENDING_WAIT_PERIOD = 200; // Wait time in milliseconds.

	// Store previous states of text nodes
	const previousTextState = new WeakMap();

	// Queue for changes to be sent
	let changeQueue = [];
	let isSending = false;

	function escapeMarkdown(text) {
	return text.replace(/([_*[\]()])/g, '\\$1');
	}

	function isScriptContent(node) {
	while (node) {
	if (node.nodeType === Node.ELEMENT_NODE && node.tagName.toLowerCase() === 'script') {
	return true;
	}
	node = node.parentNode;
	}
	return false;
	}

	function convertToMarkdown(node) {
	if (isScriptContent(node)) {
	return ''; // Skip content inside or of <script> tags
	}

	let markdown = "";
	let tagName, href, src, alt, title;

	if (node.nodeType === Node.TEXT_NODE) {
	markdown += escapeMarkdown(node.textContent.trim());
	} else if (node.nodeType === Node.ELEMENT_NODE) {
	tagName = node.tagName.toLowerCase();
	switch (tagName) {
	case 'script':
	return ''; // Early return for script tags
	case 'h1': case 'h2': case 'h3':
	case 'h4': case 'h5': case 'h6':
	markdown += `\n\n${'#'.repeat(parseInt(tagName[1]))} ${escapeMarkdown(node.textContent.trim())}\n\n`;
	break;
	case 'p':
	markdown += `\n\n${escapeMarkdown(node.textContent.trim())}\n\n`;
	break;
	case 'a':
	href = node.getAttribute('href') \|\| '#';
	markdown += `[${escapeMarkdown(node.textContent.trim())}](${href})`;
	break;
	case 'img':
	src = node.getAttribute('src') \|\| '';
	alt = node.getAttribute('alt') \|\| '';
	title = node.getAttribute('title') \|\| '';
	markdown += `![${escapeMarkdown(alt)}](${src}${title ? ` "${escapeMarkdown(title)}"` : ''})\n\n`;
	break;
	case 'ul':
	case 'ol':
	markdown += `\n${handleList(node)}\n\n`;
	break;
	case 'li':
	markdown += handleListItem(node) + '\n';
	break;
	case 'strong': case 'b':
	markdown += `${escapeMarkdown(node.textContent.trim())}`;
	break;
	case 'em': case 'i':
	markdown += `${escapeMarkdown(node.textContent.trim())}`;
	break;
	case 'blockquote':
	markdown += `\n\n${node.textContent.trim().split('\n').map(line => `> ${line}`).join('\n')}\n\n`;
	break;
	case 'code':
	if (node.parentElement.tagName.toLowerCase() === 'pre') {
	markdown += `\n\n\`\`\`\n${node.textContent.trim()}\n\`\`\`\n\n`;
	} else {
	markdown += `\`${escapeMarkdown(node.textContent.trim())}\``;
	}
	break;
	case 'hr':
	markdown += '\n\n---\n\n';
	break;
	case 'div':
	case 'section':
	case 'article':
	case 'header':
	case 'footer':
	markdown += `\n\n${convertChildrenToMarkdown(node)}\n\n`;
	break;
	default:
	markdown += `\n\n${convertChildrenToMarkdown(node)}\n\n`;
	}
	}
	return markdown.trim();
	}


	// Convert the children of a node to Markdown
	function convertChildrenToMarkdown(node) {
	let markdown = '';
	node.childNodes.forEach(child => {
	if (!isScriptContent(child)) {
	markdown += convertToMarkdown(child) + '\n\n';
	}
	});
	return markdown.trim();
	}

	// Handle unordered and ordered lists
	function handleList(node) {
	let markdown = '';
	node.childNodes.forEach(child => {
	if (child.nodeType === Node.ELEMENT_NODE && child.tagName.toLowerCase() === 'li') {
	const prefix = node.tagName.toLowerCase() === 'ul' ? '- ' : '1. ';
	markdown += prefix + handleListItem(child) + '\n';
	}
	});
	return markdown.trim();
	}

	// Handle individual list items
	function handleListItem(node) {
	return escapeMarkdown(node.textContent.trim());
	}

	// Sends changes to logging server
	function sendPostRequest(url, changeText) {
	GM_xmlhttpRequest({
	method: "POST",
	url: LOGGING_SERVER_URL,
	headers: {
	"Content-Type": "application/json"
	},
	data: JSON.stringify({ url: url, changes: changeText }),
	onload: function(response) {
	console.log("Response:", response.responseText);
	},
	onerror: function(error) {
	console.error("Error:", error);
	}
	});
	}

	// Report changes to console log and server (with server logging batched)
	function reportChange(changeText) {
	console.log('\n\n' + changeText + '\n\n');
	changeQueue.push(changeText);

	// Batch and send changes after a while
	if (!isSending) {
	isSending = true;
	setTimeout(() => {
	const combinedChanges = changeQueue.join('\n\n');
	sendPostRequest(window.location.href, combinedChanges);
	changeQueue = [];
	isSending = false;
	}, BATCH_SENDING_WAIT_PERIOD);
	}
	}

	// Log changes in Markdown format
	function logChanges(mutationsList) {
	for (let mutation of mutationsList) {
	if (mutation.type === 'characterData') {
	if (!isScriptContent(mutation.target)) {
	const newText = convertToMarkdown(mutation.target);
	const oldText = previousTextState.get(mutation.target) \|\| "";
	if (newText !== oldText) {
	reportChange(newText);
	previousTextState.set(mutation.target, newText);
	}
	}
	} else if (mutation.type === 'childList') {
	mutation.addedNodes.forEach(node => {
	if (!isScriptContent(node)) {
	const newText = convertToMarkdown(node);
	if (newText.length > 0) {
	reportChange(newText);
	}
	}
	});
	}
	}
	}

	// Initial log of visible text
	const initialMarkdown = convertToMarkdown(document.body);
	reportChange(initialMarkdown);

	// Watch for changes
	const observer = new MutationObserver(logChanges);

	const config = {
	attributes: false, // Ignore attribute changes
	childList: true,
	characterData: true,
	characterDataOldValue: true, // Capture the old text content
	subtree: true // Monitor the full subtree
	};

	// Start observing page changes
	observer.observe(document.body, config);
	})();