Skip to content

Instantly share code, notes, and snippets.

@lukestanley
Last active October 21, 2024 22:32
Show Gist options
  • Save lukestanley/c3a37ab61a45e72b74995a5cb7d20f70 to your computer and use it in GitHub Desktop.
Save lukestanley/c3a37ab61a45e72b74995a5cb7d20f70 to your computer and use it in GitHub Desktop.
Log arbitrary webpage text changes to JSONL file from all tabs to local server: Browser UserScript and Python server. MIT license (c) Luke Stanley 2024
"""
# MIT License (c) Luke Stanley 2024
AIOHTTP server to save webpage text changes with timestamps.
Prints changes to the console and saves them to 'changes.jsonl' in the background.
Dependencies: aiohttp, asyncio
Install: pip install aiohttp
Run: python3 logging_server_aio.py
Endpoint:
1. POST /save_changes/ - Saves changes related to a webpage.
Example CURL Request:
curl -X POST http://localhost:8536/save_changes/ -H "Content-Type: application/json" -d '{"url":"http://example.com","changes":"Welcome to Example.com!"}'
Logging:
- The server logs incoming data in JSON Lines format to 'changes.jsonl'.
- Each log entry includes the URL, changes, and timestamps (either provided or automatically generated).
- Logs are flushed to the file periodically to reduce disk I/O overhead.
Configuration:
- LOG_FILE_PATH: Path to the JSONL file where logs are stored (default: 'changes.jsonl').
- FLUSH_INTERVAL: Interval in seconds to flush the logs to the file (default: 10 seconds).
Security:
- No authentication is implemented. This is intended for internal personal, local use!
- Data sent to the server is in plaintext and vulnerable to sniffing tools.
"""
from aiohttp import web
import asyncio
import json
from datetime import datetime
# Configuration
LOG_FILE_PATH = "changes.jsonl" # Matches the FastAPI implementation
FLUSH_INTERVAL = 10 # in seconds
# Asynchronous log queue
log_queue = asyncio.Queue()
async def handle_save_changes(request):
try:
# Parse incoming JSON data
data = await request.json()
# Ensure required fields are present
if 'url' not in data or 'changes' not in data:
return web.json_response({"error": "Missing 'url' or 'changes' field"}, status=400)
# Add timestamps if not provided
now = datetime.utcnow()
if 'unix_timestamp' not in data:
data['unix_timestamp'] = int(now.timestamp())
if 'pretty_timestamp' not in data:
data['pretty_timestamp'] = now.strftime("%Y %B %dth %A %H:%M")
# Print log information to console (no Rich, so using plain print)
print(f"{data['pretty_timestamp']} [URL]: {data['url']} ({len(data['changes'].strip())} chars)")
print(f"Changes: {data['changes']}\n")
# Queue the log data
await log_queue.put(data)
# Respond with 202 Accepted
return web.json_response({}, status=202)
except json.JSONDecodeError:
return web.json_response({"error": "Invalid JSON data"}, status=400)
except Exception as e:
print(f"Error handling request: {e}")
return web.json_response({"error": str(e)}, status=500)
async def log_writer():
while True:
logs_to_write = []
while not log_queue.empty():
logs_to_write.append(await log_queue.get())
if logs_to_write:
try:
with open(LOG_FILE_PATH, "a") as f:
for log_entry in logs_to_write:
f.write(json.dumps(log_entry) + "\n")
except Exception as e:
print(f"Error writing logs to file: {e}")
await asyncio.sleep(FLUSH_INTERVAL) # Flush logs periodically
# Start the background task for logging
async def start_background_tasks(app):
app['log_writer'] = asyncio.create_task(log_writer())
# Handle graceful shutdown
async def on_shutdown(app):
# Wait until the log queue is fully processed
while not log_queue.empty():
await asyncio.sleep(0.1)
app['log_writer'].cancel()
try:
await app['log_writer']
except asyncio.CancelledError:
pass
app = web.Application(client_max_size=1024**2 * 50) # 50 MB limit
app.add_routes([web.post('/save_changes/', handle_save_changes), web.post('/save_changes', handle_save_changes)])
app.on_startup.append(start_background_tasks)
app.on_shutdown.append(on_shutdown)
if __name__ == "__main__":
web.run_app(app, host="127.0.0.1", port=8536)
// ==UserScript==
// @name Log webpage text changes to server in Markdown format
// @namespace http://tampermonkey.net/
// @version 1.0
// @description Logs webpage text changes in Markdown format to the console and shares with locally running server, batching changes to limit requests. MIT License (c) Luke Stanley 2024
// @author Luke Stanley
// @match *://*/*
// @grant GM_xmlhttpRequest
// ==/UserScript==
// I was frustrated by web apps preventing easy exporting and interoperability,
// and I also want to store a personal reference of what I was looking at to
// refresh my memory later. So I whipped this up with a minimal web server.
(function() {
const LOGGING_SERVER_URL = "http://localhost:8536/save_changes/";
const BATCH_SENDING_WAIT_PERIOD = 200; // Wait time in milliseconds.
// Store previous states of text nodes
const previousTextState = new WeakMap();
// Queue for changes to be sent
let changeQueue = [];
let isSending = false;
function escapeMarkdown(text) {
return text.replace(/([_*[\]()])/g, '\\$1');
}
function isScriptContent(node) {
while (node) {
if (node.nodeType === Node.ELEMENT_NODE && node.tagName.toLowerCase() === 'script') {
return true;
}
node = node.parentNode;
}
return false;
}
function convertToMarkdown(node) {
if (isScriptContent(node)) {
return ''; // Skip content inside or of <script> tags
}
let markdown = "";
let tagName, href, src, alt, title;
if (node.nodeType === Node.TEXT_NODE) {
markdown += escapeMarkdown(node.textContent.trim());
} else if (node.nodeType === Node.ELEMENT_NODE) {
tagName = node.tagName.toLowerCase();
switch (tagName) {
case 'script':
return ''; // Early return for script tags
case 'h1': case 'h2': case 'h3':
case 'h4': case 'h5': case 'h6':
markdown += `\n\n${'#'.repeat(parseInt(tagName[1]))} ${escapeMarkdown(node.textContent.trim())}\n\n`;
break;
case 'p':
markdown += `\n\n${escapeMarkdown(node.textContent.trim())}\n\n`;
break;
case 'a':
href = node.getAttribute('href') || '#';
markdown += `[${escapeMarkdown(node.textContent.trim())}](${href})`;
break;
case 'img':
src = node.getAttribute('src') || '';
alt = node.getAttribute('alt') || '';
title = node.getAttribute('title') || '';
markdown += `![${escapeMarkdown(alt)}](${src}${title ? ` "${escapeMarkdown(title)}"` : ''})\n\n`;
break;
case 'ul':
case 'ol':
markdown += `\n${handleList(node)}\n\n`;
break;
case 'li':
markdown += handleListItem(node) + '\n';
break;
case 'strong': case 'b':
markdown += `**${escapeMarkdown(node.textContent.trim())}**`;
break;
case 'em': case 'i':
markdown += `*${escapeMarkdown(node.textContent.trim())}*`;
break;
case 'blockquote':
markdown += `\n\n${node.textContent.trim().split('\n').map(line => `> ${line}`).join('\n')}\n\n`;
break;
case 'code':
if (node.parentElement.tagName.toLowerCase() === 'pre') {
markdown += `\n\n\`\`\`\n${node.textContent.trim()}\n\`\`\`\n\n`;
} else {
markdown += `\`${escapeMarkdown(node.textContent.trim())}\``;
}
break;
case 'hr':
markdown += '\n\n---\n\n';
break;
case 'div':
case 'section':
case 'article':
case 'header':
case 'footer':
markdown += `\n\n${convertChildrenToMarkdown(node)}\n\n`;
break;
default:
markdown += `\n\n${convertChildrenToMarkdown(node)}\n\n`;
}
}
return markdown.trim();
}
// Convert the children of a node to Markdown
function convertChildrenToMarkdown(node) {
let markdown = '';
node.childNodes.forEach(child => {
if (!isScriptContent(child)) {
markdown += convertToMarkdown(child) + '\n\n';
}
});
return markdown.trim();
}
// Handle unordered and ordered lists
function handleList(node) {
let markdown = '';
node.childNodes.forEach(child => {
if (child.nodeType === Node.ELEMENT_NODE && child.tagName.toLowerCase() === 'li') {
const prefix = node.tagName.toLowerCase() === 'ul' ? '- ' : '1. ';
markdown += prefix + handleListItem(child) + '\n';
}
});
return markdown.trim();
}
// Handle individual list items
function handleListItem(node) {
return escapeMarkdown(node.textContent.trim());
}
// Sends changes to logging server
function sendPostRequest(url, changeText) {
GM_xmlhttpRequest({
method: "POST",
url: LOGGING_SERVER_URL,
headers: {
"Content-Type": "application/json"
},
data: JSON.stringify({ url: url, changes: changeText }),
onload: function(response) {
console.log("Response:", response.responseText);
},
onerror: function(error) {
console.error("Error:", error);
}
});
}
// Report changes to console log and server (with server logging batched)
function reportChange(changeText) {
console.log('\n\n' + changeText + '\n\n');
changeQueue.push(changeText);
// Batch and send changes after a while
if (!isSending) {
isSending = true;
setTimeout(() => {
const combinedChanges = changeQueue.join('\n\n');
sendPostRequest(window.location.href, combinedChanges);
changeQueue = [];
isSending = false;
}, BATCH_SENDING_WAIT_PERIOD);
}
}
// Log changes in Markdown format
function logChanges(mutationsList) {
for (let mutation of mutationsList) {
if (mutation.type === 'characterData') {
if (!isScriptContent(mutation.target)) {
const newText = convertToMarkdown(mutation.target);
const oldText = previousTextState.get(mutation.target) || "";
if (newText !== oldText) {
reportChange(newText);
previousTextState.set(mutation.target, newText);
}
}
} else if (mutation.type === 'childList') {
mutation.addedNodes.forEach(node => {
if (!isScriptContent(node)) {
const newText = convertToMarkdown(node);
if (newText.length > 0) {
reportChange(newText);
}
}
});
}
}
}
// Initial log of visible text
const initialMarkdown = convertToMarkdown(document.body);
reportChange(initialMarkdown);
// Watch for changes
const observer = new MutationObserver(logChanges);
const config = {
attributes: false, // Ignore attribute changes
childList: true,
characterData: true,
characterDataOldValue: true, // Capture the old text content
subtree: true // Monitor the full subtree
};
// Start observing page changes
observer.observe(document.body, config);
})();
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment