Last active
October 21, 2024 22:32
-
-
Save lukestanley/c3a37ab61a45e72b74995a5cb7d20f70 to your computer and use it in GitHub Desktop.
Log arbitrary webpage text changes to JSONL file from all tabs to local server: Browser UserScript and Python server. MIT license (c) Luke Stanley 2024
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
# MIT License (c) Luke Stanley 2024 | |
AIOHTTP server to save webpage text changes with timestamps. | |
Prints changes to the console and saves them to 'changes.jsonl' in the background. | |
Dependencies: aiohttp, asyncio | |
Install: pip install aiohttp | |
Run: python3 logging_server_aio.py | |
Endpoint: | |
1. POST /save_changes/ - Saves changes related to a webpage. | |
Example CURL Request: | |
curl -X POST http://localhost:8536/save_changes/ -H "Content-Type: application/json" -d '{"url":"http://example.com","changes":"Welcome to Example.com!"}' | |
Logging: | |
- The server logs incoming data in JSON Lines format to 'changes.jsonl'. | |
- Each log entry includes the URL, changes, and timestamps (either provided or automatically generated). | |
- Logs are flushed to the file periodically to reduce disk I/O overhead. | |
Configuration: | |
- LOG_FILE_PATH: Path to the JSONL file where logs are stored (default: 'changes.jsonl'). | |
- FLUSH_INTERVAL: Interval in seconds to flush the logs to the file (default: 10 seconds). | |
Security: | |
- No authentication is implemented. This is intended for internal personal, local use! | |
- Data sent to the server is in plaintext and vulnerable to sniffing tools. | |
""" | |
from aiohttp import web | |
import asyncio | |
import json | |
from datetime import datetime | |
# Configuration | |
LOG_FILE_PATH = "changes.jsonl" # Matches the FastAPI implementation | |
FLUSH_INTERVAL = 10 # in seconds | |
# Asynchronous log queue | |
log_queue = asyncio.Queue() | |
async def handle_save_changes(request): | |
try: | |
# Parse incoming JSON data | |
data = await request.json() | |
# Ensure required fields are present | |
if 'url' not in data or 'changes' not in data: | |
return web.json_response({"error": "Missing 'url' or 'changes' field"}, status=400) | |
# Add timestamps if not provided | |
now = datetime.utcnow() | |
if 'unix_timestamp' not in data: | |
data['unix_timestamp'] = int(now.timestamp()) | |
if 'pretty_timestamp' not in data: | |
data['pretty_timestamp'] = now.strftime("%Y %B %dth %A %H:%M") | |
# Print log information to console (no Rich, so using plain print) | |
print(f"{data['pretty_timestamp']} [URL]: {data['url']} ({len(data['changes'].strip())} chars)") | |
print(f"Changes: {data['changes']}\n") | |
# Queue the log data | |
await log_queue.put(data) | |
# Respond with 202 Accepted | |
return web.json_response({}, status=202) | |
except json.JSONDecodeError: | |
return web.json_response({"error": "Invalid JSON data"}, status=400) | |
except Exception as e: | |
print(f"Error handling request: {e}") | |
return web.json_response({"error": str(e)}, status=500) | |
async def log_writer(): | |
while True: | |
logs_to_write = [] | |
while not log_queue.empty(): | |
logs_to_write.append(await log_queue.get()) | |
if logs_to_write: | |
try: | |
with open(LOG_FILE_PATH, "a") as f: | |
for log_entry in logs_to_write: | |
f.write(json.dumps(log_entry) + "\n") | |
except Exception as e: | |
print(f"Error writing logs to file: {e}") | |
await asyncio.sleep(FLUSH_INTERVAL) # Flush logs periodically | |
# Start the background task for logging | |
async def start_background_tasks(app): | |
app['log_writer'] = asyncio.create_task(log_writer()) | |
# Handle graceful shutdown | |
async def on_shutdown(app): | |
# Wait until the log queue is fully processed | |
while not log_queue.empty(): | |
await asyncio.sleep(0.1) | |
app['log_writer'].cancel() | |
try: | |
await app['log_writer'] | |
except asyncio.CancelledError: | |
pass | |
app = web.Application(client_max_size=1024**2 * 50) # 50 MB limit | |
app.add_routes([web.post('/save_changes/', handle_save_changes), web.post('/save_changes', handle_save_changes)]) | |
app.on_startup.append(start_background_tasks) | |
app.on_shutdown.append(on_shutdown) | |
if __name__ == "__main__": | |
web.run_app(app, host="127.0.0.1", port=8536) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// ==UserScript== | |
// @name Log webpage text changes to server in Markdown format | |
// @namespace http://tampermonkey.net/ | |
// @version 1.0 | |
// @description Logs webpage text changes in Markdown format to the console and shares with locally running server, batching changes to limit requests. MIT License (c) Luke Stanley 2024 | |
// @author Luke Stanley | |
// @match *://*/* | |
// @grant GM_xmlhttpRequest | |
// ==/UserScript== | |
// I was frustrated by web apps preventing easy exporting and interoperability, | |
// and I also want to store a personal reference of what I was looking at to | |
// refresh my memory later. So I whipped this up with a minimal web server. | |
(function() { | |
const LOGGING_SERVER_URL = "http://localhost:8536/save_changes/"; | |
const BATCH_SENDING_WAIT_PERIOD = 200; // Wait time in milliseconds. | |
// Store previous states of text nodes | |
const previousTextState = new WeakMap(); | |
// Queue for changes to be sent | |
let changeQueue = []; | |
let isSending = false; | |
function escapeMarkdown(text) { | |
return text.replace(/([_*[\]()])/g, '\\$1'); | |
} | |
function isScriptContent(node) { | |
while (node) { | |
if (node.nodeType === Node.ELEMENT_NODE && node.tagName.toLowerCase() === 'script') { | |
return true; | |
} | |
node = node.parentNode; | |
} | |
return false; | |
} | |
function convertToMarkdown(node) { | |
if (isScriptContent(node)) { | |
return ''; // Skip content inside or of <script> tags | |
} | |
let markdown = ""; | |
let tagName, href, src, alt, title; | |
if (node.nodeType === Node.TEXT_NODE) { | |
markdown += escapeMarkdown(node.textContent.trim()); | |
} else if (node.nodeType === Node.ELEMENT_NODE) { | |
tagName = node.tagName.toLowerCase(); | |
switch (tagName) { | |
case 'script': | |
return ''; // Early return for script tags | |
case 'h1': case 'h2': case 'h3': | |
case 'h4': case 'h5': case 'h6': | |
markdown += `\n\n${'#'.repeat(parseInt(tagName[1]))} ${escapeMarkdown(node.textContent.trim())}\n\n`; | |
break; | |
case 'p': | |
markdown += `\n\n${escapeMarkdown(node.textContent.trim())}\n\n`; | |
break; | |
case 'a': | |
href = node.getAttribute('href') || '#'; | |
markdown += `[${escapeMarkdown(node.textContent.trim())}](${href})`; | |
break; | |
case 'img': | |
src = node.getAttribute('src') || ''; | |
alt = node.getAttribute('alt') || ''; | |
title = node.getAttribute('title') || ''; | |
markdown += `![${escapeMarkdown(alt)}](${src}${title ? ` "${escapeMarkdown(title)}"` : ''})\n\n`; | |
break; | |
case 'ul': | |
case 'ol': | |
markdown += `\n${handleList(node)}\n\n`; | |
break; | |
case 'li': | |
markdown += handleListItem(node) + '\n'; | |
break; | |
case 'strong': case 'b': | |
markdown += `**${escapeMarkdown(node.textContent.trim())}**`; | |
break; | |
case 'em': case 'i': | |
markdown += `*${escapeMarkdown(node.textContent.trim())}*`; | |
break; | |
case 'blockquote': | |
markdown += `\n\n${node.textContent.trim().split('\n').map(line => `> ${line}`).join('\n')}\n\n`; | |
break; | |
case 'code': | |
if (node.parentElement.tagName.toLowerCase() === 'pre') { | |
markdown += `\n\n\`\`\`\n${node.textContent.trim()}\n\`\`\`\n\n`; | |
} else { | |
markdown += `\`${escapeMarkdown(node.textContent.trim())}\``; | |
} | |
break; | |
case 'hr': | |
markdown += '\n\n---\n\n'; | |
break; | |
case 'div': | |
case 'section': | |
case 'article': | |
case 'header': | |
case 'footer': | |
markdown += `\n\n${convertChildrenToMarkdown(node)}\n\n`; | |
break; | |
default: | |
markdown += `\n\n${convertChildrenToMarkdown(node)}\n\n`; | |
} | |
} | |
return markdown.trim(); | |
} | |
// Convert the children of a node to Markdown | |
function convertChildrenToMarkdown(node) { | |
let markdown = ''; | |
node.childNodes.forEach(child => { | |
if (!isScriptContent(child)) { | |
markdown += convertToMarkdown(child) + '\n\n'; | |
} | |
}); | |
return markdown.trim(); | |
} | |
// Handle unordered and ordered lists | |
function handleList(node) { | |
let markdown = ''; | |
node.childNodes.forEach(child => { | |
if (child.nodeType === Node.ELEMENT_NODE && child.tagName.toLowerCase() === 'li') { | |
const prefix = node.tagName.toLowerCase() === 'ul' ? '- ' : '1. '; | |
markdown += prefix + handleListItem(child) + '\n'; | |
} | |
}); | |
return markdown.trim(); | |
} | |
// Handle individual list items | |
function handleListItem(node) { | |
return escapeMarkdown(node.textContent.trim()); | |
} | |
// Sends changes to logging server | |
function sendPostRequest(url, changeText) { | |
GM_xmlhttpRequest({ | |
method: "POST", | |
url: LOGGING_SERVER_URL, | |
headers: { | |
"Content-Type": "application/json" | |
}, | |
data: JSON.stringify({ url: url, changes: changeText }), | |
onload: function(response) { | |
console.log("Response:", response.responseText); | |
}, | |
onerror: function(error) { | |
console.error("Error:", error); | |
} | |
}); | |
} | |
// Report changes to console log and server (with server logging batched) | |
function reportChange(changeText) { | |
console.log('\n\n' + changeText + '\n\n'); | |
changeQueue.push(changeText); | |
// Batch and send changes after a while | |
if (!isSending) { | |
isSending = true; | |
setTimeout(() => { | |
const combinedChanges = changeQueue.join('\n\n'); | |
sendPostRequest(window.location.href, combinedChanges); | |
changeQueue = []; | |
isSending = false; | |
}, BATCH_SENDING_WAIT_PERIOD); | |
} | |
} | |
// Log changes in Markdown format | |
function logChanges(mutationsList) { | |
for (let mutation of mutationsList) { | |
if (mutation.type === 'characterData') { | |
if (!isScriptContent(mutation.target)) { | |
const newText = convertToMarkdown(mutation.target); | |
const oldText = previousTextState.get(mutation.target) || ""; | |
if (newText !== oldText) { | |
reportChange(newText); | |
previousTextState.set(mutation.target, newText); | |
} | |
} | |
} else if (mutation.type === 'childList') { | |
mutation.addedNodes.forEach(node => { | |
if (!isScriptContent(node)) { | |
const newText = convertToMarkdown(node); | |
if (newText.length > 0) { | |
reportChange(newText); | |
} | |
} | |
}); | |
} | |
} | |
} | |
// Initial log of visible text | |
const initialMarkdown = convertToMarkdown(document.body); | |
reportChange(initialMarkdown); | |
// Watch for changes | |
const observer = new MutationObserver(logChanges); | |
const config = { | |
attributes: false, // Ignore attribute changes | |
childList: true, | |
characterData: true, | |
characterDataOldValue: true, // Capture the old text content | |
subtree: true // Monitor the full subtree | |
}; | |
// Start observing page changes | |
observer.observe(document.body, config); | |
})(); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment