Skip to content

Instantly share code, notes, and snippets.

@unbracketed
Created December 19, 2024 20:36
Show Gist options
  • Save unbracketed/7abd317b44cc12a64a9459152d7ab427 to your computer and use it in GitHub Desktop.
Save unbracketed/7abd317b44cc12a64a9459152d7ab427 to your computer and use it in GitHub Desktop.
# /// script
# requires-python = ">=3.12"
# dependencies = [
# "starlette",
# "uvicorn",
# "beautifulsoup4",
# "httpx",
# "html5lib"
# ]
# ///
from starlette.applications import Starlette
from starlette.responses import PlainTextResponse, JSONResponse
from starlette.routing import Route
import httpx
from bs4 import BeautifulSoup
import asyncio
async def strip_html(request):
url = request.query_params.get('url')
if not url:
return JSONResponse(
{"error": "Missing url parameter"},
status_code=400
)
try:
async with httpx.AsyncClient() as client:
response = await client.get(url)
response.raise_for_status()
html = response.text
except httpx.HTTPError as e:
return JSONResponse(
{"error": f"Failed to fetch URL: {str(e)}"},
status_code=400
)
# Parse HTML using BeautifulSoup with html5lib for better parsing
soup = BeautifulSoup(html, 'html5lib')
# Remove script and style elements
for element in soup(['script', 'style']):
element.decompose()
# Get text and normalize whitespace
text = soup.get_text()
lines = (line.strip() for line in text.splitlines())
text = '\n'.join(line for line in lines if line)
return PlainTextResponse(text)
app = Starlette(
debug=True,
routes=[
Route('/', strip_html)
]
)
if __name__ == '__main__':
import uvicorn
uvicorn.run(app, host='127.0.0.1', port=8000)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment