Skip to content

Instantly share code, notes, and snippets.

@ishank-dev
Created April 4, 2025 05:38
Show Gist options
  • Save ishank-dev/d2af48bfbc911e413ac5873739b931d1 to your computer and use it in GitHub Desktop.
Save ishank-dev/d2af48bfbc911e413ac5873739b931d1 to your computer and use it in GitHub Desktop.
import io
import datetime
import statistics
from warcio.warcwriter import WARCWriter
from warcio.statusandheaders import StatusAndHeaders
from warcio.archiveiterator import ArchiveIterator
from bs4 import BeautifulSoup
from fastapi import FastAPI, Request
from fastapi.responses import JSONResponse
import uvicorn
from playwright.sync_api import sync_playwright
# --------------------------------------------------------------------------------
# 1. A SAMPLE WARC FILE GENERATION
#
# This section generates a synthetic WARC file to mimic common archival scenarios:
# - Main HTML page referencing a stylesheet, image, and script
# - The script record is deliberately omitted to test missing-asset handling
# - A soft 404 page is included for error detection experiments
# --------------------------------------------------------------------------------
main_html = """
<html>
<head>
<title>Test Page</title>
<link rel="stylesheet" href="http://example.com/styles/main.css">
</head>
<body>
<img src="http://example.com/images/logo.png" alt="Logo">
<script src="http://example.com/scripts/app.js"></script>
<p>Welcome to the test page.</p>
</body>
</html>
""".strip()
css_content = "body { background: #fff; font-family: sans-serif; }"
image_content = b"\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR"
soft404_html = """
<html>
<head><title>Page Not Found</title></head>
<body>
<h1>404 - Page Not Found</h1>
<p>Sorry, the page you are looking for does not exist.</p>
</body>
</html>
""".strip()
with open('sample.warc.gz', 'wb') as output:
writer = WARCWriter(output, gzip=True)
# Main HTML page with asset references
stream = io.BytesIO(main_html.encode('utf-8'))
http_headers = StatusAndHeaders(
'200 OK',
[('Content-Type', 'text/html')],
protocol='HTTP/1.0'
)
record = writer.create_warc_record(
"http://example.com/", "response",
payload=stream,
length=len(main_html),
http_headers=http_headers
)
writer.write_record(record)
# CSS asset
stream = io.BytesIO(css_content.encode('utf-8'))
http_headers = StatusAndHeaders(
'200 OK',
[('Content-Type', 'text/css')],
protocol='HTTP/1.0'
)
record = writer.create_warc_record(
"http://example.com/styles/main.css",
"response",
payload=stream,
length=len(css_content),
http_headers=http_headers
)
writer.write_record(record)
# Image asset
stream = io.BytesIO(image_content)
http_headers = StatusAndHeaders(
'200 OK',
[('Content-Type', 'image/png')],
protocol='HTTP/1.0'
)
record = writer.create_warc_record(
"http://example.com/images/logo.png",
"response",
payload=stream,
length=len(image_content),
http_headers=http_headers
)
writer.write_record(record)
# Soft 404 page
stream = io.BytesIO(soft404_html.encode('utf-8'))
http_headers = StatusAndHeaders(
'200 OK',
[('Content-Type', 'text/html')],
protocol='HTTP/1.0'
)
record = writer.create_warc_record(
"http://example.com/missing", "response",
payload=stream,
length=len(soft404_html),
http_headers=http_headers
)
writer.write_record(record)
print("Sample WARC file 'sample.warc.gz' has been created.")
# --------------------------------------------------------------------------------
# 2. MODULES FOR PIPELINE + STRATEGY PATTERN
#
# The following classes simulate discrete analysis tasks, each focusing on
# a specific aspect of archival quality. These tasks can be chained in a
# pipeline, demonstrating modular strategy usage for future extensibility.
# --------------------------------------------------------------------------------
class ArchiveInput:
"""
Represents data extracted from a WARC record, along with
any metadata produced by analysis modules.
"""
def __init__(self, url, html, assets=None, metadata=None):
self.url = url
self.html = html
self.assets = assets or []
self.metadata = metadata or {}
class QAModule:
"""
Defines a generic interface for QA analysis modules.
"""
def execute(self, archive_input: ArchiveInput) -> ArchiveInput:
raise NotImplementedError("Subclasses must implement execute().")
class MissingAssetChecker(QAModule):
"""
Checks whether each referenced asset is present in the WARC file.
"""
def __init__(self, warc_file):
self.warc_file = warc_file
def execute(self, archive_input: ArchiveInput) -> ArchiveInput:
missing = []
for asset_type, asset_url in archive_input.assets:
if not check_asset_archived(self.warc_file, asset_url):
missing.append(asset_url)
archive_input.metadata['missing_assets'] = missing
return archive_input
class ErrorPageDetector(QAModule):
"""
Flags a page as an 'error' if certain keywords suggest a soft 404 or similar.
"""
def execute(self, archive_input: ArchiveInput) -> ArchiveInput:
archive_input.metadata['is_error'] = is_error_page(archive_input.html)
return archive_input
class SimpleScoringEngine(QAModule):
"""
Produces a quality score based on missing assets and error detection.
"""
def execute(self, archive_input: ArchiveInput) -> ArchiveInput:
missing_assets = archive_input.metadata.get('missing_assets', [])
is_error = archive_input.metadata.get('is_error', False)
total_assets = len(archive_input.assets)
archive_input.metadata['quality_score'] = compute_quality_score(
missing_count=len(missing_assets),
total_assets=total_assets,
time_drift=0,
is_error=is_error
)
return archive_input
# --------------------------------------------------------------------------------
# 3. SUPPORTING FUNCTIONS
#
# These utility methods handle tasks such as WARC reading, asset extraction,
# and minimal logic for error detection and scoring.
# --------------------------------------------------------------------------------
def get_warc_html(warc_file, target_url=None):
"""
Extracts HTML content from the first response record in the WARC file.
An optional URL filter can be applied.
"""
with open(warc_file, 'rb') as stream:
for record in ArchiveIterator(stream):
if record.rec_type == 'response':
url = record.rec_headers.get_header('WARC-Target-URI')
if target_url and target_url not in url:
continue
payload = record.content_stream().read()
try:
html = payload.decode('utf-8')
except UnicodeDecodeError:
html = payload.decode('latin1')
return url, html
return None, None
def extract_assets(html):
"""
Parses HTML to identify URLs for images, stylesheets, and scripts.
"""
soup = BeautifulSoup(html, 'html.parser')
assets = []
for img in soup.find_all('img'):
src = img.get('src')
if src:
assets.append(('image', src))
for link in soup.find_all('link', rel='stylesheet'):
href = link.get('href')
if href:
assets.append(('stylesheet', href))
for script in soup.find_all('script'):
src = script.get('src')
if src:
assets.append(('script', src))
return assets
def check_asset_archived(warc_file, asset_url):
"""
Searches the WARC to confirm that asset_url is present.
Returns True if the record is found, otherwise False.
"""
with open(warc_file, 'rb') as stream:
for record in ArchiveIterator(stream):
if record.rec_type == 'response':
url = record.rec_headers.get_header('WARC-Target-URI')
if url == asset_url:
return True
return False
def is_error_page(html):
"""
Basic keyword-based check for 'soft 404' or error content.
"""
error_keywords = ['404', 'not found', 'error', 'page not found']
return any(keyword in html.lower() for keyword in error_keywords)
def compute_quality_score(missing_count, total_assets, time_drift, is_error):
"""
Simple heuristic to produce a 0-100 score, penalizing missing assets
and error pages. Time drift is not fully demonstrated here.
"""
if is_error:
return 0
score = 100
if total_assets > 0:
score -= (missing_count / total_assets) * 50
drift_penalty = min(time_drift / (24 * 3600), 10)
score -= drift_penalty
return max(0, min(100, score))
# --------------------------------------------------------------------------------
# 4. SIMULATED PIPELINE ORCHESTRATION
#
# This section chains the modules to showcase a pipeline approach.
# Each module implements the QAModule interface, supporting Strategy-based
# swaps or future expansions (e.g., advanced drift analyzers).
# --------------------------------------------------------------------------------
def run_pipeline(warc_file_path):
# Extract main HTML and find asset references.
url, html = get_warc_html(warc_file_path)
if not html:
print("No HTML found, skipping pipeline.")
return
assets = extract_assets(html)
archive_input = ArchiveInput(url, html, assets)
# Define pipeline modules.
pipeline_modules = [
MissingAssetChecker(warc_file_path),
ErrorPageDetector(),
SimpleScoringEngine()
]
# Execute modules in sequence.
for module in pipeline_modules:
archive_input = module.execute(archive_input)
# Show final results.
print(f"\nQA Report for: {archive_input.url}")
print(f" Missing Assets: {archive_input.metadata.get('missing_assets', [])}")
print(f" Is Error Page? : {archive_input.metadata.get('is_error', False)}")
print(f" Quality Score : {archive_input.metadata.get('quality_score', 'N/A')}")
# --------------------------------------------------------------------------------
# 5. DEMONSTRATION
#
# Demonstrates how the pipeline is run for feasibility testing.
# --------------------------------------------------------------------------------
if __name__ == "__main__":
warc_file = 'sample.warc.gz'
run_pipeline(warc_file)
# --------------------------------------------------------------------------------
# 6. RESTful API ENDPOINT USING FASTAPI
#
# A simple endpoint is provided to demonstrate how this approach
# could be exposed as a web service. This is an experimental stub
# returning hard-coded results for now.
# --------------------------------------------------------------------------------
app = FastAPI()
@app.post("/api/v1/archival-quality")
async def archival_quality(request: Request):
data = await request.json()
warc_path = data.get("warc_file", "sample.warc.gz")
target_url = data.get("target_url", None)
# Demonstration: ignoring target_url in this simplified pipeline
# and returning a stub QA report
report = {
"original_url": target_url or "http://example.com/",
"quality_score": 78,
"missing_assets": ["http://example.com/scripts/app.js"],
"issues": ["Missing script asset", "Temporal drift detected (stub)"]
}
return JSONResponse(content=report)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment