Created
April 4, 2025 05:38
-
-
Save ishank-dev/d2af48bfbc911e413ac5873739b931d1 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import io | |
import datetime | |
import statistics | |
from warcio.warcwriter import WARCWriter | |
from warcio.statusandheaders import StatusAndHeaders | |
from warcio.archiveiterator import ArchiveIterator | |
from bs4 import BeautifulSoup | |
from fastapi import FastAPI, Request | |
from fastapi.responses import JSONResponse | |
import uvicorn | |
from playwright.sync_api import sync_playwright | |
# -------------------------------------------------------------------------------- | |
# 1. A SAMPLE WARC FILE GENERATION | |
# | |
# This section generates a synthetic WARC file to mimic common archival scenarios: | |
# - Main HTML page referencing a stylesheet, image, and script | |
# - The script record is deliberately omitted to test missing-asset handling | |
# - A soft 404 page is included for error detection experiments | |
# -------------------------------------------------------------------------------- | |
main_html = """ | |
<html> | |
<head> | |
<title>Test Page</title> | |
<link rel="stylesheet" href="http://example.com/styles/main.css"> | |
</head> | |
<body> | |
<img src="http://example.com/images/logo.png" alt="Logo"> | |
<script src="http://example.com/scripts/app.js"></script> | |
<p>Welcome to the test page.</p> | |
</body> | |
</html> | |
""".strip() | |
css_content = "body { background: #fff; font-family: sans-serif; }" | |
image_content = b"\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR" | |
soft404_html = """ | |
<html> | |
<head><title>Page Not Found</title></head> | |
<body> | |
<h1>404 - Page Not Found</h1> | |
<p>Sorry, the page you are looking for does not exist.</p> | |
</body> | |
</html> | |
""".strip() | |
with open('sample.warc.gz', 'wb') as output: | |
writer = WARCWriter(output, gzip=True) | |
# Main HTML page with asset references | |
stream = io.BytesIO(main_html.encode('utf-8')) | |
http_headers = StatusAndHeaders( | |
'200 OK', | |
[('Content-Type', 'text/html')], | |
protocol='HTTP/1.0' | |
) | |
record = writer.create_warc_record( | |
"http://example.com/", "response", | |
payload=stream, | |
length=len(main_html), | |
http_headers=http_headers | |
) | |
writer.write_record(record) | |
# CSS asset | |
stream = io.BytesIO(css_content.encode('utf-8')) | |
http_headers = StatusAndHeaders( | |
'200 OK', | |
[('Content-Type', 'text/css')], | |
protocol='HTTP/1.0' | |
) | |
record = writer.create_warc_record( | |
"http://example.com/styles/main.css", | |
"response", | |
payload=stream, | |
length=len(css_content), | |
http_headers=http_headers | |
) | |
writer.write_record(record) | |
# Image asset | |
stream = io.BytesIO(image_content) | |
http_headers = StatusAndHeaders( | |
'200 OK', | |
[('Content-Type', 'image/png')], | |
protocol='HTTP/1.0' | |
) | |
record = writer.create_warc_record( | |
"http://example.com/images/logo.png", | |
"response", | |
payload=stream, | |
length=len(image_content), | |
http_headers=http_headers | |
) | |
writer.write_record(record) | |
# Soft 404 page | |
stream = io.BytesIO(soft404_html.encode('utf-8')) | |
http_headers = StatusAndHeaders( | |
'200 OK', | |
[('Content-Type', 'text/html')], | |
protocol='HTTP/1.0' | |
) | |
record = writer.create_warc_record( | |
"http://example.com/missing", "response", | |
payload=stream, | |
length=len(soft404_html), | |
http_headers=http_headers | |
) | |
writer.write_record(record) | |
print("Sample WARC file 'sample.warc.gz' has been created.") | |
# -------------------------------------------------------------------------------- | |
# 2. MODULES FOR PIPELINE + STRATEGY PATTERN | |
# | |
# The following classes simulate discrete analysis tasks, each focusing on | |
# a specific aspect of archival quality. These tasks can be chained in a | |
# pipeline, demonstrating modular strategy usage for future extensibility. | |
# -------------------------------------------------------------------------------- | |
class ArchiveInput: | |
""" | |
Represents data extracted from a WARC record, along with | |
any metadata produced by analysis modules. | |
""" | |
def __init__(self, url, html, assets=None, metadata=None): | |
self.url = url | |
self.html = html | |
self.assets = assets or [] | |
self.metadata = metadata or {} | |
class QAModule: | |
""" | |
Defines a generic interface for QA analysis modules. | |
""" | |
def execute(self, archive_input: ArchiveInput) -> ArchiveInput: | |
raise NotImplementedError("Subclasses must implement execute().") | |
class MissingAssetChecker(QAModule): | |
""" | |
Checks whether each referenced asset is present in the WARC file. | |
""" | |
def __init__(self, warc_file): | |
self.warc_file = warc_file | |
def execute(self, archive_input: ArchiveInput) -> ArchiveInput: | |
missing = [] | |
for asset_type, asset_url in archive_input.assets: | |
if not check_asset_archived(self.warc_file, asset_url): | |
missing.append(asset_url) | |
archive_input.metadata['missing_assets'] = missing | |
return archive_input | |
class ErrorPageDetector(QAModule): | |
""" | |
Flags a page as an 'error' if certain keywords suggest a soft 404 or similar. | |
""" | |
def execute(self, archive_input: ArchiveInput) -> ArchiveInput: | |
archive_input.metadata['is_error'] = is_error_page(archive_input.html) | |
return archive_input | |
class SimpleScoringEngine(QAModule): | |
""" | |
Produces a quality score based on missing assets and error detection. | |
""" | |
def execute(self, archive_input: ArchiveInput) -> ArchiveInput: | |
missing_assets = archive_input.metadata.get('missing_assets', []) | |
is_error = archive_input.metadata.get('is_error', False) | |
total_assets = len(archive_input.assets) | |
archive_input.metadata['quality_score'] = compute_quality_score( | |
missing_count=len(missing_assets), | |
total_assets=total_assets, | |
time_drift=0, | |
is_error=is_error | |
) | |
return archive_input | |
# -------------------------------------------------------------------------------- | |
# 3. SUPPORTING FUNCTIONS | |
# | |
# These utility methods handle tasks such as WARC reading, asset extraction, | |
# and minimal logic for error detection and scoring. | |
# -------------------------------------------------------------------------------- | |
def get_warc_html(warc_file, target_url=None): | |
""" | |
Extracts HTML content from the first response record in the WARC file. | |
An optional URL filter can be applied. | |
""" | |
with open(warc_file, 'rb') as stream: | |
for record in ArchiveIterator(stream): | |
if record.rec_type == 'response': | |
url = record.rec_headers.get_header('WARC-Target-URI') | |
if target_url and target_url not in url: | |
continue | |
payload = record.content_stream().read() | |
try: | |
html = payload.decode('utf-8') | |
except UnicodeDecodeError: | |
html = payload.decode('latin1') | |
return url, html | |
return None, None | |
def extract_assets(html): | |
""" | |
Parses HTML to identify URLs for images, stylesheets, and scripts. | |
""" | |
soup = BeautifulSoup(html, 'html.parser') | |
assets = [] | |
for img in soup.find_all('img'): | |
src = img.get('src') | |
if src: | |
assets.append(('image', src)) | |
for link in soup.find_all('link', rel='stylesheet'): | |
href = link.get('href') | |
if href: | |
assets.append(('stylesheet', href)) | |
for script in soup.find_all('script'): | |
src = script.get('src') | |
if src: | |
assets.append(('script', src)) | |
return assets | |
def check_asset_archived(warc_file, asset_url): | |
""" | |
Searches the WARC to confirm that asset_url is present. | |
Returns True if the record is found, otherwise False. | |
""" | |
with open(warc_file, 'rb') as stream: | |
for record in ArchiveIterator(stream): | |
if record.rec_type == 'response': | |
url = record.rec_headers.get_header('WARC-Target-URI') | |
if url == asset_url: | |
return True | |
return False | |
def is_error_page(html): | |
""" | |
Basic keyword-based check for 'soft 404' or error content. | |
""" | |
error_keywords = ['404', 'not found', 'error', 'page not found'] | |
return any(keyword in html.lower() for keyword in error_keywords) | |
def compute_quality_score(missing_count, total_assets, time_drift, is_error): | |
""" | |
Simple heuristic to produce a 0-100 score, penalizing missing assets | |
and error pages. Time drift is not fully demonstrated here. | |
""" | |
if is_error: | |
return 0 | |
score = 100 | |
if total_assets > 0: | |
score -= (missing_count / total_assets) * 50 | |
drift_penalty = min(time_drift / (24 * 3600), 10) | |
score -= drift_penalty | |
return max(0, min(100, score)) | |
# -------------------------------------------------------------------------------- | |
# 4. SIMULATED PIPELINE ORCHESTRATION | |
# | |
# This section chains the modules to showcase a pipeline approach. | |
# Each module implements the QAModule interface, supporting Strategy-based | |
# swaps or future expansions (e.g., advanced drift analyzers). | |
# -------------------------------------------------------------------------------- | |
def run_pipeline(warc_file_path): | |
# Extract main HTML and find asset references. | |
url, html = get_warc_html(warc_file_path) | |
if not html: | |
print("No HTML found, skipping pipeline.") | |
return | |
assets = extract_assets(html) | |
archive_input = ArchiveInput(url, html, assets) | |
# Define pipeline modules. | |
pipeline_modules = [ | |
MissingAssetChecker(warc_file_path), | |
ErrorPageDetector(), | |
SimpleScoringEngine() | |
] | |
# Execute modules in sequence. | |
for module in pipeline_modules: | |
archive_input = module.execute(archive_input) | |
# Show final results. | |
print(f"\nQA Report for: {archive_input.url}") | |
print(f" Missing Assets: {archive_input.metadata.get('missing_assets', [])}") | |
print(f" Is Error Page? : {archive_input.metadata.get('is_error', False)}") | |
print(f" Quality Score : {archive_input.metadata.get('quality_score', 'N/A')}") | |
# -------------------------------------------------------------------------------- | |
# 5. DEMONSTRATION | |
# | |
# Demonstrates how the pipeline is run for feasibility testing. | |
# -------------------------------------------------------------------------------- | |
if __name__ == "__main__": | |
warc_file = 'sample.warc.gz' | |
run_pipeline(warc_file) | |
# -------------------------------------------------------------------------------- | |
# 6. RESTful API ENDPOINT USING FASTAPI | |
# | |
# A simple endpoint is provided to demonstrate how this approach | |
# could be exposed as a web service. This is an experimental stub | |
# returning hard-coded results for now. | |
# -------------------------------------------------------------------------------- | |
app = FastAPI() | |
@app.post("/api/v1/archival-quality") | |
async def archival_quality(request: Request): | |
data = await request.json() | |
warc_path = data.get("warc_file", "sample.warc.gz") | |
target_url = data.get("target_url", None) | |
# Demonstration: ignoring target_url in this simplified pipeline | |
# and returning a stub QA report | |
report = { | |
"original_url": target_url or "http://example.com/", | |
"quality_score": 78, | |
"missing_assets": ["http://example.com/scripts/app.js"], | |
"issues": ["Missing script asset", "Temporal drift detected (stub)"] | |
} | |
return JSONResponse(content=report) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment