ishank-dev · April 4, 2025 05:38
diff --git a/qa_pipeline.py b/qa_pipeline.py
 import io
 import datetime
 import statistics

 from warcio.warcwriter import WARCWriter
 from warcio.statusandheaders import StatusAndHeaders
 from warcio.archiveiterator import ArchiveIterator
 from bs4 import BeautifulSoup
 from fastapi import FastAPI, Request
 from fastapi.responses import JSONResponse
 import uvicorn
 from playwright.sync_api import sync_playwright


 # --------------------------------------------------------------------------------
 # 1. A SAMPLE WARC FILE GENERATION
 #
 # This section generates a synthetic WARC file to mimic common archival scenarios:
 #  - Main HTML page referencing a stylesheet, image, and script
 #  - The script record is deliberately omitted to test missing-asset handling
 #  - A soft 404 page is included for error detection experiments
 # --------------------------------------------------------------------------------

 main_html = """
 <html>
 <head>
  <title>Test Page</title>
  <link rel="stylesheet" href="http://example.com/styles/main.css">
 </head>
 <body>
  <img src="http://example.com/images/logo.png" alt="Logo">
  <script src="http://example.com/scripts/app.js"></script>
  <p>Welcome to the test page.</p>
 </body>
 </html>
 """.strip()

 css_content = "body { background: #fff; font-family: sans-serif; }"
 image_content = b"\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR"
 soft404_html = """
 <html>
 <head><title>Page Not Found</title></head>
 <body>
  <h1>404 - Page Not Found</h1>
  <p>Sorry, the page you are looking for does not exist.</p>
 </body>
 </html>
 """.strip()

 with open('sample.warc.gz', 'wb') as output:
    writer = WARCWriter(output, gzip=True)
    
    # Main HTML page with asset references
    stream = io.BytesIO(main_html.encode('utf-8'))
    http_headers = StatusAndHeaders(
        '200 OK', 
        [('Content-Type', 'text/html')],
        protocol='HTTP/1.0'
    )
    record = writer.create_warc_record(
        "http://example.com/", "response",
        payload=stream,
        length=len(main_html),
        http_headers=http_headers
    )
    writer.write_record(record)
    
    # CSS asset
    stream = io.BytesIO(css_content.encode('utf-8'))
    http_headers = StatusAndHeaders(
        '200 OK', 
        [('Content-Type', 'text/css')],
        protocol='HTTP/1.0'
    )
    record = writer.create_warc_record(
        "http://example.com/styles/main.css",
        "response",
        payload=stream,
        length=len(css_content),
        http_headers=http_headers
    )
    writer.write_record(record)
    
    # Image asset
    stream = io.BytesIO(image_content)
    http_headers = StatusAndHeaders(
        '200 OK',
        [('Content-Type', 'image/png')],
        protocol='HTTP/1.0'
    )
    record = writer.create_warc_record(
        "http://example.com/images/logo.png",
        "response",
        payload=stream,
        length=len(image_content),
        http_headers=http_headers
    )
    writer.write_record(record)
    
    # Soft 404 page
    stream = io.BytesIO(soft404_html.encode('utf-8'))
    http_headers = StatusAndHeaders(
        '200 OK',
        [('Content-Type', 'text/html')],
        protocol='HTTP/1.0'
    )
    record = writer.create_warc_record(
        "http://example.com/missing", "response",
        payload=stream,
        length=len(soft404_html),
        http_headers=http_headers
    )
    writer.write_record(record)

 print("Sample WARC file 'sample.warc.gz' has been created.")


 # --------------------------------------------------------------------------------
 # 2. MODULES FOR PIPELINE + STRATEGY PATTERN
 #
 # The following classes simulate discrete analysis tasks, each focusing on 
 # a specific aspect of archival quality. These tasks can be chained in a 
 # pipeline, demonstrating modular strategy usage for future extensibility.
 # --------------------------------------------------------------------------------

 class ArchiveInput:
    """
    Represents data extracted from a WARC record, along with 
    any metadata produced by analysis modules.
    """
    def __init__(self, url, html, assets=None, metadata=None):
        self.url = url
        self.html = html
        self.assets = assets or []
        self.metadata = metadata or {}

 class QAModule:
    """
    Defines a generic interface for QA analysis modules.
    """
    def execute(self, archive_input: ArchiveInput) -> ArchiveInput:
        raise NotImplementedError("Subclasses must implement execute().")

 class MissingAssetChecker(QAModule):
    """
    Checks whether each referenced asset is present in the WARC file.
    """
    def __init__(self, warc_file):
        self.warc_file = warc_file

    def execute(self, archive_input: ArchiveInput) -> ArchiveInput:
        missing = []
        for asset_type, asset_url in archive_input.assets:
            if not check_asset_archived(self.warc_file, asset_url):
                missing.append(asset_url)
        archive_input.metadata['missing_assets'] = missing
        return archive_input

 class ErrorPageDetector(QAModule):
    """
    Flags a page as an 'error' if certain keywords suggest a soft 404 or similar.
    """
    def execute(self, archive_input: ArchiveInput) -> ArchiveInput:
        archive_input.metadata['is_error'] = is_error_page(archive_input.html)
        return archive_input

 class SimpleScoringEngine(QAModule):
    """
    Produces a quality score based on missing assets and error detection.
    """
    def execute(self, archive_input: ArchiveInput) -> ArchiveInput:
        missing_assets = archive_input.metadata.get('missing_assets', [])
        is_error = archive_input.metadata.get('is_error', False)
        total_assets = len(archive_input.assets)
        archive_input.metadata['quality_score'] = compute_quality_score(
            missing_count=len(missing_assets),
            total_assets=total_assets,
            time_drift=0,
            is_error=is_error
        )
        return archive_input


 # --------------------------------------------------------------------------------
 # 3. SUPPORTING FUNCTIONS
 #
 # These utility methods handle tasks such as WARC reading, asset extraction, 
 # and minimal logic for error detection and scoring.
 # --------------------------------------------------------------------------------

 def get_warc_html(warc_file, target_url=None):
    """
    Extracts HTML content from the first response record in the WARC file.
    An optional URL filter can be applied.
    """
    with open(warc_file, 'rb') as stream:
        for record in ArchiveIterator(stream):
            if record.rec_type == 'response':
                url = record.rec_headers.get_header('WARC-Target-URI')
                if target_url and target_url not in url:
                    continue
                payload = record.content_stream().read()
                try:
                    html = payload.decode('utf-8')
                except UnicodeDecodeError:
                    html = payload.decode('latin1')
                return url, html
    return None, None

 def extract_assets(html):
    """
    Parses HTML to identify URLs for images, stylesheets, and scripts.
    """
    soup = BeautifulSoup(html, 'html.parser')
    assets = []
    for img in soup.find_all('img'):
        src = img.get('src')
        if src:
            assets.append(('image', src))
    for link in soup.find_all('link', rel='stylesheet'):
        href = link.get('href')
        if href:
            assets.append(('stylesheet', href))
    for script in soup.find_all('script'):
        src = script.get('src')
        if src:
            assets.append(('script', src))
    return assets

 def check_asset_archived(warc_file, asset_url):
    """
    Searches the WARC to confirm that asset_url is present.
    Returns True if the record is found, otherwise False.
    """
    with open(warc_file, 'rb') as stream:
        for record in ArchiveIterator(stream):
            if record.rec_type == 'response':
                url = record.rec_headers.get_header('WARC-Target-URI')
                if url == asset_url:
                    return True
    return False

 def is_error_page(html):
    """
    Basic keyword-based check for 'soft 404' or error content.
    """
    error_keywords = ['404', 'not found', 'error', 'page not found']
    return any(keyword in html.lower() for keyword in error_keywords)

 def compute_quality_score(missing_count, total_assets, time_drift, is_error):
    """
    Simple heuristic to produce a 0-100 score, penalizing missing assets
    and error pages. Time drift is not fully demonstrated here.
    """
    if is_error:
        return 0
    score = 100
    if total_assets > 0:
        score -= (missing_count / total_assets) * 50
    drift_penalty = min(time_drift / (24 * 3600), 10)
    score -= drift_penalty
    return max(0, min(100, score))



 # --------------------------------------------------------------------------------
 # 4. SIMULATED PIPELINE ORCHESTRATION
 #
 # This section chains the modules to showcase a pipeline approach. 
 # Each module implements the QAModule interface, supporting Strategy-based
 # swaps or future expansions (e.g., advanced drift analyzers).
 # --------------------------------------------------------------------------------

 def run_pipeline(warc_file_path):
    # Extract main HTML and find asset references.
    url, html = get_warc_html(warc_file_path)
    if not html:
        print("No HTML found, skipping pipeline.")
        return

    assets = extract_assets(html)
    archive_input = ArchiveInput(url, html, assets)

    # Define pipeline modules.
    pipeline_modules = [
        MissingAssetChecker(warc_file_path),
        ErrorPageDetector(),
        SimpleScoringEngine()
    ]

    # Execute modules in sequence.
    for module in pipeline_modules:
        archive_input = module.execute(archive_input)

    # Show final results.
    print(f"\nQA Report for: {archive_input.url}")
    print(f"  Missing Assets: {archive_input.metadata.get('missing_assets', [])}")
    print(f"  Is Error Page? : {archive_input.metadata.get('is_error', False)}")
    print(f"  Quality Score : {archive_input.metadata.get('quality_score', 'N/A')}")


 # --------------------------------------------------------------------------------
 # 5. DEMONSTRATION
 #
 # Demonstrates how the pipeline is run for feasibility testing.
 # --------------------------------------------------------------------------------

 if __name__ == "__main__":
    warc_file = 'sample.warc.gz'
    run_pipeline(warc_file)

    # --------------------------------------------------------------------------------
    # 6. RESTful API ENDPOINT USING FASTAPI
    #
    # A simple endpoint is provided to demonstrate how this approach
    # could be exposed as a web service. This is an experimental stub 
    # returning hard-coded results for now.
    # --------------------------------------------------------------------------------

    app = FastAPI()

    @app.post("/api/v1/archival-quality")
    async def archival_quality(request: Request):
        data = await request.json()
        warc_path = data.get("warc_file", "sample.warc.gz")
        target_url = data.get("target_url", None)
        # Demonstration: ignoring target_url in this simplified pipeline
        # and returning a stub QA report
        report = {
            "original_url": target_url or "http://example.com/",
            "quality_score": 78,
            "missing_assets": ["http://example.com/scripts/app.js"],
            "issues": ["Missing script asset", "Temporal drift detected (stub)"]
        }
        return JSONResponse(content=report)
	import io
	import datetime
	import statistics

	from warcio.warcwriter import WARCWriter
	from warcio.statusandheaders import StatusAndHeaders
	from warcio.archiveiterator import ArchiveIterator
	from bs4 import BeautifulSoup
	from fastapi import FastAPI, Request
	from fastapi.responses import JSONResponse
	import uvicorn
	from playwright.sync_api import sync_playwright


	# --------------------------------------------------------------------------------
	# 1. A SAMPLE WARC FILE GENERATION
	#
	# This section generates a synthetic WARC file to mimic common archival scenarios:
	# - Main HTML page referencing a stylesheet, image, and script
	# - The script record is deliberately omitted to test missing-asset handling
	# - A soft 404 page is included for error detection experiments
	# --------------------------------------------------------------------------------

	main_html = """
	<html>
	<head>
	<title>Test Page</title>
	<link rel="stylesheet" href="http://example.com/styles/main.css">
	</head>
	<body>
	<img src="http://example.com/images/logo.png" alt="Logo">
	<script src="http://example.com/scripts/app.js"></script>
	<p>Welcome to the test page.</p>
	</body>
	</html>
	""".strip()

	css_content = "body { background: #fff; font-family: sans-serif; }"
	image_content = b"\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR"
	soft404_html = """
	<html>
	<head><title>Page Not Found</title></head>
	<body>
	<h1>404 - Page Not Found</h1>
	<p>Sorry, the page you are looking for does not exist.</p>
	</body>
	</html>
	""".strip()

	with open('sample.warc.gz', 'wb') as output:
	writer = WARCWriter(output, gzip=True)

	# Main HTML page with asset references
	stream = io.BytesIO(main_html.encode('utf-8'))
	http_headers = StatusAndHeaders(
	'200 OK',
	[('Content-Type', 'text/html')],
	protocol='HTTP/1.0'
	)
	record = writer.create_warc_record(
	"http://example.com/", "response",
	payload=stream,
	length=len(main_html),
	http_headers=http_headers
	)
	writer.write_record(record)

	# CSS asset
	stream = io.BytesIO(css_content.encode('utf-8'))
	http_headers = StatusAndHeaders(
	'200 OK',
	[('Content-Type', 'text/css')],
	protocol='HTTP/1.0'
	)
	record = writer.create_warc_record(
	"http://example.com/styles/main.css",
	"response",
	payload=stream,
	length=len(css_content),
	http_headers=http_headers
	)
	writer.write_record(record)

	# Image asset
	stream = io.BytesIO(image_content)
	http_headers = StatusAndHeaders(
	'200 OK',
	[('Content-Type', 'image/png')],
	protocol='HTTP/1.0'
	)
	record = writer.create_warc_record(
	"http://example.com/images/logo.png",
	"response",
	payload=stream,
	length=len(image_content),
	http_headers=http_headers
	)
	writer.write_record(record)

	# Soft 404 page
	stream = io.BytesIO(soft404_html.encode('utf-8'))
	http_headers = StatusAndHeaders(
	'200 OK',
	[('Content-Type', 'text/html')],
	protocol='HTTP/1.0'
	)
	record = writer.create_warc_record(
	"http://example.com/missing", "response",
	payload=stream,
	length=len(soft404_html),
	http_headers=http_headers
	)
	writer.write_record(record)

	print("Sample WARC file 'sample.warc.gz' has been created.")


	# --------------------------------------------------------------------------------
	# 2. MODULES FOR PIPELINE + STRATEGY PATTERN
	#
	# The following classes simulate discrete analysis tasks, each focusing on
	# a specific aspect of archival quality. These tasks can be chained in a
	# pipeline, demonstrating modular strategy usage for future extensibility.
	# --------------------------------------------------------------------------------

	class ArchiveInput:
	"""
	Represents data extracted from a WARC record, along with
	any metadata produced by analysis modules.
	"""
	def __init__(self, url, html, assets=None, metadata=None):
	self.url = url
	self.html = html
	self.assets = assets or []
	self.metadata = metadata or {}

	class QAModule:
	"""
	Defines a generic interface for QA analysis modules.
	"""
	def execute(self, archive_input: ArchiveInput) -> ArchiveInput:
	raise NotImplementedError("Subclasses must implement execute().")

	class MissingAssetChecker(QAModule):
	"""
	Checks whether each referenced asset is present in the WARC file.
	"""
	def __init__(self, warc_file):
	self.warc_file = warc_file

	def execute(self, archive_input: ArchiveInput) -> ArchiveInput:
	missing = []
	for asset_type, asset_url in archive_input.assets:
	if not check_asset_archived(self.warc_file, asset_url):
	missing.append(asset_url)
	archive_input.metadata['missing_assets'] = missing
	return archive_input

	class ErrorPageDetector(QAModule):
	"""
	Flags a page as an 'error' if certain keywords suggest a soft 404 or similar.
	"""
	def execute(self, archive_input: ArchiveInput) -> ArchiveInput:
	archive_input.metadata['is_error'] = is_error_page(archive_input.html)
	return archive_input

	class SimpleScoringEngine(QAModule):
	"""
	Produces a quality score based on missing assets and error detection.
	"""
	def execute(self, archive_input: ArchiveInput) -> ArchiveInput:
	missing_assets = archive_input.metadata.get('missing_assets', [])
	is_error = archive_input.metadata.get('is_error', False)
	total_assets = len(archive_input.assets)
	archive_input.metadata['quality_score'] = compute_quality_score(
	missing_count=len(missing_assets),
	total_assets=total_assets,
	time_drift=0,
	is_error=is_error
	)
	return archive_input


	# --------------------------------------------------------------------------------
	# 3. SUPPORTING FUNCTIONS
	#
	# These utility methods handle tasks such as WARC reading, asset extraction,
	# and minimal logic for error detection and scoring.
	# --------------------------------------------------------------------------------

	def get_warc_html(warc_file, target_url=None):
	"""
	Extracts HTML content from the first response record in the WARC file.
	An optional URL filter can be applied.
	"""
	with open(warc_file, 'rb') as stream:
	for record in ArchiveIterator(stream):
	if record.rec_type == 'response':
	url = record.rec_headers.get_header('WARC-Target-URI')
	if target_url and target_url not in url:
	continue
	payload = record.content_stream().read()
	try:
	html = payload.decode('utf-8')
	except UnicodeDecodeError:
	html = payload.decode('latin1')
	return url, html
	return None, None

	def extract_assets(html):
	"""
	Parses HTML to identify URLs for images, stylesheets, and scripts.
	"""
	soup = BeautifulSoup(html, 'html.parser')
	assets = []
	for img in soup.find_all('img'):
	src = img.get('src')
	if src:
	assets.append(('image', src))
	for link in soup.find_all('link', rel='stylesheet'):
	href = link.get('href')
	if href:
	assets.append(('stylesheet', href))
	for script in soup.find_all('script'):
	src = script.get('src')
	if src:
	assets.append(('script', src))
	return assets

	def check_asset_archived(warc_file, asset_url):
	"""
	Searches the WARC to confirm that asset_url is present.
	Returns True if the record is found, otherwise False.
	"""
	with open(warc_file, 'rb') as stream:
	for record in ArchiveIterator(stream):
	if record.rec_type == 'response':
	url = record.rec_headers.get_header('WARC-Target-URI')
	if url == asset_url:
	return True
	return False

	def is_error_page(html):
	"""
	Basic keyword-based check for 'soft 404' or error content.
	"""
	error_keywords = ['404', 'not found', 'error', 'page not found']
	return any(keyword in html.lower() for keyword in error_keywords)

	def compute_quality_score(missing_count, total_assets, time_drift, is_error):
	"""
	Simple heuristic to produce a 0-100 score, penalizing missing assets
	and error pages. Time drift is not fully demonstrated here.
	"""
	if is_error:
	return 0
	score = 100
	if total_assets > 0:
	score -= (missing_count / total_assets) * 50
	drift_penalty = min(time_drift / (24 * 3600), 10)
	score -= drift_penalty
	return max(0, min(100, score))



	# --------------------------------------------------------------------------------
	# 4. SIMULATED PIPELINE ORCHESTRATION
	#
	# This section chains the modules to showcase a pipeline approach.
	# Each module implements the QAModule interface, supporting Strategy-based
	# swaps or future expansions (e.g., advanced drift analyzers).
	# --------------------------------------------------------------------------------

	def run_pipeline(warc_file_path):
	# Extract main HTML and find asset references.
	url, html = get_warc_html(warc_file_path)
	if not html:
	print("No HTML found, skipping pipeline.")
	return

	assets = extract_assets(html)
	archive_input = ArchiveInput(url, html, assets)

	# Define pipeline modules.
	pipeline_modules = [
	MissingAssetChecker(warc_file_path),
	ErrorPageDetector(),
	SimpleScoringEngine()
	]

	# Execute modules in sequence.
	for module in pipeline_modules:
	archive_input = module.execute(archive_input)

	# Show final results.
	print(f"\nQA Report for: {archive_input.url}")
	print(f" Missing Assets: {archive_input.metadata.get('missing_assets', [])}")
	print(f" Is Error Page? : {archive_input.metadata.get('is_error', False)}")
	print(f" Quality Score : {archive_input.metadata.get('quality_score', 'N/A')}")


	# --------------------------------------------------------------------------------
	# 5. DEMONSTRATION
	#
	# Demonstrates how the pipeline is run for feasibility testing.
	# --------------------------------------------------------------------------------

	if __name__ == "__main__":
	warc_file = 'sample.warc.gz'
	run_pipeline(warc_file)

	# --------------------------------------------------------------------------------
	# 6. RESTful API ENDPOINT USING FASTAPI
	#
	# A simple endpoint is provided to demonstrate how this approach
	# could be exposed as a web service. This is an experimental stub
	# returning hard-coded results for now.
	# --------------------------------------------------------------------------------

	app = FastAPI()

	@app.post("/api/v1/archival-quality")
	async def archival_quality(request: Request):
	data = await request.json()
	warc_path = data.get("warc_file", "sample.warc.gz")
	target_url = data.get("target_url", None)
	# Demonstration: ignoring target_url in this simplified pipeline
	# and returning a stub QA report
	report = {
	"original_url": target_url or "http://example.com/",
	"quality_score": 78,
	"missing_assets": ["http://example.com/scripts/app.js"],
	"issues": ["Missing script asset", "Temporal drift detected (stub)"]
	}
	return JSONResponse(content=report)