|
""" |
|
title: Cloudflare Browser Rendering |
|
author: jerieljan |
|
version: 0.3 |
|
license: MIT License |
|
description: Enables LLMs to read webpages by processing them into Markdown with the Cloudflare Browser Rendering API |
|
""" |
|
|
|
from pydantic import BaseModel, Field |
|
from typing import Optional, Callable, Any, Dict, List |
|
import requests |
|
import asyncio |
|
|
|
|
|
class Tools: |
|
class Valves(BaseModel): |
|
CLOUDFLARE_ACCOUNT_ID: str = Field( |
|
default="", description="The Cloudflare Account ID" |
|
) |
|
|
|
CLOUDFLARE_API_TOKEN: str = Field( |
|
default="", description="The API token to access Cloudflare services" |
|
) |
|
|
|
CLOUDFLARE_API_BASE_URL: str = Field( |
|
default="https://api.cloudflare.com/client/v4", |
|
description="(Optional) The base URL for Cloudflare API endpoints", |
|
) |
|
|
|
def __init__(self): |
|
self.valves = self.Valves() |
|
self.citation = False |
|
self.tools = [ |
|
{ |
|
"type": "function", |
|
"function": { |
|
"name": "extract_markdown", |
|
"description": "Extract markdown content from a webpage using Cloudflare Browser Rendering", |
|
"parameters": { |
|
"type": "object", |
|
"properties": { |
|
"url": { |
|
"type": "string", |
|
"description": "The URL of the webpage to extract markdown from", |
|
}, |
|
"reject_pattern": { |
|
"type": "array", |
|
"items": {"type": "string"}, |
|
"description": "Optional regex patterns to reject certain requests (e.g., CSS files)", |
|
"default": [] |
|
} |
|
}, |
|
"required": ["url"], |
|
}, |
|
}, |
|
}, |
|
{ |
|
"type": "function", |
|
"function": { |
|
"name": "extract_markdown_from_html", |
|
"description": "Convert raw HTML content to markdown using Cloudflare Browser Rendering", |
|
"parameters": { |
|
"type": "object", |
|
"properties": { |
|
"html": { |
|
"type": "string", |
|
"description": "The raw HTML content to convert to markdown", |
|
} |
|
}, |
|
"required": ["html"], |
|
}, |
|
}, |
|
} |
|
] |
|
|
|
async def extract_markdown( |
|
self, |
|
url: str, |
|
reject_pattern: Optional[List[str]] = None, |
|
__event_emitter__: Optional[Callable[[Dict], Any]] = None |
|
) -> str: |
|
""" |
|
Uses the Cloudflare Browser Rendering service to fetch a webpage and provide the content in Markdown format. |
|
""" |
|
if not self.valves.CLOUDFLARE_API_TOKEN: |
|
raise Exception("CLOUDFLARE_API_TOKEN not provided in valves") |
|
|
|
if not self.valves.CLOUDFLARE_ACCOUNT_ID: |
|
raise Exception("CLOUDFLARE_ACCOUNT_ID not provided in valves") |
|
|
|
# Status emitter helper |
|
async def emit_status( |
|
description: str, status: str = "in_progress", done: bool = False |
|
): |
|
if __event_emitter__: |
|
await __event_emitter__( |
|
{ |
|
"type": "status", |
|
"data": { |
|
"description": description, |
|
"status": status, |
|
"done": done, |
|
}, |
|
} |
|
) |
|
|
|
# Initial status |
|
await emit_status(f"Extracting markdown from: {url}...", "processing") |
|
|
|
headers = { |
|
"Authorization": f"Bearer {self.valves.CLOUDFLARE_API_TOKEN}", |
|
"Content-Type": "application/json", |
|
} |
|
|
|
payload = {"url": url} |
|
|
|
if reject_pattern: |
|
payload["rejectRequestPattern"] = reject_pattern |
|
|
|
try: |
|
await emit_status(f"Extracting markdown from: {url}...", "processing") |
|
|
|
response = requests.post( |
|
f"{self.valves.CLOUDFLARE_API_BASE_URL}/accounts/{self.valves.CLOUDFLARE_ACCOUNT_ID}/browser-rendering/markdown", |
|
headers=headers, |
|
json=payload, |
|
timeout=30, |
|
) |
|
response.raise_for_status() |
|
result = response.json() |
|
|
|
if not result.get("success", False): |
|
error_msg = result.get("errors", ["Unknown error occurred"])[0] |
|
await emit_status(f"Error: {error_msg}", status="error", done=True) |
|
return f"Error extracting markdown: {error_msg}" |
|
|
|
markdown_content = result.get("result", "") |
|
|
|
# Emit citation |
|
if __event_emitter__: |
|
await __event_emitter__( |
|
{ |
|
"type": "citation", |
|
"data": { |
|
"document": [markdown_content], |
|
"metadata": [{"source": url}], |
|
"source": {"name": url, "url": url}, |
|
}, |
|
} |
|
) |
|
|
|
# Complete status |
|
await emit_status( |
|
"Markdown extraction completed successfully", status="complete", done=True |
|
) |
|
|
|
return markdown_content |
|
|
|
except requests.exceptions.RequestException as e: |
|
error_msg = f"Network error extracting markdown: {str(e)}" |
|
await emit_status(error_msg, status="error", done=True) |
|
return error_msg |
|
except Exception as e: |
|
error_msg = f"Error extracting markdown: {str(e)}" |
|
await emit_status(error_msg, status="error", done=True) |
|
return error_msg |
|
|
|
async def extract_markdown_from_html( |
|
self, |
|
html: str, |
|
__event_emitter__: Optional[Callable[[Dict], Any]] = None |
|
) -> str: |
|
""" |
|
Uses the Cloudflare Browser Rendering service to process the user's provided HTML code and reformat it to Markdown. |
|
""" |
|
if not self.valves.CLOUDFLARE_API_TOKEN: |
|
raise Exception("CLOUDFLARE_API_TOKEN not provided in valves") |
|
|
|
if not self.valves.CLOUDFLARE_ACCOUNT_ID: |
|
raise Exception("CLOUDFLARE_ACCOUNT_ID not provided in valves") |
|
|
|
# Status emitter helper |
|
async def emit_status( |
|
description: str, status: str = "in_progress", done: bool = False |
|
): |
|
if __event_emitter__: |
|
await __event_emitter__( |
|
{ |
|
"type": "status", |
|
"data": { |
|
"description": description, |
|
"status": status, |
|
"done": done, |
|
}, |
|
} |
|
) |
|
|
|
# Initial status |
|
await emit_status("Converting HTML to markdown...", "processing") |
|
|
|
headers = { |
|
"Authorization": f"Bearer {self.valves.CLOUDFLARE_API_TOKEN}", |
|
"Content-Type": "application/json", |
|
} |
|
|
|
payload = {"html": html} |
|
|
|
try: |
|
await emit_status("Converting HTML to markdown...", "processing") |
|
|
|
response = requests.post( |
|
f"{self.valves.CLOUDFLARE_API_BASE_URL}/accounts/{self.valves.CLOUDFLARE_ACCOUNT_ID}/browser-rendering/markdown", |
|
headers=headers, |
|
json=payload, |
|
timeout=30, |
|
) |
|
response.raise_for_status() |
|
result = response.json() |
|
|
|
if not result.get("success", False): |
|
error_msg = result.get("errors", ["Unknown error occurred"])[0] |
|
await emit_status(f"Error: {error_msg}", status="error", done=True) |
|
return f"Error converting HTML to markdown: {error_msg}" |
|
|
|
markdown_content = result.get("result", "") |
|
|
|
# Emit citation for HTML content |
|
if __event_emitter__: |
|
await __event_emitter__( |
|
{ |
|
"type": "citation", |
|
"data": { |
|
"document": [markdown_content], |
|
"metadata": [{"source": "Raw HTML Content"}], |
|
"source": {"name": "HTML Content"}, |
|
}, |
|
} |
|
) |
|
|
|
# Complete status |
|
await emit_status( |
|
"HTML to markdown conversion completed successfully", status="complete", done=True |
|
) |
|
|
|
return markdown_content |
|
|
|
except requests.exceptions.RequestException as e: |
|
error_msg = f"Network error converting HTML to markdown: {str(e)}" |
|
await emit_status(error_msg, status="error", done=True) |
|
return error_msg |
|
except Exception as e: |
|
error_msg = f"Error converting HTML to markdown: {str(e)}" |
|
await emit_status(error_msg, status="error", done=True) |
|
return error_msg |