Created
January 23, 2025 10:46
-
-
Save ezynda3/3d95afaaacfbb87ec19b937da731c646 to your computer and use it in GitHub Desktop.
Crawl4AI OpenAPI Spec
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
openapi: 3.0.3 | |
info: | |
title: Crawl4AI API | |
version: 1.0.0 | |
description: API for web crawling and content extraction | |
servers: | |
- url: / | |
components: | |
schemas: | |
TokenUsage: | |
type: object | |
properties: | |
completion_tokens: | |
type: integer | |
default: 0 | |
prompt_tokens: | |
type: integer | |
default: 0 | |
total_tokens: | |
type: integer | |
default: 0 | |
completion_tokens_details: | |
type: object | |
prompt_tokens_details: | |
type: object | |
MediaItem: | |
type: object | |
properties: | |
src: | |
type: string | |
default: "" | |
alt: | |
type: string | |
desc: | |
type: string | |
score: | |
type: integer | |
type: | |
type: string | |
default: "image" | |
group_id: | |
type: integer | |
format: | |
type: string | |
width: | |
type: integer | |
Link: | |
type: object | |
properties: | |
href: | |
type: string | |
default: "" | |
text: | |
type: string | |
title: | |
type: string | |
base_domain: | |
type: string | |
Media: | |
type: object | |
properties: | |
images: | |
type: array | |
items: | |
$ref: '#/components/schemas/MediaItem' | |
default: [] | |
videos: | |
type: array | |
items: | |
$ref: '#/components/schemas/MediaItem' | |
default: [] | |
audios: | |
type: array | |
items: | |
$ref: '#/components/schemas/MediaItem' | |
default: [] | |
Links: | |
type: object | |
properties: | |
internal: | |
type: array | |
items: | |
$ref: '#/components/schemas/Link' | |
default: [] | |
external: | |
type: array | |
items: | |
$ref: '#/components/schemas/Link' | |
default: [] | |
MarkdownGenerationResult: | |
type: object | |
required: | |
- raw_markdown | |
- markdown_with_citations | |
- references_markdown | |
properties: | |
raw_markdown: | |
type: string | |
markdown_with_citations: | |
type: string | |
references_markdown: | |
type: string | |
fit_markdown: | |
type: string | |
fit_html: | |
type: string | |
DispatchResult: | |
type: object | |
required: | |
- task_id | |
- memory_usage | |
- peak_memory | |
- start_time | |
- end_time | |
properties: | |
task_id: | |
type: string | |
memory_usage: | |
type: number | |
format: float | |
peak_memory: | |
type: number | |
format: float | |
start_time: | |
type: string | |
format: date-time | |
end_time: | |
type: string | |
format: date-time | |
error_message: | |
type: string | |
default: "" | |
TaskStatus: | |
type: string | |
enum: [pending, processing, completed, failed] | |
CrawlerType: | |
type: string | |
enum: [basic, llm, cosine, json_css] | |
ExtractionConfig: | |
type: object | |
properties: | |
type: | |
$ref: '#/components/schemas/CrawlerType' | |
params: | |
type: object | |
additionalProperties: true | |
default: {} | |
ChunkingStrategy: | |
type: object | |
properties: | |
type: | |
type: string | |
params: | |
type: object | |
additionalProperties: true | |
default: {} | |
ContentFilter: | |
type: object | |
properties: | |
type: | |
type: string | |
default: bm25 | |
params: | |
type: object | |
additionalProperties: true | |
default: {} | |
CrawlRequest: | |
type: object | |
required: | |
- urls | |
properties: | |
urls: | |
oneOf: | |
- type: string | |
format: uri | |
- type: array | |
items: | |
type: string | |
format: uri | |
word_count_threshold: | |
type: integer | |
default: 100 | |
extraction_config: | |
$ref: '#/components/schemas/ExtractionConfig' | |
chunking_strategy: | |
$ref: '#/components/schemas/ChunkingStrategy' | |
content_filter: | |
$ref: '#/components/schemas/ContentFilter' | |
js_code: | |
type: array | |
items: | |
type: string | |
wait_for: | |
type: string | |
css_selector: | |
type: string | |
screenshot: | |
type: boolean | |
default: false | |
magic: | |
type: boolean | |
default: false | |
extra: | |
type: object | |
additionalProperties: true | |
session_id: | |
type: string | |
cache_mode: | |
type: string | |
enum: [enabled, disabled] | |
default: enabled | |
priority: | |
type: integer | |
minimum: 1 | |
maximum: 10 | |
default: 5 | |
ttl: | |
type: integer | |
default: 3600 | |
crawler_params: | |
type: object | |
additionalProperties: true | |
default: {} | |
CrawlResult: | |
type: object | |
required: | |
- url | |
- html | |
- success | |
properties: | |
url: | |
type: string | |
html: | |
type: string | |
success: | |
type: boolean | |
cleaned_html: | |
type: string | |
media: | |
type: object | |
additionalProperties: | |
type: array | |
items: | |
type: object | |
default: {} | |
links: | |
type: object | |
additionalProperties: | |
type: array | |
items: | |
type: object | |
default: {} | |
downloaded_files: | |
type: array | |
items: | |
type: string | |
screenshot: | |
type: string | |
pdf: | |
type: string | |
format: binary | |
markdown: | |
oneOf: | |
- type: string | |
- $ref: '#/components/schemas/MarkdownGenerationResult' | |
markdown_v2: | |
$ref: '#/components/schemas/MarkdownGenerationResult' | |
fit_markdown: | |
type: string | |
fit_html: | |
type: string | |
extracted_content: | |
type: string | |
metadata: | |
type: object | |
error_message: | |
type: string | |
session_id: | |
type: string | |
response_headers: | |
type: object | |
status_code: | |
type: integer | |
ssl_certificate: | |
type: object | |
dispatch_result: | |
$ref: '#/components/schemas/DispatchResult' | |
redirected_url: | |
type: string | |
TaskResponse: | |
type: object | |
required: | |
- status | |
- created_at | |
properties: | |
status: | |
$ref: '#/components/schemas/TaskStatus' | |
created_at: | |
type: number | |
format: float | |
result: | |
$ref: '#/components/schemas/CrawlResult' | |
results: | |
type: array | |
items: | |
$ref: '#/components/schemas/CrawlResult' | |
error: | |
type: string | |
HealthCheckResponse: | |
type: object | |
properties: | |
status: | |
type: string | |
available_slots: | |
type: integer | |
memory_usage: | |
type: number | |
cpu_usage: | |
type: number | |
securitySchemes: | |
BearerAuth: | |
type: http | |
scheme: bearer | |
paths: | |
/: | |
get: | |
summary: Root endpoint | |
responses: | |
'200': | |
description: Service status or redirect to documentation | |
content: | |
application/json: | |
schema: | |
type: object | |
properties: | |
message: | |
type: string | |
/crawl: | |
post: | |
summary: Submit a crawling task | |
security: | |
- BearerAuth: [] | |
requestBody: | |
required: true | |
content: | |
application/json: | |
schema: | |
$ref: '#/components/schemas/CrawlRequest' | |
examples: | |
basic: | |
summary: Basic crawl with screenshot | |
value: | |
urls: ["https://www.nbcnews.com/business"] | |
screenshot: true | |
multipleUrls: | |
summary: Crawl multiple URLs | |
value: | |
urls: [ | |
"https://example.com", | |
"https://python.org", | |
"https://github.com", | |
"https://stackoverflow.com", | |
"https://news.ycombinator.com" | |
] | |
word_count_threshold: 100 | |
bypass_cache: true | |
verbose: true | |
asyncPagination: | |
summary: Paginated crawl with dynamic content | |
value: | |
urls: ["https://github.com/microsoft/TypeScript/commits/main"] | |
session_id: "typescript_commits_session" | |
css_selector: "li.Box-sc-g0xbh4-0" | |
js_code: ["(() => { const button = document.querySelector('a[data-testid=\"pagination-next-button\"]'); if (button) button.click(); })();"] | |
wait_for: "() => { const commits = document.querySelectorAll('li.Box-sc-g0xbh4-0 h4'); if (commits.length === 0) return false; const firstCommit = commits[0].textContent.trim(); return firstCommit !== window.firstCommit; }" | |
js_only: true | |
cache_mode: "bypass" | |
extraction_config: | |
type: "json_css" | |
params: | |
schema: { | |
name: "Commit Extractor", | |
baseSelector: "li.Box-sc-g0xbh4-0", | |
fields: [ | |
{ | |
name: "title", | |
selector: "h4.markdown-title", | |
type: "text", | |
transform: "strip" | |
} | |
] | |
} | |
markdownPlus: | |
summary: Enhanced markdown generation with content filtering | |
value: | |
urls: ["https://www.nbcnews.com/business"] | |
word_count_threshold: 0 | |
markdown_generator: | |
type: "default" | |
params: | |
content_filter: | |
type: "pruning" | |
threshold: 0.48 | |
threshold_type: "fixed" | |
min_word_threshold: 0 | |
cache_mode: "bypass" | |
cssSelector: | |
summary: Filter content using CSS selector | |
value: | |
urls: ["https://www.nbcnews.com/business"] | |
css_selector: "article" | |
screenshot: true | |
jsExecution: | |
summary: Execute JavaScript before extraction | |
value: | |
urls: ["https://www.nbcnews.com/business"] | |
screenshot: true | |
js_code: ["const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More')); loadMoreButton && loadMoreButton.click();"] | |
cosineStrategy: | |
summary: Use cosine similarity extraction | |
value: | |
urls: ["https://www.nbcnews.com/business"] | |
extraction_config: | |
type: "cosine" | |
params: | |
semantic_filter: "inflation rent prices" | |
llmStrategy: | |
summary: LLM-based extraction with translation | |
value: | |
urls: ["https://www.nbcnews.com/business"] | |
extraction_config: | |
type: "llm" | |
params: | |
provider: "groq/llama3-8b-8192" | |
instruction: "I am interested in only financial news, and translate them in French." | |
sessionBasedCrawl: | |
summary: Session-based crawling with pagination | |
value: | |
urls: ["https://example.com/paged-content"] | |
session_id: "page_navigation_session" | |
js_code: ["document.querySelector('.next-page-button').click();"] | |
css_selector: ".content-section" | |
mediaHandling: | |
summary: Media handling and image extraction | |
value: | |
urls: ["https://www.nbcnews.com/business"] | |
screenshot: true | |
exclude_external_images: false | |
responses: | |
'200': | |
description: Task created successfully | |
content: | |
application/json: | |
schema: | |
type: object | |
properties: | |
task_id: | |
type: string | |
format: uuid | |
/task/{task_id}: | |
get: | |
summary: Get task status and results | |
security: | |
- BearerAuth: [] | |
parameters: | |
- name: task_id | |
in: path | |
required: true | |
schema: | |
type: string | |
format: uuid | |
responses: | |
'200': | |
description: Task information | |
content: | |
application/json: | |
schema: | |
$ref: '#/components/schemas/TaskResponse' | |
'404': | |
description: Task not found | |
/crawl_sync: | |
post: | |
summary: Synchronous crawling endpoint | |
security: | |
- BearerAuth: [] | |
requestBody: | |
required: true | |
content: | |
application/json: | |
schema: | |
$ref: '#/components/schemas/CrawlRequest' | |
responses: | |
'200': | |
description: Crawling results | |
content: | |
application/json: | |
schema: | |
$ref: '#/components/schemas/TaskResponse' | |
examples: | |
multipleUrlsResponse: | |
summary: Response for multiple URLs crawl | |
value: | |
status: "completed" | |
results: [ | |
{ | |
url: "https://example.com", | |
success: true, | |
markdown: "Example Domain...", | |
metadata: { | |
title: "Example Domain" | |
}, | |
links: { | |
internal: [], | |
external: [] | |
}, | |
media: { | |
images: [] | |
} | |
}, | |
{ | |
url: "https://python.org", | |
success: true, | |
markdown: "Welcome to Python...", | |
metadata: { | |
title: "Welcome to Python.org" | |
}, | |
links: { | |
internal: ["link1", "link2"], | |
external: ["ext1", "ext2"] | |
}, | |
media: { | |
images: ["img1", "img2"] | |
} | |
} | |
] | |
'408': | |
description: Task timed out | |
'500': | |
description: Internal server error | |
/crawl_direct: | |
post: | |
summary: Direct crawling endpoint without task queue | |
security: | |
- BearerAuth: [] | |
requestBody: | |
required: true | |
content: | |
application/json: | |
schema: | |
$ref: '#/components/schemas/CrawlRequest' | |
responses: | |
'200': | |
description: Crawling results | |
content: | |
application/json: | |
schema: | |
$ref: '#/components/schemas/TaskResponse' | |
'500': | |
description: Internal server error | |
/health: | |
get: | |
summary: Service health check | |
responses: | |
'200': | |
description: Health check response | |
content: | |
application/json: | |
schema: | |
$ref: '#/components/schemas/HealthCheckResponse' |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment