Skip to content

Instantly share code, notes, and snippets.

@ezynda3
Created January 23, 2025 10:46
Show Gist options
  • Save ezynda3/3d95afaaacfbb87ec19b937da731c646 to your computer and use it in GitHub Desktop.
Save ezynda3/3d95afaaacfbb87ec19b937da731c646 to your computer and use it in GitHub Desktop.
Crawl4AI OpenAPI Spec
openapi: 3.0.3
info:
title: Crawl4AI API
version: 1.0.0
description: API for web crawling and content extraction
servers:
- url: /
components:
schemas:
TokenUsage:
type: object
properties:
completion_tokens:
type: integer
default: 0
prompt_tokens:
type: integer
default: 0
total_tokens:
type: integer
default: 0
completion_tokens_details:
type: object
prompt_tokens_details:
type: object
MediaItem:
type: object
properties:
src:
type: string
default: ""
alt:
type: string
desc:
type: string
score:
type: integer
type:
type: string
default: "image"
group_id:
type: integer
format:
type: string
width:
type: integer
Link:
type: object
properties:
href:
type: string
default: ""
text:
type: string
title:
type: string
base_domain:
type: string
Media:
type: object
properties:
images:
type: array
items:
$ref: '#/components/schemas/MediaItem'
default: []
videos:
type: array
items:
$ref: '#/components/schemas/MediaItem'
default: []
audios:
type: array
items:
$ref: '#/components/schemas/MediaItem'
default: []
Links:
type: object
properties:
internal:
type: array
items:
$ref: '#/components/schemas/Link'
default: []
external:
type: array
items:
$ref: '#/components/schemas/Link'
default: []
MarkdownGenerationResult:
type: object
required:
- raw_markdown
- markdown_with_citations
- references_markdown
properties:
raw_markdown:
type: string
markdown_with_citations:
type: string
references_markdown:
type: string
fit_markdown:
type: string
fit_html:
type: string
DispatchResult:
type: object
required:
- task_id
- memory_usage
- peak_memory
- start_time
- end_time
properties:
task_id:
type: string
memory_usage:
type: number
format: float
peak_memory:
type: number
format: float
start_time:
type: string
format: date-time
end_time:
type: string
format: date-time
error_message:
type: string
default: ""
TaskStatus:
type: string
enum: [pending, processing, completed, failed]
CrawlerType:
type: string
enum: [basic, llm, cosine, json_css]
ExtractionConfig:
type: object
properties:
type:
$ref: '#/components/schemas/CrawlerType'
params:
type: object
additionalProperties: true
default: {}
ChunkingStrategy:
type: object
properties:
type:
type: string
params:
type: object
additionalProperties: true
default: {}
ContentFilter:
type: object
properties:
type:
type: string
default: bm25
params:
type: object
additionalProperties: true
default: {}
CrawlRequest:
type: object
required:
- urls
properties:
urls:
oneOf:
- type: string
format: uri
- type: array
items:
type: string
format: uri
word_count_threshold:
type: integer
default: 100
extraction_config:
$ref: '#/components/schemas/ExtractionConfig'
chunking_strategy:
$ref: '#/components/schemas/ChunkingStrategy'
content_filter:
$ref: '#/components/schemas/ContentFilter'
js_code:
type: array
items:
type: string
wait_for:
type: string
css_selector:
type: string
screenshot:
type: boolean
default: false
magic:
type: boolean
default: false
extra:
type: object
additionalProperties: true
session_id:
type: string
cache_mode:
type: string
enum: [enabled, disabled]
default: enabled
priority:
type: integer
minimum: 1
maximum: 10
default: 5
ttl:
type: integer
default: 3600
crawler_params:
type: object
additionalProperties: true
default: {}
CrawlResult:
type: object
required:
- url
- html
- success
properties:
url:
type: string
html:
type: string
success:
type: boolean
cleaned_html:
type: string
media:
type: object
additionalProperties:
type: array
items:
type: object
default: {}
links:
type: object
additionalProperties:
type: array
items:
type: object
default: {}
downloaded_files:
type: array
items:
type: string
screenshot:
type: string
pdf:
type: string
format: binary
markdown:
oneOf:
- type: string
- $ref: '#/components/schemas/MarkdownGenerationResult'
markdown_v2:
$ref: '#/components/schemas/MarkdownGenerationResult'
fit_markdown:
type: string
fit_html:
type: string
extracted_content:
type: string
metadata:
type: object
error_message:
type: string
session_id:
type: string
response_headers:
type: object
status_code:
type: integer
ssl_certificate:
type: object
dispatch_result:
$ref: '#/components/schemas/DispatchResult'
redirected_url:
type: string
TaskResponse:
type: object
required:
- status
- created_at
properties:
status:
$ref: '#/components/schemas/TaskStatus'
created_at:
type: number
format: float
result:
$ref: '#/components/schemas/CrawlResult'
results:
type: array
items:
$ref: '#/components/schemas/CrawlResult'
error:
type: string
HealthCheckResponse:
type: object
properties:
status:
type: string
available_slots:
type: integer
memory_usage:
type: number
cpu_usage:
type: number
securitySchemes:
BearerAuth:
type: http
scheme: bearer
paths:
/:
get:
summary: Root endpoint
responses:
'200':
description: Service status or redirect to documentation
content:
application/json:
schema:
type: object
properties:
message:
type: string
/crawl:
post:
summary: Submit a crawling task
security:
- BearerAuth: []
requestBody:
required: true
content:
application/json:
schema:
$ref: '#/components/schemas/CrawlRequest'
examples:
basic:
summary: Basic crawl with screenshot
value:
urls: ["https://www.nbcnews.com/business"]
screenshot: true
multipleUrls:
summary: Crawl multiple URLs
value:
urls: [
"https://example.com",
"https://python.org",
"https://github.com",
"https://stackoverflow.com",
"https://news.ycombinator.com"
]
word_count_threshold: 100
bypass_cache: true
verbose: true
asyncPagination:
summary: Paginated crawl with dynamic content
value:
urls: ["https://github.com/microsoft/TypeScript/commits/main"]
session_id: "typescript_commits_session"
css_selector: "li.Box-sc-g0xbh4-0"
js_code: ["(() => { const button = document.querySelector('a[data-testid=\"pagination-next-button\"]'); if (button) button.click(); })();"]
wait_for: "() => { const commits = document.querySelectorAll('li.Box-sc-g0xbh4-0 h4'); if (commits.length === 0) return false; const firstCommit = commits[0].textContent.trim(); return firstCommit !== window.firstCommit; }"
js_only: true
cache_mode: "bypass"
extraction_config:
type: "json_css"
params:
schema: {
name: "Commit Extractor",
baseSelector: "li.Box-sc-g0xbh4-0",
fields: [
{
name: "title",
selector: "h4.markdown-title",
type: "text",
transform: "strip"
}
]
}
markdownPlus:
summary: Enhanced markdown generation with content filtering
value:
urls: ["https://www.nbcnews.com/business"]
word_count_threshold: 0
markdown_generator:
type: "default"
params:
content_filter:
type: "pruning"
threshold: 0.48
threshold_type: "fixed"
min_word_threshold: 0
cache_mode: "bypass"
cssSelector:
summary: Filter content using CSS selector
value:
urls: ["https://www.nbcnews.com/business"]
css_selector: "article"
screenshot: true
jsExecution:
summary: Execute JavaScript before extraction
value:
urls: ["https://www.nbcnews.com/business"]
screenshot: true
js_code: ["const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More')); loadMoreButton && loadMoreButton.click();"]
cosineStrategy:
summary: Use cosine similarity extraction
value:
urls: ["https://www.nbcnews.com/business"]
extraction_config:
type: "cosine"
params:
semantic_filter: "inflation rent prices"
llmStrategy:
summary: LLM-based extraction with translation
value:
urls: ["https://www.nbcnews.com/business"]
extraction_config:
type: "llm"
params:
provider: "groq/llama3-8b-8192"
instruction: "I am interested in only financial news, and translate them in French."
sessionBasedCrawl:
summary: Session-based crawling with pagination
value:
urls: ["https://example.com/paged-content"]
session_id: "page_navigation_session"
js_code: ["document.querySelector('.next-page-button').click();"]
css_selector: ".content-section"
mediaHandling:
summary: Media handling and image extraction
value:
urls: ["https://www.nbcnews.com/business"]
screenshot: true
exclude_external_images: false
responses:
'200':
description: Task created successfully
content:
application/json:
schema:
type: object
properties:
task_id:
type: string
format: uuid
/task/{task_id}:
get:
summary: Get task status and results
security:
- BearerAuth: []
parameters:
- name: task_id
in: path
required: true
schema:
type: string
format: uuid
responses:
'200':
description: Task information
content:
application/json:
schema:
$ref: '#/components/schemas/TaskResponse'
'404':
description: Task not found
/crawl_sync:
post:
summary: Synchronous crawling endpoint
security:
- BearerAuth: []
requestBody:
required: true
content:
application/json:
schema:
$ref: '#/components/schemas/CrawlRequest'
responses:
'200':
description: Crawling results
content:
application/json:
schema:
$ref: '#/components/schemas/TaskResponse'
examples:
multipleUrlsResponse:
summary: Response for multiple URLs crawl
value:
status: "completed"
results: [
{
url: "https://example.com",
success: true,
markdown: "Example Domain...",
metadata: {
title: "Example Domain"
},
links: {
internal: [],
external: []
},
media: {
images: []
}
},
{
url: "https://python.org",
success: true,
markdown: "Welcome to Python...",
metadata: {
title: "Welcome to Python.org"
},
links: {
internal: ["link1", "link2"],
external: ["ext1", "ext2"]
},
media: {
images: ["img1", "img2"]
}
}
]
'408':
description: Task timed out
'500':
description: Internal server error
/crawl_direct:
post:
summary: Direct crawling endpoint without task queue
security:
- BearerAuth: []
requestBody:
required: true
content:
application/json:
schema:
$ref: '#/components/schemas/CrawlRequest'
responses:
'200':
description: Crawling results
content:
application/json:
schema:
$ref: '#/components/schemas/TaskResponse'
'500':
description: Internal server error
/health:
get:
summary: Service health check
responses:
'200':
description: Health check response
content:
application/json:
schema:
$ref: '#/components/schemas/HealthCheckResponse'
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment