ezynda3 · January 23, 2025 10:46
diff --git a/crawl4ai-openapi.yml b/crawl4ai-openapi.yml
 openapi: 3.0.3
 info:
  title: Crawl4AI API
  version: 1.0.0
  description: API for web crawling and content extraction

 servers:
  - url: /

 components:
  schemas:
    TokenUsage:
      type: object
      properties:
        completion_tokens:
          type: integer
          default: 0
        prompt_tokens:
          type: integer
          default: 0
        total_tokens:
          type: integer
          default: 0
        completion_tokens_details:
          type: object
        prompt_tokens_details:
          type: object

    MediaItem:
      type: object
      properties:
        src:
          type: string
          default: ""
        alt:
          type: string
        desc:
          type: string
        score:
          type: integer
        type:
          type: string
          default: "image"
        group_id:
          type: integer
        format:
          type: string
        width:
          type: integer

    Link:
      type: object
      properties:
        href:
          type: string
          default: ""
        text:
          type: string
        title:
          type: string
        base_domain:
          type: string

    Media:
      type: object
      properties:
        images:
          type: array
          items:
            $ref: '#/components/schemas/MediaItem'
          default: []
        videos:
          type: array
          items:
            $ref: '#/components/schemas/MediaItem'
          default: []
        audios:
          type: array
          items:
            $ref: '#/components/schemas/MediaItem'
          default: []

    Links:
      type: object
      properties:
        internal:
          type: array
          items:
            $ref: '#/components/schemas/Link'
          default: []
        external:
          type: array
          items:
            $ref: '#/components/schemas/Link'
          default: []

    MarkdownGenerationResult:
      type: object
      required:
        - raw_markdown
        - markdown_with_citations
        - references_markdown
      properties:
        raw_markdown:
          type: string
        markdown_with_citations:
          type: string
        references_markdown:
          type: string
        fit_markdown:
          type: string
        fit_html:
          type: string

    DispatchResult:
      type: object
      required:
        - task_id
        - memory_usage
        - peak_memory
        - start_time
        - end_time
      properties:
        task_id:
          type: string
        memory_usage:
          type: number
          format: float
        peak_memory:
          type: number
          format: float
        start_time:
          type: string
          format: date-time
        end_time:
          type: string
          format: date-time
        error_message:
          type: string
          default: ""
    TaskStatus:
      type: string
      enum: [pending, processing, completed, failed]
    
    CrawlerType:
      type: string
      enum: [basic, llm, cosine, json_css]
    
    ExtractionConfig:
      type: object
      properties:
        type:
          $ref: '#/components/schemas/CrawlerType'
        params:
          type: object
          additionalProperties: true
          default: {}
    
    ChunkingStrategy:
      type: object
      properties:
        type:
          type: string
        params:
          type: object
          additionalProperties: true
          default: {}
    
    ContentFilter:
      type: object
      properties:
        type:
          type: string
          default: bm25
        params:
          type: object
          additionalProperties: true
          default: {}
    
    CrawlRequest:
      type: object
      required:
        - urls
      properties:
        urls:
          oneOf:
            - type: string
              format: uri
            - type: array
              items:
                type: string
                format: uri
        word_count_threshold:
          type: integer
          default: 100
        extraction_config:
          $ref: '#/components/schemas/ExtractionConfig'
        chunking_strategy:
          $ref: '#/components/schemas/ChunkingStrategy'
        content_filter:
          $ref: '#/components/schemas/ContentFilter'
        js_code:
          type: array
          items:
            type: string
        wait_for:
          type: string
        css_selector:
          type: string
        screenshot:
          type: boolean
          default: false
        magic:
          type: boolean
          default: false
        extra:
          type: object
          additionalProperties: true
        session_id:
          type: string
        cache_mode:
          type: string
          enum: [enabled, disabled]
          default: enabled
        priority:
          type: integer
          minimum: 1
          maximum: 10
          default: 5
        ttl:
          type: integer
          default: 3600
        crawler_params:
          type: object
          additionalProperties: true
          default: {}

    CrawlResult:
      type: object
      required:
        - url
        - html
        - success
      properties:
        url:
          type: string
        html:
          type: string
        success:
          type: boolean
        cleaned_html:
          type: string
        media:
          type: object
          additionalProperties:
            type: array
            items:
              type: object
          default: {}
        links:
          type: object
          additionalProperties:
            type: array
            items:
              type: object
          default: {}
        downloaded_files:
          type: array
          items:
            type: string
        screenshot:
          type: string
        pdf:
          type: string
          format: binary
        markdown:
          oneOf:
            - type: string
            - $ref: '#/components/schemas/MarkdownGenerationResult'
        markdown_v2:
          $ref: '#/components/schemas/MarkdownGenerationResult'
        fit_markdown:
          type: string
        fit_html:
          type: string
        extracted_content:
          type: string
        metadata:
          type: object
        error_message:
          type: string
        session_id:
          type: string
        response_headers:
          type: object
        status_code:
          type: integer
        ssl_certificate:
          type: object
        dispatch_result:
          $ref: '#/components/schemas/DispatchResult'
        redirected_url:
          type: string

    TaskResponse:
      type: object
      required:
        - status
        - created_at
      properties:
        status:
          $ref: '#/components/schemas/TaskStatus'
        created_at:
          type: number
          format: float
        result:
          $ref: '#/components/schemas/CrawlResult'
        results:
          type: array
          items:
            $ref: '#/components/schemas/CrawlResult'
        error:
          type: string

    HealthCheckResponse:
      type: object
      properties:
        status:
          type: string
        available_slots:
          type: integer
        memory_usage:
          type: number
        cpu_usage:
          type: number

  securitySchemes:
    BearerAuth:
      type: http
      scheme: bearer

 paths:
  /:
    get:
      summary: Root endpoint
      responses:
        '200':
          description: Service status or redirect to documentation
          content:
            application/json:
              schema:
                type: object
                properties:
                  message:
                    type: string

  /crawl:
    post:
      summary: Submit a crawling task
      security:
        - BearerAuth: []
      requestBody:
        required: true
        content:
          application/json:
            schema:
              $ref: '#/components/schemas/CrawlRequest'
            examples:
              basic:
                summary: Basic crawl with screenshot
                value:
                  urls: ["https://www.nbcnews.com/business"]
                  screenshot: true
              multipleUrls:
                summary: Crawl multiple URLs
                value:
                  urls: [
                    "https://example.com",
                    "https://python.org",
                    "https://github.com",
                    "https://stackoverflow.com",
                    "https://news.ycombinator.com"
                  ]
                  word_count_threshold: 100
                  bypass_cache: true
                  verbose: true
              asyncPagination:
                summary: Paginated crawl with dynamic content
                value:
                  urls: ["https://github.com/microsoft/TypeScript/commits/main"]
                  session_id: "typescript_commits_session"
                  css_selector: "li.Box-sc-g0xbh4-0"
                  js_code: ["(() => { const button = document.querySelector('a[data-testid=\"pagination-next-button\"]'); if (button) button.click(); })();"]
                  wait_for: "() => { const commits = document.querySelectorAll('li.Box-sc-g0xbh4-0 h4'); if (commits.length === 0) return false; const firstCommit = commits[0].textContent.trim(); return firstCommit !== window.firstCommit; }"
                  js_only: true
                  cache_mode: "bypass"
                  extraction_config:
                    type: "json_css"
                    params:
                      schema: {
                        name: "Commit Extractor",
                        baseSelector: "li.Box-sc-g0xbh4-0",
                        fields: [
                          {
                            name: "title",
                            selector: "h4.markdown-title",
                            type: "text",
                            transform: "strip"
                          }
                        ]
                      }
              markdownPlus:
                summary: Enhanced markdown generation with content filtering
                value:
                  urls: ["https://www.nbcnews.com/business"]
                  word_count_threshold: 0
                  markdown_generator:
                    type: "default"
                    params:
                      content_filter:
                        type: "pruning"
                        threshold: 0.48
                        threshold_type: "fixed"
                        min_word_threshold: 0
                  cache_mode: "bypass"
              cssSelector:
                summary: Filter content using CSS selector
                value:
                  urls: ["https://www.nbcnews.com/business"]
                  css_selector: "article"
                  screenshot: true
              jsExecution:
                summary: Execute JavaScript before extraction
                value:
                  urls: ["https://www.nbcnews.com/business"]
                  screenshot: true
                  js_code: ["const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More')); loadMoreButton && loadMoreButton.click();"]
              cosineStrategy:
                summary: Use cosine similarity extraction
                value:
                  urls: ["https://www.nbcnews.com/business"]
                  extraction_config:
                    type: "cosine"
                    params:
                      semantic_filter: "inflation rent prices"
              llmStrategy:
                summary: LLM-based extraction with translation
                value:
                  urls: ["https://www.nbcnews.com/business"]
                  extraction_config:
                    type: "llm"
                    params:
                      provider: "groq/llama3-8b-8192"
                      instruction: "I am interested in only financial news, and translate them in French."
              sessionBasedCrawl:
                summary: Session-based crawling with pagination
                value:
                  urls: ["https://example.com/paged-content"]
                  session_id: "page_navigation_session"
                  js_code: ["document.querySelector('.next-page-button').click();"]
                  css_selector: ".content-section"
              mediaHandling:
                summary: Media handling and image extraction
                value:
                  urls: ["https://www.nbcnews.com/business"]
                  screenshot: true
                  exclude_external_images: false
      responses:
        '200':
          description: Task created successfully
          content:
            application/json:
              schema:
                type: object
                properties:
                  task_id:
                    type: string
                    format: uuid

  /task/{task_id}:
    get:
      summary: Get task status and results
      security:
        - BearerAuth: []
      parameters:
        - name: task_id
          in: path
          required: true
          schema:
            type: string
            format: uuid
      responses:
        '200':
          description: Task information
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/TaskResponse'
        '404':
          description: Task not found

  /crawl_sync:
    post:
      summary: Synchronous crawling endpoint
      security:
        - BearerAuth: []
      requestBody:
        required: true
        content:
          application/json:
            schema:
              $ref: '#/components/schemas/CrawlRequest'
      responses:
        '200':
          description: Crawling results
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/TaskResponse'
              examples:
                multipleUrlsResponse:
                  summary: Response for multiple URLs crawl
                  value:
                    status: "completed"
                    results: [
                      {
                        url: "https://example.com",
                        success: true,
                        markdown: "Example Domain...",
                        metadata: {
                          title: "Example Domain"
                        },
                        links: {
                          internal: [],
                          external: []
                        },
                        media: {
                          images: []
                        }
                      },
                      {
                        url: "https://python.org",
                        success: true,
                        markdown: "Welcome to Python...",
                        metadata: {
                          title: "Welcome to Python.org"
                        },
                        links: {
                          internal: ["link1", "link2"],
                          external: ["ext1", "ext2"]
                        },
                        media: {
                          images: ["img1", "img2"]
                        }
                      }
                    ]
        '408':
          description: Task timed out
        '500':
          description: Internal server error

  /crawl_direct:
    post:
      summary: Direct crawling endpoint without task queue
      security:
        - BearerAuth: []
      requestBody:
        required: true
        content:
          application/json:
            schema:
              $ref: '#/components/schemas/CrawlRequest'
      responses:
        '200':
          description: Crawling results
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/TaskResponse'
        '500':
          description: Internal server error

  /health:
    get:
      summary: Service health check
      responses:
        '200':
          description: Health check response
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/HealthCheckResponse'
	openapi: 3.0.3
	info:
	title: Crawl4AI API
	version: 1.0.0
	description: API for web crawling and content extraction

	servers:
	- url: /

	components:
	schemas:
	TokenUsage:
	type: object
	properties:
	completion_tokens:
	type: integer
	default: 0
	prompt_tokens:
	type: integer
	default: 0
	total_tokens:
	type: integer
	default: 0
	completion_tokens_details:
	type: object
	prompt_tokens_details:
	type: object

	MediaItem:
	type: object
	properties:
	src:
	type: string
	default: ""
	alt:
	type: string
	desc:
	type: string
	score:
	type: integer
	type:
	type: string
	default: "image"
	group_id:
	type: integer
	format:
	type: string
	width:
	type: integer

	Link:
	type: object
	properties:
	href:
	type: string
	default: ""
	text:
	type: string
	title:
	type: string
	base_domain:
	type: string

	Media:
	type: object
	properties:
	images:
	type: array
	items:
	$ref: '#/components/schemas/MediaItem'
	default: []
	videos:
	type: array
	items:
	$ref: '#/components/schemas/MediaItem'
	default: []
	audios:
	type: array
	items:
	$ref: '#/components/schemas/MediaItem'
	default: []

	Links:
	type: object
	properties:
	internal:
	type: array
	items:
	$ref: '#/components/schemas/Link'
	default: []
	external:
	type: array
	items:
	$ref: '#/components/schemas/Link'
	default: []

	MarkdownGenerationResult:
	type: object
	required:
	- raw_markdown
	- markdown_with_citations
	- references_markdown
	properties:
	raw_markdown:
	type: string
	markdown_with_citations:
	type: string
	references_markdown:
	type: string
	fit_markdown:
	type: string
	fit_html:
	type: string

	DispatchResult:
	type: object
	required:
	- task_id
	- memory_usage
	- peak_memory
	- start_time
	- end_time
	properties:
	task_id:
	type: string
	memory_usage:
	type: number
	format: float
	peak_memory:
	type: number
	format: float
	start_time:
	type: string
	format: date-time
	end_time:
	type: string
	format: date-time
	error_message:
	type: string
	default: ""
	TaskStatus:
	type: string
	enum: [pending, processing, completed, failed]

	CrawlerType:
	type: string
	enum: [basic, llm, cosine, json_css]

	ExtractionConfig:
	type: object
	properties:
	type:
	$ref: '#/components/schemas/CrawlerType'
	params:
	type: object
	additionalProperties: true
	default: {}

	ChunkingStrategy:
	type: object
	properties:
	type:
	type: string
	params:
	type: object
	additionalProperties: true
	default: {}

	ContentFilter:
	type: object
	properties:
	type:
	type: string
	default: bm25
	params:
	type: object
	additionalProperties: true
	default: {}

	CrawlRequest:
	type: object
	required:
	- urls
	properties:
	urls:
	oneOf:
	- type: string
	format: uri
	- type: array
	items:
	type: string
	format: uri
	word_count_threshold:
	type: integer
	default: 100
	extraction_config:
	$ref: '#/components/schemas/ExtractionConfig'
	chunking_strategy:
	$ref: '#/components/schemas/ChunkingStrategy'
	content_filter:
	$ref: '#/components/schemas/ContentFilter'
	js_code:
	type: array
	items:
	type: string
	wait_for:
	type: string
	css_selector:
	type: string
	screenshot:
	type: boolean
	default: false
	magic:
	type: boolean
	default: false
	extra:
	type: object
	additionalProperties: true
	session_id:
	type: string
	cache_mode:
	type: string
	enum: [enabled, disabled]
	default: enabled
	priority:
	type: integer
	minimum: 1
	maximum: 10
	default: 5
	ttl:
	type: integer
	default: 3600
	crawler_params:
	type: object
	additionalProperties: true
	default: {}

	CrawlResult:
	type: object
	required:
	- url
	- html
	- success
	properties:
	url:
	type: string
	html:
	type: string
	success:
	type: boolean
	cleaned_html:
	type: string
	media:
	type: object
	additionalProperties:
	type: array
	items:
	type: object
	default: {}
	links:
	type: object
	additionalProperties:
	type: array
	items:
	type: object
	default: {}
	downloaded_files:
	type: array
	items:
	type: string
	screenshot:
	type: string
	pdf:
	type: string
	format: binary
	markdown:
	oneOf:
	- type: string
	- $ref: '#/components/schemas/MarkdownGenerationResult'
	markdown_v2:
	$ref: '#/components/schemas/MarkdownGenerationResult'
	fit_markdown:
	type: string
	fit_html:
	type: string
	extracted_content:
	type: string
	metadata:
	type: object
	error_message:
	type: string
	session_id:
	type: string
	response_headers:
	type: object
	status_code:
	type: integer
	ssl_certificate:
	type: object
	dispatch_result:
	$ref: '#/components/schemas/DispatchResult'
	redirected_url:
	type: string

	TaskResponse:
	type: object
	required:
	- status
	- created_at
	properties:
	status:
	$ref: '#/components/schemas/TaskStatus'
	created_at:
	type: number
	format: float
	result:
	$ref: '#/components/schemas/CrawlResult'
	results:
	type: array
	items:
	$ref: '#/components/schemas/CrawlResult'
	error:
	type: string

	HealthCheckResponse:
	type: object
	properties:
	status:
	type: string
	available_slots:
	type: integer
	memory_usage:
	type: number
	cpu_usage:
	type: number

	securitySchemes:
	BearerAuth:
	type: http
	scheme: bearer

	paths:
	/:
	get:
	summary: Root endpoint
	responses:
	'200':
	description: Service status or redirect to documentation
	content:
	application/json:
	schema:
	type: object
	properties:
	message:
	type: string

	/crawl:
	post:
	summary: Submit a crawling task
	security:
	- BearerAuth: []
	requestBody:
	required: true
	content:
	application/json:
	schema:
	$ref: '#/components/schemas/CrawlRequest'
	examples:
	basic:
	summary: Basic crawl with screenshot
	value:
	urls: ["https://www.nbcnews.com/business"]
	screenshot: true
	multipleUrls:
	summary: Crawl multiple URLs
	value:
	urls: [
	"https://example.com",
	"https://python.org",
	"https://github.com",
	"https://stackoverflow.com",
	"https://news.ycombinator.com"
	]
	word_count_threshold: 100
	bypass_cache: true
	verbose: true
	asyncPagination:
	summary: Paginated crawl with dynamic content
	value:
	urls: ["https://github.com/microsoft/TypeScript/commits/main"]
	session_id: "typescript_commits_session"
	css_selector: "li.Box-sc-g0xbh4-0"
	js_code: ["(() => { const button = document.querySelector('a[data-testid=\"pagination-next-button\"]'); if (button) button.click(); })();"]
	wait_for: "() => { const commits = document.querySelectorAll('li.Box-sc-g0xbh4-0 h4'); if (commits.length === 0) return false; const firstCommit = commits[0].textContent.trim(); return firstCommit !== window.firstCommit; }"
	js_only: true
	cache_mode: "bypass"
	extraction_config:
	type: "json_css"
	params:
	schema: {
	name: "Commit Extractor",
	baseSelector: "li.Box-sc-g0xbh4-0",
	fields: [
	{
	name: "title",
	selector: "h4.markdown-title",
	type: "text",
	transform: "strip"
	}
	]
	}
	markdownPlus:
	summary: Enhanced markdown generation with content filtering
	value:
	urls: ["https://www.nbcnews.com/business"]
	word_count_threshold: 0
	markdown_generator:
	type: "default"
	params:
	content_filter:
	type: "pruning"
	threshold: 0.48
	threshold_type: "fixed"
	min_word_threshold: 0
	cache_mode: "bypass"
	cssSelector:
	summary: Filter content using CSS selector
	value:
	urls: ["https://www.nbcnews.com/business"]
	css_selector: "article"
	screenshot: true
	jsExecution:
	summary: Execute JavaScript before extraction
	value:
	urls: ["https://www.nbcnews.com/business"]
	screenshot: true
	js_code: ["const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More')); loadMoreButton && loadMoreButton.click();"]
	cosineStrategy:
	summary: Use cosine similarity extraction
	value:
	urls: ["https://www.nbcnews.com/business"]
	extraction_config:
	type: "cosine"
	params:
	semantic_filter: "inflation rent prices"
	llmStrategy:
	summary: LLM-based extraction with translation
	value:
	urls: ["https://www.nbcnews.com/business"]
	extraction_config:
	type: "llm"
	params:
	provider: "groq/llama3-8b-8192"
	instruction: "I am interested in only financial news, and translate them in French."
	sessionBasedCrawl:
	summary: Session-based crawling with pagination
	value:
	urls: ["https://example.com/paged-content"]
	session_id: "page_navigation_session"
	js_code: ["document.querySelector('.next-page-button').click();"]
	css_selector: ".content-section"
	mediaHandling:
	summary: Media handling and image extraction
	value:
	urls: ["https://www.nbcnews.com/business"]
	screenshot: true
	exclude_external_images: false
	responses:
	'200':
	description: Task created successfully
	content:
	application/json:
	schema:
	type: object
	properties:
	task_id:
	type: string
	format: uuid

	/task/{task_id}:
	get:
	summary: Get task status and results
	security:
	- BearerAuth: []
	parameters:
	- name: task_id
	in: path
	required: true
	schema:
	type: string
	format: uuid
	responses:
	'200':
	description: Task information
	content:
	application/json:
	schema:
	$ref: '#/components/schemas/TaskResponse'
	'404':
	description: Task not found

	/crawl_sync:
	post:
	summary: Synchronous crawling endpoint
	security:
	- BearerAuth: []
	requestBody:
	required: true
	content:
	application/json:
	schema:
	$ref: '#/components/schemas/CrawlRequest'
	responses:
	'200':
	description: Crawling results
	content:
	application/json:
	schema:
	$ref: '#/components/schemas/TaskResponse'
	examples:
	multipleUrlsResponse:
	summary: Response for multiple URLs crawl
	value:
	status: "completed"
	results: [
	{
	url: "https://example.com",
	success: true,
	markdown: "Example Domain...",
	metadata: {
	title: "Example Domain"
	},
	links: {
	internal: [],
	external: []
	},
	media: {
	images: []
	}
	},
	{
	url: "https://python.org",
	success: true,
	markdown: "Welcome to Python...",
	metadata: {
	title: "Welcome to Python.org"
	},
	links: {
	internal: ["link1", "link2"],
	external: ["ext1", "ext2"]
	},
	media: {
	images: ["img1", "img2"]
	}
	}
	]
	'408':
	description: Task timed out
	'500':
	description: Internal server error

	/crawl_direct:
	post:
	summary: Direct crawling endpoint without task queue
	security:
	- BearerAuth: []
	requestBody:
	required: true
	content:
	application/json:
	schema:
	$ref: '#/components/schemas/CrawlRequest'
	responses:
	'200':
	description: Crawling results
	content:
	application/json:
	schema:
	$ref: '#/components/schemas/TaskResponse'
	'500':
	description: Internal server error

	/health:
	get:
	summary: Service health check
	responses:
	'200':
	description: Health check response
	content:
	application/json:
	schema:
	$ref: '#/components/schemas/HealthCheckResponse'