fmassot · April 11, 2024 13:16
diff --git a/01_ycollet_query.json b/01_ycollet_query.json
 {
  "query": "actor.login:Cyan4973"
 }
diff --git a/02_date_histogram_query.json b/02_date_histogram_query.json
 {
    "query": "*",
    "max_hits": 0,
    "aggs": {
        "events": {
            "date_histogram": {
                "field": "created_at",
                "fixed_interval": "1d"
            }
        }
    }
 }
diff --git a/03_date_histogram_ycollet_query.json b/03_date_histogram_ycollet_query.json
 {
    "query": "actor.login:Cyan4973",
    "max_hits": 0,
    "aggs": {
        "events": {
            "date_histogram": {
                "field": "created_at",
                "fixed_interval": "1d"
            }
        }
    }
 }
diff --git a/05_repositories_mentioning_quickwit.json b/05_repositories_mentioning_quickwit.json
 {
  "query": "(payload.description:quickwit OR payload.comment.body:quickwit OR payload.issue.body:quickwit)",
  "max_hits": 0,
  "aggs": {
    "top_repositories": {
      "terms": {
        "size": 100,
        "field": "repo.name",
        "order": { "_count": "desc" }
      }
    }
  }
 }
diff --git a/gh-archive-index-config.yaml b/gh-archive-index-config.yaml
 #
 # Index config file for gh-archive dataset.
 #
 version: 0.6

 index_id: gh-archive-6

 doc_mapping:
  field_mappings:
    - name: id
      type: text
    - name: type
      type: text
      fast: true
    - name: actor
      type: object
      field_mappings:
      - name: login
        type: text
        fast: true
    - name: repo
      type: object
      field_mappings:
      - name: url
        type: text
      - name: name
        type: text
        fast: true
    - name: payload
      type: object
      field_mappings:
      - name: description
        type: text
      - name: pull_request
        type: object
        field_mappings:
        - name: title
          type: text
        - name: body
          type: text
      - name: release
        type: object
        field_mappings:
        - name: body
          type: text
      - name: review
        type: object
        field_mappings:
        - name: body
          type: text
      - name: comment
        type: object
        field_mappings:
        - name: body
          type: text
        - name: diff_hunk
          type: text
      - name: issue
        type: object
        field_mappings:
        - name: title
          type: text
        - name: body
          type: text
    - name: created_at
      type: datetime
      fast: true
      indexed: true
      input_formats:
      - rfc3339
      - unix_timestamp
      output_format: rfc3339
      precision: seconds
      stored: true
    - name: org
      type: object
      field_mappings:
      - name: login
        type: text
        fast: true
      - name: avatar_url
        type: text
      - name: url
        stored: true
        type: text
      - name: gravatar_id
        type: text
  tag_fields: []
  store_source: false
  timestamp_field: created_at
  mode: dynamic
 indexing_settings:
  commit_timeout_secs: 60
  split_num_docs_target: 10000000
  merge_policy:
    type: limit_merge
    max_merge_ops: 0
    merge_factor: 10
    max_merge_factor: 10
    maturation_period: "2days"
  resources:
    heap_size: 2GB
diff --git a/gh-archive-source-config.yaml b/gh-archive-source-config.yaml
 version: 0.6
 source_id: gh-archive-source
 source_type: http
 desired_num_pipelines: 1
 params:
  uri_pattern: "https://data.gharchive.org/20{15..24}-{01..13}-{01..32}-{0..24}.json.gz"
diff --git a/quickwit-helm-chart-config.yaml b/quickwit-helm-chart-config.yaml
 image:
  repository: quickwit/quickwit
  pullPolicy: Always
  # Overrides the image tag whose default is the chart appVersion.
  tag: qw-http-source
  
 imagePullSecrets: []
 nameOverride: ""
 fullnameOverride: ""

 # Additional global env
 environment:
  QW_DISABLE_TELEMETRY: 1
  OTEL_EXPORTER_OTLP_ENDPOINT: http://quickwit-indexer.quickwit-prod.svc.cluster.local:7281
  QW_ENABLE_OPENTELEMETRY_OTLP_EXPORTER: true
  NO_COLOR: true
  RUST_LOG: quickwit=info

 searcher:
  replicaCount: 4
  
 resources:
  limits:
    memory: 18Gi
    cpu: 8
  requests:
    memory: 2Gi
    cpu: 7
    
 nodeSelector:
    node.kubernetes.io/instance-type: c5n.2xlarge
    
 indexer:
  replicaCount: 1

  # Extra env for indexer
  extraEnv: {}

  resources:
    limits:
      memory: 8Gi
    requests:
      memory: 6Gi
      
  persistentVolume:
    enabled: true
    storage: "250Gi"
    
  nodeSelector:
    node.kubernetes.io/instance-type: c5a.xlarge
    
 metastore:
  replicaCount: 1

  # Extra env for metastore
  extraEnv: {}
    # KEY: VALUE

  resources:
    limits:
      memory: 1Gi
    requests:
      memory: 256Mi
      
 control_plane:
  # Extra env for searcher
  extraEnv: {}
    # KEY: VALUE

  resources:
    limits:
      memory: 256Mi
    requests:
      memory: 25Mi
      
 janitor:
  # Enable Janitor service
  enabled: true

  # Extra env for searcher
  extraEnv: {}
    # KEY: VALUE

  resources:
    limits:
      memory: 2Gi
    requests:
      memory: 100Mi
      
 # Quickwit configuration
 config:
  metastore_uri: s3://gharchive/indexes
  postgres: {}
  default_index_root_uri: s3://gharchive/indexes

  # Indexer settings
  indexer:
    split_store_max_num_bytes: 200G
    split_store_max_num_splits: 200
    enable_otlp_endpoint: false

  ingest_api:
    max_queue_memory_usage: 1GB
    max_queue_disk_usage: 2GB

  # Searcher settings
  searcher:
    fast_field_cache_capacity: 0GB # 6G for 1 searcher, 9GB for 2 searchers, 10.75GB for 4 searchers, 12GB for 8 searchers and more
    split_footer_cache_capacity: 0GB ##6.5GB for 1 searcher, 3.5GB for 2 searchers, 1.75GB for 4 searchers,  0.9GB otherwise
    max_num_concurrent_split_streams: 100
    partial_request_cache_capacity: 0
    aggregation_memory_limit: 5G
	{
	"query": "*",
	"max_hits": 0,
	"aggs": {
	"events": {
	"date_histogram": {
	"field": "created_at",
	"fixed_interval": "1d"
	}
	}
	}
	}
	{
	"query": "actor.login:Cyan4973",
	"max_hits": 0,
	"aggs": {
	"events": {
	"date_histogram": {
	"field": "created_at",
	"fixed_interval": "1d"
	}
	}
	}
	}
	{
	"query": "(payload.description:quickwit OR payload.comment.body:quickwit OR payload.issue.body:quickwit)",
	"max_hits": 0,
	"aggs": {
	"top_repositories": {
	"terms": {
	"size": 100,
	"field": "repo.name",
	"order": { "_count": "desc" }
	}
	}
	}
	}
	#
	# Index config file for gh-archive dataset.
	#
	version: 0.6

	index_id: gh-archive-6

	doc_mapping:
	field_mappings:
	- name: id
	type: text
	- name: type
	type: text
	fast: true
	- name: actor
	type: object
	field_mappings:
	- name: login
	type: text
	fast: true
	- name: repo
	type: object
	field_mappings:
	- name: url
	type: text
	- name: name
	type: text
	fast: true
	- name: payload
	type: object
	field_mappings:
	- name: description
	type: text
	- name: pull_request
	type: object
	field_mappings:
	- name: title
	type: text
	- name: body
	type: text
	- name: release
	type: object
	field_mappings:
	- name: body
	type: text
	- name: review
	type: object
	field_mappings:
	- name: body
	type: text
	- name: comment
	type: object
	field_mappings:
	- name: body
	type: text
	- name: diff_hunk
	type: text
	- name: issue
	type: object
	field_mappings:
	- name: title
	type: text
	- name: body
	type: text
	- name: created_at
	type: datetime
	fast: true
	indexed: true
	input_formats:
	- rfc3339
	- unix_timestamp
	output_format: rfc3339
	precision: seconds
	stored: true
	- name: org
	type: object
	field_mappings:
	- name: login
	type: text
	fast: true
	- name: avatar_url
	type: text
	- name: url
	stored: true
	type: text
	- name: gravatar_id
	type: text
	tag_fields: []
	store_source: false
	timestamp_field: created_at
	mode: dynamic
	indexing_settings:
	commit_timeout_secs: 60
	split_num_docs_target: 10000000
	merge_policy:
	type: limit_merge
	max_merge_ops: 0
	merge_factor: 10
	max_merge_factor: 10
	maturation_period: "2days"
	resources:
	heap_size: 2GB
	version: 0.6
	source_id: gh-archive-source
	source_type: http
	desired_num_pipelines: 1
	params:
	uri_pattern: "https://data.gharchive.org/20{15..24}-{01..13}-{01..32}-{0..24}.json.gz"
	image:
	repository: quickwit/quickwit
	pullPolicy: Always
	# Overrides the image tag whose default is the chart appVersion.
	tag: qw-http-source

	imagePullSecrets: []
	nameOverride: ""
	fullnameOverride: ""

	# Additional global env
	environment:
	QW_DISABLE_TELEMETRY: 1
	OTEL_EXPORTER_OTLP_ENDPOINT: http://quickwit-indexer.quickwit-prod.svc.cluster.local:7281
	QW_ENABLE_OPENTELEMETRY_OTLP_EXPORTER: true
	NO_COLOR: true
	RUST_LOG: quickwit=info

	searcher:
	replicaCount: 4

	resources:
	limits:
	memory: 18Gi
	cpu: 8
	requests:
	memory: 2Gi
	cpu: 7

	nodeSelector:
	node.kubernetes.io/instance-type: c5n.2xlarge

	indexer:
	replicaCount: 1

	# Extra env for indexer
	extraEnv: {}

	resources:
	limits:
	memory: 8Gi
	requests:
	memory: 6Gi

	persistentVolume:
	enabled: true
	storage: "250Gi"

	nodeSelector:
	node.kubernetes.io/instance-type: c5a.xlarge

	metastore:
	replicaCount: 1

	# Extra env for metastore
	extraEnv: {}
	# KEY: VALUE

	resources:
	limits:
	memory: 1Gi
	requests:
	memory: 256Mi

	control_plane:
	# Extra env for searcher
	extraEnv: {}
	# KEY: VALUE

	resources:
	limits:
	memory: 256Mi
	requests:
	memory: 25Mi

	janitor:
	# Enable Janitor service
	enabled: true

	# Extra env for searcher
	extraEnv: {}
	# KEY: VALUE

	resources:
	limits:
	memory: 2Gi
	requests:
	memory: 100Mi

	# Quickwit configuration
	config:
	metastore_uri: s3://gharchive/indexes
	postgres: {}
	default_index_root_uri: s3://gharchive/indexes

	# Indexer settings
	indexer:
	split_store_max_num_bytes: 200G
	split_store_max_num_splits: 200
	enable_otlp_endpoint: false

	ingest_api:
	max_queue_memory_usage: 1GB
	max_queue_disk_usage: 2GB

	# Searcher settings
	searcher:
	fast_field_cache_capacity: 0GB # 6G for 1 searcher, 9GB for 2 searchers, 10.75GB for 4 searchers, 12GB for 8 searchers and more
	split_footer_cache_capacity: 0GB ##6.5GB for 1 searcher, 3.5GB for 2 searchers, 1.75GB for 4 searchers, 0.9GB otherwise
	max_num_concurrent_split_streams: 100
	partial_request_cache_capacity: 0
	aggregation_memory_limit: 5G