cdxker · January 3, 2025 22:36
diff --git a/0__scrape-okteto-docs.sh b/0__scrape-okteto-docs.sh
 export DATASET_ID="<your-trieve-dataset>"
 export ORGANIZATION_ID="<your-trieve-organization>"
 export TRIEVE_API_KEY="<your-trieve-api-key>"
 export TRIEVE_URL="https://api.trieve.ai"

 PAYLOAD='{
    "crawl_options": {
        "allow_external_links" : false,
        "boost_titles": true,
        "exclude_paths": [],
        "exclude_tags": [
            "#__docusaurus > div.announcementBar_mb4j",'
            "#__docusaurus > nav",
            "#__docusaurus_skipToContent_fallback > div > div > aside > div > div > nav",
            "#__docusaurus_skipToContent_fallback > div > div > aside > div > div > nav",
            "#__docusaurus_skipToContent_fallback > div > div > main > div > div > div.col.col--3 > div",
            ".TiersList"
        ],
        "include_paths" : ["/docs/*"],
        "include_tags" : ["article"],
        "interval" : "daily",
        "limit" : 1000,
        "site_url" : "https://www.okteto.com/docs",
        "scrape_options" : null
    },
    "dataset_id": "'${DATASET_ID}'"
 }'

 # Make the API call using curl and capture the response code
 curl -X PUT "${TRIEVE_URL}/api/dataset" \
  -H "Content-Type: application/json" \
  -H "Authorization: ${API_KEY}" \
  -H "TR-Organization: ${ORGANIZATION_ID}" \
  -H "TR-Dataset: ${DATAST_ID}" \
  -d "${PAYLOAD}" \
  -w "\n%{http_code}"
diff --git a/1__example_action.yaml b/1__example_action.yaml
 name: "Index Trieve Search Component"

 on:
  push:
  workflow_dispatch:

 jobs:
  create-search-index:
    runs-on: ubuntu-24.04
    steps:
      - name: update-curl
        shell: bash
        run: |
          sudo apt-get update && sudo apt-get install curl

      - name: "Ingest Trieve Search Index"
        shell: bash
        env:
          ORGANIZATION_ID: ${{ secrets.TRIEVE_ORGANIZATION_ID }}
          DATASET_ID: ${{ secrets.TRIEVE_DATASET_ID }}
          API_KEY: ${{ secrets.TRIEVE_API_KEY }}
        run: |
          # Load environment variables
          TRIEVE_URL="https://api.trieve.ai"

          all_envs() {
            echo "
          These github repo secrets must to be set:
          TRIEVE_ORGANIZATION_ID="\<your-trieve-organization-id\>"
          TRIEVE_DATASET_ID="\<your-trieve-dataset-id\>"
          TRIEVE_API_KEY="\<your-trieve-api-key\>"

          alternatively you can modify the actions env's as such
          ORGANIZATION_ID="\<your-trieve-organization-id\>"
          DATASET_ID="\<your-trieve-dataset-id\>"
          API_KEY="\<your-trieve-api-key\>"
            "
          }

          [ -z $ORGANIZATION_ID ] && echo "ORGANIZATION_ID is not found" && all_envs && exit 1
          [ -z $DATASET_ID ] && echo "DATASET_ID is not found" &&  all_envs && exit 1
          [ -z $API_KEY ] && echo "API_KEY is not found" &&  all_envs && exit 1

          PAYLOAD='{
              "crawl_options": {
                  "allow_external_links" : false,
                  "boost_titles": true,
                  "exclude_paths": [],
                  "exclude_tags": [
                      "#__docusaurus > div.announcementBar_mb4j",
                      "#__docusaurus > nav",
                      "#__docusaurus_skipToContent_fallback > div > div > aside > div > div > nav",
                      "#__docusaurus_skipToContent_fallback > div > div > aside > div > div > nav",
                      "#__docusaurus_skipToContent_fallback > div > div > main > div > div > div.col.col--3 > div",
                      ".TiersList"
                  ],
                  "include_paths" : ["/docs/*"],
                  "include_tags" : ["article"],
                  "interval" : "daily",
                  "limit" : 1000,
                  "site_url" : "https://www.okteto.com/docs",
                  "scrape_options" : null
              },
              "dataset_id": "'${DATASET_ID}'"
          }'

          echo "Payload $PAYLOAD"
          
          # Make the API call using curl and capture the response code
          response=$(curl -X PUT "${TRIEVE_URL}/api/dataset" \
              -H "Content-Type: application/json" \
              -H "Authorization: ${API_KEY}" \
              -H "TR-Organization: ${ORGANIZATION_ID}" \
              -H "TR-Dataset: ${DATAST_ID}" \
              -d "${PAYLOAD}" \
              -w "\n%{http_code}")

          # Extract the response code from the last line
          http_code=$(echo "$response" | tail -n1)
          # Extract the response body (everything except the last line)
          response_body=$(echo "$response" | sed \$d)

          # Check if the response code is 200
          if [ "$http_code" -eq 200 ]; then
              echo "Crawling finished Successfully"
              exit 0
          else
              echo "Error: Received HTTP status code $http_code"
              echo "Response: $response_body"
              exit 1
          fi
	export DATASET_ID="<your-trieve-dataset>"
	export ORGANIZATION_ID="<your-trieve-organization>"
	export TRIEVE_API_KEY="<your-trieve-api-key>"
	export TRIEVE_URL="https://api.trieve.ai"

	PAYLOAD='{
	"crawl_options": {
	"allow_external_links" : false,
	"boost_titles": true,
	"exclude_paths": [],
	"exclude_tags": [
	"#__docusaurus > div.announcementBar_mb4j",'
	"#__docusaurus > nav",
	"#__docusaurus_skipToContent_fallback > div > div > aside > div > div > nav",
	"#__docusaurus_skipToContent_fallback > div > div > aside > div > div > nav",
	"#__docusaurus_skipToContent_fallback > div > div > main > div > div > div.col.col--3 > div",
	".TiersList"
	],
	"include_paths" : ["/docs/*"],
	"include_tags" : ["article"],
	"interval" : "daily",
	"limit" : 1000,
	"site_url" : "https://www.okteto.com/docs",
	"scrape_options" : null
	},
	"dataset_id": "'${DATASET_ID}'"
	}'

	# Make the API call using curl and capture the response code
	curl -X PUT "${TRIEVE_URL}/api/dataset" \
	-H "Content-Type: application/json" \
	-H "Authorization: ${API_KEY}" \
	-H "TR-Organization: ${ORGANIZATION_ID}" \
	-H "TR-Dataset: ${DATAST_ID}" \
	-d "${PAYLOAD}" \
	-w "\n%{http_code}"
	name: "Index Trieve Search Component"

	on:
	push:
	workflow_dispatch:

	jobs:
	create-search-index:
	runs-on: ubuntu-24.04
	steps:
	- name: update-curl
	shell: bash
	run: \|
	sudo apt-get update && sudo apt-get install curl

	- name: "Ingest Trieve Search Index"
	shell: bash
	env:
	ORGANIZATION_ID: ${{ secrets.TRIEVE_ORGANIZATION_ID }}
	DATASET_ID: ${{ secrets.TRIEVE_DATASET_ID }}
	API_KEY: ${{ secrets.TRIEVE_API_KEY }}
	run: \|
	# Load environment variables
	TRIEVE_URL="https://api.trieve.ai"

	all_envs() {
	echo "
	These github repo secrets must to be set:
	TRIEVE_ORGANIZATION_ID="\<your-trieve-organization-id\>"
	TRIEVE_DATASET_ID="\<your-trieve-dataset-id\>"
	TRIEVE_API_KEY="\<your-trieve-api-key\>"

	alternatively you can modify the actions env's as such
	ORGANIZATION_ID="\<your-trieve-organization-id\>"
	DATASET_ID="\<your-trieve-dataset-id\>"
	API_KEY="\<your-trieve-api-key\>"
	"
	}

	[ -z $ORGANIZATION_ID ] && echo "ORGANIZATION_ID is not found" && all_envs && exit 1
	[ -z $DATASET_ID ] && echo "DATASET_ID is not found" && all_envs && exit 1
	[ -z $API_KEY ] && echo "API_KEY is not found" && all_envs && exit 1

	PAYLOAD='{
	"crawl_options": {
	"allow_external_links" : false,
	"boost_titles": true,
	"exclude_paths": [],
	"exclude_tags": [
	"#__docusaurus > div.announcementBar_mb4j",
	"#__docusaurus > nav",
	"#__docusaurus_skipToContent_fallback > div > div > aside > div > div > nav",
	"#__docusaurus_skipToContent_fallback > div > div > aside > div > div > nav",
	"#__docusaurus_skipToContent_fallback > div > div > main > div > div > div.col.col--3 > div",
	".TiersList"
	],
	"include_paths" : ["/docs/*"],
	"include_tags" : ["article"],
	"interval" : "daily",
	"limit" : 1000,
	"site_url" : "https://www.okteto.com/docs",
	"scrape_options" : null
	},
	"dataset_id": "'${DATASET_ID}'"
	}'

	echo "Payload $PAYLOAD"

	# Make the API call using curl and capture the response code
	response=$(curl -X PUT "${TRIEVE_URL}/api/dataset" \
	-H "Content-Type: application/json" \
	-H "Authorization: ${API_KEY}" \
	-H "TR-Organization: ${ORGANIZATION_ID}" \
	-H "TR-Dataset: ${DATAST_ID}" \
	-d "${PAYLOAD}" \
	-w "\n%{http_code}")

	# Extract the response code from the last line
	http_code=$(echo "$response" \| tail -n1)
	# Extract the response body (everything except the last line)
	response_body=$(echo "$response" \| sed \$d)

	# Check if the response code is 200
	if [ "$http_code" -eq 200 ]; then
	echo "Crawling finished Successfully"
	exit 0
	else
	echo "Error: Received HTTP status code $http_code"
	echo "Response: $response_body"
	exit 1
	fi