generalmimon · September 10, 2022 15:27
diff --git a/github-collect-repos-using-ks.sh b/github-collect-repos-using-ks.sh
 #!/bin/sh -f
 # SPDX-FileCopyrightText: 2021 Petr Pucil <[email protected]>
 #
 # SPDX-License-Identifier: MIT

 # A shell script for collecting Kaitai Struct YAML (.ksy) file paths in GitHub repositories.
 #
 # Requires <https://stedolan.github.io/jq> and <https://curl.se> to work.
 #
 # Inspired by https://github.com/Alhadis/Harvester, but this script:
 #   * is much simpler and more portable,
 #   * uses the official GitHub REST API returning machine-readable JSON data (instead of
 #     requesting HTML pages and parsing their contents),
 #   * does not require running a fully-fledged web browser (which also restricts
 #     cross-origin HTTP requests so you need to manually navigate to github.com) where you
 #     need to log in to your GitHub account,
 #   * reduces number of individual HTTP requests (it receives up to 100 results at once -
 #     GitHub sends sometimes less but I have no idea why; Harvester can only get 10 results
 #     at a time)
 #   * saves bandwidth (receives results in JSON, not HTML)
 #   * can be conveniently run from command line on any POSIX-compliant shell.

 auth_token=<login>:<token> # e.g. octocat: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;
                           # go to <https://github.com/settings/tokens> to get a real one
                           # (no OAuth scopes are needed, public access is enough)
 search_query='-org:kaitai-io extension:ksy'
 output_file=github-collected-ksy-paths.ndjson
 # per_page=100
 per_page=$((RANDOM % 61 + 40))
 max_page=$(( 1000 / per_page ))
 total_count=$(curl \
  -G 'https://api.github.com/search/code' \
  -sS \
  -u "$auth_token" \
  -H "Accept: application/vnd.github.v3+json" \
  --data-urlencode q="${search_query}" \
  --data-urlencode per_page=1 \
  | jq '.total_count')
 echo "total count: $total_count"
 echo "per page: $per_page"
 echo "max page: $max_page"

 if [ -f "$output_file" ]; then
  sort -u -o "$output_file" "$output_file"
 else
  touch "$output_file"
 fi

 tmp_f=$(mktemp)
 # https://unix.stackexchange.com/a/520041
 trap 'rm -vf "$tmp_f"; trap - EXIT; exit' EXIT INT HUP

 page=1
 num_written_in_run=0
 lines_before=$(wc -l < "$output_file")

 set --

 while true; do
 #  if [ "$lines_before" -ge "$total_count" ]; then
 #    echo "total count ($total_count) reached, output file has $lines_before lines"
 #    break
 #  fi

  echo "page $page"
  output=$(curl \
    -G 'https://api.github.com/search/code' \
    -sS \
    -u "$auth_token" \
    -H "Accept: application/vnd.github.v3+json" \
    --data-urlencode q="${search_query}" \
    $@ \
    --data-urlencode per_page="${per_page}" \
    --data-urlencode page="${page}")

  if ! printf %s "$output" \
    | jq --compact-output '.items | map({html_url, repo: .repository.html_url}) | .[]' \
    > "$tmp_f"; then
    echo "jq failed to parse JSON, waiting 65 seconds"
    echo "$output"
    sleep 65
    continue
  fi

  lines_recv=$(wc -l < "$tmp_f")
  if [ "$lines_recv" -ne 0 ]; then
    echo "$lines_recv items received ($per_page requested)"

    LC_ALL=C sort -o "$tmp_f" "$tmp_f"
    LC_ALL=C sort -u -m -o "$output_file" "$output_file" "$tmp_f"
    lines_after=$(wc -l < "$output_file")
    echo "$(( lines_after - lines_before )) items written ($lines_after total)"
    num_written_in_run=$(( num_written_in_run + (lines_after - lines_before) ))
    lines_before=$lines_after
  fi

  if [ $page -ge $max_page ]; then
    page=1
    echo "reached max page, wrapping around to page 1, waiting 50 seconds"
    sleep 50
    if [ $num_written_in_run -lt $max_page ]; then
      if [ $# -lt 2 ]; then
        set -- --data-urlencode order=asc --data-urlencode sort=indexed
      else
        shift 2
      fi
      echo "switching strategy to '$@'"
    fi
    num_written_in_run=0
    per_page=$((RANDOM % 61 + 40))
    max_page=$(( 1000 / per_page ))
    echo "per page: $per_page"
    echo "max page: $max_page"
  else
    page=$((page+1))
    sleep 30
  fi
 done

 # if [ "$OS" = "Windows_NT" ]; then
 #   dos2unix "$output_file" || :
 # fi
	#!/bin/sh -f
	# SPDX-FileCopyrightText: 2021 Petr Pucil <[email protected]>
	#
	# SPDX-License-Identifier: MIT

	# A shell script for collecting Kaitai Struct YAML (.ksy) file paths in GitHub repositories.
	#
	# Requires <https://stedolan.github.io/jq> and <https://curl.se> to work.
	#
	# Inspired by https://github.com/Alhadis/Harvester, but this script:
	# * is much simpler and more portable,
	# * uses the official GitHub REST API returning machine-readable JSON data (instead of
	# requesting HTML pages and parsing their contents),
	# * does not require running a fully-fledged web browser (which also restricts
	# cross-origin HTTP requests so you need to manually navigate to github.com) where you
	# need to log in to your GitHub account,
	# * reduces number of individual HTTP requests (it receives up to 100 results at once -
	# GitHub sends sometimes less but I have no idea why; Harvester can only get 10 results
	# at a time)
	# * saves bandwidth (receives results in JSON, not HTML)
	# * can be conveniently run from command line on any POSIX-compliant shell.

	auth_token=<login>:<token> # e.g. octocat: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;
	# go to <https://github.com/settings/tokens> to get a real one
	# (no OAuth scopes are needed, public access is enough)
	search_query='-org:kaitai-io extension:ksy'
	output_file=github-collected-ksy-paths.ndjson
	# per_page=100
	per_page=$((RANDOM % 61 + 40))
	max_page=$(( 1000 / per_page ))
	total_count=$(curl \
	-G 'https://api.github.com/search/code' \
	-sS \
	-u "$auth_token" \
	-H "Accept: application/vnd.github.v3+json" \
	--data-urlencode q="${search_query}" \
	--data-urlencode per_page=1 \
	\| jq '.total_count')
	echo "total count: $total_count"
	echo "per page: $per_page"
	echo "max page: $max_page"

	if [ -f "$output_file" ]; then
	sort -u -o "$output_file" "$output_file"
	else
	touch "$output_file"
	fi

	tmp_f=$(mktemp)
	# https://unix.stackexchange.com/a/520041
	trap 'rm -vf "$tmp_f"; trap - EXIT; exit' EXIT INT HUP

	page=1
	num_written_in_run=0
	lines_before=$(wc -l < "$output_file")

	set --

	while true; do
	# if [ "$lines_before" -ge "$total_count" ]; then
	# echo "total count ($total_count) reached, output file has $lines_before lines"
	# break
	# fi

	echo "page $page"
	output=$(curl \
	-G 'https://api.github.com/search/code' \
	-sS \
	-u "$auth_token" \
	-H "Accept: application/vnd.github.v3+json" \
	--data-urlencode q="${search_query}" \
	$@ \
	--data-urlencode per_page="${per_page}" \
	--data-urlencode page="${page}")

	if ! printf %s "$output" \
	\| jq --compact-output '.items \| map({html_url, repo: .repository.html_url}) \| .[]' \
	> "$tmp_f"; then
	echo "jq failed to parse JSON, waiting 65 seconds"
	echo "$output"
	sleep 65
	continue
	fi

	lines_recv=$(wc -l < "$tmp_f")
	if [ "$lines_recv" -ne 0 ]; then
	echo "$lines_recv items received ($per_page requested)"

	LC_ALL=C sort -o "$tmp_f" "$tmp_f"
	LC_ALL=C sort -u -m -o "$output_file" "$output_file" "$tmp_f"
	lines_after=$(wc -l < "$output_file")
	echo "$(( lines_after - lines_before )) items written ($lines_after total)"
	num_written_in_run=$(( num_written_in_run + (lines_after - lines_before) ))
	lines_before=$lines_after
	fi

	if [ $page -ge $max_page ]; then
	page=1
	echo "reached max page, wrapping around to page 1, waiting 50 seconds"
	sleep 50
	if [ $num_written_in_run -lt $max_page ]; then
	if [ $# -lt 2 ]; then
	set -- --data-urlencode order=asc --data-urlencode sort=indexed
	else
	shift 2
	fi
	echo "switching strategy to '$@'"
	fi
	num_written_in_run=0
	per_page=$((RANDOM % 61 + 40))
	max_page=$(( 1000 / per_page ))
	echo "per page: $per_page"
	echo "max page: $max_page"
	else
	page=$((page+1))
	sleep 30
	fi
	done

	# if [ "$OS" = "Windows_NT" ]; then
	# dos2unix "$output_file" \|\| :
	# fi
No results found