Last active
September 10, 2022 15:27
-
-
Save generalmimon/0508bc02a381efa0a93abf690ae87b30 to your computer and use it in GitHub Desktop.
A shell script for collecting Kaitai Struct YAML (.ksy) file paths in GitHub repositories
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/sh -f | |
# SPDX-FileCopyrightText: 2021 Petr Pucil <[email protected]> | |
# | |
# SPDX-License-Identifier: MIT | |
# A shell script for collecting Kaitai Struct YAML (.ksy) file paths in GitHub repositories. | |
# | |
# Requires <https://stedolan.github.io/jq> and <https://curl.se> to work. | |
# | |
# Inspired by https://github.com/Alhadis/Harvester, but this script: | |
# * is much simpler and more portable, | |
# * uses the official GitHub REST API returning machine-readable JSON data (instead of | |
# requesting HTML pages and parsing their contents), | |
# * does not require running a fully-fledged web browser (which also restricts | |
# cross-origin HTTP requests so you need to manually navigate to github.com) where you | |
# need to log in to your GitHub account, | |
# * reduces number of individual HTTP requests (it receives up to 100 results at once - | |
# GitHub sends sometimes less but I have no idea why; Harvester can only get 10 results | |
# at a time) | |
# * saves bandwidth (receives results in JSON, not HTML) | |
# * can be conveniently run from command line on any POSIX-compliant shell. | |
auth_token=<login>:<token> # e.g. octocat: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; | |
# go to <https://github.com/settings/tokens> to get a real one | |
# (no OAuth scopes are needed, public access is enough) | |
search_query='-org:kaitai-io extension:ksy' | |
output_file=github-collected-ksy-paths.ndjson | |
# per_page=100 | |
per_page=$((RANDOM % 61 + 40)) | |
max_page=$(( 1000 / per_page )) | |
total_count=$(curl \ | |
-G 'https://api.github.com/search/code' \ | |
-sS \ | |
-u "$auth_token" \ | |
-H "Accept: application/vnd.github.v3+json" \ | |
--data-urlencode q="${search_query}" \ | |
--data-urlencode per_page=1 \ | |
| jq '.total_count') | |
echo "total count: $total_count" | |
echo "per page: $per_page" | |
echo "max page: $max_page" | |
if [ -f "$output_file" ]; then | |
sort -u -o "$output_file" "$output_file" | |
else | |
touch "$output_file" | |
fi | |
tmp_f=$(mktemp) | |
# https://unix.stackexchange.com/a/520041 | |
trap 'rm -vf "$tmp_f"; trap - EXIT; exit' EXIT INT HUP | |
page=1 | |
num_written_in_run=0 | |
lines_before=$(wc -l < "$output_file") | |
set -- | |
while true; do | |
# if [ "$lines_before" -ge "$total_count" ]; then | |
# echo "total count ($total_count) reached, output file has $lines_before lines" | |
# break | |
# fi | |
echo "page $page" | |
output=$(curl \ | |
-G 'https://api.github.com/search/code' \ | |
-sS \ | |
-u "$auth_token" \ | |
-H "Accept: application/vnd.github.v3+json" \ | |
--data-urlencode q="${search_query}" \ | |
$@ \ | |
--data-urlencode per_page="${per_page}" \ | |
--data-urlencode page="${page}") | |
if ! printf %s "$output" \ | |
| jq --compact-output '.items | map({html_url, repo: .repository.html_url}) | .[]' \ | |
> "$tmp_f"; then | |
echo "jq failed to parse JSON, waiting 65 seconds" | |
echo "$output" | |
sleep 65 | |
continue | |
fi | |
lines_recv=$(wc -l < "$tmp_f") | |
if [ "$lines_recv" -ne 0 ]; then | |
echo "$lines_recv items received ($per_page requested)" | |
LC_ALL=C sort -o "$tmp_f" "$tmp_f" | |
LC_ALL=C sort -u -m -o "$output_file" "$output_file" "$tmp_f" | |
lines_after=$(wc -l < "$output_file") | |
echo "$(( lines_after - lines_before )) items written ($lines_after total)" | |
num_written_in_run=$(( num_written_in_run + (lines_after - lines_before) )) | |
lines_before=$lines_after | |
fi | |
if [ $page -ge $max_page ]; then | |
page=1 | |
echo "reached max page, wrapping around to page 1, waiting 50 seconds" | |
sleep 50 | |
if [ $num_written_in_run -lt $max_page ]; then | |
if [ $# -lt 2 ]; then | |
set -- --data-urlencode order=asc --data-urlencode sort=indexed | |
else | |
shift 2 | |
fi | |
echo "switching strategy to '$@'" | |
fi | |
num_written_in_run=0 | |
per_page=$((RANDOM % 61 + 40)) | |
max_page=$(( 1000 / per_page )) | |
echo "per page: $per_page" | |
echo "max page: $max_page" | |
else | |
page=$((page+1)) | |
sleep 30 | |
fi | |
done | |
# if [ "$OS" = "Windows_NT" ]; then | |
# dos2unix "$output_file" || : | |
# fi |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment