Last active
July 10, 2019 17:20
-
-
Save yvanzo/52f9f9038ad3c60631ada7b7c0d26d92 to your computer and use it in GitHub Desktop.
Script to check search indexes status using musicbrainz-docker
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
set -e -u | |
HELP=$(cat <<EOH | |
Usage: $0 | |
For each of MusicBrainz Solr cores/collections, | |
compare the count of existing documents in PostgreSQL, | |
with the count of indexed documents in Solr. | |
EOH | |
) | |
if [ $# -ne 0 ]; then | |
echo "$0: wrong number of arguments" | |
echo "$HELP" | |
exit 1 | |
fi | |
MB_DOCKER_ROOT=$(cd "$(dirname "${BASH_SOURCE[0]}")/../" && pwd) | |
cd "$MB_DOCKER_ROOT" | |
if ! docker-compose ps db | grep -qw 'Up'; then | |
echo "$0: cannot count existing documents: 'db' is not a running docker-compose service" | |
exit 1 | |
fi | |
if ! docker-compose ps indexer | grep -qw 'Up'; then | |
echo "$0: cannot count indexed documents: 'indexer' is not a running docker-compose service" | |
exit 1 | |
fi | |
# Count existing documents by core in PostgreSQL | |
declare -A queries | |
queries['annotation']="$(cat <<EOSQL | |
SELECT SUM(annotation_count) FROM ( | |
SELECT COUNT(DISTINCT area ) annotation_count FROM area_annotation AS one | |
UNION SELECT COUNT(DISTINCT artist ) annotation_count FROM artist_annotation | |
UNION SELECT COUNT(DISTINCT event ) annotation_count FROM event_annotation | |
UNION SELECT COUNT(DISTINCT instrument ) annotation_count FROM instrument_annotation | |
UNION SELECT COUNT(DISTINCT label ) annotation_count FROM label_annotation | |
UNION SELECT COUNT(DISTINCT place ) annotation_count FROM place_annotation | |
UNION SELECT COUNT(DISTINCT recording ) annotation_count FROM recording_annotation | |
UNION SELECT COUNT(DISTINCT release ) annotation_count FROM release_annotation | |
UNION SELECT COUNT(DISTINCT release_group) annotation_count FROM release_group_annotation | |
UNION SELECT COUNT(DISTINCT series ) annotation_count FROM series_annotation | |
UNION SELECT COUNT(DISTINCT work ) annotation_count FROM work_annotation | |
) AS total | |
EOSQL | |
)" | |
queries['cdstub']="SELECT COUNT(id) FROM release_raw" | |
for table in area artist editor event instrument label place \ | |
recording release release_group series tag url work | |
do | |
queries["${table//_/-}"]="SELECT COUNT(id) FROM $table" | |
done | |
declare -A counts | |
POSTGRES_USER=musicbrainz | |
POSTGRES_DATABASE=musicbrainz_db | |
for core in "${!queries[@]}" | |
do | |
counts["$core"]=$(docker-compose exec db \ | |
psql -U $POSTGRES_USER -d $POSTGRES_DATABASE \ | |
-c "COPY(${queries[$core]}) TO STDOUT" | tr -d '\r') | |
done | |
# Sort cores by ascending number of documents | |
declare -a ascending_cores=( $( | |
for core in "${!counts[@]}" | |
do | |
echo $core ${counts["$core"]} | |
done | sort -n -k2 | sed 's/ .*$//' | |
) ) | |
# Count indexed documents by core in Solr | |
declare -A indexed_docs | |
while read line | |
do | |
core=${line% *} | |
docs=${line#* } | |
indexed_docs["$core"]="${docs/$'\r'/}" | |
done < <(docker-compose exec indexer bash -c " | |
wget -q -O - http://search:8983/v2/cores | python2 -c ' | |
import sys, json; | |
json_status = json.load(sys.stdin)[\"status\"]; | |
for core in json_status: | |
print core, json_status[core][\"index\"][\"numDocs\"] | |
'" | sort -n -k2) | |
# Compare number of indexed docs with number of existing docs, by core | |
for core in "${ascending_cores[@]}" | |
do | |
if [ ${counts[$core]} -eq ${indexed_docs[$core]} ] | |
then | |
echo "$core" "OK" "${indexed_docs[$core]}" "/${counts[$core]}" | |
else | |
echo "$core" "--" "${indexed_docs[$core]}" "/${counts[$core]}" | |
fi | |
done | ( | |
if column --version &>/dev/null | |
then | |
column --table --table-columns CORE,STATUS,INDEX,DB --table-right INDEX | |
else | |
(echo CORE STATUS INDEX DB; cat) | column -t | |
fi | |
) |
We can make this file beautiful and searchable if this error is corrected: No commas found in this CSV file in line 0.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
CORE STATUS INDEX DB | |
cdstub OK 0 /0 | |
event OK 16 /16 | |
release OK 32 /32 | |
release-group OK 32 /32 | |
series OK 34 /34 | |
label OK 74 /74 | |
place OK 82 /82 | |
url OK 283 /283 | |
instrument OK 945 /945 | |
annotation OK 1197 /1197 | |
work OK 1862 /1862 | |
editor OK 1898 /1898 | |
area OK 2015 /2015 | |
tag OK 2119 /2119 | |
artist OK 8172 /8172 | |
recording OK 38617 /38617 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment