yvanzo · July 10, 2019 17:20
diff --git a/check-search-indexes.sh b/check-search-indexes.sh
 #!/bin/bash

 set -e -u

 HELP=$(cat <<EOH
 Usage: $0

 For each of MusicBrainz Solr cores/collections,
 compare the count of existing documents in PostgreSQL,
 with the count of indexed documents in Solr.
 EOH
 )

 if [ $# -ne 0 ]; then
  echo "$0: wrong number of arguments"
  echo "$HELP"
  exit 1
 fi

 MB_DOCKER_ROOT=$(cd "$(dirname "${BASH_SOURCE[0]}")/../" && pwd)

 cd "$MB_DOCKER_ROOT"

 if ! docker-compose ps db | grep -qw 'Up'; then
  echo "$0: cannot count existing documents: 'db' is not a running docker-compose service"
  exit 1
 fi

 if ! docker-compose ps indexer | grep -qw 'Up'; then
  echo "$0: cannot count indexed documents: 'indexer' is not a running docker-compose service"
  exit 1
 fi

 # Count existing documents by core in PostgreSQL

 declare -A queries

 queries['annotation']="$(cat <<EOSQL
 SELECT SUM(annotation_count) FROM (
          SELECT COUNT(DISTINCT area         ) annotation_count FROM area_annotation          AS one
    UNION SELECT COUNT(DISTINCT artist       ) annotation_count FROM artist_annotation
    UNION SELECT COUNT(DISTINCT event        ) annotation_count FROM event_annotation
    UNION SELECT COUNT(DISTINCT instrument   ) annotation_count FROM instrument_annotation
    UNION SELECT COUNT(DISTINCT label        ) annotation_count FROM label_annotation
    UNION SELECT COUNT(DISTINCT place        ) annotation_count FROM place_annotation
    UNION SELECT COUNT(DISTINCT recording    ) annotation_count FROM recording_annotation
    UNION SELECT COUNT(DISTINCT release      ) annotation_count FROM release_annotation
    UNION SELECT COUNT(DISTINCT release_group) annotation_count FROM release_group_annotation
    UNION SELECT COUNT(DISTINCT series       ) annotation_count FROM series_annotation
    UNION SELECT COUNT(DISTINCT work         ) annotation_count FROM work_annotation
 ) AS total
 EOSQL
 )"

 queries['cdstub']="SELECT COUNT(id) FROM release_raw"

 for table in area artist editor event instrument label place \
  recording release release_group series tag url work
 do
  queries["${table//_/-}"]="SELECT COUNT(id) FROM $table"
 done

 declare -A counts

 POSTGRES_USER=musicbrainz
 POSTGRES_DATABASE=musicbrainz_db
 for core in "${!queries[@]}"
 do
  counts["$core"]=$(docker-compose exec db \
    psql -U $POSTGRES_USER -d $POSTGRES_DATABASE \
      -c "COPY(${queries[$core]}) TO STDOUT" | tr -d '\r')
 done

 # Sort cores by ascending number of documents

 declare -a ascending_cores=( $(
  for core in "${!counts[@]}"
  do
    echo $core ${counts["$core"]}
  done | sort -n -k2 | sed 's/ .*$//'
 ) )

 # Count indexed documents by core in Solr

 declare -A indexed_docs

 while read line
 do
  core=${line% *}
  docs=${line#* }
  indexed_docs["$core"]="${docs/$'\r'/}"
 done < <(docker-compose exec indexer bash -c "
 wget -q -O - http://search:8983/v2/cores | python2 -c '
 import sys, json;
 json_status = json.load(sys.stdin)[\"status\"];
 for core in json_status:
    print core, json_status[core][\"index\"][\"numDocs\"]
 '" | sort -n -k2)

 # Compare number of indexed docs with number of existing docs, by core

 for core in "${ascending_cores[@]}"
 do
  if [ ${counts[$core]} -eq ${indexed_docs[$core]} ]
  then
    echo "$core" "OK" "${indexed_docs[$core]}" "/${counts[$core]}"
  else
    echo "$core" "--" "${indexed_docs[$core]}" "/${counts[$core]}"
  fi
 done | (
  if column --version &>/dev/null
  then
    column --table --table-columns CORE,STATUS,INDEX,DB --table-right INDEX
  else
    (echo CORE STATUS INDEX DB; cat) | column -t
  fi
 )
diff --git a/small-sample-result.csv b/small-sample-result.csv
 CORE           STATUS  INDEX  DB
 cdstub         OK          0  /0
 event          OK         16  /16
 release        OK         32  /32
 release-group  OK         32  /32
 series         OK         34  /34
 label          OK         74  /74
 place          OK         82  /82
 url            OK        283  /283
 instrument     OK        945  /945
 annotation     OK       1197  /1197
 work           OK       1862  /1862
 editor         OK       1898  /1898
 area           OK       2015  /2015
 tag            OK       2119  /2119
 artist         OK       8172  /8172
 recording      OK      38617  /38617
	#!/bin/bash

	set -e -u

	HELP=$(cat <<EOH
	Usage: $0

	For each of MusicBrainz Solr cores/collections,
	compare the count of existing documents in PostgreSQL,
	with the count of indexed documents in Solr.
	EOH
	)

	if [ $# -ne 0 ]; then
	echo "$0: wrong number of arguments"
	echo "$HELP"
	exit 1
	fi

	MB_DOCKER_ROOT=$(cd "$(dirname "${BASH_SOURCE[0]}")/../" && pwd)

	cd "$MB_DOCKER_ROOT"

	if ! docker-compose ps db \| grep -qw 'Up'; then
	echo "$0: cannot count existing documents: 'db' is not a running docker-compose service"
	exit 1
	fi

	if ! docker-compose ps indexer \| grep -qw 'Up'; then
	echo "$0: cannot count indexed documents: 'indexer' is not a running docker-compose service"
	exit 1
	fi

	# Count existing documents by core in PostgreSQL

	declare -A queries

	queries['annotation']="$(cat <<EOSQL
	SELECT SUM(annotation_count) FROM (
	SELECT COUNT(DISTINCT area ) annotation_count FROM area_annotation AS one
	UNION SELECT COUNT(DISTINCT artist ) annotation_count FROM artist_annotation
	UNION SELECT COUNT(DISTINCT event ) annotation_count FROM event_annotation
	UNION SELECT COUNT(DISTINCT instrument ) annotation_count FROM instrument_annotation
	UNION SELECT COUNT(DISTINCT label ) annotation_count FROM label_annotation
	UNION SELECT COUNT(DISTINCT place ) annotation_count FROM place_annotation
	UNION SELECT COUNT(DISTINCT recording ) annotation_count FROM recording_annotation
	UNION SELECT COUNT(DISTINCT release ) annotation_count FROM release_annotation
	UNION SELECT COUNT(DISTINCT release_group) annotation_count FROM release_group_annotation
	UNION SELECT COUNT(DISTINCT series ) annotation_count FROM series_annotation
	UNION SELECT COUNT(DISTINCT work ) annotation_count FROM work_annotation
	) AS total
	EOSQL
	)"

	queries['cdstub']="SELECT COUNT(id) FROM release_raw"

	for table in area artist editor event instrument label place \
	recording release release_group series tag url work
	do
	queries["${table//_/-}"]="SELECT COUNT(id) FROM $table"
	done

	declare -A counts

	POSTGRES_USER=musicbrainz
	POSTGRES_DATABASE=musicbrainz_db
	for core in "${!queries[@]}"
	do
	counts["$core"]=$(docker-compose exec db \
	psql -U $POSTGRES_USER -d $POSTGRES_DATABASE \
	-c "COPY(${queries[$core]}) TO STDOUT" \| tr -d '\r')
	done

	# Sort cores by ascending number of documents

	declare -a ascending_cores=( $(
	for core in "${!counts[@]}"
	do
	echo $core ${counts["$core"]}
	done \| sort -n -k2 \| sed 's/ .*$//'
	) )

	# Count indexed documents by core in Solr

	declare -A indexed_docs

	while read line
	do
	core=${line% *}
	docs=${line#* }
	indexed_docs["$core"]="${docs/$'\r'/}"
	done < <(docker-compose exec indexer bash -c "
	wget -q -O - http://search:8983/v2/cores \| python2 -c '
	import sys, json;
	json_status = json.load(sys.stdin)[\"status\"];
	for core in json_status:
	print core, json_status[core][\"index\"][\"numDocs\"]
	'" \| sort -n -k2)

	# Compare number of indexed docs with number of existing docs, by core

	for core in "${ascending_cores[@]}"
	do
	if [ ${counts[$core]} -eq ${indexed_docs[$core]} ]
	then
	echo "$core" "OK" "${indexed_docs[$core]}" "/${counts[$core]}"
	else
	echo "$core" "--" "${indexed_docs[$core]}" "/${counts[$core]}"
	fi
	done \| (
	if column --version &>/dev/null
	then
	column --table --table-columns CORE,STATUS,INDEX,DB --table-right INDEX
	else
	(echo CORE STATUS INDEX DB; cat) \| column -t
	fi
	)
	CORE STATUS INDEX DB
	cdstub OK 0 /0
	event OK 16 /16
	release OK 32 /32
	release-group OK 32 /32
	series OK 34 /34
	label OK 74 /74
	place OK 82 /82
	url OK 283 /283
	instrument OK 945 /945
	annotation OK 1197 /1197
	work OK 1862 /1862
	editor OK 1898 /1898
	area OK 2015 /2015
	tag OK 2119 /2119
	artist OK 8172 /8172
	recording OK 38617 /38617