Test Case:
export SKIP_VTADMIN=1
pushd ${VTROOT}/examples/local
sed -i "" '42i\
--enable_buffer \\
' ../common/scripts/vtgate-up.sh
sed -i "" -e "s/semi_sync/none/g" 101_initial_cluster.sh
./101_initial_cluster.sh
replicauid=$(($(vtctlclient -- ListAllTablets --keyspace commerce --tablet_type replica | awk '{print $1}' | cut -d- -f2)+0))
primaryuid=$(($(vtctlclient -- ListAllTablets --keyspace commerce --tablet_type primary | awk '{print $1}' | cut -d- -f2)+0))
echo "Starting primary tablet UID: ${primaryuid}"
echo "Starting replicat tablet UID: ${replicauid}"
echo "Starting inserts at $(date)..."
rm /tmp/query-results.txt
while true; do date >> /tmp/query-results.txt; mysql commerce -e "insert into customer (email) values ('${RANDOM}[email protected]')" >> /tmp/query-results.txt 2>&1; done &
echo "Original tablet status:"
vtctldclient GetTablets
echo "Pre TER buffering stats:"
curl -s localhost:15001/debug/vars | grep -E "^\"Buffer"
echo "Doing TER at $(date)..."
command mysql -u root --socket=${VTDATAROOT}/vt_0000000${replicauid}/mysql.sock --binary-as-hex=false -e "set global super_read_only=OFF; set global read_only=OFF"
command mysql -u root --socket=${VTDATAROOT}/vt_0000000${primaryuid}/mysql.sock --binary-as-hex=false -e "set global super_read_only=ON; set global read_only=ON"
vtctldclient TabletExternallyReparented zone1-${replicauid}
echo "Completed TER at $(date)..."
echo "New tablet status:"
vtctldclient GetTablets
echo "Sleeping for 40 seconds..."
sleep 40
echo "Stopping inserts..."
kill %1
echo "Rows created:"
mysql commerce -e "select count(*) from customer"
echo "Post TER buffering stats:"
curl -s localhost:15001/debug/vars | grep -E "^\"Buffer"
echo "Errors seen in the log file:"
grep -Ei "error|fail" /tmp/query-results.txt
Results (no meaningful difference between v16 and v18/main):
Starting primary tablet UID: 100
Starting replicat tablet UID: 101
Starting inserts at Thu Aug 24 21:00:54 EDT 2023...
[1] 14281
Original tablet status:
zone1-0000000100 commerce 0 primary localhost:15100 localhost:17100 [] 2023-08-25T01:00:28Z
zone1-0000000101 commerce 0 replica localhost:15101 localhost:17101 [] <null>
zone1-0000000102 commerce 0 rdonly localhost:15102 localhost:17102 [] <null>
Pre TER buffering stats:
"BufferFailoverDurationSumMs": {"commerce.0": 0},
"BufferLastFailoverDurationMs": {},
"BufferLastRequestsDryRunMax": {},
"BufferLastRequestsInFlightMax": {},
"BufferRequestsBuffered": {"commerce.0": 0},
"BufferRequestsBufferedDryRun": {"commerce.0": 0},
"BufferRequestsDrained": {"commerce.0": 0},
"BufferRequestsEvicted": {"commerce.0.ContextDone": 0, "commerce.0.BufferFull": 0, "commerce.0.WindowExceeded": 0},
"BufferRequestsSkipped": {"commerce.0.BufferFull": 0, "commerce.0.Disabled": 0, "commerce.0.Shutdown": 0, "commerce.0.LastReparentTooRecent": 0, "commerce.0.LastFailoverTooRecent": 0},
"BufferSize": 1000,
"BufferStarts": {"commerce.0": 0},
"BufferStops": {"commerce.0.ReshardingComplete": 0, "commerce.0.NewPrimarySeen": 0, "commerce.0.MaxDurationExceeded": 0, "commerce.0.Shutdown": 0},
"BufferUtilizationDryRunSum": {"commerce.0": 0},
"BufferUtilizationSum": {"commerce.0": 0},
Doing TER at Thu Aug 24 21:00:54 EDT 2023...
{
"keyspace": "commerce",
"shard": "0",
"new_primary": {
"cell": "zone1",
"uid": 101
},
"old_primary": {
"cell": "zone1",
"uid": 100
}
}
Completed TER at Thu Aug 24 21:00:54 EDT 2023...
New tablet status:
zone1-0000000100 commerce 0 replica localhost:15100 localhost:17100 [] <null>
zone1-0000000101 commerce 0 primary localhost:15101 localhost:17101 [] 2023-08-25T01:00:54Z
zone1-0000000102 commerce 0 rdonly localhost:15102 localhost:17102 [] <null>
Sleeping for 40 seconds...
Stopping inserts...
Rows created:
[1] 14281 terminated while true; do; date >> /tmp/query-results.txt; mysql commerce -e >> 2>&1;
+----------+
| count(*) |
+----------+
| 9048 |
+----------+
Post TER buffering stats:
"BufferFailoverDurationSumMs": {"commerce.0": 47},
"BufferLastFailoverDurationMs": {"commerce.0": 47},
"BufferLastRequestsDryRunMax": {"commerce.0": 0},
"BufferLastRequestsInFlightMax": {"commerce.0": 2},
"BufferRequestsBuffered": {"commerce.0": 2},
"BufferRequestsBufferedDryRun": {"commerce.0": 0},
"BufferRequestsDrained": {"commerce.0": 2},
"BufferRequestsEvicted": {"commerce.0.ContextDone": 0, "commerce.0.BufferFull": 0, "commerce.0.WindowExceeded": 0},
"BufferRequestsSkipped": {"commerce.0.Shutdown": 0, "commerce.0.LastReparentTooRecent": 0, "commerce.0.LastFailoverTooRecent": 0, "commerce.0.BufferFull": 0, "commerce.0.Disabled": 0},
"BufferSize": 1000,
"BufferStarts": {"commerce.0": 1},
"BufferStops": {"commerce.0.ReshardingComplete": 0, "commerce.0.NewPrimarySeen": 1, "commerce.0.MaxDurationExceeded": 0, "commerce.0.Shutdown": 0},
"BufferUtilizationDryRunSum": {"commerce.0": 0},
"BufferUtilizationSum": {"commerce.0": 0},
Errors seen in the log file:
vtgate logs:
❯ cat ${VTDATAROOT}/tmp/vtgate.INFO
Log file created at: 2023/08/24 21:00:29
Running on machine: pslord
Binary: Built with gc go1.20.6 for darwin/arm64
Log line format: [IWEF]mmdd hh:mm:ss.uuuuuu threadid file:line] msg
I0824 21:00:29.121545 11257 servenv.go:131] Version: 16.0.5-SNAPSHOT (Git revision c934b2b49e155fdf72a2e1ee2aa674df8400437d branch 'release-16.0') built on Thu Aug 24 20:08:09 EDT 2023 by [email protected] using go1.20.6 darwin/arm64
I0824 21:00:29.130811 11257 healthcheck.go:312] loading tablets for cells: zone1
I0824 21:00:29.130817 11257 healthcheck.go:333] Setting up healthcheck for cell: zone1
I0824 21:00:29.130852 11257 flags.go:180] vtgate buffer enabled. PRIMARY requests will be buffered during detected failovers.
I0824 21:00:29.130881 11257 keyspace_events.go:84] started watching keyspace events in "zone1"
I0824 21:00:29.130905 11257 tabletgateway.go:186] Gateway waiting for serving tablets of types [PRIMARY REPLICA] ...
I0824 21:00:29.134399 11257 keyspace_events.go:385] created dedicated watcher for keyspace zone1/commerce
I0824 21:00:29.135315 11257 healthcheck.go:375] Adding tablet to healthcheck: alias:{cell:"zone1" uid:102} hostname:"localhost" port_map:{key:"grpc" value:16102} port_map:{key:"vt" value:15102} keyspace:"commerce" shard:"0" type:RDONLY mysql_hostname:"localhost" mysql_port:17102 db_server_version:"8.0.31" default_conn_collation:255
I0824 21:00:29.135354 11257 healthcheck.go:375] Adding tablet to healthcheck: alias:{cell:"zone1" uid:101} hostname:"localhost" port_map:{key:"grpc" value:16101} port_map:{key:"vt" value:15101} keyspace:"commerce" shard:"0" type:REPLICA mysql_hostname:"localhost" mysql_port:17101 db_server_version:"8.0.31" default_conn_collation:255
I0824 21:00:29.135362 11257 healthcheck.go:375] Adding tablet to healthcheck: alias:{cell:"zone1" uid:100} hostname:"localhost" port_map:{key:"grpc" value:16100} port_map:{key:"vt" value:15100} keyspace:"commerce" shard:"0" type:PRIMARY mysql_hostname:"localhost" mysql_port:17100 primary_term_start_time:{seconds:1692925228 nanoseconds:331501000} db_server_version:"8.0.31" default_conn_collation:255
I0824 21:00:29.138808 11257 tablet_health_check.go:111] HealthCheckUpdate(Serving State): tablet: zone1-101 (localhost) serving false => true for commerce/0 (REPLICA) reason: healthCheck update
I0824 21:00:29.138808 11257 tablet_health_check.go:111] HealthCheckUpdate(Serving State): tablet: zone1-100 (localhost) serving false => true for commerce/0 (PRIMARY) reason: healthCheck update
I0824 21:00:29.138813 11257 tablet_health_check.go:111] HealthCheckUpdate(Serving State): tablet: zone1-102 (localhost) serving false => true for commerce/0 (RDONLY) reason: healthCheck update
I0824 21:00:29.138825 11257 keyspace_events.go:272] keyspace event resolved: commerce/commerce is now consistent (serving: true)
I0824 21:00:29.138852 11257 shard_buffer.go:483] disruption in shard commerce/0 resolved (serving: true)
I0824 21:00:29.237741 11257 tabletgateway.go:194] Waiting for tablets completed
I0824 21:00:29.237771 11257 vtgate.go:161] Transaction mode: 'MULTI'
I0824 21:00:29.741141 11257 tracker.go:125] finished loading schema for keyspace commerce. Found 9 columns in total across the tables
I0824 21:00:29.741920 11257 vschema_manager.go:103] Received vschema update
I0824 21:00:29.742203 11257 vschema_manager.go:103] Received vschema update
I0824 21:00:29.742476 11257 streamlog.go:208] Streaming logs from VTGate at /debug/querylog.
I0824 21:00:29.743554 11257 grpc_server.go:209] Setting grpc max message size to 16777216
I0824 21:00:29.743559 11257 grpc_server.go:384] Building interceptors with 0 unary interceptors and 0 stream interceptors
I0824 21:00:29.743723 11257 service_map.go:66] Registering vtgateservice for grpc, disable it with -grpc-vtgateservice service_map parameter
I0824 21:00:29.744130 11257 tracker.go:158] Starting schema tracking
I0824 21:00:29.744210 11257 auth_server_clientcert.go:46] Not configuring AuthServerClientCert because mysql_server_ssl_ca is empty
I0824 21:00:29.744326 11257 auth_server_ldap.go:68] Not configuring AuthServerLdap because mysql_ldap_auth_config_file and mysql_ldap_auth_config_string are empty
I0824 21:00:29.744340 11257 auth_server_static.go:99] Not configuring AuthServerStatic, as mysql_auth_server_static_file and mysql_auth_server_static_string are empty
I0824 21:00:29.744351 11257 auth_server_vault.go:84] Not configuring AuthServerVault, as --mysql_auth_vault_addr is empty.
W0824 21:00:29.744484 11257 plugin_mysql_server.go:530] Found existent socket when trying to create new unix mysql listener: /tmp/mysql.sock, attempting to clean up
I0824 21:00:29.744739 11257 statsd.go:81] statsdAddress is empty
I0824 21:00:29.745794 11257 grpc_server.go:286] Listening for gRPC calls on port 15991
I0824 21:00:29.745822 11257 unix_socket.go:37] Not listening on socket file
I0824 21:00:29.768431 11257 vschema_manager.go:103] Received vschema update
I0824 21:00:54.207689 11257 shard_buffer.go:282] Starting buffering for shard: commerce/0 (window: 10s, size: 1000, max failover duration: 20s) (A failover was detected by this seen error: vttablet: rpc error: code = Code(17) desc = The MySQL server is running with the --read-only option so it cannot execute this statement (errno 1290) (sqlstate HY000) (CallerID: userData1).)
E0824 21:00:54.254369 11257 healthcheck.go:547] Adding 1 to PrimaryPromoted counter for target: keyspace:"commerce" shard:"0" tablet_type:REPLICA, tablet: zone1-0000000101, tabletType: PRIMARY
I0824 21:00:54.254890 11257 keyspace_events.go:272] keyspace event resolved: commerce/commerce is now consistent (serving: true)
I0824 21:00:54.254901 11257 shard_buffer.go:483] disruption in shard commerce/0 resolved (serving: true)
I0824 21:00:54.254910 11257 shard_buffer.go:572] Stopping buffering for shard: commerce/0 after: 0.0 seconds due to: a primary promotion has been detected. Draining 2 buffered requests now.
I0824 21:00:54.262834 11257 shard_buffer.go:597] Draining finished for shard: commerce/0 Took: 7.899625ms for: 2 requests.
I0824 21:00:54.274248 11257 tablet_health_check.go:111] HealthCheckUpdate(Serving State): tablet: zone1-100 (localhost) serving true => false for commerce/0 (REPLICA) reason: healthCheck update error: vttablet error: no replication status (errno 100) (sqlstate HY000)
I0824 21:00:59.275036 11257 tablet_health_check.go:111] HealthCheckUpdate(Serving State): tablet: zone1-100 (localhost) serving false => true for commerce/0 (REPLICA) reason: healthCheck update
I thought that it might be due to the tablets involved being in different cells, so starting with this diff:
diff --git a/examples/common/scripts/vtgate-up.sh b/examples/common/scripts/vtgate-up.sh
index 03b85869e5..671a748c67 100755
--- a/examples/common/scripts/vtgate-up.sh
+++ b/examples/common/scripts/vtgate-up.sh
@@ -35,11 +35,13 @@ vtgate \
--mysql_server_port $mysql_server_port \
--mysql_server_socket_path $mysql_server_socket_path \
--cell $cell \
- --cells_to_watch $cell \
+ --cells_to_watch 'zone1,zone2' \
--tablet_types_to_wait PRIMARY,REPLICA \
--service_map 'grpc-vtgateservice' \
--pid_file $VTDATAROOT/tmp/vtgate.pid \
+ --enable_buffer \
--mysql_auth_server_impl none \
+ --min_number_serving_vttablets=1 \
> $VTDATAROOT/tmp/vtgate.out 2>&1 &
# Block waiting for vtgate to be listening
diff --git a/examples/local/101_initial_cluster.sh b/examples/local/101_initial_cluster.sh
index ffa24d7f98..41562e2d2f 100755
--- a/examples/local/101_initial_cluster.sh
+++ b/examples/local/101_initial_cluster.sh
@@ -33,14 +33,20 @@ fi
# start vtctld
CELL=zone1 ../common/scripts/vtctld-up.sh
+vtctldclient AddCellInfo --root /vitess/zone2 --server-address "${ETCD_SERVER}" zone2
+
# start vttablets for keyspace commerce
for i in 100 101 102; do
- CELL=zone1 TABLET_UID=$i ../common/scripts/mysqlctl-up.sh
- CELL=zone1 KEYSPACE=commerce TABLET_UID=$i ../common/scripts/vttablet-up.sh
+ tcell=zone1
+ if [[ ${i}%2 -ne 0 ]]; then
+ tcell=zone2
+ fi
+ CELL=${tcell} TABLET_UID=$i ../common/scripts/mysqlctl-up.sh
+ CELL=${tcell} KEYSPACE=commerce TABLET_UID=$i ../common/scripts/vttablet-up.sh
done
# set the correct durability policy for the keyspace
-vtctldclient --server localhost:15999 SetKeyspaceDurabilityPolicy --durability-policy=semi_sync commerce || fail "Failed to set keyspace durability policy on the commerce keyspace"
+vtctldclient --server localhost:15999 SetKeyspaceDurabilityPolicy --durability-policy=none commerce || fail "Failed to set keyspace durability policy on the commerce keyspace"
# start vtorc
../common/scripts/vtorc-up.sh
The results were essentially unchanged:
Starting primary tablet UID: 100
Starting replicat tablet UID: 101
Starting inserts at Thu Aug 24 21:51:38 EDT 2023...
[1] 24131
Original tablet status:
zone2-0000000101 commerce 0 replica localhost:15101 localhost:17101 [] <null>
zone1-0000000100 commerce 0 primary localhost:15100 localhost:17100 [] 2023-08-25T01:50:42Z
zone1-0000000102 commerce 0 rdonly localhost:15102 localhost:17102 [] <null>
Pre TER buffering stats:
"BufferFailoverDurationSumMs": {"commerce.0": 0},
"BufferLastFailoverDurationMs": {},
"BufferLastRequestsDryRunMax": {},
"BufferLastRequestsInFlightMax": {},
"BufferRequestsBuffered": {"commerce.0": 0},
"BufferRequestsBufferedDryRun": {"commerce.0": 0},
"BufferRequestsDrained": {"commerce.0": 0},
"BufferRequestsEvicted": {"commerce.0.ContextDone": 0, "commerce.0.BufferFull": 0, "commerce.0.WindowExceeded": 0},
"BufferRequestsSkipped": {"commerce.0.BufferFull": 0, "commerce.0.Disabled": 0, "commerce.0.Shutdown": 0, "commerce.0.LastReparentTooRecent": 0, "commerce.0.LastFailoverTooRecent": 0},
"BufferSize": 1000,
"BufferStarts": {"commerce.0": 0},
"BufferStops": {"commerce.0.MaxDurationExceeded": 0, "commerce.0.Shutdown": 0, "commerce.0.ReshardingComplete": 0, "commerce.0.NewPrimarySeen": 0},
"BufferUtilizationDryRunSum": {"commerce.0": 0},
"BufferUtilizationSum": {"commerce.0": 0},
Doing TER at Thu Aug 24 21:51:38 EDT 2023...
{
"keyspace": "commerce",
"shard": "0",
"new_primary": {
"cell": "zone2",
"uid": 101
},
"old_primary": {
"cell": "zone1",
"uid": 100
}
}
Completed TER at Thu Aug 24 21:51:38 EDT 2023...
New tablet status:
zone2-0000000101 commerce 0 primary localhost:15101 localhost:17101 [] 2023-08-25T01:51:38Z
zone1-0000000100 commerce 0 replica localhost:15100 localhost:17100 [] <null>
zone1-0000000102 commerce 0 rdonly localhost:15102 localhost:17102 [] <null>
Sleeping for 40 seconds...
Stopping inserts...
Rows created:
[1] + 24131 terminated while true; do; date >> /tmp/query-results.txt; mysql commerce -e >> 2>&1;
+----------+
| count(*) |
+----------+
| 2617 |
+----------+
Post TER buffering stats:
"BufferFailoverDurationSumMs": {"commerce.0": 32},
"BufferLastFailoverDurationMs": {"commerce.0": 32},
"BufferLastRequestsDryRunMax": {"commerce.0": 0},
"BufferLastRequestsInFlightMax": {"commerce.0": 1},
"BufferRequestsBuffered": {"commerce.0": 1},
"BufferRequestsBufferedDryRun": {"commerce.0": 0},
"BufferRequestsDrained": {"commerce.0": 1},
"BufferRequestsEvicted": {"commerce.0.BufferFull": 0, "commerce.0.WindowExceeded": 0, "commerce.0.ContextDone": 0},
"BufferRequestsSkipped": {"commerce.0.BufferFull": 0, "commerce.0.Disabled": 0, "commerce.0.Shutdown": 0, "commerce.0.LastReparentTooRecent": 0, "commerce.0.LastFailoverTooRecent": 0},
"BufferSize": 1000,
"BufferStarts": {"commerce.0": 1},
"BufferStops": {"commerce.0.ReshardingComplete": 0, "commerce.0.NewPrimarySeen": 1, "commerce.0.MaxDurationExceeded": 0, "commerce.0.Shutdown": 0},
"BufferUtilizationDryRunSum": {"commerce.0": 0},
"BufferUtilizationSum": {"commerce.0": 0},
Errors seen in the log file: