Skip to content

Commit 980bd64

Browse files
committed
Kubernetes sandbox reliability updates.
1 parent e2f97f8 commit 980bd64

File tree

7 files changed

+37
-43
lines changed

7 files changed

+37
-43
lines changed

helm/vitess/templates/_vttablet.tpl

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -176,9 +176,10 @@ volumes:
176176
{{- $tablet := index . 4 -}}
177177
{{- with $tablet.vttablet -}}
178178
{{- $0 := $.Values.vttablet -}}
179+
{{- $cellClean := $cell.name | replace "_" "-" -}}
179180
{{- $keyspaceClean := $keyspace.name | replace "_" "-" -}}
180181
{{- $shardClean := include "format-shard-name" $shard.name -}}
181-
{{- $setName := printf "%s-%s-%s" $keyspaceClean $shardClean $tablet.type | lower -}}
182+
{{- $setName := printf "%s-%s-%s-%s" $cellClean $keyspaceClean $shardClean $tablet.type | lower -}}
182183
{{- $uid := "$(cat $VTDATAROOT/init/tablet-uid)" }}
183184
# vttablet StatefulSet
184185
apiVersion: apps/v1beta1
@@ -193,6 +194,7 @@ spec:
193194
labels:
194195
app: vitess
195196
component: vttablet
197+
cell: {{$cellClean | quote}}
196198
keyspace: {{$keyspace.name | quote}}
197199
shard: {{$shardClean | quote}}
198200
type: {{$tablet.type | quote}}
@@ -222,6 +224,7 @@ spec:
222224
{{- $cell := index . 1 -}}
223225
{{- $keyspace := index . 2 -}}
224226
{{- $shard := index . 3 -}}
227+
{{- $shardClean := include "format-shard-name" $shard.name -}}
225228
{{- $tablet := index . 4 -}}
226229
{{- $uid := index . 5 -}}
227230
{{- with $tablet.vttablet -}}
@@ -235,7 +238,7 @@ metadata:
235238
app: vitess
236239
component: vttablet
237240
keyspace: {{$keyspace.name | quote}}
238-
shard: {{$shard.name | quote}}
241+
shard: {{$shardClean | quote}}
239242
type: {{$tablet.type | quote}}
240243
annotations:
241244
pod.beta.kubernetes.io/init-containers: '[

test/cluster/k8s_environment.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -184,7 +184,8 @@ def restart_mysql_task(self, tablet_name, task_name, is_alloc=False):
184184
time.sleep(60)
185185

186186
# Create the pod again.
187-
os.system('cat %s | kubectl create -f -' % tmpfile.name)
187+
os.system('cat %s | kubectl create --namespace=%s -f -' % (
188+
tmpfile.name, self.cluster_name))
188189
while time.time() - start_time < 120:
189190
logging.info('Waiting for pod %s to be running', vttablet_pod_name)
190191
pod = subprocess.check_output(

test/cluster/sandbox/initial_reparent.py

Lines changed: 16 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -4,10 +4,10 @@
44
import json
55
import logging
66
import optparse
7+
import sys
78
import time
89
from vtproto import topodata_pb2
910
from vttest import sharding_utils
10-
import sandbox_utils
1111
import vtctl_sandbox
1212

1313

@@ -23,24 +23,29 @@ def initial_reparent(keyspace, master_cell, num_shards, namespace, timeout_s):
2323
"""Performs the first reparent."""
2424
successfully_reparented = []
2525
master_tablets = {}
26+
start_time = time.time()
27+
logging.info('Finding tablets to reparent to.')
2628
while len(master_tablets) < num_shards:
29+
if time.time() - start_time > timeout_s:
30+
logging.fatal('Timed out waiting to find a replica tablet')
31+
return 1
2732
for shard_name in sharding_utils.get_shard_names(num_shards):
28-
shard_name = sandbox_utils.fix_shard_name(shard_name)
33+
if shard_name in master_tablets:
34+
continue
2935
tablets = vtctl_sandbox.execute_vtctl_command(
30-
['ListShardTablets', '%s/%s' % (
31-
keyspace, sandbox_utils.fix_shard_name(shard_name))],
36+
['ListShardTablets', '%s/%s' % (keyspace, shard_name)],
3237
namespace=namespace)[0].split('\n')
3338
tablets = [x.split(' ') for x in tablets if x]
3439
potential_masters = [
3540
x[0] for x in tablets if x[3] == 'replica'
3641
and x[0].split('-')[0] == master_cell]
3742
if potential_masters:
3843
master_tablets[shard_name] = potential_masters[0]
44+
logging.info(
45+
'%s selected for shard %s', potential_masters[0], shard_name)
3946

40-
start_time = time.time()
4147
while time.time() - start_time < timeout_s:
4248
for shard_name in sharding_utils.get_shard_names(num_shards):
43-
shard_name = sandbox_utils.fix_shard_name(shard_name)
4449
master_tablet_id = master_tablets[shard_name]
4550
if is_master(master_tablet_id, namespace):
4651
logging.info('Tablet %s is the master of %s/%s.',
@@ -55,8 +60,9 @@ def initial_reparent(keyspace, master_cell, num_shards, namespace, timeout_s):
5560
master_tablet_id], namespace=namespace, timeout_s=5)
5661
if len(successfully_reparented) == num_shards:
5762
logging.info('Done with initial reparent.')
58-
return
63+
return 0
5964
logging.fatal('Timed out waiting for initial reparent.')
65+
return 1
6066

6167

6268
def main():
@@ -73,9 +79,9 @@ def main():
7379
logging.getLogger().setLevel(logging.INFO)
7480

7581
options, _ = parser.parse_args()
76-
initial_reparent(options.keyspace, options.master_cell,
77-
options.shard_count, options.namespace,
78-
options.timeout)
82+
sys.exit(initial_reparent(options.keyspace, options.master_cell,
83+
options.shard_count, options.namespace,
84+
options.timeout))
7985

8086

8187
if __name__ == '__main__':

test/cluster/sandbox/sandbox_utils.py

Lines changed: 0 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -5,26 +5,6 @@
55
import random
66

77

8-
def fix_shard_name(shard_name):
9-
"""Kubernetes doesn't allow '-' in the beginning or end of attributes.
10-
11-
Instead, replace them with an x.
12-
13-
Example: -80 becomes x80, 80- becomes 80x.
14-
15-
Args:
16-
shard_name: string, A standard shard name (like -80).
17-
18-
Returns:
19-
A fixed shard name suitable for kubernetes (string).
20-
"""
21-
if shard_name.startswith('-'):
22-
return 'x%s' % shard_name[1:]
23-
if shard_name.endswith('-'):
24-
return '%sx' % shard_name[:-1]
25-
return shard_name
26-
27-
288
def create_log_file(log_dir, filename):
299
"""Create a log file.
3010

test/cluster/sandbox/subprocess_component.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -28,13 +28,14 @@ def start(self):
2828
self.log_dir, '%s.INFO' % self.name)
2929
errorfile = sandbox_utils.create_log_file(
3030
self.log_dir, '%s.ERROR' % self.name)
31-
subprocess.call(['./%s' % self.script] + script_args, stdout=infofile,
32-
stderr=errorfile)
31+
subprocess.check_call(
32+
['./%s' % self.script] + script_args, stdout=infofile,
33+
stderr=errorfile)
3334
logging.info('Done.')
3435
except subprocess.CalledProcessError as error:
3536
raise sandbox.SandboxError(
36-
'Subprocess %s returned errorcode %d, result %s.' % (
37-
self.script, error.returncode, error.output))
37+
'Subprocess %s returned errorcode %d, find log at %s.' % (
38+
self.script, error.returncode, errorfile.name))
3839
finally:
3940
if infofile:
4041
infofile.close()

test/cluster/sandbox/vitess_kubernetes_sandbox.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,6 @@
1111
from vttest import sharding_utils
1212

1313
import sandbox
14-
import sandbox_utils
1514
import sandlet
1615
import subprocess_component
1716

@@ -92,7 +91,6 @@ def _generate_helm_keyspaces(self):
9291

9392
for shard_index, shard_name in enumerate(
9493
sharding_utils.get_shard_names(ks['shard_count'])):
95-
shard_name = sandbox_utils.fix_shard_name(shard_name)
9694
shard = dict(
9795
name=shard_name,
9896
tablets=[dict(
@@ -221,7 +219,7 @@ def generate_helm_sandlet(self):
221219
'wait_for_mysql_%s' % name, self.name, 'wait_for_mysql.py',
222220
self.log_dir, namespace=self.name,
223221
cells=','.join(self.app_options.cells),
224-
tablet_count=(shard_count * (
222+
tablet_count=(shard_count * len(self.app_options.cells) * (
225223
keyspace['replica_count'] + keyspace['rdonly_count'])))
226224
wait_for_mysql_subprocess.dependencies = ['helm']
227225
initial_reparent_subprocess = subprocess_component.Subprocess(

test/cluster/sandbox/wait_for_mysql.py

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
import logging
55
import optparse
66
import re
7+
import sys
78
import time
89
import vtctl_sandbox
910

@@ -30,6 +31,8 @@ def main():
3031
parser.add_option('-c', '--cells', help='Comma separated list of cells')
3132
parser.add_option('-t', '--tablet_count',
3233
help='Total number of expected tablets', type=int)
34+
parser.add_option('-w', '--wait', help='Max wait time (s)', type=int,
35+
default=300)
3336
logging.getLogger().setLevel(logging.INFO)
3437

3538
options, _ = parser.parse_args()
@@ -42,17 +45,18 @@ def main():
4245

4346
# Do this in a loop as the output of ListAllTablets may not be parseable
4447
# until all tablets have been started.
45-
while time.time() - start_time < 300 and len(tablets) < options.tablet_count:
48+
while (time.time() - start_time < options.wait and
49+
len(tablets) < options.tablet_count):
4650
tablets = get_all_tablets(options.cells, options.namespace)
4751
logging.info('Expecting %d tablets, found %d tablets',
4852
options.tablet_count, len(tablets))
4953

5054
start_time = time.time()
51-
while time.time() - start_time < 300:
55+
while time.time() - start_time < options.wait:
5256
for tablet in [t for t in tablets if t not in good_tablets]:
5357
_, success = vtctl_sandbox.execute_vtctl_command(
5458
['ExecuteFetchAsDba', tablet, 'show databases'],
55-
namespace=options.namespace)
59+
namespace=options.namespace, timeout_s=1)
5660
if success:
5761
good_tablets.append(tablet)
5862
logging.info('%d of %d tablets healthy.', len(good_tablets), len(tablets))
@@ -62,6 +66,7 @@ def main():
6266
break
6367
else:
6468
logging.warn('Timed out waiting for tablets to be ready.')
69+
sys.exit(1)
6570

6671

6772
if __name__ == '__main__':

0 commit comments

Comments
 (0)