Skip to content

Commit e7d11ff

Browse files
authored
Merge pull request vitessio#2626 from thompsonja/sandbox_updates
Kubernetes sandbox reliability updates.
2 parents 1b10471 + 635af5a commit e7d11ff

File tree

8 files changed

+71
-62
lines changed

8 files changed

+71
-62
lines changed

helm/vitess/templates/_helpers.tpl

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -14,14 +14,17 @@
1414
{{- range . }}{{template "format-flags" .}}{{end -}}
1515
{{- end -}}
1616

17-
# Format a shard name, making sure it starts and ends with [A-Za-z0-9].
18-
{{- define "format-shard-name" -}}
17+
# Clean labels, making sure it starts and ends with [A-Za-z0-9].
18+
# This is especially important for shard names, which can start or end with
19+
# '-' (like -80 or 80-), which would be an invalid kubernetes label.
20+
{{- define "clean-label" -}}
21+
{{- $replaced_label := . | replace "_" "-"}}
1922
{{- if hasPrefix "-" . -}}
20-
x{{.}}
23+
x{{$replaced_label}}
2124
{{- else if hasSuffix "-" . -}}
22-
{{.}}x
25+
{{$replaced_label}}x
2326
{{- else -}}
24-
{{.}}
27+
{{$replaced_label}}
2528
{{- end -}}
2629
{{- end -}}
2730

helm/vitess/templates/_vttablet.tpl

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -176,9 +176,10 @@ volumes:
176176
{{- $tablet := index . 4 -}}
177177
{{- with $tablet.vttablet -}}
178178
{{- $0 := $.Values.vttablet -}}
179-
{{- $keyspaceClean := $keyspace.name | replace "_" "-" -}}
180-
{{- $shardClean := include "format-shard-name" $shard.name -}}
181-
{{- $setName := printf "%s-%s-%s" $keyspaceClean $shardClean $tablet.type | lower -}}
179+
{{- $cellClean := include "clean-label" $cell.name -}}
180+
{{- $keyspaceClean := include "clean-label" $keyspace.name -}}
181+
{{- $shardClean := include "clean-label" $shard.name -}}
182+
{{- $setName := printf "%s-%s-%s-%s" $cellClean $keyspaceClean $shardClean $tablet.type | lower -}}
182183
{{- $uid := "$(cat $VTDATAROOT/init/tablet-uid)" }}
183184
# vttablet StatefulSet
184185
apiVersion: apps/v1beta1
@@ -193,6 +194,7 @@ spec:
193194
labels:
194195
app: vitess
195196
component: vttablet
197+
cell: {{$cellClean | quote}}
196198
keyspace: {{$keyspace.name | quote}}
197199
shard: {{$shardClean | quote}}
198200
type: {{$tablet.type | quote}}
@@ -222,6 +224,7 @@ spec:
222224
{{- $cell := index . 1 -}}
223225
{{- $keyspace := index . 2 -}}
224226
{{- $shard := index . 3 -}}
227+
{{- $shardClean := include "clean-label" $shard.name -}}
225228
{{- $tablet := index . 4 -}}
226229
{{- $uid := index . 5 -}}
227230
{{- with $tablet.vttablet -}}
@@ -235,7 +238,7 @@ metadata:
235238
app: vitess
236239
component: vttablet
237240
keyspace: {{$keyspace.name | quote}}
238-
shard: {{$shard.name | quote}}
241+
shard: {{$shardClean | quote}}
239242
type: {{$tablet.type | quote}}
240243
annotations:
241244
pod.beta.kubernetes.io/init-containers: '[

test/cluster/k8s_environment.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -184,7 +184,8 @@ def restart_mysql_task(self, tablet_name, task_name, is_alloc=False):
184184
time.sleep(60)
185185

186186
# Create the pod again.
187-
os.system('cat %s | kubectl create -f -' % tmpfile.name)
187+
os.system('cat %s | kubectl create --namespace=%s -f -' % (
188+
tmpfile.name, self.cluster_name))
188189
while time.time() - start_time < 120:
189190
logging.info('Waiting for pod %s to be running', vttablet_pod_name)
190191
pod = subprocess.check_output(

test/cluster/sandbox/initial_reparent.py

Lines changed: 19 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -4,10 +4,10 @@
44
import json
55
import logging
66
import optparse
7+
import sys
78
import time
89
from vtproto import topodata_pb2
910
from vttest import sharding_utils
10-
import sandbox_utils
1111
import vtctl_sandbox
1212

1313

@@ -23,24 +23,30 @@ def initial_reparent(keyspace, master_cell, num_shards, namespace, timeout_s):
2323
"""Performs the first reparent."""
2424
successfully_reparented = []
2525
master_tablets = {}
26+
start_time = time.time()
27+
logging.info('Finding tablets to reparent to.')
2628
while len(master_tablets) < num_shards:
29+
if time.time() - start_time > timeout_s:
30+
logging.error('Timed out waiting to find a replica tablet')
31+
return 1
32+
2733
for shard_name in sharding_utils.get_shard_names(num_shards):
28-
shard_name = sandbox_utils.fix_shard_name(shard_name)
34+
if shard_name in master_tablets:
35+
continue
2936
tablets = vtctl_sandbox.execute_vtctl_command(
30-
['ListShardTablets', '%s/%s' % (
31-
keyspace, sandbox_utils.fix_shard_name(shard_name))],
37+
['ListShardTablets', '%s/%s' % (keyspace, shard_name)],
3238
namespace=namespace)[0].split('\n')
3339
tablets = [x.split(' ') for x in tablets if x]
3440
potential_masters = [
3541
x[0] for x in tablets if x[3] == 'replica'
3642
and x[0].split('-')[0] == master_cell]
3743
if potential_masters:
3844
master_tablets[shard_name] = potential_masters[0]
45+
logging.info(
46+
'%s selected for shard %s', potential_masters[0], shard_name)
3947

40-
start_time = time.time()
4148
while time.time() - start_time < timeout_s:
4249
for shard_name in sharding_utils.get_shard_names(num_shards):
43-
shard_name = sandbox_utils.fix_shard_name(shard_name)
4450
master_tablet_id = master_tablets[shard_name]
4551
if is_master(master_tablet_id, namespace):
4652
logging.info('Tablet %s is the master of %s/%s.',
@@ -55,8 +61,10 @@ def initial_reparent(keyspace, master_cell, num_shards, namespace, timeout_s):
5561
master_tablet_id], namespace=namespace, timeout_s=5)
5662
if len(successfully_reparented) == num_shards:
5763
logging.info('Done with initial reparent.')
58-
return
59-
logging.fatal('Timed out waiting for initial reparent.')
64+
return 0
65+
66+
logging.error('Timed out waiting for initial reparent.')
67+
return 1
6068

6169

6270
def main():
@@ -73,9 +81,9 @@ def main():
7381
logging.getLogger().setLevel(logging.INFO)
7482

7583
options, _ = parser.parse_args()
76-
initial_reparent(options.keyspace, options.master_cell,
77-
options.shard_count, options.namespace,
78-
options.timeout)
84+
sys.exit(initial_reparent(options.keyspace, options.master_cell,
85+
options.shard_count, options.namespace,
86+
options.timeout))
7987

8088

8189
if __name__ == '__main__':

test/cluster/sandbox/sandbox_utils.py

Lines changed: 0 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -5,26 +5,6 @@
55
import random
66

77

8-
def fix_shard_name(shard_name):
9-
"""Kubernetes doesn't allow '-' in the beginning or end of attributes.
10-
11-
Instead, replace them with an x.
12-
13-
Example: -80 becomes x80, 80- becomes 80x.
14-
15-
Args:
16-
shard_name: string, A standard shard name (like -80).
17-
18-
Returns:
19-
A fixed shard name suitable for kubernetes (string).
20-
"""
21-
if shard_name.startswith('-'):
22-
return 'x%s' % shard_name[1:]
23-
if shard_name.endswith('-'):
24-
return '%sx' % shard_name[:-1]
25-
return shard_name
26-
27-
288
def create_log_file(log_dir, filename):
299
"""Create a log file.
3010

test/cluster/sandbox/subprocess_component.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -28,13 +28,14 @@ def start(self):
2828
self.log_dir, '%s.INFO' % self.name)
2929
errorfile = sandbox_utils.create_log_file(
3030
self.log_dir, '%s.ERROR' % self.name)
31-
subprocess.call(['./%s' % self.script] + script_args, stdout=infofile,
32-
stderr=errorfile)
31+
subprocess.check_call(
32+
['./%s' % self.script] + script_args, stdout=infofile,
33+
stderr=errorfile)
3334
logging.info('Done.')
3435
except subprocess.CalledProcessError as error:
3536
raise sandbox.SandboxError(
36-
'Subprocess %s returned errorcode %d, result %s.' % (
37-
self.script, error.returncode, error.output))
37+
'Subprocess %s returned errorcode %d, find log at %s.' % (
38+
self.script, error.returncode, errorfile.name))
3839
finally:
3940
if infofile:
4041
infofile.close()

test/cluster/sandbox/vitess_kubernetes_sandbox.py

Lines changed: 22 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,6 @@
1111
from vttest import sharding_utils
1212

1313
import sandbox
14-
import sandbox_utils
1514
import sandlet
1615
import subprocess_component
1716

@@ -92,7 +91,6 @@ def _generate_helm_keyspaces(self):
9291

9392
for shard_index, shard_name in enumerate(
9493
sharding_utils.get_shard_names(ks['shard_count'])):
95-
shard_name = sandbox_utils.fix_shard_name(shard_name)
9694
shard = dict(
9795
name=shard_name,
9896
tablets=[dict(
@@ -187,12 +185,15 @@ def _generate_helm_values_config(self):
187185
keyspaces=copy.deepcopy(keyspaces),
188186
)
189187
# Each tablet's UID must be unique, so increment the uidBase for tablets
190-
# by the cell epsilon value to ensure uniqueness. This logic will go away
191-
# once StatefulSet is available.
188+
# by the cell epsilon value to ensure uniqueness. Also convert the UID to
189+
# a string, or else the parser will attempt to parse UID as a float, which
190+
# causes issues when UID's are large. This logic will go away once
191+
# StatefulSet is available.
192192
for keyspace in cell_dict['keyspaces']:
193193
for shard in keyspace['shards']:
194194
for tablets in shard['tablets']:
195-
tablets['uidBase'] += index * self.cell_epsilon
195+
tablets['uidBase'] = str(
196+
tablets['uidBase'] + index * self.cell_epsilon)
196197
yaml_values['topology']['cells'].append(cell_dict)
197198

198199
if index == 0:
@@ -214,24 +215,31 @@ def generate_helm_sandlet(self):
214215
helm_sandlet = sandlet.Sandlet('helm')
215216
helm_sandlet.components.add_component(kubernetes_components.HelmComponent(
216217
'helm', self.name, self._generate_helm_values_config()))
218+
219+
# Add a subprocess task to wait for all mysql instances to be healthy.
220+
tablet_count = 0
221+
for keyspace in self.app_options.keyspaces:
222+
tablet_count += (keyspace['shard_count'] * len(self.app_options.cells) * (
223+
keyspace['replica_count'] + keyspace['rdonly_count']))
224+
wait_for_mysql_subprocess = subprocess_component.Subprocess(
225+
'wait_for_mysql', self.name, 'wait_for_mysql.py',
226+
self.log_dir, namespace=self.name,
227+
cells=','.join(self.app_options.cells),
228+
tablet_count=tablet_count)
229+
wait_for_mysql_subprocess.dependencies = ['helm']
230+
helm_sandlet.components.add_component(wait_for_mysql_subprocess)
231+
232+
# Add a subprocess task for each keyspace to perform the initial reparent.
217233
for keyspace in self.app_options.keyspaces:
218234
name = keyspace['name']
219235
shard_count = keyspace['shard_count']
220-
wait_for_mysql_subprocess = subprocess_component.Subprocess(
221-
'wait_for_mysql_%s' % name, self.name, 'wait_for_mysql.py',
222-
self.log_dir, namespace=self.name,
223-
cells=','.join(self.app_options.cells),
224-
tablet_count=(shard_count * (
225-
keyspace['replica_count'] + keyspace['rdonly_count'])))
226-
wait_for_mysql_subprocess.dependencies = ['helm']
227236
initial_reparent_subprocess = subprocess_component.Subprocess(
228-
'initial_reparent_%s' % name, self.name,
237+
'initial_reparent_%s_%d' % (name, shard_count), self.name,
229238
'initial_reparent.py', self.log_dir, namespace=self.name,
230239
keyspace=name, shard_count=shard_count,
231240
master_cell=self.app_options.cells[0])
232241
initial_reparent_subprocess.dependencies = [
233242
wait_for_mysql_subprocess.name]
234-
helm_sandlet.components.add_component(wait_for_mysql_subprocess)
235243
helm_sandlet.components.add_component(initial_reparent_subprocess)
236244
self.sandlets.add_component(helm_sandlet)
237245

test/cluster/sandbox/wait_for_mysql.py

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
import logging
55
import optparse
66
import re
7+
import sys
78
import time
89
import vtctl_sandbox
910

@@ -30,6 +31,8 @@ def main():
3031
parser.add_option('-c', '--cells', help='Comma separated list of cells')
3132
parser.add_option('-t', '--tablet_count',
3233
help='Total number of expected tablets', type=int)
34+
parser.add_option('-w', '--wait', help='Max wait time (s)', type=int,
35+
default=300)
3336
logging.getLogger().setLevel(logging.INFO)
3437

3538
options, _ = parser.parse_args()
@@ -42,17 +45,18 @@ def main():
4245

4346
# Do this in a loop as the output of ListAllTablets may not be parseable
4447
# until all tablets have been started.
45-
while time.time() - start_time < 300 and len(tablets) < options.tablet_count:
48+
while (time.time() - start_time < options.wait and
49+
len(tablets) < options.tablet_count):
4650
tablets = get_all_tablets(options.cells, options.namespace)
4751
logging.info('Expecting %d tablets, found %d tablets',
4852
options.tablet_count, len(tablets))
4953

5054
start_time = time.time()
51-
while time.time() - start_time < 300:
55+
while time.time() - start_time < options.wait:
5256
for tablet in [t for t in tablets if t not in good_tablets]:
5357
_, success = vtctl_sandbox.execute_vtctl_command(
5458
['ExecuteFetchAsDba', tablet, 'show databases'],
55-
namespace=options.namespace)
59+
namespace=options.namespace, timeout_s=1)
5660
if success:
5761
good_tablets.append(tablet)
5862
logging.info('%d of %d tablets healthy.', len(good_tablets), len(tablets))
@@ -62,6 +66,7 @@ def main():
6266
break
6367
else:
6468
logging.warn('Timed out waiting for tablets to be ready.')
69+
sys.exit(1)
6570

6671

6772
if __name__ == '__main__':

0 commit comments

Comments
 (0)