Skip to content

Commit 1225c01

Browse files
authored
cluster: fix the issue when endpoints are empty in scale-out (#1227)
1 parent 6b3d44f commit 1225c01

10 files changed

Lines changed: 112 additions & 20 deletions

File tree

Makefile

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,7 @@ tidy:
8585
clean:
8686
@rm -rf bin
8787
@rm -rf cover
88+
@rm -rf tests/*/{bin/*.test,logs,cover/*.out}
8889

8990
test: failpoint-enable run-tests failpoint-disable
9091

pkg/cluster/api/pdapi.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -87,6 +87,9 @@ var (
8787
)
8888

8989
func tryURLs(endpoints []string, f func(endpoint string) ([]byte, error)) ([]byte, error) {
90+
if len(endpoints) == 0 {
91+
return nil, errors.New("no endpoint available")
92+
}
9093
var err error
9194
var bytes []byte
9295
for _, endpoint := range endpoints {

pkg/cluster/ctxt/context.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,11 @@ const (
2828
ctxKey = contextKey("TASK_CONTEXT")
2929
)
3030

31+
const (
32+
// CtxBaseTopo is key of store the base topology in context.Context
33+
CtxBaseTopo = contextKey("BASE_TOPO")
34+
)
35+
3136
type (
3237
// Executor is the executor interface for TiUP, all tasks will in the end
3338
// be passed to a executor and then be actually performed.

pkg/cluster/manager/scale_out.go

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -148,7 +148,9 @@ func (m *Manager) ScaleOut(
148148
return err
149149
}
150150

151-
if err := t.Execute(ctxt.New(context.Background())); err != nil {
151+
ctx := ctxt.New(context.Background())
152+
ctx = context.WithValue(ctx, ctxt.CtxBaseTopo, topo)
153+
if err := t.Execute(ctx); err != nil {
152154
if errorx.Cast(err) != nil {
153155
// FIXME: Map possible task errors and give suggestions.
154156
return err

pkg/cluster/operation/action.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -470,7 +470,7 @@ func StartComponent(ctx context.Context, instances []spec.Instance, options Opti
470470
// of checkpoint context every time put it into a new goroutine.
471471
nctx := checkpoint.NewContext(ctx)
472472
errg.Go(func() error {
473-
if err := ins.PrepareStart(tlsCfg); err != nil {
473+
if err := ins.PrepareStart(nctx, tlsCfg); err != nil {
474474
return err
475475
}
476476
err := startInstance(nctx, ins, options.OptTimeout)

pkg/cluster/spec/instance.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -84,7 +84,7 @@ type Instance interface {
8484
Ready(context.Context, ctxt.Executor, uint64) error
8585
InitConfig(ctx context.Context, e ctxt.Executor, clusterName string, clusterVersion string, deployUser string, paths meta.DirPaths) error
8686
ScaleConfig(ctx context.Context, e ctxt.Executor, topo Topology, clusterName string, clusterVersion string, deployUser string, paths meta.DirPaths) error
87-
PrepareStart(tlsCfg *tls.Config) error
87+
PrepareStart(ctx context.Context, tlsCfg *tls.Config) error
8888
ComponentName() string
8989
InstanceName() string
9090
ServiceName() string
@@ -398,7 +398,7 @@ func (i *BaseInstance) SetPatched(p bool) {
398398
}
399399

400400
// PrepareStart checks instance requirements before starting
401-
func (i *BaseInstance) PrepareStart(tlsCfg *tls.Config) error {
401+
func (i *BaseInstance) PrepareStart(ctx context.Context, tlsCfg *tls.Config) error {
402402
return nil
403403
}
404404

pkg/cluster/spec/tiflash.go

Lines changed: 20 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ import (
2525
"strings"
2626
"time"
2727

28+
perrs "github.com/pingcap/errors"
2829
"github.com/pingcap/tiup/pkg/cluster/api"
2930
"github.com/pingcap/tiup/pkg/cluster/ctxt"
3031
"github.com/pingcap/tiup/pkg/cluster/template/scripts"
@@ -526,7 +527,7 @@ func (i *TiFlashInstance) InitConfig(
526527
}
527528
tidbStatusStr := strings.Join(tidbStatusAddrs, ",")
528529

529-
pdStr := strings.Join(i.getEndpoints(), ",")
530+
pdStr := strings.Join(i.getEndpoints(i.topo), ",")
530531

531532
cfg := scripts.NewTiFlashScript(
532533
i.GetHost(),
@@ -652,25 +653,37 @@ type replicateConfig struct {
652653
EnablePlacementRules string `json:"enable-placement-rules"`
653654
}
654655

655-
func (i *TiFlashInstance) getEndpoints() []string {
656+
func (i *TiFlashInstance) getEndpoints(topo Topology) []string {
656657
var endpoints []string
657-
for _, pd := range i.topo.(*Specification).PDServers {
658+
for _, pd := range topo.(*Specification).PDServers {
658659
endpoints = append(endpoints, fmt.Sprintf("%s:%d", pd.Host, uint64(pd.ClientPort)))
659660
}
660661
return endpoints
661662
}
662663

663664
// PrepareStart checks TiFlash requirements before starting
664-
func (i *TiFlashInstance) PrepareStart(tlsCfg *tls.Config) error {
665-
endPoints := i.getEndpoints()
665+
func (i *TiFlashInstance) PrepareStart(ctx context.Context, tlsCfg *tls.Config) error {
666666
// set enable-placement-rules to true via PDClient
667-
pdClient := api.NewPDClient(endPoints, 10*time.Second, tlsCfg)
668667
enablePlacementRules, err := json.Marshal(replicateConfig{
669668
EnablePlacementRules: "true",
670669
})
670+
// this should not failed, else exit
671671
if err != nil {
672-
return nil
672+
return perrs.Annotate(err, "failed to marshal replicate config")
673+
}
674+
675+
var topo Topology
676+
if topoVal := ctx.Value(ctxt.CtxBaseTopo); topoVal != nil { // in scale-out phase
677+
var ok bool
678+
topo, ok = topoVal.(Topology)
679+
if !ok {
680+
return perrs.New("base topology in context is invalid")
681+
}
682+
} else { // in start phase
683+
topo = i.topo
673684
}
674685

686+
endpoints := i.getEndpoints(topo)
687+
pdClient := api.NewPDClient(endpoints, 10*time.Second, tlsCfg)
675688
return pdClient.UpdateReplicateConfig(bytes.NewBuffer(enablePlacementRules))
676689
}

tests/tiup-cluster/script/scale_tools.sh

Lines changed: 13 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ function scale_tools() {
1616
if [ $test_tls = true ]; then
1717
topo=./topo/full_tls.yaml
1818
else
19-
topo=./topo/full.yaml
19+
topo=./topo/full_without_tiflash.yaml
2020
fi
2121

2222
tiup-cluster $client --yes deploy $name $version $topo -i ~/.ssh/id_rsa
@@ -29,9 +29,6 @@ function scale_tools() {
2929
for item in pump drainer tidb tikv pd grafana node_exporter blackbox_exporter; do
3030
tiup-cluster $client exec $name -N n1 --command "grep $item /home/tidb/deploy/prometheus-9090/conf/prometheus.yml"
3131
done
32-
if [ $test_tls = false ]; then
33-
tiup-cluster $client exec $name -N n1 --command "grep tiflash /home/tidb/deploy/prometheus-9090/conf/prometheus.yml"
34-
fi
3532

3633
tiup-cluster $client list | grep "$name"
3734

@@ -46,9 +43,9 @@ function scale_tools() {
4643
total=19
4744
total_add_one=20
4845
else
49-
total_sub_one=21
50-
total=22
51-
total_add_one=23
46+
total_sub_one=20
47+
total=21
48+
total_add_one=22
5249
fi
5350

5451
echo "start scale in pump"
@@ -95,6 +92,13 @@ function scale_tools() {
9592
# currently tiflash is not supported in TLS enabled cluster
9693
# and only Tiflash support data-dir in multipath
9794
if [ $test_tls = false ]; then
95+
echo "start scale out tiflash(first time)"
96+
topo=./topo/full_scale_in_tiflash.yaml
97+
tiup-cluster $client --yes scale-out $name $topo
98+
tiup-cluster $client exec $name -N n1 --command "grep tiflash /home/tidb/deploy/prometheus-9090/conf/prometheus.yml"
99+
# ensure scale-out will mark pd.enable-placement-rules to true. ref https://github.com/pingcap/tiup/issues/1226
100+
curl n3:2379/pd/api/v1/config 2>/dev/null | grep '"enable-placement-rules": "true"'
101+
98102
# ensure tiflash's data dir exists
99103
tiup-cluster $client exec $name -N n3 --command "ls /home/tidb/deploy/tiflash-9000/data1"
100104
tiup-cluster $client exec $name -N n3 --command "ls /data/tiflash-data"
@@ -103,10 +107,10 @@ function scale_tools() {
103107
tiup-cluster $client display $name | grep Tombstone
104108
echo "start prune tiflash"
105109
yes | tiup-cluster $client prune $name
106-
wait_instance_num_reach $name $total_sub_one $native_ssh
110+
wait_instance_num_reach $name $total $native_ssh
107111
! tiup-cluster $client exec $name -N n3 --command "ls /home/tidb/deploy/tiflash-9000/data1"
108112
! tiup-cluster $client exec $name -N n3 --command "ls /data/tiflash-data"
109-
echo "start scale out tiflash"
113+
echo "start scale out tiflash(second time)"
110114
topo=./topo/full_scale_in_tiflash.yaml
111115
tiup-cluster $client --yes scale-out $name $topo
112116
fi
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,3 @@
11
tiflash_servers:
22
- host: n3
3+
data_dir: "data1,/data/tiflash-data"
Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
global:
2+
user: tidb
3+
group: pingcap
4+
5+
server_configs:
6+
tidb:
7+
binlog.enable: true
8+
binlog.ignore-error: false
9+
tikv:
10+
storage.reserve-space: 1K
11+
pump:
12+
storage.stop-write-at-available-space: 1 mib
13+
14+
tidb_servers:
15+
- host: n1
16+
- host: n2
17+
18+
pd_servers:
19+
- host: n3
20+
- host: n4
21+
- host: n5
22+
23+
# Note if only 3 instance, when scale-in one of it.
24+
# It may not be tombstone.
25+
tikv_servers:
26+
- host: n1
27+
- host: n3
28+
data_dir: "/home/tidb/my_kv_data"
29+
- host: n4
30+
- host: n5
31+
32+
pump_servers:
33+
- host: n3
34+
- host: n4
35+
- host: n5
36+
37+
drainer_servers:
38+
- host: n1
39+
data_dir: /home/tidb/data/drainer-8249/data
40+
commit_ts: -1
41+
config:
42+
syncer.db-type: "file"
43+
44+
cdc_servers:
45+
- host: n3
46+
- host: n4
47+
- host: n5
48+
49+
tispark_masters:
50+
- host: n3
51+
52+
tispark_workers:
53+
- host: n4
54+
55+
monitoring_servers:
56+
- host: n1
57+
rule_dir: /tmp/local/prometheus
58+
grafana_servers:
59+
- host: n1
60+
dashboard_dir: /tmp/local/grafana
61+
alertmanager_servers:
62+
- host: n1
63+
config_file: /tmp/local/alertmanager/alertmanager.yml

0 commit comments

Comments
 (0)