From 8cc9b6c28464be11a26b6be9e03bea485deb3797 Mon Sep 17 00:00:00 2001 From: Dong Chen Date: Tue, 12 Jan 2016 19:28:06 -0800 Subject: [PATCH] Add swarm container create retry option. Signed-off-by: Dong Chen --- cli/help.go | 1 + cluster/swarm/cluster.go | 18 +++++++++++++++++- .../nodemanagement/nodehealth.bats | 19 +++++++++++++++++++ 3 files changed, 37 insertions(+), 1 deletion(-) diff --git a/cli/help.go b/cli/help.go index bf71388471..3650cf616f 100644 --- a/cli/help.go +++ b/cli/help.go @@ -44,6 +44,7 @@ Arguments: Options: {{range .Flags}}{{.}} {{end}}{{if (eq .Name "manage")}}{{printf "\t * swarm.overcommit=0.05\tovercommit to apply on resources"}} + {{printf "\t * swarm.createretry=0\tcontainer create retry count after initial failure"}} {{printf "\t * mesos.address=\taddress to bind on [$SWARM_MESOS_ADDRESS]"}} {{printf "\t * mesos.port=\tport to bind on [$SWARM_MESOS_PORT]"}} {{printf "\t * mesos.offertimeout=30s\ttimeout for offers [$SWARM_MESOS_OFFER_TIMEOUT]"}} diff --git a/cluster/swarm/cluster.go b/cluster/swarm/cluster.go index 96985b7807..19bbe4c392 100644 --- a/cluster/swarm/cluster.go +++ b/cluster/swarm/cluster.go @@ -59,6 +59,7 @@ type Cluster struct { overcommitRatio float64 engineOpts *cluster.EngineOpts + createRetry int64 TLSConfig *tls.Config } @@ -76,12 +77,20 @@ func NewCluster(scheduler *scheduler.Scheduler, TLSConfig *tls.Config, discovery pendingContainers: make(map[string]*pendingContainer), overcommitRatio: 0.05, engineOpts: engineOptions, + createRetry: 0, } if val, ok := options.Float("swarm.overcommit", ""); ok { cluster.overcommitRatio = val } + if val, ok := options.Int("swarm.createretry", ""); ok { + if val < 0 { + log.Fatalf("swarm.createretry=%d is invalid", val) + } + cluster.createRetry = val + } + discoveryCh, errCh := cluster.discovery.Watch(nil) go cluster.monitorDiscovery(discoveryCh, errCh) go cluster.monitorPendingEngines() @@ -119,16 +128,23 @@ func (c *Cluster) generateUniqueID() string { func (c *Cluster) CreateContainer(config *cluster.ContainerConfig, name string, authConfig *dockerclient.AuthConfig) (*cluster.Container, error) { container, err := c.createContainer(config, name, false, authConfig) - // fails with image not found, then try to reschedule with image affinity if err != nil { + var retries int64 + // fails with image not found, then try to reschedule with image affinity bImageNotFoundError, _ := regexp.MatchString(`image \S* not found`, err.Error()) if bImageNotFoundError && !config.HaveNodeConstraint() { // Check if the image exists in the cluster // If exists, retry with a image affinity if c.Image(config.Image) != nil { container, err = c.createContainer(config, name, true, authConfig) + retries++ } } + + for ; retries < c.createRetry && err != nil; retries++ { + log.WithFields(log.Fields{"Name": "Swarm"}).Warnf("Failed to create container: %s, retrying", err) + container, err = c.createContainer(config, name, false, authConfig) + } } return container, err } diff --git a/test/integration/nodemanagement/nodehealth.bats b/test/integration/nodemanagement/nodehealth.bats index 7af906dafd..4e0402164e 100644 --- a/test/integration/nodemanagement/nodehealth.bats +++ b/test/integration/nodemanagement/nodehealth.bats @@ -55,3 +55,22 @@ function teardown() { [ "$status" -eq 0 ] } +@test "scheduler retry" { + # Start 1 engine and register it in the file. + start_docker 2 + # Start swarm and check it can reach the node + # refresh interval is 20s. 20 retries before marking it as unhealthy + swarm_manage --engine-refresh-min-interval "20s" --engine-refresh-max-interval "20s" --engine-failure-retry 20 -cluster-opt swarm.createretry=1 "${HOSTS[0]},${HOSTS[1]}" + + eval "docker_swarm info | grep -q -i 'Nodes: 2'" + + # Use memory on node-0 + docker_swarm run -e constraint:node==node-0 -m 50m busybox sh + + # Stop the node-1 + docker_host stop ${DOCKER_CONTAINERS[1]} + + # Try to run a container. It'd try node-1, upon failure automatically retry on node-0 + run docker_swarm run -m 10m busybox sh + [ "$status" -eq 0 ] +}