From 8cc9b6c28464be11a26b6be9e03bea485deb3797 Mon Sep 17 00:00:00 2001
From: Dong Chen <dongluo.chen@docker.com>
Date: Tue, 12 Jan 2016 19:28:06 -0800
Subject: [PATCH] Add swarm container create retry option.

Signed-off-by: Dong Chen <dongluo.chen@docker.com>
---
 cli/help.go                                   |  1 +
 cluster/swarm/cluster.go                      | 18 +++++++++++++++++-
 .../nodemanagement/nodehealth.bats            | 19 +++++++++++++++++++
 3 files changed, 37 insertions(+), 1 deletion(-)

diff --git a/cli/help.go b/cli/help.go
index bf71388471..3650cf616f 100644
--- a/cli/help.go
+++ b/cli/help.go
@@ -44,6 +44,7 @@ Arguments:
 Options:
    {{range .Flags}}{{.}}
    {{end}}{{if (eq .Name "manage")}}{{printf "\t * swarm.overcommit=0.05\tovercommit to apply on resources"}}
+                                    {{printf "\t * swarm.createretry=0\tcontainer create retry count after initial failure"}}
                                     {{printf "\t * mesos.address=\taddress to bind on [$SWARM_MESOS_ADDRESS]"}}
                                     {{printf "\t * mesos.port=\tport to bind on [$SWARM_MESOS_PORT]"}}
                                     {{printf "\t * mesos.offertimeout=30s\ttimeout for offers [$SWARM_MESOS_OFFER_TIMEOUT]"}}
diff --git a/cluster/swarm/cluster.go b/cluster/swarm/cluster.go
index 96985b7807..19bbe4c392 100644
--- a/cluster/swarm/cluster.go
+++ b/cluster/swarm/cluster.go
@@ -59,6 +59,7 @@ type Cluster struct {
 
 	overcommitRatio float64
 	engineOpts      *cluster.EngineOpts
+	createRetry     int64
 	TLSConfig       *tls.Config
 }
 
@@ -76,12 +77,20 @@ func NewCluster(scheduler *scheduler.Scheduler, TLSConfig *tls.Config, discovery
 		pendingContainers: make(map[string]*pendingContainer),
 		overcommitRatio:   0.05,
 		engineOpts:        engineOptions,
+		createRetry:       0,
 	}
 
 	if val, ok := options.Float("swarm.overcommit", ""); ok {
 		cluster.overcommitRatio = val
 	}
 
+	if val, ok := options.Int("swarm.createretry", ""); ok {
+		if val < 0 {
+			log.Fatalf("swarm.createretry=%d is invalid", val)
+		}
+		cluster.createRetry = val
+	}
+
 	discoveryCh, errCh := cluster.discovery.Watch(nil)
 	go cluster.monitorDiscovery(discoveryCh, errCh)
 	go cluster.monitorPendingEngines()
@@ -119,16 +128,23 @@ func (c *Cluster) generateUniqueID() string {
 func (c *Cluster) CreateContainer(config *cluster.ContainerConfig, name string, authConfig *dockerclient.AuthConfig) (*cluster.Container, error) {
 	container, err := c.createContainer(config, name, false, authConfig)
 
-	//  fails with image not found, then try to reschedule with image affinity
 	if err != nil {
+		var retries int64
+		//  fails with image not found, then try to reschedule with image affinity
 		bImageNotFoundError, _ := regexp.MatchString(`image \S* not found`, err.Error())
 		if bImageNotFoundError && !config.HaveNodeConstraint() {
 			// Check if the image exists in the cluster
 			// If exists, retry with a image affinity
 			if c.Image(config.Image) != nil {
 				container, err = c.createContainer(config, name, true, authConfig)
+				retries++
 			}
 		}
+
+		for ; retries < c.createRetry && err != nil; retries++ {
+			log.WithFields(log.Fields{"Name": "Swarm"}).Warnf("Failed to create container: %s, retrying", err)
+			container, err = c.createContainer(config, name, false, authConfig)
+		}
 	}
 	return container, err
 }
diff --git a/test/integration/nodemanagement/nodehealth.bats b/test/integration/nodemanagement/nodehealth.bats
index 7af906dafd..4e0402164e 100644
--- a/test/integration/nodemanagement/nodehealth.bats
+++ b/test/integration/nodemanagement/nodehealth.bats
@@ -55,3 +55,22 @@ function teardown() {
 	[ "$status" -eq 0 ]
 }
 
+@test "scheduler retry" {
+	# Start 1 engine and register it in the file.
+	start_docker 2
+	# Start swarm and check it can reach the node
+	# refresh interval is 20s. 20 retries before marking it as unhealthy
+	swarm_manage --engine-refresh-min-interval "20s" --engine-refresh-max-interval "20s" --engine-failure-retry 20 -cluster-opt swarm.createretry=1 "${HOSTS[0]},${HOSTS[1]}"
+
+	eval "docker_swarm info | grep -q -i 'Nodes: 2'"
+
+	# Use memory on node-0
+	docker_swarm run -e constraint:node==node-0 -m 50m busybox sh
+
+	# Stop the node-1
+	docker_host stop ${DOCKER_CONTAINERS[1]}
+
+	# Try to run a container. It'd try node-1, upon failure automatically retry on node-0
+	run docker_swarm run -m 10m busybox sh
+	[ "$status" -eq 0 ]
+}