Skip to content
This repository has been archived by the owner on Feb 1, 2021. It is now read-only.

Commit

Permalink
Merge pull request #2436 from dongluochen/rescheduleWithOverlay
Browse files Browse the repository at this point in the history
Reschedule containers with global network
  • Loading branch information
dongluochen authored Aug 18, 2016
2 parents 25919db + c3b9a53 commit dcaaa55
Showing 1 changed file with 66 additions and 9 deletions.
75 changes: 66 additions & 9 deletions cluster/watchdog.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,11 @@ package cluster

import (
"sync"
"time"

log "github.com/Sirupsen/logrus"
"github.com/docker/engine-api/types/network"
"golang.org/x/net/context"
)

// Watchdog listens to cluster events and handles container rescheduling
Expand Down Expand Up @@ -61,6 +64,7 @@ func (w *Watchdog) rescheduleContainers(e *Engine) {
defer w.Unlock()

log.Debugf("Node %s failed - rescheduling containers", e.ID)

for _, c := range e.Containers() {

// Skip containers which don't have an "on-node-failure" reschedule policy.
Expand All @@ -75,23 +79,76 @@ func (w *Watchdog) rescheduleContainers(e *Engine) {
// will abort because the name is already taken.
c.Engine.removeContainer(c)

newContainer, err := w.cluster.CreateContainer(c.Config, c.Info.Name, nil)
// keep track of all global networks this container is connected to
globalNetworks := make(map[string]*network.EndpointSettings)
// if the existing containter has global network endpoints,
// they need to be removed with force option
// "docker network disconnect -f network containername" only takes containername
name := c.Info.Name
if len(name) == 0 || len(name) == 1 && name[0] == '/' {
log.Errorf("container %s has no name", c.ID)
continue
}
// cut preceeding '/'
if name[0] == '/' {
name = name[1:]
}

if c.NetworkSettings != nil && len(c.NetworkSettings.Networks) > 0 {
// find an engine to do disconnect work
randomEngine, err := w.cluster.RANDOMENGINE()
if err != nil {
log.Errorf("Failed to find an engine to do network cleanup for container %s: %v", c.ID, err)
// add the container back, so we can retry later
c.Engine.AddContainer(c)
continue
}

clusterNetworks := w.cluster.Networks().Uniq()
for networkName, endpoint := range c.NetworkSettings.Networks {
net := clusterNetworks.Get(endpoint.NetworkID)
if net != nil && net.Scope == "global" {
// record the nework, they should be reconstructed on the new container
globalNetworks[networkName] = endpoint
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
defer cancel()
err = randomEngine.apiClient.NetworkDisconnect(ctx, networkName, name, true)
if err != nil {
// do not abort here as this endpoint might have been removed before
log.Warnf("Failed to remove network endpoint from old container %s: %v", name, err)
}
}
}
}

newContainer, err := w.cluster.CreateContainer(c.Config, c.Info.Name, nil)
if err != nil {
log.Errorf("Failed to reschedule container %s: %v", c.ID, err)
// add the container back, so we can retry later
c.Engine.AddContainer(c)
} else {
log.Infof("Rescheduled container %s from %s to %s as %s", c.ID, c.Engine.Name, newContainer.Engine.Name, newContainer.ID)
if c.Info.State.Running {
log.Infof("Container %s was running, starting container %s", c.ID, newContainer.ID)
if err := w.cluster.StartContainer(newContainer, nil); err != nil {
log.Errorf("Failed to start rescheduled container %s: %v", newContainer.ID, err)
}
continue
}

// Docker create command cannot create a container with multiple networks
// see https://github.com/docker/docker/issues/17750
// Add the global networks one by one
for networkName, endpoint := range globalNetworks {
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
defer cancel()
err = newContainer.Engine.apiClient.NetworkConnect(ctx, networkName, name, endpoint)
if err != nil {
log.Warnf("Failed to connect network %s to container %s: %v", networkName, name, err)
}
}
}

log.Infof("Rescheduled container %s from %s to %s as %s", c.ID, c.Engine.Name, newContainer.Engine.Name, newContainer.ID)
if c.Info.State.Running {
log.Infof("Container %s was running, starting container %s", c.ID, newContainer.ID)
if err := w.cluster.StartContainer(newContainer, nil); err != nil {
log.Errorf("Failed to start rescheduled container %s: %v", newContainer.ID, err)
}
}
}
}

// NewWatchdog creates a new watchdog
Expand Down

0 comments on commit dcaaa55

Please sign in to comment.