diff --git a/core/app.go b/core/app.go index 807d1b38..3c3675ae 100644 --- a/core/app.go +++ b/core/app.go @@ -5,7 +5,6 @@ import ( "fmt" "os" "os/exec" - "os/signal" "sync" "syscall" "time" @@ -32,20 +31,20 @@ var ( // after it is run, it can be reloaded and paused with signals. type App struct { ServiceBackend discovery.ServiceBackend - Services []*services.Service - Backends []*backends.Backend - Tasks []*tasks.Task - Telemetry *telemetry.Telemetry - PreStartCmd *exec.Cmd - PreStopCmd *exec.Cmd - PostStopCmd *exec.Cmd - Command *exec.Cmd - StopTimeout int - QuitChannels []chan bool - maintModeLock *sync.RWMutex - signalLock *sync.RWMutex - paused bool - ConfigFlag string + Services []*services.Service + Backends []*backends.Backend + Tasks []*tasks.Task + Telemetry *telemetry.Telemetry + PreStartCmd *exec.Cmd + PreStopCmd *exec.Cmd + PostStopCmd *exec.Cmd + Command *exec.Cmd + StopTimeout int + QuitChannels []chan bool + maintModeLock *sync.RWMutex + signalLock *sync.RWMutex + paused bool + ConfigFlag string } // EmptyApp creates an empty application @@ -102,6 +101,7 @@ func NewApp(configFlag string) (*App, error) { a.Backends = cfg.Backends a.Tasks = cfg.Tasks a.Telemetry = cfg.Telemetry + a.ConfigFlag = configFlag return a, nil } @@ -228,7 +228,6 @@ func (a *App) Reload() error { a.stopPolling() a.forAllServices(deregisterService) - signal.Reset() a.load(newApp) return nil @@ -246,7 +245,6 @@ func (a *App) load(newApp *App) { } a.Telemetry = newApp.Telemetry a.Tasks = newApp.Tasks - a.handleSignals() a.handlePolling() } diff --git a/integration_tests/fixtures/app/Dockerfile b/integration_tests/fixtures/app/Dockerfile index 995d48d5..21d263e0 100644 --- a/integration_tests/fixtures/app/Dockerfile +++ b/integration_tests/fixtures/app/Dockerfile @@ -9,9 +9,12 @@ RUN npm install -g json http-server COPY build/containerpilot /bin/containerpilot COPY app-with-consul.json /app-with-consul.json +COPY app-with-consul-prestart-sigusr1.json /app-with-consul-prestart-sigusr1.json +COPY app-with-consul-prestart-sighup.json /app-with-consul-prestart-sighup.json COPY app-with-etcd.json /app-with-etcd.json COPY reload-app.sh /reload-app.sh COPY reload-app-etcd.sh /reload-app-etcd.sh +COPY reload-app-prestart.sh /reload-app-prestart.sh COPY sensor.sh /sensor.sh COPY task.sh /task.sh diff --git a/integration_tests/fixtures/app/app-with-consul-prestart-sighup.json b/integration_tests/fixtures/app/app-with-consul-prestart-sighup.json new file mode 100644 index 00000000..5b767cfc --- /dev/null +++ b/integration_tests/fixtures/app/app-with-consul-prestart-sighup.json @@ -0,0 +1,45 @@ +{ + "consul": "consul:8500", + "preStart": ["/reload-app-prestart.sh", "HUP"], + "logging": { + "level": "DEBUG", + "format": "text" + }, + "services": [ + { + "name": "app", + "port": 8000, + "health": "/usr/bin/curl --fail -s -o /dev/null http://localhost:8888", + "poll": 1, + "ttl": 5, + "tags": ["application"] + } + ], + "backends": [ + { + "name": "nginx", + "poll": 7, + "onChange": "/reload-app.sh" + }, + { + "name": "app", + "poll": 5, + "onChange": "/reload-app.sh", + "tag": "application" + } + ], + "telemetry": { + "port": 9090, + "sensors": [ + { + "namespace": "containerpilot", + "subsystem": "app", + "name": "some_counter", + "help": "help text", + "type": "counter", + "poll": 1, + "check": ["/sensor.sh", "count"] + } + ] + } +} diff --git a/integration_tests/fixtures/app/app-with-consul-prestart-sigusr1.json b/integration_tests/fixtures/app/app-with-consul-prestart-sigusr1.json new file mode 100644 index 00000000..fc7d9be9 --- /dev/null +++ b/integration_tests/fixtures/app/app-with-consul-prestart-sigusr1.json @@ -0,0 +1,45 @@ +{ + "consul": "consul:8500", + "preStart": ["/reload-app-prestart.sh", "USR1"], + "logging": { + "level": "DEBUG", + "format": "text" + }, + "services": [ + { + "name": "app", + "port": 8000, + "health": "/usr/bin/curl --fail -s -o /dev/null http://localhost:8000", + "poll": 1, + "ttl": 5, + "tags": ["application"] + } + ], + "backends": [ + { + "name": "nginx", + "poll": 7, + "onChange": "/reload-app.sh" + }, + { + "name": "app", + "poll": 5, + "onChange": "/reload-app.sh", + "tag": "application" + } + ], + "telemetry": { + "port": 9090, + "sensors": [ + { + "namespace": "containerpilot", + "subsystem": "app", + "name": "some_counter", + "help": "help text", + "type": "counter", + "poll": 1, + "check": ["/sensor.sh", "count"] + } + ] + } +} diff --git a/integration_tests/fixtures/app/reload-app-prestart.sh b/integration_tests/fixtures/app/reload-app-prestart.sh new file mode 100755 index 00000000..5578db92 --- /dev/null +++ b/integration_tests/fixtures/app/reload-app-prestart.sh @@ -0,0 +1,22 @@ +#!/bin/bash +# Sends a signal to ContainerPilot during the preStart + +# wait a few seconds for the Consul container to become available +n=0 +while true +do + if [ n == 10 ]; then + echo "Timed out waiting for Consul" + exit 1; + fi + curl -Ls --fail http://consul:8500/v1/status/leader | grep 8300 && break + n=$((n+1)) + sleep 1 +done + +if [[ ${1} == "HUP" ]]; then + # Change our config to actually pass the healthcheck + sed -i s/8888/8000/ /app-with-consul-prestart-sighup.json +fi + +kill -${1} 1 diff --git a/integration_tests/fixtures/app/reload-app.sh b/integration_tests/fixtures/app/reload-app.sh index 6a809a5b..8fe4343e 100755 --- a/integration_tests/fixtures/app/reload-app.sh +++ b/integration_tests/fixtures/app/reload-app.sh @@ -1,5 +1,18 @@ #!/bin/bash +# wait a few seconds for the Consul container to become available +n=0 +while true +do + if [ n == 10 ]; then + echo "Timed out waiting for Consul" + exit 1; + fi + curl -Ls --fail http://consul:8500/v1/status/leader | grep 8300 && break + n=$((n+1)) + sleep 1 +done + # get all the healthy application servers and write the json to file curl -s consul:8500/v1/health/service/app?passing | json > /tmp/lastQuery.json diff --git a/integration_tests/fixtures/test_probe/src/docker.go b/integration_tests/fixtures/test_probe/src/docker.go index 4446bc00..ffdc8c72 100644 --- a/integration_tests/fixtures/test_probe/src/docker.go +++ b/integration_tests/fixtures/test_probe/src/docker.go @@ -13,6 +13,7 @@ const ( SigChld = DockerSignal("CHLD") SigUsr1 = DockerSignal("USR1") SigHup = DockerSignal("HUP") + SigAbrt = DockerSignal("ABRT") // for debugging via stack trace dump ) // DockerProbe is a test probe for docker diff --git a/integration_tests/fixtures/test_probe/src/main.go b/integration_tests/fixtures/test_probe/src/main.go index 33528816..a799bf09 100644 --- a/integration_tests/fixtures/test_probe/src/main.go +++ b/integration_tests/fixtures/test_probe/src/main.go @@ -14,9 +14,11 @@ var AllTests map[string]TestCommand func runTest(testName string, args []string) { // Register Tests AllTests = map[string]TestCommand{ - "test_sigterm": TestSigterm, - "test_sighup_deadlock": TestSighupDeadlock, - "test_discovery": TestDiscovery, + "test_sigterm": TestSigterm, + "test_sighup_deadlock": TestSighupDeadlock, + "test_sigusr1_prestart": TestSigUsr1Prestart, + "test_sighup_prestart": TestSigHupPrestart, + "test_discovery": TestDiscovery, } if test := AllTests[testName]; test != nil { diff --git a/integration_tests/fixtures/test_probe/src/test_sighup_prestart.go b/integration_tests/fixtures/test_probe/src/test_sighup_prestart.go new file mode 100644 index 00000000..eb1cf8e1 --- /dev/null +++ b/integration_tests/fixtures/test_probe/src/test_sighup_prestart.go @@ -0,0 +1,25 @@ +package main + +import "log" + +// The health check in the containerpilot config is intentionally +// broken. The preStart script will fix the health check and then +// SIGHUP to perform a config reload. +func TestSigHupPrestart(args []string) bool { + if len(args) != 1 { + log.Println("TestSigHupPrestart requires 1 argument") + log.Println(" - containerID: docker container to kill") + return false + } + + consul, err := NewConsulProbe() + if err != nil { + log.Println(err) + } + // Wait for 1 healthy 'app' service to be registered with consul + if err = consul.WaitForServices("app", "", 1); err != nil { + log.Printf("Expected app to be healthy after SIGHUP: %s\n", err) + return false + } + return true +} diff --git a/integration_tests/fixtures/test_probe/src/test_sigusr1_prestart.go b/integration_tests/fixtures/test_probe/src/test_sigusr1_prestart.go new file mode 100644 index 00000000..74372fd6 --- /dev/null +++ b/integration_tests/fixtures/test_probe/src/test_sigusr1_prestart.go @@ -0,0 +1,34 @@ +package main + +import "log" + +func TestSigUsr1Prestart(args []string) bool { + if len(args) != 1 { + log.Println("TestSigUsr1Prestart requires 1 argument") + log.Println(" - containerID: docker container to kill") + return false + } + + docker, err := NewDockerProbe() + if err != nil { + log.Println(err) + } + + // Prestart will SIGUSR1 us into maintenance + // Send SIGUSR1 to get us back out of maintenance + if err = docker.SendSignal(args[0], SigUsr1); err != nil { + log.Println(err) + return false + } + + // Wait for app to be healthy + consul, err := NewConsulProbe() + if err != nil { + log.Println(err) + } + if err = consul.WaitForServices("app", "", 1); err != nil { + log.Printf("Expected app to be healthy after SIGUSR1: %s\n", err) + return false + } + return true +} diff --git a/integration_tests/tests/test_sighup_deadlock/docker-compose.yml b/integration_tests/tests/test_sighup_deadlock/docker-compose.yml index 612e5600..1a218c40 100644 --- a/integration_tests/tests/test_sighup_deadlock/docker-compose.yml +++ b/integration_tests/tests/test_sighup_deadlock/docker-compose.yml @@ -8,6 +8,8 @@ app: mem_limit: 512m links: - consul:consul + volumes: + - '${CONTAINERPILOT_BIN}:/bin/containerpilot:ro' test: image: "cpfix_test_probe" diff --git a/integration_tests/tests/test_sighup_prestart/docker-compose.yml b/integration_tests/tests/test_sighup_prestart/docker-compose.yml new file mode 100644 index 00000000..32dfbe9e --- /dev/null +++ b/integration_tests/tests/test_sighup_prestart/docker-compose.yml @@ -0,0 +1,23 @@ +consul: + image: "cpfix_consul" + mem_limit: 256m + hostname: consul + +app: + image: "cpfix_app" + environment: + - CONTAINERPILOT=file:///app-with-consul-prestart-sighup.json + mem_limit: 512m + links: + - consul:consul + volumes: + - '${CONTAINERPILOT_BIN}:/bin/containerpilot:ro' + +test: + image: "cpfix_test_probe" + mem_limit: 128m + links: + - consul:consul + - app:app + volumes: + - '/var/run/docker.sock:/var/run/docker.sock' diff --git a/integration_tests/tests/test_sighup_prestart/run.sh b/integration_tests/tests/test_sighup_prestart/run.sh new file mode 100755 index 00000000..c1de0847 --- /dev/null +++ b/integration_tests/tests/test_sighup_prestart/run.sh @@ -0,0 +1,15 @@ +#!/bin/bash + +docker-compose up -d consul app +APP_ID="$(docker-compose ps -q app)" +docker-compose run --no-deps test /go/bin/test_probe test_sighup_prestart $APP_ID > /dev/null 2>&1 +result=$? +TEST_ID=$(docker ps -l -f "ancestor=cpfix_test_probe" --format="{{.ID}}") +if [ $result -ne 0 ]; then + echo "==== TEST LOGS ====" + docker logs $TEST_ID + echo "==== APP LOGS ====" + docker logs $APP_ID +fi +docker rm -f $TEST_ID > /dev/null 2>&1 +exit $result diff --git a/telemetry/telemetry.go b/telemetry/telemetry.go index 695effde..80256e34 100644 --- a/telemetry/telemetry.go +++ b/telemetry/telemetry.go @@ -25,7 +25,6 @@ type Telemetry struct { Poll int mux *http.ServeMux lock sync.RWMutex - listen net.Listener addr net.TCPAddr listening bool } @@ -65,32 +64,35 @@ func NewTelemetry(raw interface{}) (*Telemetry, error) { return t, nil } +var listener net.Listener + // Serve starts serving the telemetry service func (t *Telemetry) Serve() { t.lock.Lock() defer t.lock.Unlock() - if t.listening { + + // No-op if we've created the server previously. + // TODO: golang's native implementation of http.Server.Server() cannot + // support graceful reload. We need to select an alternate implementation + // but in the meantime we need to back-out the change to reloading + // ref https://github.com/joyent/containerpilot/pull/165 + if listener != nil { return } ln, err := net.Listen(t.addr.Network(), t.addr.String()) if err != nil { - log.Errorf("Error serving telemetry on %s: %v", t.addr.String(), err) + log.Fatalf("Error serving telemetry on %s: %v", t.addr.String(), err) } - t.listen = ln + listener = ln t.listening = true go func() { - log.Debugf("telemetry: Listening on %s\n", t.addr.String()) - log.Fatal(http.Serve(t.listen, t.mux)) - log.Debugf("telemetry: Stopped listening on %s\n", t.addr.String()) + log.Infof("telemetry: Listening on %s", t.addr.String()) + log.Fatal(http.Serve(listener, t.mux)) + log.Debugf("telemetry: Stopped listening on %s", t.addr.String()) }() } // Shutdown shuts down the telemetry service func (t *Telemetry) Shutdown() { - t.lock.Lock() - defer t.lock.Unlock() - if t.listening { - t.listen.Close() - t.listening = false - } + log.Debug("telemetry: shutdown received but currently a no-op") } diff --git a/telemetry/telemetry_test.go b/telemetry/telemetry_test.go index 1c51881a..ce1341ce 100644 --- a/telemetry/telemetry_test.go +++ b/telemetry/telemetry_test.go @@ -2,15 +2,15 @@ package telemetry import ( "encoding/json" + "fmt" + "net/http" "strings" "testing" "github.com/prometheus/client_golang/prometheus" ) -func TestTelemetryParse(t *testing.T) { - - jsonFragment := []byte(`{ +var jsonFragment = []byte(`{ "port": 8000, "interfaces": ["eth0"], "sensors": [ @@ -26,6 +26,7 @@ func TestTelemetryParse(t *testing.T) { ] }`) +func TestTelemetryParse(t *testing.T) { if telem, err := NewTelemetry(decodeJSONRawTelemetry(t, jsonFragment)); err != nil { t.Fatalf("Could not parse telemetry JSON: %s", err) } else { @@ -66,3 +67,41 @@ func decodeJSONRawTelemetry(t *testing.T, testJSON json.RawMessage) interface{} } return raw } + +func TestTelemetryServerRestart(t *testing.T) { + if telem, err := NewTelemetry(decodeJSONRawTelemetry(t, jsonFragment)); err != nil { + t.Fatalf("Could not parse telemetry JSON: %s", err) + } else { + // initial server + telem.Serve() + checkServerIsListening(t, telem) + telem.Shutdown() + + // reloaded server + telem, err := NewTelemetry(decodeJSONRawTelemetry(t, jsonFragment)) + if err != nil { + t.Fatalf("Could not parse telemetry JSON: %s", err) + } + telem.Serve() + checkServerIsListening(t, telem) + } +} + +func checkServerIsListening(t *testing.T, telem *Telemetry) { + telem.lock.RLock() + defer telem.lock.RUnlock() + verifyMetricsEndpointOk(t, telem) +} + +func verifyMetricsEndpointOk(t *testing.T, telem *Telemetry) { + url := fmt.Sprintf("http://%v:%v/metrics", telem.addr.IP, telem.addr.Port) + resp, err := http.Get(url) + if err != nil { + t.Fatal(err) + } + defer resp.Body.Close() + if resp.StatusCode != 200 { + t.Fatalf("Got %v status from telemetry server", resp.StatusCode) + } + +}