From 423c461f23fe10725fd4ebdb21b45410e9bceaf6 Mon Sep 17 00:00:00 2001 From: William Leese Date: Tue, 3 May 2016 10:01:01 +0200 Subject: [PATCH 1/3] Integration tests for receiving SIGHUP and SIGUSR1 during preStart --- core/app.go | 29 ++++++------ integration_tests/fixtures/app/Dockerfile | 3 ++ .../app/app-with-consul-prestart-sighup.json | 45 +++++++++++++++++++ .../app/app-with-consul-prestart-sigusr1.json | 45 +++++++++++++++++++ .../fixtures/app/reload-app-prestart.sh | 22 +++++++++ integration_tests/fixtures/app/reload-app.sh | 13 ++++++ .../fixtures/test_probe/src/main.go | 8 ++-- .../test_probe/src/test_sighup_prestart.go | 25 +++++++++++ .../test_probe/src/test_sigusr1_prestart.go | 34 ++++++++++++++ .../test_sighup_deadlock/docker-compose.yml | 2 + .../test_sighup_prestart/docker-compose.yml | 23 ++++++++++ .../tests/test_sighup_prestart/run.sh | 15 +++++++ 12 files changed, 247 insertions(+), 17 deletions(-) create mode 100644 integration_tests/fixtures/app/app-with-consul-prestart-sighup.json create mode 100644 integration_tests/fixtures/app/app-with-consul-prestart-sigusr1.json create mode 100755 integration_tests/fixtures/app/reload-app-prestart.sh create mode 100644 integration_tests/fixtures/test_probe/src/test_sighup_prestart.go create mode 100644 integration_tests/fixtures/test_probe/src/test_sigusr1_prestart.go create mode 100644 integration_tests/tests/test_sighup_prestart/docker-compose.yml create mode 100755 integration_tests/tests/test_sighup_prestart/run.sh diff --git a/core/app.go b/core/app.go index 807d1b38..eba64e9b 100644 --- a/core/app.go +++ b/core/app.go @@ -32,20 +32,20 @@ var ( // after it is run, it can be reloaded and paused with signals. type App struct { ServiceBackend discovery.ServiceBackend - Services []*services.Service - Backends []*backends.Backend - Tasks []*tasks.Task - Telemetry *telemetry.Telemetry - PreStartCmd *exec.Cmd - PreStopCmd *exec.Cmd - PostStopCmd *exec.Cmd - Command *exec.Cmd - StopTimeout int - QuitChannels []chan bool - maintModeLock *sync.RWMutex - signalLock *sync.RWMutex - paused bool - ConfigFlag string + Services []*services.Service + Backends []*backends.Backend + Tasks []*tasks.Task + Telemetry *telemetry.Telemetry + PreStartCmd *exec.Cmd + PreStopCmd *exec.Cmd + PostStopCmd *exec.Cmd + Command *exec.Cmd + StopTimeout int + QuitChannels []chan bool + maintModeLock *sync.RWMutex + signalLock *sync.RWMutex + paused bool + ConfigFlag string } // EmptyApp creates an empty application @@ -102,6 +102,7 @@ func NewApp(configFlag string) (*App, error) { a.Backends = cfg.Backends a.Tasks = cfg.Tasks a.Telemetry = cfg.Telemetry + a.ConfigFlag = configFlag return a, nil } diff --git a/integration_tests/fixtures/app/Dockerfile b/integration_tests/fixtures/app/Dockerfile index 995d48d5..21d263e0 100644 --- a/integration_tests/fixtures/app/Dockerfile +++ b/integration_tests/fixtures/app/Dockerfile @@ -9,9 +9,12 @@ RUN npm install -g json http-server COPY build/containerpilot /bin/containerpilot COPY app-with-consul.json /app-with-consul.json +COPY app-with-consul-prestart-sigusr1.json /app-with-consul-prestart-sigusr1.json +COPY app-with-consul-prestart-sighup.json /app-with-consul-prestart-sighup.json COPY app-with-etcd.json /app-with-etcd.json COPY reload-app.sh /reload-app.sh COPY reload-app-etcd.sh /reload-app-etcd.sh +COPY reload-app-prestart.sh /reload-app-prestart.sh COPY sensor.sh /sensor.sh COPY task.sh /task.sh diff --git a/integration_tests/fixtures/app/app-with-consul-prestart-sighup.json b/integration_tests/fixtures/app/app-with-consul-prestart-sighup.json new file mode 100644 index 00000000..5b767cfc --- /dev/null +++ b/integration_tests/fixtures/app/app-with-consul-prestart-sighup.json @@ -0,0 +1,45 @@ +{ + "consul": "consul:8500", + "preStart": ["/reload-app-prestart.sh", "HUP"], + "logging": { + "level": "DEBUG", + "format": "text" + }, + "services": [ + { + "name": "app", + "port": 8000, + "health": "/usr/bin/curl --fail -s -o /dev/null http://localhost:8888", + "poll": 1, + "ttl": 5, + "tags": ["application"] + } + ], + "backends": [ + { + "name": "nginx", + "poll": 7, + "onChange": "/reload-app.sh" + }, + { + "name": "app", + "poll": 5, + "onChange": "/reload-app.sh", + "tag": "application" + } + ], + "telemetry": { + "port": 9090, + "sensors": [ + { + "namespace": "containerpilot", + "subsystem": "app", + "name": "some_counter", + "help": "help text", + "type": "counter", + "poll": 1, + "check": ["/sensor.sh", "count"] + } + ] + } +} diff --git a/integration_tests/fixtures/app/app-with-consul-prestart-sigusr1.json b/integration_tests/fixtures/app/app-with-consul-prestart-sigusr1.json new file mode 100644 index 00000000..fc7d9be9 --- /dev/null +++ b/integration_tests/fixtures/app/app-with-consul-prestart-sigusr1.json @@ -0,0 +1,45 @@ +{ + "consul": "consul:8500", + "preStart": ["/reload-app-prestart.sh", "USR1"], + "logging": { + "level": "DEBUG", + "format": "text" + }, + "services": [ + { + "name": "app", + "port": 8000, + "health": "/usr/bin/curl --fail -s -o /dev/null http://localhost:8000", + "poll": 1, + "ttl": 5, + "tags": ["application"] + } + ], + "backends": [ + { + "name": "nginx", + "poll": 7, + "onChange": "/reload-app.sh" + }, + { + "name": "app", + "poll": 5, + "onChange": "/reload-app.sh", + "tag": "application" + } + ], + "telemetry": { + "port": 9090, + "sensors": [ + { + "namespace": "containerpilot", + "subsystem": "app", + "name": "some_counter", + "help": "help text", + "type": "counter", + "poll": 1, + "check": ["/sensor.sh", "count"] + } + ] + } +} diff --git a/integration_tests/fixtures/app/reload-app-prestart.sh b/integration_tests/fixtures/app/reload-app-prestart.sh new file mode 100755 index 00000000..5578db92 --- /dev/null +++ b/integration_tests/fixtures/app/reload-app-prestart.sh @@ -0,0 +1,22 @@ +#!/bin/bash +# Sends a signal to ContainerPilot during the preStart + +# wait a few seconds for the Consul container to become available +n=0 +while true +do + if [ n == 10 ]; then + echo "Timed out waiting for Consul" + exit 1; + fi + curl -Ls --fail http://consul:8500/v1/status/leader | grep 8300 && break + n=$((n+1)) + sleep 1 +done + +if [[ ${1} == "HUP" ]]; then + # Change our config to actually pass the healthcheck + sed -i s/8888/8000/ /app-with-consul-prestart-sighup.json +fi + +kill -${1} 1 diff --git a/integration_tests/fixtures/app/reload-app.sh b/integration_tests/fixtures/app/reload-app.sh index 6a809a5b..8fe4343e 100755 --- a/integration_tests/fixtures/app/reload-app.sh +++ b/integration_tests/fixtures/app/reload-app.sh @@ -1,5 +1,18 @@ #!/bin/bash +# wait a few seconds for the Consul container to become available +n=0 +while true +do + if [ n == 10 ]; then + echo "Timed out waiting for Consul" + exit 1; + fi + curl -Ls --fail http://consul:8500/v1/status/leader | grep 8300 && break + n=$((n+1)) + sleep 1 +done + # get all the healthy application servers and write the json to file curl -s consul:8500/v1/health/service/app?passing | json > /tmp/lastQuery.json diff --git a/integration_tests/fixtures/test_probe/src/main.go b/integration_tests/fixtures/test_probe/src/main.go index 33528816..a799bf09 100644 --- a/integration_tests/fixtures/test_probe/src/main.go +++ b/integration_tests/fixtures/test_probe/src/main.go @@ -14,9 +14,11 @@ var AllTests map[string]TestCommand func runTest(testName string, args []string) { // Register Tests AllTests = map[string]TestCommand{ - "test_sigterm": TestSigterm, - "test_sighup_deadlock": TestSighupDeadlock, - "test_discovery": TestDiscovery, + "test_sigterm": TestSigterm, + "test_sighup_deadlock": TestSighupDeadlock, + "test_sigusr1_prestart": TestSigUsr1Prestart, + "test_sighup_prestart": TestSigHupPrestart, + "test_discovery": TestDiscovery, } if test := AllTests[testName]; test != nil { diff --git a/integration_tests/fixtures/test_probe/src/test_sighup_prestart.go b/integration_tests/fixtures/test_probe/src/test_sighup_prestart.go new file mode 100644 index 00000000..eb1cf8e1 --- /dev/null +++ b/integration_tests/fixtures/test_probe/src/test_sighup_prestart.go @@ -0,0 +1,25 @@ +package main + +import "log" + +// The health check in the containerpilot config is intentionally +// broken. The preStart script will fix the health check and then +// SIGHUP to perform a config reload. +func TestSigHupPrestart(args []string) bool { + if len(args) != 1 { + log.Println("TestSigHupPrestart requires 1 argument") + log.Println(" - containerID: docker container to kill") + return false + } + + consul, err := NewConsulProbe() + if err != nil { + log.Println(err) + } + // Wait for 1 healthy 'app' service to be registered with consul + if err = consul.WaitForServices("app", "", 1); err != nil { + log.Printf("Expected app to be healthy after SIGHUP: %s\n", err) + return false + } + return true +} diff --git a/integration_tests/fixtures/test_probe/src/test_sigusr1_prestart.go b/integration_tests/fixtures/test_probe/src/test_sigusr1_prestart.go new file mode 100644 index 00000000..74372fd6 --- /dev/null +++ b/integration_tests/fixtures/test_probe/src/test_sigusr1_prestart.go @@ -0,0 +1,34 @@ +package main + +import "log" + +func TestSigUsr1Prestart(args []string) bool { + if len(args) != 1 { + log.Println("TestSigUsr1Prestart requires 1 argument") + log.Println(" - containerID: docker container to kill") + return false + } + + docker, err := NewDockerProbe() + if err != nil { + log.Println(err) + } + + // Prestart will SIGUSR1 us into maintenance + // Send SIGUSR1 to get us back out of maintenance + if err = docker.SendSignal(args[0], SigUsr1); err != nil { + log.Println(err) + return false + } + + // Wait for app to be healthy + consul, err := NewConsulProbe() + if err != nil { + log.Println(err) + } + if err = consul.WaitForServices("app", "", 1); err != nil { + log.Printf("Expected app to be healthy after SIGUSR1: %s\n", err) + return false + } + return true +} diff --git a/integration_tests/tests/test_sighup_deadlock/docker-compose.yml b/integration_tests/tests/test_sighup_deadlock/docker-compose.yml index 612e5600..1a218c40 100644 --- a/integration_tests/tests/test_sighup_deadlock/docker-compose.yml +++ b/integration_tests/tests/test_sighup_deadlock/docker-compose.yml @@ -8,6 +8,8 @@ app: mem_limit: 512m links: - consul:consul + volumes: + - '${CONTAINERPILOT_BIN}:/bin/containerpilot:ro' test: image: "cpfix_test_probe" diff --git a/integration_tests/tests/test_sighup_prestart/docker-compose.yml b/integration_tests/tests/test_sighup_prestart/docker-compose.yml new file mode 100644 index 00000000..32dfbe9e --- /dev/null +++ b/integration_tests/tests/test_sighup_prestart/docker-compose.yml @@ -0,0 +1,23 @@ +consul: + image: "cpfix_consul" + mem_limit: 256m + hostname: consul + +app: + image: "cpfix_app" + environment: + - CONTAINERPILOT=file:///app-with-consul-prestart-sighup.json + mem_limit: 512m + links: + - consul:consul + volumes: + - '${CONTAINERPILOT_BIN}:/bin/containerpilot:ro' + +test: + image: "cpfix_test_probe" + mem_limit: 128m + links: + - consul:consul + - app:app + volumes: + - '/var/run/docker.sock:/var/run/docker.sock' diff --git a/integration_tests/tests/test_sighup_prestart/run.sh b/integration_tests/tests/test_sighup_prestart/run.sh new file mode 100755 index 00000000..c1de0847 --- /dev/null +++ b/integration_tests/tests/test_sighup_prestart/run.sh @@ -0,0 +1,15 @@ +#!/bin/bash + +docker-compose up -d consul app +APP_ID="$(docker-compose ps -q app)" +docker-compose run --no-deps test /go/bin/test_probe test_sighup_prestart $APP_ID > /dev/null 2>&1 +result=$? +TEST_ID=$(docker ps -l -f "ancestor=cpfix_test_probe" --format="{{.ID}}") +if [ $result -ne 0 ]; then + echo "==== TEST LOGS ====" + docker logs $TEST_ID + echo "==== APP LOGS ====" + docker logs $APP_ID +fi +docker rm -f $TEST_ID > /dev/null 2>&1 +exit $result From 8863602b51469643b708c6f178d6d3a783325eb9 Mon Sep 17 00:00:00 2001 From: Tim Gross Date: Fri, 27 May 2016 10:32:40 -0400 Subject: [PATCH 2/3] Back-out reload of telemetry server The native golang HTTP server cannot be gracefully reloaded. The telemetry server originally punted on reloading because this meant the only thing we couldn't change was the telemetry port. Although we made a go at it, we can't make the reload work so this commit backs out the change (but still uses the refactored code from PR #138) until we can find a better HTTP server lib to use. --- telemetry/telemetry.go | 28 ++++++++++++----------- telemetry/telemetry_test.go | 45 ++++++++++++++++++++++++++++++++++--- 2 files changed, 57 insertions(+), 16 deletions(-) diff --git a/telemetry/telemetry.go b/telemetry/telemetry.go index 695effde..4614b86e 100644 --- a/telemetry/telemetry.go +++ b/telemetry/telemetry.go @@ -25,7 +25,6 @@ type Telemetry struct { Poll int mux *http.ServeMux lock sync.RWMutex - listen net.Listener addr net.TCPAddr listening bool } @@ -65,32 +64,35 @@ func NewTelemetry(raw interface{}) (*Telemetry, error) { return t, nil } +var listener net.Listener + // Serve starts serving the telemetry service func (t *Telemetry) Serve() { t.lock.Lock() defer t.lock.Unlock() - if t.listening { + + // No-op if we've created the server previously. + // TODO: golang's native implementation of http.Server.Server() cannot + // support graceful reload. We need to select an alternate implementation + // but in the meantime we need to back-out the change to reloading + // ref https://github.com/joyent/containerpilot/pull/165 + if listener != nil { return } ln, err := net.Listen(t.addr.Network(), t.addr.String()) if err != nil { - log.Errorf("Error serving telemetry on %s: %v", t.addr.String(), err) + log.Fatalf("Error serving telemetry on %s: %v", t.addr.String(), err) } - t.listen = ln + listener = ln t.listening = true go func() { - log.Debugf("telemetry: Listening on %s\n", t.addr.String()) - log.Fatal(http.Serve(t.listen, t.mux)) - log.Debugf("telemetry: Stopped listening on %s\n", t.addr.String()) + log.Infof("telemetry: Listening on %s", t.addr.String()) + log.Fatal(http.Serve(listener, t.mux)) + log.Infof("telemetry: Stopped listening on %s", t.addr.String()) }() } // Shutdown shuts down the telemetry service func (t *Telemetry) Shutdown() { - t.lock.Lock() - defer t.lock.Unlock() - if t.listening { - t.listen.Close() - t.listening = false - } + log.Debug("telemetry: shutdown received but currently a no-op") } diff --git a/telemetry/telemetry_test.go b/telemetry/telemetry_test.go index 1c51881a..ce1341ce 100644 --- a/telemetry/telemetry_test.go +++ b/telemetry/telemetry_test.go @@ -2,15 +2,15 @@ package telemetry import ( "encoding/json" + "fmt" + "net/http" "strings" "testing" "github.com/prometheus/client_golang/prometheus" ) -func TestTelemetryParse(t *testing.T) { - - jsonFragment := []byte(`{ +var jsonFragment = []byte(`{ "port": 8000, "interfaces": ["eth0"], "sensors": [ @@ -26,6 +26,7 @@ func TestTelemetryParse(t *testing.T) { ] }`) +func TestTelemetryParse(t *testing.T) { if telem, err := NewTelemetry(decodeJSONRawTelemetry(t, jsonFragment)); err != nil { t.Fatalf("Could not parse telemetry JSON: %s", err) } else { @@ -66,3 +67,41 @@ func decodeJSONRawTelemetry(t *testing.T, testJSON json.RawMessage) interface{} } return raw } + +func TestTelemetryServerRestart(t *testing.T) { + if telem, err := NewTelemetry(decodeJSONRawTelemetry(t, jsonFragment)); err != nil { + t.Fatalf("Could not parse telemetry JSON: %s", err) + } else { + // initial server + telem.Serve() + checkServerIsListening(t, telem) + telem.Shutdown() + + // reloaded server + telem, err := NewTelemetry(decodeJSONRawTelemetry(t, jsonFragment)) + if err != nil { + t.Fatalf("Could not parse telemetry JSON: %s", err) + } + telem.Serve() + checkServerIsListening(t, telem) + } +} + +func checkServerIsListening(t *testing.T, telem *Telemetry) { + telem.lock.RLock() + defer telem.lock.RUnlock() + verifyMetricsEndpointOk(t, telem) +} + +func verifyMetricsEndpointOk(t *testing.T, telem *Telemetry) { + url := fmt.Sprintf("http://%v:%v/metrics", telem.addr.IP, telem.addr.Port) + resp, err := http.Get(url) + if err != nil { + t.Fatal(err) + } + defer resp.Body.Close() + if resp.StatusCode != 200 { + t.Fatalf("Got %v status from telemetry server", resp.StatusCode) + } + +} From 2da0999f44d795366587cae7d41a0bacb2768ad8 Mon Sep 17 00:00:00 2001 From: Tim Gross Date: Tue, 31 May 2016 09:11:36 -0400 Subject: [PATCH 3/3] Remove signal.Reset from config reload Resetting the signal handlers creates a race that we are having trouble resolving with creating deadlocks. Fortunately it turns out that it's unnecessary. When the config is reloaded, we keep the existing App instance and replace its values. This means we can safely keep the signal-handling channel around and just reuse it. --- core/app.go | 3 --- integration_tests/fixtures/test_probe/src/docker.go | 1 + telemetry/telemetry.go | 2 +- 3 files changed, 2 insertions(+), 4 deletions(-) diff --git a/core/app.go b/core/app.go index eba64e9b..3c3675ae 100644 --- a/core/app.go +++ b/core/app.go @@ -5,7 +5,6 @@ import ( "fmt" "os" "os/exec" - "os/signal" "sync" "syscall" "time" @@ -229,7 +228,6 @@ func (a *App) Reload() error { a.stopPolling() a.forAllServices(deregisterService) - signal.Reset() a.load(newApp) return nil @@ -247,7 +245,6 @@ func (a *App) load(newApp *App) { } a.Telemetry = newApp.Telemetry a.Tasks = newApp.Tasks - a.handleSignals() a.handlePolling() } diff --git a/integration_tests/fixtures/test_probe/src/docker.go b/integration_tests/fixtures/test_probe/src/docker.go index 4446bc00..ffdc8c72 100644 --- a/integration_tests/fixtures/test_probe/src/docker.go +++ b/integration_tests/fixtures/test_probe/src/docker.go @@ -13,6 +13,7 @@ const ( SigChld = DockerSignal("CHLD") SigUsr1 = DockerSignal("USR1") SigHup = DockerSignal("HUP") + SigAbrt = DockerSignal("ABRT") // for debugging via stack trace dump ) // DockerProbe is a test probe for docker diff --git a/telemetry/telemetry.go b/telemetry/telemetry.go index 4614b86e..80256e34 100644 --- a/telemetry/telemetry.go +++ b/telemetry/telemetry.go @@ -88,7 +88,7 @@ func (t *Telemetry) Serve() { go func() { log.Infof("telemetry: Listening on %s", t.addr.String()) log.Fatal(http.Serve(listener, t.mux)) - log.Infof("telemetry: Stopped listening on %s", t.addr.String()) + log.Debugf("telemetry: Stopped listening on %s", t.addr.String()) }() }