Skip to content

Commit

Permalink
Add instructions to remote debug the Driver pods
Browse files Browse the repository at this point in the history
This makes the driver command configurable so that Delve can be used to
execute the driver binary and adds Make targets to build the Driver
image for debugging.

Signed-off-by: mprahl <[email protected]>
  • Loading branch information
mprahl committed Jan 9, 2025
1 parent d21fca6 commit 0f6cc5b
Show file tree
Hide file tree
Showing 7 changed files with 144 additions and 4 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -85,3 +85,6 @@ __pycache__

# kfp local execution default directory
local_outputs/

# Ignore debug Driver Dockerfile produced from `make -C backend image_driver_debug`
backend/Dockerfile.driver-debug
6 changes: 4 additions & 2 deletions backend/Dockerfile.driver
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,9 @@
# See the License for the specific language governing permissions and
# limitations under the License.

FROM golang:1.21.7-alpine3.19 as builder
FROM golang:1.21.7-alpine3.19 AS builder

ARG GCFLAGS=""

WORKDIR /go/src/github.com/kubeflow/pipelines

Expand All @@ -25,7 +27,7 @@ RUN ./hack/install-go-licenses.sh

COPY . .

RUN GO111MODULE=on CGO_ENABLED=0 GOOS=linux GOARCH=amd64 go build -tags netgo -ldflags '-extldflags "-static"' -o /bin/driver ./backend/src/v2/cmd/driver/*.go
RUN GO111MODULE=on CGO_ENABLED=0 GOOS=linux GOARCH=amd64 go build -tags netgo -gcflags="${GCFLAGS}" -ldflags '-extldflags "-static"' -o /bin/driver ./backend/src/v2/cmd/driver/*.go

# Check licenses and comply with license terms.
# First, make sure there's no forbidden license.
Expand Down
17 changes: 17 additions & 0 deletions backend/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,16 @@ image_visualization:
.PHONY: image_driver
image_driver:
cd $(MOD_ROOT) && ${CONTAINER_ENGINE} build -t ${IMG_TAG_DRIVER} -f backend/Dockerfile.driver .
.PHONY: image_driver_debug
image_driver_debug:
cd $(MOD_ROOT) && sed -e '/RUN .*go mod download/a\
RUN go install github.com/go-delve/delve/cmd/dlv@latest' \
-e '/COPY .*\/bin\/driver \/bin\/driver/a\
COPY . \/go\/src\/github.com\/kubeflow\/pipelines\
COPY --from=builder /go/bin/dlv /bin/dlv\
EXPOSE 2345' \
backend/Dockerfile.driver > backend/Dockerfile.driver-debug
cd $(MOD_ROOT) && ${CONTAINER_ENGINE} build --build-arg GCFLAGS="all=-N -l" -t ${IMG_TAG_DRIVER}:debug -f backend/Dockerfile.driver-debug .
.PHONY: image_launcher
image_launcher:
cd $(MOD_ROOT) && ${CONTAINER_ENGINE} build -t ${IMG_TAG_LAUNCHER} -f backend/Dockerfile.launcher .
Expand All @@ -100,3 +110,10 @@ dev-kind-cluster:
kubectl apply -k $(CURDIR)/../manifests/kustomize/env/dev-kind
kubectl -n kubeflow wait --for condition=Available --timeout=10m deployment/mysql
kubectl -n kubeflow wait --for condition=Available --timeout=3m deployment/metadata-grpc-deployment

.PHONY: kind-load-driver-debug
kind-load-driver-debug:
kind --name $(KIND_NAME) load docker-image ${IMG_TAG_DRIVER}:debug

.PHONY: kind-build-and-load-driver-debug
kind-build-and-load-driver-debug: image_driver_debug kind-load-driver-debug
106 changes: 106 additions & 0 deletions backend/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -167,3 +167,109 @@ You can also directly connect to the MariaDB database server with:
```bash
mysql -h 127.0.0.1 -u root
```

## Remote Debug the Driver

These instructions assume you are leveraging the Kind cluster in the
[Run Locally With a Kind Cluster](#run-locally-with-a-kind-cluster) section.

### Build the Driver Image With Debug Prerequisites

Run the following to create the `backend/Dockerfile.driver-debug` file and build the container image
tagged as `kfp-driver:debug`. This container image is based on `backend/Dockerfile.driver` but installs
[Delve](https://github.com/go-delve/delve), builds the binary without compiler optimizations so the binary matches the
source code (via `GCFLAGS="all=-N -l"`), and copies the source code to the destination container for the debugger.
Any changes to the Driver code will require rebuilding this container image.

```bash
make -C backend image_driver_debug
```

Then load the container image in the Kind cluster.

```bash
make -C backend kind-load-driver-debug
```

Alternatively, you can use this Make target that does both.

```bash
make -C kind-build-and-load-driver-debug
```

### Run the API Server With Debug Configuration

You may use the following VS Code `launch.json` file to run the API server which overrides the Driver
command to use Delve and the Driver image to use debug image built previously.

```json
{
"version": "0.2.0",
"configurations": [
{
"name": "Launch API server (Kind) (Debug Driver)",
"type": "go",
"request": "launch",
"mode": "debug",
"program": "${workspaceFolder}/backend/src/apiserver",
"env": {
"POD_NAMESPACE": "kubeflow",
"DBCONFIG_MYSQLCONFIG_HOST": "localhost",
"MINIO_SERVICE_SERVICE_HOST": "localhost",
"MINIO_SERVICE_SERVICE_PORT": "9000",
"METADATA_GRPC_SERVICE_SERVICE_HOST": "localhost",
"METADATA_GRPC_SERVICE_SERVICE_PORT": "8080",
"ML_PIPELINE_VISUALIZATIONSERVER_SERVICE_HOST": "localhost",
"ML_PIPELINE_VISUALIZATIONSERVER_SERVICE_PORT": "8888",
"V2_DRIVER_IMAGE": "kfp-driver:debug",
"V2_DRIVER_COMMAND": "dlv exec --listen=:2345 --headless=true --api-version=2 --log /bin/driver --",
}
}
]
}
```

### Starting a Remote Debug Session

Start by launching a pipeline. This will eventually create a Driver pod that is waiting for a remote debug connection.

You can see the pods with the following command.

```bash
kubectl -n kubeflow get pods -w
```

Once you see a pod with `-driver` in the name such as `hello-world-clph9-system-dag-driver-10974850`, port forward
the Delve port in the pod to your localhost (replace `<driver pod name>` with the actual name).

```bash
kubectl -n kubeflow port-forward <driver pod name> 2345:2345
```

Set a breakpoint on the Driver code in VS Code. Then remotely connect to the Delve debug session with the following VS
Code `launch.json` file:

```json
{
"version": "0.2.0",
"configurations": [
{
"name": "Connect to remote driver",
"type": "go",
"request": "attach",
"mode": "remote",
"remotePath": "/go/src/github.com/kubeflow/pipelines",
"port": 2345,
"host": "127.0.0.1",
}
]
}
```

Once the Driver pod succeeds, the remote debug session will close. Then repeat the process of forwarding the port
of subsequent Driver pods and starting remote debug sessions in VS Code until the pipeline completes.

For debugging a specific Driver pod, you'll need to continuously port forward and connect to the remote debug session
without a breakpoint so that Delve will continue execution until the Driver pod you are interested in starts up. At that
point, you can set a break point, port forward, and connect to the remote debug session to debug that specific Driver
pod.
2 changes: 2 additions & 0 deletions backend/src/v2/compiler/argocompiler/argo.go
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,7 @@ func Compile(jobArg *pipelinespec.PipelineJob, kubernetesSpecArg *pipelinespec.S
// TODO(chensun): release process and update the images.
launcherImage: GetLauncherImage(),
driverImage: GetDriverImage(),
driverCommand: GetDriverCommand(),
job: job,
spec: spec,
executors: deploy.GetExecutors(),
Expand Down Expand Up @@ -161,6 +162,7 @@ type workflowCompiler struct {
wf *wfapi.Workflow
templates map[string]*wfapi.Template
driverImage string
driverCommand []string
launcherImage string
}

Expand Down
12 changes: 11 additions & 1 deletion backend/src/v2/compiler/argocompiler/container.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,8 @@ const (
LauncherImageEnvVar = "V2_LAUNCHER_IMAGE"
DefaultDriverImage = "gcr.io/ml-pipeline/kfp-driver@sha256:dc8b56a2eb071f30409828a8884d621092e68385af11a6c06aa9e9fbcfbb19de"
DriverImageEnvVar = "V2_DRIVER_IMAGE"
DefaultDriverCommand = "driver"
DriverCommandEnvVar = "V2_DRIVER_COMMAND"
gcsScratchLocation = "/gcs"
gcsScratchName = "gcs-scratch"
s3ScratchLocation = "/s3"
Expand Down Expand Up @@ -91,6 +93,14 @@ func GetDriverImage() string {
return driverImage
}

func GetDriverCommand() []string {
driverCommand := os.Getenv(DriverCommandEnvVar)
if driverCommand == "" {
driverCommand = DefaultDriverCommand
}
return strings.Split(driverCommand, " ")
}

func (c *workflowCompiler) containerDriverTask(name string, inputs containerDriverInputs) (*wfapi.DAGTask, *containerDriverOutputs) {
dagTask := &wfapi.DAGTask{
Name: name,
Expand Down Expand Up @@ -151,7 +161,7 @@ func (c *workflowCompiler) addContainerDriverTemplate() string {
},
Container: &k8score.Container{
Image: GetDriverImage(),
Command: []string{"driver"},
Command: GetDriverCommand(),
Args: []string{
"--type", "CONTAINER",
"--pipeline_name", c.spec.GetPipelineInfo().GetName(),
Expand Down
2 changes: 1 addition & 1 deletion backend/src/v2/compiler/argocompiler/dag.go
Original file line number Diff line number Diff line change
Expand Up @@ -480,7 +480,7 @@ func (c *workflowCompiler) addDAGDriverTemplate() string {
},
Container: &k8score.Container{
Image: c.driverImage,
Command: []string{"driver"},
Command: c.driverCommand,
Args: []string{
"--type", inputValue(paramDriverType),
"--pipeline_name", c.spec.GetPipelineInfo().GetName(),
Expand Down

0 comments on commit 0f6cc5b

Please sign in to comment.