Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Add CUDA connected components & track building #4015

Open
wants to merge 6 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .gitlab-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -159,7 +159,7 @@ test_exatrkx_unittests:
- git checkout $HEAD_SHA
- source CI/dependencies.sh
- cd ..
- ctest --test-dir build -R ExaTrkX
- ctest --test-dir build -R "(ExaTrkX|CudaConnectedComponents)"

test_exatrkx_python:
stage: test
Expand Down
20 changes: 20 additions & 0 deletions Examples/Python/src/ExaTrkXTrackFinding.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

#include "Acts/Plugins/ExaTrkX/BoostTrackBuilding.hpp"
#include "Acts/Plugins/ExaTrkX/CudaTrackBuilding.hpp"
#include "Acts/Plugins/ExaTrkX/ExaTrkXPipeline.hpp"
#include "Acts/Plugins/ExaTrkX/OnnxEdgeClassifier.hpp"
#include "Acts/Plugins/ExaTrkX/OnnxMetricLearning.hpp"
Expand Down Expand Up @@ -112,6 +113,25 @@ void addExaTrkXTrackFinding(Context &ctx) {
}
#endif

#ifdef ACTS_EXATRKX_WITH_CUDA
benjaminhuth marked this conversation as resolved.
Show resolved Hide resolved
{
using Alg = Acts::CudaTrackBuilding;
using Config = Alg::Config;

auto alg = py::class_<Alg, Acts::TrackBuildingBase, std::shared_ptr<Alg>>(
mex, "CudaTrackBuilding")
.def(py::init([](const Config &c, Logging::Level lvl) {
return std::make_shared<Alg>(
c, getDefaultLogger("TrackBuilding", lvl));
}),
"config"_a, "level"_a);

auto c = py::class_<Config>(alg, "Config").def(py::init<>());
ACTS_PYTHON_STRUCT_BEGIN(c, Config);
ACTS_PYTHON_STRUCT_END();
}
#endif

#ifdef ACTS_EXATRKX_ONNX_BACKEND
{
using Alg = Acts::OnnxMetricLearning;
Expand Down
1 change: 1 addition & 0 deletions Plugins/ExaTrkX/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ add_library(ActsPluginExaTrkX SHARED src/buildEdges.cpp src/ExaTrkXPipeline.cpp)

if(ACTS_EXATRKX_ENABLE_CUDA)
target_compile_definitions(ActsPluginExaTrkX PUBLIC ACTS_EXATRKX_WITH_CUDA)
benjaminhuth marked this conversation as resolved.
Show resolved Hide resolved
target_sources(ActsPluginExaTrkX PRIVATE src/CudaTrackBuilding.cu)
endif()

if(ACTS_EXATRKX_ENABLE_ONNX)
Expand Down
44 changes: 44 additions & 0 deletions Plugins/ExaTrkX/include/Acts/Plugins/ExaTrkX/CudaTrackBuilding.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
// This file is part of the ACTS project.
//
// Copyright (C) 2016 CERN for the benefit of the ACTS project
//
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

#pragma once

#include "Acts/Plugins/ExaTrkX/Stages.hpp"
#include "Acts/Utilities/Logger.hpp"

#include <memory>

#include <torch/script.h>

namespace Acts {

class CudaTrackBuilding final : public Acts::TrackBuildingBase {
public:
struct Config {
// nothing yet
};

CudaTrackBuilding(const Config &cfg, std::unique_ptr<const Logger> logger)
: m_cfg(cfg),
m_logger(std::move(logger)),
m_device(torch::Device(torch::kCUDA)) {}
benjaminhuth marked this conversation as resolved.
Show resolved Hide resolved
Comment on lines +26 to +29
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

💡 Codebase verification

⚠️ Potential issue

Hmmmm, wisdom in error handling I sense, but better approach we must take.

Follow the ways of TorchMetricLearning and TorchEdgeClassifier, you should. Device ID in configuration and ACTS_WARNING they use, yes.

-        m_device(torch::Device(torch::kCUDA)) {}
+        m_device([&]() {
+          if (cfg.deviceID >= 0) {
+            m_device = torch::Device(torch::kCUDA, cfg.deviceID);
+          } else {
+            ACTS_WARNING("GPU device " << cfg.deviceID << " not available");
+            throw std::runtime_error("Invalid CUDA device requested");
+          }
+          return m_device;
+        }()) {}
  • Add deviceID to Config struct, you must
  • Consistent with codebase patterns, this approach is
🔗 Analysis chain

Handle CUDA device initialization gracefully, young padawan must.

In constructor, catch CUDA errors you should, when device initialization fails it might. Use CUDA_CHECK macro from CudaUtils.cuh, wisdom this would be.

-        m_device(torch::Device(torch::kCUDA)) {}
+        m_device([&]() {
+          CUDA_CHECK(cudaGetDeviceCount(&deviceCount));
+          if (deviceCount == 0) {
+            throw std::runtime_error("No CUDA devices available");
+          }
+          return torch::Device(torch::kCUDA);
+        }()) {}
🏁 Scripts executed

The following scripts were executed for the analysis:

Error: No shell scripts found


Script:

#!/bin/bash
# Search for CudaUtils.cuh and its content
fd -e cuh "CudaUtils" --exec cat {}

# Look for CUDA device initialization patterns
rg -A 2 "torch::Device.*kCUDA" 

# Search for CUDA_CHECK usage
rg "CUDA_CHECK"

# Find error handling patterns with CUDA
rg "throw.*CUDA|CUDA.*error"

Length of output: 5533


std::vector<std::vector<int>> operator()(
std::any nodes, std::any edges, std::any edge_weights,
std::vector<int> &spacepointIDs,
const ExecutionContext &execContext = {}) override;
torch::Device device() const override { return m_device; };

private:
Config m_cfg;
std::unique_ptr<const Acts::Logger> m_logger;
torch::Device m_device;
const auto &logger() const { return *m_logger; }
};

} // namespace Acts
Original file line number Diff line number Diff line change
@@ -0,0 +1,173 @@
// This file is part of the ACTS project.
//
// Copyright (C) 2016 CERN for the benefit of the ACTS project
//
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

#pragma once

#include "Acts/Plugins/ExaTrkX/detail/CudaUtils.cuh"

#include <thrust/execution_policy.h>
#include <thrust/scan.h>

namespace Acts::detail {

template <typename T>
__device__ void swap(T &a, T &b) {
T tmp = a;
a = b;
b = tmp;
}
Comment on lines +18 to +23
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't know what the type of TEdge is, but if it is big you might want to implement this using moves.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Actually, since this is a __device__ function, is there any realistic scenario where a object used on device that has a move constructor? I naivly would not expect this, but I don't have a lot of experience here...


/// Implementation of the FastSV algorithm as shown in
/// https://arxiv.org/abs/1910.05971
template <typename TEdge, typename TLabel>
__global__ void labelConnectedComponents(std::size_t numEdges,
const TEdge *sourceEdges,
const TEdge *targetEdges,
std::size_t numNodes, TLabel *labels,
TLabel *labelsNext) {
// Currently this kernel works only with 1 block
assert(gridDim.x == 1);

for (std::size_t i = threadIdx.x; i < numNodes; i += blockDim.x) {
labels[i] = i;
labelsNext[i] = i;
}
benjaminhuth marked this conversation as resolved.
Show resolved Hide resolved

__shared__ bool changed;

do {
changed = false;
__syncthreads();

//printf("Iteration %i\n", n);

// Tree hooking for each edge;
for (std::size_t i = threadIdx.x; i < numEdges; i += blockDim.x) {
auto u = sourceEdges[i];
auto v = targetEdges[i];

if (labels[u] == labels[labels[u]] && labels[v] < labels[u]) {
labelsNext[labels[u]] = labels[v];
changed = true;
//printf("Edge (%i,%i): set labelsNext[%i] = labels[%i] = %i\n", u, v, labels[u], v, labels[v]);
} else if (labels[v] == labels[labels[v]] && labels[u] < labels[v]) {
labelsNext[labels[v]] = labels[u];
changed = true;
//printf("Edge (%i,%i): set labelsNext[%i] = labels[%i] = %i\n", u, v, labels[v], u, labels[u]);
} else {
//printf("Edge (%i,%i): no action\n", u, v);
}
}
__syncthreads();

for (std::size_t i = threadIdx.x; i < numNodes; i += blockDim.x) {
labels[i] = labelsNext[i];
}

/*if(threadIdx.x == 0 ) {
for(int i=0; i<numNodes; ++i) {
printf("Vertex %i - label %i\n", i, labels[i]);
}
}*/

// Shortcutting
for (std::size_t i = threadIdx.x; i < numNodes; i += blockDim.x) {
if (labels[i] != labels[labels[i]]) {
labelsNext[i] = labels[labels[i]];
//printf("Vertex %i: labelsNext[%i] = labels[%i] = %i\n", i, i, labels[i], labels[labels[i]]);
changed = true;
}
}

for (std::size_t i = threadIdx.x; i < numNodes; i += blockDim.x) {
labels[i] = labelsNext[i];
}

/*if(threadIdx.x == 0 ) {
for(int i=0; i<numNodes; ++i) {
printf("Vertex after Shortcutting %i - label %i\n", i, labels[i]);
}
}*/

__syncthreads();
} while (changed == true)
}

template <typename T>
__global__ void makeLabelMask(std::size_t nLabels, const T *labels,
T *labelMask) {
std::size_t i = threadIdx.x + blockDim.x * blockIdx.x;

if (i >= nLabels) {
return;
}

labelMask[labels[i]] = 1;
}

template <typename T>
__global__ void mapEdgeLabels(std::size_t nLabels, T *labels,
const T *mapping) {
std::size_t i = threadIdx.x + blockDim.x * blockIdx.x;

if (i >= nLabels) {
return;
}

labels[i] = mapping[labels[i]];
}

template <typename TEdges, typename TLabel>
TLabel connectedComponentsCuda(std::size_t nEdges, const TEdges *sourceEdges,
const TEdges *targetEdges, std::size_t nNodes,
TLabel *labels, cudaStream_t stream) {
TLabel *tmpMemory;
ACTS_CUDA_CHECK(cudaMallocAsync(&tmpMemory, nNodes * sizeof(TLabel), stream));

// Make synchronization in one block, to avoid that inter-block sync is
// necessary
dim3 blockDim = 1024;
labelConnectedComponents<<<1, blockDim, 1, stream>>>(
nEdges, sourceEdges, targetEdges, nNodes, labels, tmpMemory);
benjaminhuth marked this conversation as resolved.
Show resolved Hide resolved
ACTS_CUDA_CHECK(cudaGetLastError());

// Assume we have the following components:
// 0 3 5 3 0 0

// Fill a mask which labels survived the connected components algorithm
// 0 1 2 3 4 5
// 1 0 0 1 0 1
benjaminhuth marked this conversation as resolved.
Show resolved Hide resolved
ACTS_CUDA_CHECK(
cudaMemsetAsync(tmpMemory, 0, nNodes * sizeof(TLabel), stream));
dim3 gridDim = (nNodes + blockDim.x - 1) / blockDim.x;
makeLabelMask<<<gridDim, blockDim, 0, stream>>>(nNodes, labels, tmpMemory);
ACTS_CUDA_CHECK(cudaGetLastError());

// Exclusive prefix sum on the label mask
// 0 1 2 3 4 5
// 0 1 1 1 2 2
thrust::exclusive_scan(thrust::device.on(stream), tmpMemory,
tmpMemory + nNodes, tmpMemory);

// Remap edge labels with values in prefix sum
// 0 -> 0, 3 -> 1, 5 -> 2
mapEdgeLabels<<<gridDim, blockDim, 0, stream>>>(nNodes, labels, tmpMemory);
ACTS_CUDA_CHECK(cudaGetLastError());

TLabel nLabels;
ACTS_CUDA_CHECK(cudaMemcpyAsync(&nLabels, &tmpMemory[nNodes - 1],
sizeof(TLabel), cudaMemcpyDeviceToHost,
stream));

ACTS_CUDA_CHECK(cudaFreeAsync(tmpMemory, stream));
ACTS_CUDA_CHECK(cudaStreamSynchronize(stream));

return nLabels;
}

} // namespace Acts::detail
32 changes: 32 additions & 0 deletions Plugins/ExaTrkX/include/Acts/Plugins/ExaTrkX/detail/CudaUtils.cuh
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
// This file is part of the ACTS project.
//
// Copyright (C) 2016 CERN for the benefit of the ACTS project
//
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

#pragma once

#include <sstream>

#include <cuda_runtime_api.h>

namespace Acts::detail {

inline void cudaAssert(cudaError_t code, const char *file, int line) {
if (code != cudaSuccess) {
std::stringstream ss;
ss << "CUDA error: " << cudaGetErrorString(code) << ", " << file << ":"
<< line;
throw std::runtime_error(ss.str());
}
cudaDeviceSynchronize();
}

} // namespace Acts::detail

#define ACTS_CUDA_CHECK(ans) \
do { \
Acts::detail::cudaAssert((ans), __FILE__, __LINE__); \
} while (0)
71 changes: 71 additions & 0 deletions Plugins/ExaTrkX/src/CudaTrackBuilding.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
// This file is part of the ACTS project.
//
// Copyright (C) 2016 CERN for the benefit of the ACTS project
//
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

#include "Acts/Plugins/ExaTrkX/CudaTrackBuilding.hpp"
#include "Acts/Plugins/ExaTrkX/detail/ConnectedComponents.cuh"
#include "Acts/Plugins/ExaTrkX/detail/CudaUtils.cuh"
#include "Acts/Utilities/Zip.hpp"

#include <c10/cuda/CUDAGuard.h>
#include <c10/cuda/CUDAStream.h>
#include <torch/torch.h>

namespace Acts {

std::vector<std::vector<int>> CudaTrackBuilding::operator()(
std::any /*nodes*/, std::any edges, std::any weights,
benjaminhuth marked this conversation as resolved.
Show resolved Hide resolved
std::vector<int>& spacepointIDs, const ExecutionContext& execContext) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This vector can be const.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think there is some dumb reason that ONNX runtime accepts only mutable pointers or so... Probably in that case it would be better to just copy the data, but I wouldn't touch it in this PR

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah but this is graph building... so indeed it could be const

ACTS_VERBOSE("Start CUDA track building");
c10::cuda::CUDAStreamGuard guard(execContext.stream.value());

const auto edgeTensor = std::any_cast<torch::Tensor>(edges).to(torch::kCUDA);
assert(edgeTensor.size(0) == 2);
benjaminhuth marked this conversation as resolved.
Show resolved Hide resolved

const auto numSpacepoints = spacepointIDs.size();
const auto numEdges = static_cast<std::size_t>(edgeTensor.size(1));

if (numEdges == 0) {
ACTS_WARNING("No edges remained after edge classification");
return {};
}

auto stream = execContext.stream->stream();

auto cudaSrcPtr = edgeTensor.data_ptr<std::int64_t>();
auto cudaTgtPtr = edgeTensor.data_ptr<std::int64_t>() + numEdges;

int* cudaLabels;
ACTS_CUDA_CHECK(
cudaMallocAsync(&cudaLabels, numSpacepoints * sizeof(int), stream));

std::size_t numberLabels = detail::connectedComponentsCuda(
numEdges, cudaSrcPtr, cudaTgtPtr, numSpacepoints, cudaLabels, stream);

// TODO not sure why there is an issue that is not detected in the unit tests
numberLabels += 1;
Comment on lines +49 to +50
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue

Investigate the TODO comment, you must.

An issue not detected in unit tests, the increment of 'numberLabels' suggests. Rather than adjusting manually, find and fix the root cause, we should.

Assist you, can I. Help investigate this issue or open a new GitHub issue, would you like?


std::vector<int> trackLabels(numSpacepoints);
ACTS_CUDA_CHECK(cudaMemcpyAsync(trackLabels.data(), cudaLabels,
numSpacepoints * sizeof(int),
cudaMemcpyDeviceToHost, stream));
ACTS_CUDA_CHECK(cudaFreeAsync(cudaLabels, stream));
ACTS_CUDA_CHECK(cudaStreamSynchronize(stream));
ACTS_CUDA_CHECK(cudaGetLastError());

ACTS_VERBOSE("Found " << numberLabels << " track candidates");

std::vector<std::vector<int>> trackCandidates(numberLabels);

for (const auto [label, id] : Acts::zip(trackLabels, spacepointIDs)) {
trackCandidates[label].push_back(id);
}

return trackCandidates;
}

} // namespace Acts
1 change: 1 addition & 0 deletions Tests/UnitTests/Plugins/ExaTrkX/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,4 @@ add_unittest(ExaTrkXTensorConversion ExaTrkXTensorConversionTests.cpp)
add_unittest(ExaTrkXEdgeBuilding ExaTrkXEdgeBuildingTest.cpp)
add_unittest(ExaTrkXBoostTrackBuilding ExaTrkXBoostTrackBuildingTests.cpp)
add_unittest(ExaTrkXMetricHookTests ExaTrkXMetricHookTests.cpp)
add_unittest(ConnectedComponentsCuda ConnectedComponentCudaTests.cu)
Loading
Loading