diff --git a/.gitmodules b/.gitmodules
new file mode 100644
index 0000000000..e005367822
--- /dev/null
+++ b/.gitmodules
@@ -0,0 +1,3 @@
+[submodule "integration/xgboost/encryption_plugins/cuda_plugin/CGBN"]
+	path = integration/xgboost/encryption_plugins/cuda_plugin/CGBN
+	url = https://github.com/NVlabs/CGBN.git
diff --git a/integration/xgboost/encryption_plugins/CMakeLists.txt b/integration/xgboost/encryption_plugins/CMakeLists.txt
index f5d71dd61c..f7701e5742 100644
--- a/integration/xgboost/encryption_plugins/CMakeLists.txt
+++ b/integration/xgboost/encryption_plugins/CMakeLists.txt
@@ -1,41 +1,32 @@
 cmake_minimum_required(VERSION 3.19)
-project(xgb_nvflare LANGUAGES CXX C VERSION 1.0)
+set(CMAKE_C_STANDARD 17)
 set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CUDA_STANDARD 17)
 set(CMAKE_BUILD_TYPE Debug)
+set(CMAKE_MODULE_PATH "${CMAKE_SOURCE_DIR}")
+cmake_policy(VERSION ${CMAKE_VERSION})
+message(STATUS "CMAKE_SOURCE_DIR: ${CMAKE_SOURCE_DIR}")
+message(STATUS "CMAKE_VERSION: ${CMAKE_VERSION}")
 
-option(GOOGLE_TEST "Build google tests" OFF)
 
-file(GLOB_RECURSE LIB_SRC "src/*.cc")
-
-add_library(nvflare SHARED ${LIB_SRC})
-set_target_properties(nvflare PROPERTIES
-  CXX_STANDARD 17
-  CXX_STANDARD_REQUIRED ON
-  POSITION_INDEPENDENT_CODE ON
-  ENABLE_EXPORTS ON
-)
-target_include_directories(nvflare PRIVATE ${xgb_nvflare_SOURCE_DIR}/src/include)
-
-if (APPLE)
-    add_link_options("LINKER:-object_path_lto,$<TARGET_PROPERTY:NAME>_lto.o")
-    add_link_options("LINKER:-cache_path_lto,${CMAKE_BINARY_DIR}/LTOCache")
-endif ()
-
-#-- Unit Tests
-if(GOOGLE_TEST)
-    find_package(GTest REQUIRED)
-    enable_testing()
-    add_executable(nvflare_test)
-    target_link_libraries(nvflare_test PRIVATE nvflare)
-
-
-    target_include_directories(nvflare_test PRIVATE ${xgb_nvflare_SOURCE_DIR}/src/include)
+# this has to be set before project()
+if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
+  set(CMAKE_CUDA_ARCHITECTURES 70)
+endif()
 
-    add_subdirectory(${xgb_nvflare_SOURCE_DIR}/tests)
+project(xgb_plugins LANGUAGES CUDA CXX VERSION 1.0)
+option(BUILD_CUDA_PLUGIN "Build CUDA plugin" ON)
+option(BUILD_NVFLARE_PLUGIN "Build NVFlare plugin" ON)
 
-    add_test(
-        NAME TestNvflarePlugins
-        COMMAND nvflare_test
-        WORKING_DIRECTORY ${xgb_nvflare_BINARY_DIR})
+if (BUILD_CUDA_PLUGIN)
+  add_subdirectory(cuda_plugin)
+else()
+  message(STATUS "Skipping CUDA plugin")
+endif()
 
+if (BUILD_NVFLARE_PLUGIN)
+  add_subdirectory(nvflare_plugin)
+else()
+  message(STATUS "Skipping NVFLARE plugin")
 endif()
+
diff --git a/integration/xgboost/encryption_plugins/README.md b/integration/xgboost/encryption_plugins/README.md
index 57f2c4621e..be92cf4b40 100644
--- a/integration/xgboost/encryption_plugins/README.md
+++ b/integration/xgboost/encryption_plugins/README.md
@@ -1,9 +1,23 @@
-# Build Instruction
+# XGBoost plugins
 
-cd NVFlare/integration/xgboost/encryption_plugins
+
+## Install required dependencies for CUDA plugin
+If you want to build CUDA plugin, you need to install the following libraries:
+Require `libgmp-dev`, CUDA runtime >= 12.1, CUDA driver >= 12.1, NVIDIA GPU Driver >= 535
+Compute Compatibility >= 7.0
+
+## Build instructions
+
+```
 mkdir build
 cd build
 cmake ..
 make
+```
+
+## Disable build of CUDA plugin
+You can pass option to cmake to disable the build of CUDA plugin if you don't have the environment:
+```
+cmake -DBUILD_CUDA_PLUGIN=OFF ..
+```
 
-The library is libxgb_nvflare.so 
diff --git a/integration/xgboost/encryption_plugins/cuda_plugin/CGBN b/integration/xgboost/encryption_plugins/cuda_plugin/CGBN
new file mode 160000
index 0000000000..e8b9d265c7
--- /dev/null
+++ b/integration/xgboost/encryption_plugins/cuda_plugin/CGBN
@@ -0,0 +1 @@
+Subproject commit e8b9d265c7b84077d02340b0986f3c91b2eb02fb
diff --git a/integration/xgboost/encryption_plugins/cuda_plugin/CMakeLists.txt b/integration/xgboost/encryption_plugins/cuda_plugin/CMakeLists.txt
new file mode 100644
index 0000000000..c93780d4bf
--- /dev/null
+++ b/integration/xgboost/encryption_plugins/cuda_plugin/CMakeLists.txt
@@ -0,0 +1,52 @@
+set(CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}")
+find_package(gmp REQUIRED)
+
+# Set NVCC compiler
+find_program(NVCC nvcc)
+if(NOT NVCC)
+    message(FATAL_ERROR "NVCC not found! Please make sure CUDA is installed.")
+endif()
+
+file(GLOB_RECURSE LIB_SRC
+	${CMAKE_SOURCE_DIR}/shared/dam/*.cc
+	${CMAKE_SOURCE_DIR}/shared/plugins/*.cc
+	${CMAKE_CURRENT_SOURCE_DIR}/src/*.cc
+)
+message(STATUS "LIB_SRC files: ${LIB_SRC}")
+
+file(GLOB_RECURSE CUDA_SRC
+	${CMAKE_CURRENT_SOURCE_DIR}/CGBN/include/cgbn/cgbn.h
+	${CMAKE_CURRENT_SOURCE_DIR}/src/*.h
+)
+message(STATUS "CUDA_SRC files: ${CUDA_SRC}")
+
+set(TARGET_NAME cuda_paillier)
+
+set_source_files_properties(
+	${CMAKE_CURRENT_SOURCE_DIR}/src/delegated_plugin.cc
+	${CMAKE_CURRENT_SOURCE_DIR}/src/plugin_main.cc
+	${CUDA_SRC}
+	PROPERTIES LANGUAGE CUDA
+)
+
+add_library(${TARGET_NAME} SHARED ${LIB_SRC})
+set_target_properties(${TARGET_NAME}
+	PROPERTIES
+                CUDA_RUNTIME_LIBRARY Shared
+)
+set_target_properties(${TARGET_NAME}
+        PROPERTIES
+	LINKER_LANGUAGE CUDA
+)
+
+target_include_directories(${TARGET_NAME} PRIVATE
+	${CMAKE_SOURCE_DIR}/shared/include
+	${CMAKE_CURRENT_SOURCE_DIR}/src
+	${CMAKE_CURRENT_SOURCE_DIR}/CGBN/include/cgbn
+)
+
+target_compile_features(${TARGET_NAME} PRIVATE cuda_std_17)
+target_link_libraries(${TARGET_NAME} PRIVATE gmp::gmpc)
+target_link_libraries(${TARGET_NAME} PRIVATE gmp::gmpxx)
+target_link_libraries(${TARGET_NAME} PRIVATE gmp::gmp)
+
diff --git a/integration/xgboost/encryption_plugins/cuda_plugin/Findgmp.cmake b/integration/xgboost/encryption_plugins/cuda_plugin/Findgmp.cmake
new file mode 100644
index 0000000000..452fee2bba
--- /dev/null
+++ b/integration/xgboost/encryption_plugins/cuda_plugin/Findgmp.cmake
@@ -0,0 +1,87 @@
+####################################################
+# Looking for main header file in standard locations
+find_path(gmpc_INCLUDE_DIR gmp.h)
+find_path(gmpxx_INCLUDE_DIR gmpxx.h)
+
+############################################
+# Looking for binaries in standard locations
+find_library(gmpc_LIBRARY NAMES gmp libgmp)
+find_library(gmpxx_LIBRARY NAMES gmpxx libgmpxx)
+
+######################################################################################
+# QUESTION: IS ALL THAT NECESSARY OR find_package_handle_standard_args DOES THE JOB? #
+######################################################################################
+IF (gmpc_INCLUDE_DIR STREQUAL "gmpc_INCLUDE_DIR-NOTFOUND")
+	MESSAGE(WARNING "GMP c headers not found")
+	SET(GMP_DETECTION_ERROR TRUE)
+ELSEIF(gmpxx_INCLUDE_DIR STREQUAL "gmpxx_INCLUDE_DIR-NOTFOUND")
+	MESSAGE(WARNING "GMP c++ headers not found")
+	SET(GMP_DETECTION_ERROR TRUE)
+ELSEIF(gmpc_LIBRARY STREQUAL "gmpc_LIBRARY-NOTFOUND")
+	MESSAGE(WARNING "GMP c library not found")
+	SET(GMP_DETECTION_ERROR TRUE)
+ELSEIF(gmpxx_LIBRARY STREQUAL "gmpxx_LIBRARY-NOTFOUND")
+	MESSAGE(WARNING "GMP c++ library not found")
+	SET(GMP_DETECTION_ERROR TRUE)
+ENDIF()
+
+IF (NOT GMP_DETECTION_ERROR)
+
+	mark_as_advanced(gmpc_INCLUDE_DIR gmpc_LIBRARY gmpxx_INCLUDE_DIR gmpxx_LIBRARY)
+
+	#############################
+	# Setting find_package output
+	#	gmp_FOUND
+	# Cache variables
+	#	gmp_INCLUDE_DIR
+	#	gmp_LIBRARY
+	# CMakeLists variables
+	#	gmp_INCLUDE_DIRS
+	#	gmp_LIBRARIES
+	include( FindPackageHandleStandardArgs )
+	find_package_handle_standard_args(gmp REQUIRED_VARS
+		gmpc_LIBRARY
+		gmpxx_LIBRARY
+		gmpc_INCLUDE_DIR
+		gmpxx_INCLUDE_DIR
+		)
+		
+	IF (gmp_FOUND)
+
+		SET(gmpc_LIBRARIES ${gmpc_LIBRARY})
+		SET(gmpc_INCLUDE_DIRS ${gmpc_INCLUDE_DIR})
+		SET(gmpxx_LIBRARIES ${gmpxx_LIBRARY})
+		SET(gmpxx_INCLUDE_DIRS ${gmpxx_INCLUDE_DIR})
+
+		##################################
+		# Setting gmp::gmp
+		IF (NOT TARGET gmp::gmpc)
+			add_library(gmp::gmpc UNKNOWN IMPORTED)
+			set_target_properties(gmp::gmpc PROPERTIES
+				IMPORTED_LINK_INTERFACE_LANGUAGES "CXX"
+				IMPORTED_LOCATION "${gmpc_LIBRARY}"
+				INTERFACE_INCLUDE_DIRECTORIES "${gmpc_INCLUDE_DIR}"
+				)
+		ENDIF()
+		#SET(GMPC_TARGET "gmp::gmpc")
+		IF (NOT TARGET gmp::gmpxx)
+			add_library(gmp::gmpxx UNKNOWN IMPORTED)
+			set_target_properties(gmp::gmpxx PROPERTIES
+				IMPORTED_LINK_INTERFACE_LANGUAGES "CXX"
+				IMPORTED_LOCATION "${gmpxx_LIBRARY}"
+				INTERFACE_INCLUDE_DIRECTORIES "${gmpxx_INCLUDE_DIR}"
+				)
+		ENDIF()
+		#SET(GMPXX_TARGET "gmp::gmpxx")
+		IF (NOT TARGET gmp::gmp)
+			add_library(gmp::gmp INTERFACE IMPORTED)
+			#SET(GMP_TARGET "${GMPC_TARGET};${GMPXX_TARGET}")
+			set_target_properties(gmp::gmp PROPERTIES
+				LINK_INTERFACE_LIBRARIES "gmp::gmpc;gmp::gmpxx"
+				IMPORTED_LOCATION "${gmpc_LIBRARY};${gmpxx_LIBRARY}")
+		ENDIF()
+
+	ENDIF()
+	
+ENDIF()
+
diff --git a/integration/xgboost/encryption_plugins/cuda_plugin/README.md b/integration/xgboost/encryption_plugins/cuda_plugin/README.md
new file mode 100644
index 0000000000..fd3354bba1
--- /dev/null
+++ b/integration/xgboost/encryption_plugins/cuda_plugin/README.md
@@ -0,0 +1,4 @@
+# CUDA plugin
+
+Use CUDA to do paillier encryption and addition.
+
diff --git a/integration/xgboost/encryption_plugins/cuda_plugin/src/cuda_plugin.h b/integration/xgboost/encryption_plugins/cuda_plugin/src/cuda_plugin.h
new file mode 100755
index 0000000000..539e513f47
--- /dev/null
+++ b/integration/xgboost/encryption_plugins/cuda_plugin/src/cuda_plugin.h
@@ -0,0 +1,331 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef CUDA_PLUGIN_H
+#define CUDA_PLUGIN_H
+
+#pragma once
+#include <iostream>
+#include <vector>
+#include <map>
+#include "paillier.h"
+#include "base_plugin.h"
+#include "local_plugin.h"
+#include "endec.h"
+
+#define PRECISION 1e6
+
+namespace nvflare {
+
+// Define a structured header for the buffer
+struct BufferHeader {
+  bool has_key;
+  size_t key_size;
+  size_t rand_seed_size;
+};
+
+class CUDAPlugin: public LocalPlugin {
+  private:
+    PaillierCipher<bits>* paillier_cipher_ptr_ = nullptr;
+    CgbnPair* encrypted_gh_pairs_ = nullptr;
+    Endec* endec_ptr_ = nullptr;
+
+  public:
+    explicit CUDAPlugin(std::vector<std::pair<std::string_view, std::string_view>> const &args): LocalPlugin(args) {
+      bool fix_seed = get_bool(args, "fix_seed");
+      paillier_cipher_ptr_ = new PaillierCipher<bits>(bits/2, fix_seed, debug_);
+      encrypted_gh_pairs_ = nullptr;
+    }
+
+    ~CUDAPlugin() {
+      delete paillier_cipher_ptr_;
+      if (endec_ptr_ != nullptr) {
+        delete endec_ptr_;
+        endec_ptr_ = nullptr;
+      }
+    }
+
+    void setGHPairs() {
+      if (debug_) std::cout << "setGHPairs is called" << std::endl;
+      const std::uint8_t* pointer = encrypted_gh_.data();
+
+      // Retrieve header
+      BufferHeader header;
+      std::memcpy(&header, pointer, sizeof(BufferHeader));
+      pointer += sizeof(BufferHeader);
+
+      // Get key and n (if present)
+      cgbn_mem_t<bits>* key_ptr;
+      if (header.has_key) {
+        mpz_t n;
+        mpz_init(n);
+        key_ptr = (cgbn_mem_t<bits>* )malloc(header.key_size);
+        if (!key_ptr) {
+          std::cout << "bad alloc with key_ptr" << std::endl;
+          throw std::bad_alloc();
+        }
+        memcpy(key_ptr, pointer, header.key_size);
+        store2Gmp(n, key_ptr);
+        pointer += header.key_size;
+
+        if (header.rand_seed_size != sizeof(uint64_t)) {
+          free(key_ptr);
+          mpz_clear(n);
+          std::cout << "rand_seed_size " << header.rand_seed_size << " is wrong " << std::endl;
+          throw std::runtime_error("Invalid random seed size");
+        }
+        uint64_t rand_seed;
+        memcpy(&rand_seed, pointer, header.rand_seed_size);
+        pointer += header.rand_seed_size;
+
+        if (!paillier_cipher_ptr_->has_pub_key) {
+          paillier_cipher_ptr_->set_pub_key(n, rand_seed);
+        }
+        mpz_clear(n);
+        free(key_ptr);
+      }
+
+      // Access payload
+      std::vector<std::uint8_t> payload(pointer, pointer + (encrypted_gh_.size() - (pointer - encrypted_gh_.data())));
+
+      ck(cudaMalloc((void **)&encrypted_gh_pairs_, payload.size()));
+      cudaMemcpy(encrypted_gh_pairs_, payload.data(), payload.size(), cudaMemcpyHostToDevice);
+    }
+
+    void clearGHPairs() {
+      if (debug_) std::cout << "clearGHPairs is called" << std::endl;
+      if (encrypted_gh_pairs_) {
+        cudaFree(encrypted_gh_pairs_);
+        encrypted_gh_pairs_ = nullptr;
+      }
+    }
+
+    Buffer createBuffer(
+      bool has_key_flag,
+      cgbn_mem_t<bits>* key_ptr,
+      size_t key_size,
+      uint64_t rand_seed,
+      size_t rand_seed_size,
+      cgbn_mem_t<bits>* d_ciphers_ptr,
+      size_t payload_size
+    ) {
+        if (debug_) std::cout << "createBuffer is called" << std::endl;
+        // Calculate header size and total buffer size
+        size_t header_size = sizeof(BufferHeader);
+        size_t mem_size = header_size + key_size + rand_seed_size + payload_size;
+
+        // Allocate buffer
+        void* buffer = malloc(mem_size);
+        if (!buffer) {
+          std::cout << "bad alloc with buffer" << std::endl;
+          throw std::bad_alloc();
+        }
+
+        // Construct header
+        BufferHeader header;
+        header.has_key = has_key_flag;
+        header.key_size = key_size;
+        header.rand_seed_size = rand_seed_size;
+
+        // Copy header to buffer
+        memcpy(buffer, &header, header_size);
+
+        // Copy the key (if present)
+        if (has_key_flag) {
+          memcpy((char*)buffer + header_size, key_ptr, key_size);
+          memcpy((char*)buffer + header_size + key_size, &rand_seed, rand_seed_size);
+        }
+
+        // Copy the payload
+        cudaMemcpy((char*)buffer + header_size + key_size + rand_seed_size, d_ciphers_ptr, payload_size, cudaMemcpyDeviceToHost);
+
+        Buffer result(buffer, mem_size, true);
+
+        return result;
+    }
+
+    Buffer EncryptVector(const std::vector<double>& cleartext) override {
+      if (debug_) std::cout << "Calling EncryptVector with count " << cleartext.size() << std::endl;
+      if (endec_ptr_ != nullptr) {
+        delete endec_ptr_;
+      }
+      endec_ptr_ = new Endec(PRECISION);
+
+      size_t count = cleartext.size();
+      int byte_length = bits / 8;
+      size_t mem_size = sizeof(cgbn_mem_t<bits>) * count;
+      cgbn_mem_t<bits>* h_ptr=(cgbn_mem_t<bits>* )malloc(mem_size);
+      if (debug_) std::cout << "h_ptr size is " << mem_size << " indata size is " << count * byte_length << std::endl;
+      for (size_t i = 0; i < count; ++i) {
+        mpz_t n;
+        mpz_init(n);
+
+        endec_ptr_->encode(n, cleartext[i]);
+        store2Cgbn(h_ptr + i, n);
+
+        mpz_clear(n);
+      }
+
+      cgbn_mem_t<bits>* d_plains_ptr;
+      cgbn_mem_t<bits>* d_ciphers_ptr;
+      ck(cudaMalloc((void **)&d_plains_ptr, mem_size));
+      ck(cudaMalloc((void **)&d_ciphers_ptr, mem_size));
+      cudaMemcpy(d_plains_ptr, h_ptr, mem_size, cudaMemcpyHostToDevice);
+
+      if (!paillier_cipher_ptr_->has_prv_key) {
+#ifdef TIME
+      CudaTimer cuda_timer(0);
+      float gen_time=0;
+      cuda_timer.start();
+#endif
+        if (debug_) std::cout<<"Gen KeyPair with bits: " << bits << std::endl;
+        paillier_cipher_ptr_->genKeypair();
+#ifdef TIME
+      gen_time += cuda_timer.stop();
+      std::cout<<"Gen KeyPair Time "<< gen_time <<" MS"<<std::endl;
+#endif
+      }
+
+      paillier_cipher_ptr_->encrypt<TPI,TPB>(d_plains_ptr, d_ciphers_ptr, count);
+
+      // get pub_key n
+      mpz_t n;
+      mpz_init(n);
+      size_t key_size = sizeof(cgbn_mem_t<bits>);
+      paillier_cipher_ptr_->getN(n);
+      store2Cgbn(h_ptr, n);
+      mpz_clear(n);
+
+      // get rand_seed
+      size_t rand_seed_size = sizeof(uint64_t);
+      uint64_t rand_seed = paillier_cipher_ptr_->get_rand_seed();
+
+      Buffer result = createBuffer(true, h_ptr, key_size, rand_seed, rand_seed_size, d_ciphers_ptr, mem_size);
+
+      void* buffer = malloc(mem_size);
+      cudaMemcpy(buffer, d_ciphers_ptr, mem_size, cudaMemcpyDeviceToHost);
+      cudaFree(d_plains_ptr);
+      cudaFree(d_ciphers_ptr);
+      free(h_ptr);
+
+      //Buffer result(buffer, mem_size, true);
+
+      return result;
+    }
+
+    std::vector<double> DecryptVector(const std::vector<Buffer>& ciphertext) override {
+      if (debug_) std::cout << "Calling DecryptVector" << std::endl;
+      size_t mem_size = 0;
+      for (int i = 0; i < ciphertext.size(); ++i) {
+        mem_size += ciphertext[i].buf_size;
+        if (ciphertext[i].buf_size != 2 * sizeof(cgbn_mem_t<bits>)) {
+          std::cout << "buf_size is " << ciphertext[i].buf_size << std::endl;
+          std::cout << "expected buf_size is " << 2 * sizeof(cgbn_mem_t<bits>) << std::endl;
+          std::cout << "Fatal Error" << std::endl;
+        }
+      }
+
+      size_t count = mem_size / sizeof(cgbn_mem_t<bits>);
+      cgbn_mem_t<bits>* h_ptr=(cgbn_mem_t<bits>* )malloc(mem_size);
+      if (debug_) std::cout << "h_ptr size is " << mem_size << " how many gh is " << count << std::endl;
+      
+
+      cgbn_mem_t<bits>* d_plains_ptr;
+      cgbn_mem_t<bits>* d_ciphers_ptr;
+      ck(cudaMalloc((void **)&d_plains_ptr, mem_size));
+      ck(cudaMalloc((void **)&d_ciphers_ptr, mem_size));
+      
+      size_t offset = 0;
+      for (int i = 0; i < ciphertext.size(); ++i) {
+        cudaMemcpy(d_ciphers_ptr + offset, ciphertext[i].buffer, ciphertext[i].buf_size, cudaMemcpyHostToDevice);
+        offset += ciphertext[i].buf_size / sizeof(cgbn_mem_t<bits>);
+      }
+
+      if (!paillier_cipher_ptr_->has_prv_key) {
+        std::cout << "Can't call DecryptVector if paillier does not have private key." << std::endl;
+        throw std::runtime_error("Can't call DecryptVector if paillier does not have private key.");
+      }
+
+      paillier_cipher_ptr_->decrypt<TPI,TPB>(d_ciphers_ptr, d_plains_ptr, count);
+
+
+      cudaMemcpy(h_ptr, d_plains_ptr, mem_size, cudaMemcpyDeviceToHost);
+      std::vector<double> result;
+      for (size_t i = 0; i < count; ++i) {
+        mpz_t n;
+        mpz_init(n);
+        store2Gmp(n, h_ptr + i);
+        double output_num = endec_ptr_->decode(n);
+        result.push_back(output_num);
+        mpz_clear(n);
+      }
+      cudaFree(d_plains_ptr);
+      cudaFree(d_ciphers_ptr);
+      free(h_ptr);
+      return result;
+    }
+
+    std::map<int, Buffer> AddGHPairs(const std::map<int, std::vector<int>>& sample_ids) override{
+      if (debug_) std::cout << "Calling AddGHPairs with sample_ids size " << sample_ids.size() << std::endl;
+      if (!encrypted_gh_pairs_) {
+        setGHPairs();
+      }
+      std::map<int, Buffer> result;
+
+      CgbnPair* d_res_ptr;
+      size_t mem_size = sizeof(CgbnPair);
+      if (mem_size != 2*sizeof(cgbn_mem_t<bits>)) {
+        std::cout << "Fatal Error" << std::endl;
+      }
+      ck(cudaMalloc((void **)&d_res_ptr, mem_size));
+      cgbn_mem_t<bits>* d_plains_ptr;
+      ck(cudaMalloc((void **)&d_plains_ptr, mem_size));
+
+      if (!paillier_cipher_ptr_->has_pub_key) {
+        std::cout << "Can't call AddGHPairs if paillier does not have public key." << std::endl;
+        throw std::runtime_error("Can't call AddGHPairs if paillier does not have public key.");
+      }
+
+      // Iterate through the map
+      for (auto& pair : sample_ids) {
+          int key = pair.first;
+          const int* sample_id = pair.second.data();
+          int count = pair.second.size();
+
+          int* sample_id_d;
+          ck(cudaMalloc((void **)&sample_id_d, sizeof(int) * count));
+          cudaMemcpy(sample_id_d, sample_id, sizeof(int) * count, cudaMemcpyHostToDevice);
+
+          paillier_cipher_ptr_->sum<TPI,TPB>(d_res_ptr, encrypted_gh_pairs_, sample_id_d, count);
+
+          void* data = malloc(mem_size);
+          cudaMemcpy(data, d_res_ptr, mem_size, cudaMemcpyDeviceToHost);
+          Buffer buffer(data, mem_size, true);
+          result[key] = buffer; // Add the Buffer object to the result map
+          cudaFree(sample_id_d);
+      }
+      cudaFree(d_res_ptr);
+      cudaFree(d_plains_ptr);
+      if (debug_) std::cout << "Finish AddGHPairs" << std::endl;
+      if (encrypted_gh_pairs_) {
+          clearGHPairs();
+      }
+      return result;
+    }
+};
+} // namespace nvflare
+
+#endif // CUDA_PLUGIN_H
diff --git a/integration/xgboost/encryption_plugins/cuda_plugin/src/cuda_utils.h b/integration/xgboost/encryption_plugins/cuda_plugin/src/cuda_utils.h
new file mode 100755
index 0000000000..f0911a664e
--- /dev/null
+++ b/integration/xgboost/encryption_plugins/cuda_plugin/src/cuda_utils.h
@@ -0,0 +1,274 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef CUDA_UTILS_H
+#define CUDA_UTILS_H
+
+#pragma once
+
+#include <getopt.h>
+#include <iostream>
+#include <fstream>
+#include <unistd.h>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <sstream>
+#include <vector>
+#include <math.h>
+#include <string>
+#include <gmp.h>
+#include "cgbn.h"
+#include <cstdlib> // For rand() function
+#include <ctime>   // For time() function
+
+/********** Constant Values **************/
+const static unsigned int bits=2048;
+const static unsigned int key_len=1024;
+
+//const static unsigned int bits=4096;
+//const static unsigned int key_len=2048;
+
+//const static unsigned int bits=6144;
+//const static unsigned int key_len=3072;
+
+
+const static int TPB=512;
+const static int TPI=32;
+const static int window_bits=5;
+
+/** Class **/
+struct CgbnPair {
+  cgbn_mem_t<bits> g;
+  cgbn_mem_t<bits> h;
+};
+
+/*************Error Handling**************/
+bool check(cudaError_t e, int iLine, const char *szFile) {
+    if (e != cudaSuccess) {
+        std::cout << "CUDA runtime API error " << cudaGetErrorString(e) << " at line " << iLine << " in file " << szFile << std::endl;
+        exit(0);
+        return false;
+    }
+    return true;
+}
+#define ck(call) check(call, __LINE__, __FILE__)
+
+void cgbn_check(cgbn_error_report_t *report, const char *file=NULL, int32_t line=0) {
+    // check for cgbn errors
+    if(cgbn_error_report_check(report)) {
+        printf("\n");
+        printf("CGBN error occurred: %s\n", cgbn_error_string(report));
+
+        if(report->_instance!=0xFFFFFFFF) {
+            printf("Error reported by instance %d", report->_instance);
+            if(report->_blockIdx.x!=0xFFFFFFFF || report->_threadIdx.x!=0xFFFFFFFF)
+                printf(", ");
+            if(report->_blockIdx.x!=0xFFFFFFFF)
+                printf("blockIdx=(%d, %d, %d) ", report->_blockIdx.x, report->_blockIdx.y, report->_blockIdx.z);
+            if(report->_threadIdx.x!=0xFFFFFFFF)
+                printf("threadIdx=(%d, %d, %d)", report->_threadIdx.x, report->_threadIdx.y, report->_threadIdx.z);
+            printf("\n");
+        }
+        else {
+            printf("Error reported by blockIdx=(%d %d %d)", report->_blockIdx.x, report->_blockIdx.y, report->_blockIdx.z);
+            printf("threadIdx=(%d %d %d)\n", report->_threadIdx.x, report->_threadIdx.y, report->_threadIdx.z);
+        }
+        if(file!=NULL)
+            printf("file %s, line %d\n", file, line);
+        exit(1);
+    }
+}
+#define CGBN_CHECK(report) cgbn_check(report, __FILE__, __LINE__)
+
+/*************Time Handling**************/
+class CudaTimer{
+    private:
+        cudaEvent_t event_start;
+        cudaEvent_t event_stop;
+        cudaStream_t stream;
+        float time;
+    public:
+        CudaTimer(cudaStream_t stream){
+            this->stream=stream;
+        }
+        void start(){
+            ck(cudaEventCreate(&event_start));
+            ck(cudaEventCreate(&event_stop));
+            ck(cudaEventRecord(event_start, stream)); 
+        }
+        float stop(){
+            ck(cudaEventRecord(event_stop,stream));
+            ck(cudaEventSynchronize(event_stop));
+            ck(cudaEventElapsedTime(&time, event_start, event_stop));
+            ck(cudaEventDestroy(event_start));
+            ck(cudaEventDestroy(event_stop));
+            return time;
+        }
+        ~CudaTimer(){
+        }
+};
+
+/**********GMP and CGBN functions***************/
+void getPrimeOver(mpz_t rop, int bits, uint64_t &seed_start){
+    gmp_randstate_t state;
+    gmp_randinit_default(state);
+    gmp_randseed_ui(state, seed_start);
+    seed_start++;
+    mpz_t rand_num;
+    mpz_init(rand_num);
+    mpz_urandomb(rand_num, state, bits);
+    //gmp_printf("rand_num:%Zd\n", rand_num);
+    mpz_setbit(rand_num, bits-1);
+    mpz_nextprime(rop, rand_num); 
+    mpz_clear(rand_num);
+}
+
+template<unsigned int BITS>
+void store2Cgbn(cgbn_mem_t<BITS> *address,  mpz_t z) {
+    size_t words;
+    if(mpz_sizeinbase(z, 2) > BITS) {
+        printf("mpz_sizeinbase: %lu exceeds %d\n", mpz_sizeinbase(z, 2), BITS);
+        exit(1);
+    }
+
+    mpz_export((uint32_t *)address, &words, -1, sizeof(uint32_t), 0, 0, z);
+    while(words<(BITS+31)/32)
+        ((uint32_t *)address)[words++]=0;
+}
+
+template<unsigned int BITS>
+void store2Gmp(mpz_t z, cgbn_mem_t<BITS> *address ) {
+    mpz_import(z, (BITS+31)/32, -1, sizeof(uint32_t), 0, 0, (uint32_t *)address);
+}
+
+template<unsigned int BITS>
+void initArr(cgbn_mem_t<BITS> *address, int count, int default_value = 0, bool randomize = false){
+    
+    for(int i = 0; i < count; i++){
+        int value;
+        mpz_t n;
+        mpz_init(n);
+        if (randomize) {
+            value = i; // rand();
+        } else {
+            value = default_value;
+        }
+        mpz_set_si(n, value);
+
+        store2Cgbn(address + i, n);
+
+        gmp_printf("input%d:%Zd\n", i, n);
+        mpz_clear(n); 
+    }
+}
+template<unsigned int BITS>
+void printCgbn(cgbn_mem_t<BITS> *h_ptr, int print_count){
+    for(int i = 0; i < print_count; i++){
+        mpz_t n;
+        mpz_init(n);
+        store2Gmp(n, h_ptr + i);
+        gmp_printf("printCgbn [%d]:%Zd\n",i,  n);
+        mpz_clear(n); 
+    }
+}
+
+template<unsigned int BITS>
+void printDevCgbn(cgbn_mem_t<BITS> *d_ptr, int print_count, std::string name="cipher"){
+
+    int mem_size=sizeof(cgbn_mem_t<BITS>)*print_count;
+    cgbn_mem_t<BITS>* h_plains_ptr=(cgbn_mem_t<BITS>* )malloc(mem_size);
+    cudaMemcpy(h_plains_ptr, d_ptr, mem_size, cudaMemcpyDeviceToHost); 
+
+    for(int i = 0; i < print_count; i++){
+        mpz_t n;
+        mpz_init(n);
+        store2Gmp(n, h_plains_ptr + i);
+        gmp_printf("printDevCgbn %s[%d]:%Zd\n",name,i, n);
+        mpz_clear(n); 
+    }
+
+
+    free(h_plains_ptr);
+}
+
+template<unsigned int BITS>
+void printDevGH(CgbnPair *d_ptr, int print_count, std::string name="cipher"){
+
+    int mem_size=sizeof(CgbnPair) * print_count;
+    CgbnPair* h_plains_ptr=(CgbnPair *)malloc(mem_size);
+    cudaMemcpy(h_plains_ptr, d_ptr, mem_size, cudaMemcpyDeviceToHost); 
+
+    for(int i = 0; i < print_count; i++){
+        mpz_t g, h;
+        mpz_init(g);
+        mpz_init(h);
+        CgbnPair p = *(h_plains_ptr +i);
+        store2Gmp(g, &p.g);
+        store2Gmp(h, &p.h);
+        gmp_printf("printDevCgbn %s[%d]:g %Zd, h %Zd\n",name, i, g, h);
+        mpz_clear(g); 
+        mpz_clear(h); 
+    }
+
+    free(h_plains_ptr);
+}
+
+template<unsigned int BITS>
+void compArr(cgbn_mem_t<BITS> *a, cgbn_mem_t<BITS> *b,int count){
+    int mem_size=sizeof(cgbn_mem_t<BITS>)*count;
+    cgbn_mem_t<BITS>* ha=(cgbn_mem_t<BITS>* )malloc(mem_size);
+    cudaMemcpy(ha, a, mem_size, cudaMemcpyDeviceToHost); 
+
+    cgbn_mem_t<BITS>* hb=(cgbn_mem_t<BITS>* )malloc(mem_size);
+    cudaMemcpy(hb, b, mem_size, cudaMemcpyDeviceToHost); 
+
+
+    for(int i = 0; i < count; i++){
+        int res=0;
+        mpz_t na, nb;
+        mpz_init(na);
+        store2Gmp(na, ha + i);
+
+        mpz_init(nb);
+        store2Gmp(nb, hb + i);
+
+        res=mpz_cmp(na, nb);
+        if(res!=0){
+            std::cout<<"res= "<<res<<" Incorrect at i= "<<i<<std::endl;;
+            gmp_printf("  a=%Zd \n",na);
+            gmp_printf("  b=%Zd \n",nb);
+            exit(1);
+        }
+
+        mpz_clear(na);
+        mpz_clear(nb);
+    }
+    std::cout<<"Correct!"<<std::endl;
+}
+
+bool compare_result(const std::vector<double> &a, const std::vector<double> &b, double eps=1e-6) {
+    if (a.size() != b.size()) return false;
+    for (auto i = 0; i < a.size(); ++i) {
+        if (fabs(a[i] - b[i]) >= eps) {
+            std::cout << "Fatal Error at position " << i << " " << a[i] << " " << b[i] << std::endl;
+            return false;
+        }
+    }
+    return true;
+}
+
+#endif // CUDA_UTILS_H
diff --git a/integration/xgboost/encryption_plugins/cuda_plugin/src/delegated_plugin.cc b/integration/xgboost/encryption_plugins/cuda_plugin/src/delegated_plugin.cc
new file mode 100755
index 0000000000..6b348b3cb6
--- /dev/null
+++ b/integration/xgboost/encryption_plugins/cuda_plugin/src/delegated_plugin.cc
@@ -0,0 +1,32 @@
+/**
+ * Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "delegated_plugin.h"
+#include "cuda_plugin.h"
+
+namespace nvflare {
+
+DelegatedPlugin::DelegatedPlugin(std::vector<std::pair<std::string_view, std::string_view>> const &args):
+  BasePlugin(args) {
+
+  auto name = get_string(args, "name");
+  if (name == "cuda_paillier") {
+    plugin_ = new CUDAPlugin(args);
+  } else {
+    throw std::invalid_argument{"Unknown plugin name: " + name};
+  }
+}
+
+} // namespace nvflare
diff --git a/integration/xgboost/encryption_plugins/cuda_plugin/src/endec.h b/integration/xgboost/encryption_plugins/cuda_plugin/src/endec.h
new file mode 100755
index 0000000000..206746371c
--- /dev/null
+++ b/integration/xgboost/encryption_plugins/cuda_plugin/src/endec.h
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ENDEC_H
+#define ENDEC_H
+
+#include "gmp.h"
+
+class Endec {
+  private:
+    bool debug_ = false;
+    double precision_;
+
+  public:
+    Endec(double p, bool debug = false): debug_(debug), precision_(p) {}
+
+    void encode(mpz_t& result, const double& number) {
+      int64_t temp = static_cast<int64_t>(number * precision_);
+      uint64_t output_number = static_cast<uint64_t>(temp);
+
+      mpz_set_ui(result, output_number);
+      if (debug_) printf("Encoding using (p %f): input %f, output %lu\n", precision_, number, output_number);
+
+    }
+
+    double decode(const mpz_t& number) {
+      uint64_t output_num = mpz_get_ui(number);
+      int64_t sint = static_cast<int64_t>(output_num);
+      double result = sint / precision_;
+
+      if (debug_) gmp_printf("Decoding using (p %f): input %Zd, output %f\n", precision_, number, result);
+      return result;
+    }
+};
+
+#endif // ENDEC_H
diff --git a/integration/xgboost/encryption_plugins/cuda_plugin/src/paillier.h b/integration/xgboost/encryption_plugins/cuda_plugin/src/paillier.h
new file mode 100755
index 0000000000..8226f4cfa0
--- /dev/null
+++ b/integration/xgboost/encryption_plugins/cuda_plugin/src/paillier.h
@@ -0,0 +1,802 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef PAILLIER_H
+#define PAILLIER_H
+
+#pragma once
+
+#include <random>
+#include "cuda_utils.h"
+
+
+/***********************Declare*************************/
+
+template<unsigned int TPI, unsigned int BITS>
+__global__ void gpu_encrypt(cgbn_error_report_t *report, cgbn_mem_t<BITS> *plains, cgbn_mem_t<BITS> * ciphers, int count); 
+
+template<unsigned int TPI, unsigned int BITS>
+__global__ void gpu_decrypt(cgbn_error_report_t *report, cgbn_mem_t<BITS> * plains, cgbn_mem_t<BITS> *ciphers, int count);
+
+template<unsigned int TPI, unsigned int BITS>
+__global__ void reduce_sum(cgbn_error_report_t *report, CgbnPair* result, CgbnPair* arr, int count, CgbnPair* one);
+
+template<unsigned int TPI, unsigned int BITS>
+__global__ void reduce_sum_with_index(cgbn_error_report_t *report, CgbnPair* result, CgbnPair* arr,
+int* sample_bin, int count, CgbnPair* one);
+
+
+/***********************Class**********************/
+template <unsigned int BITS>
+struct PaillierPubKey{
+    cgbn_mem_t<BITS> n;
+    cgbn_mem_t<BITS> n_1;
+    cgbn_mem_t<BITS> n_square;
+    cgbn_mem_t<BITS> limit_int;
+    cgbn_mem_t<BITS> rand_seed;
+};
+
+template <unsigned int BITS>
+struct PaillierPrvKey{
+    cgbn_mem_t<BITS> lamda;
+    cgbn_mem_t<BITS> u;
+};
+
+
+
+__constant__ PaillierPrvKey<bits> c_PriKey;
+__constant__ PaillierPubKey<bits> c_PubKey;
+
+
+template <unsigned int BITS>
+class PaillierCipher{
+    private:
+        mpz_t n_, p_, q_;
+        uint64_t _rand_seed;
+        bool fix_seed_ = false;
+
+    public:
+        int key_len;
+        bool debug_ = false;
+        bool has_pub_key = false;
+        bool has_prv_key = false;
+        PaillierPubKey<BITS> pub_key;
+        PaillierPrvKey<BITS> prv_key;
+        CgbnPair _one;            
+
+    public:
+        PaillierCipher(int key_len, bool fix_seed = false, bool debug = false){
+            this->key_len=key_len;
+            debug_ = debug;
+            fix_seed_ = fix_seed;
+            mpz_init(n_);
+            mpz_init(p_);
+            mpz_init(q_);
+
+            if (debug_) std::cout<<"Construct PaillierCipher"<<std::endl;
+        }
+        ~PaillierCipher(){
+            mpz_clear(n_);
+            mpz_clear(p_);
+            mpz_clear(q_);
+        }
+
+        void getN(mpz_t n) { mpz_set(n, n_); }
+
+        uint64_t get_rand_seed() { return _rand_seed; }
+
+        void set_pub_key(mpz_t &n, uint64_t rand) {
+            if (debug_) {
+                std::cout<<"PaillierCipher::set_pub_key" << std::endl;
+                gmp_printf("n:%Zd\n", n);
+                gmp_printf("rand:%d\n", rand);
+            }
+            mpz_set(n_, n);
+            _rand_seed = rand;
+            init_pub(n, rand);
+            has_pub_key = true;
+        }
+
+        void set_keys(mpz_t &n, uint64_t rand, mpz_t &raw_p, mpz_t &raw_q){
+            set_pub_key(n, rand);
+            mpz_set(p_, raw_p);
+            mpz_set(q_, raw_q);
+            init_prv(n, raw_p, raw_q);
+            has_prv_key = true;
+        }
+
+        void init_pub(mpz_t &n, uint64_t rand){
+            if (debug_) std::cout<<"PaillierCipher::init_pub" << std::endl;
+            mpz_t n_1, n_square, limit_int, rand_seed, one;
+            mpz_init(n_1);
+            mpz_init(n_square);
+            mpz_init(limit_int);
+            mpz_init(rand_seed);
+            mpz_init(one);
+
+            mpz_add_ui(n_1, n, 1);
+            mpz_mul(n_square, n, n);
+            mpz_div_ui(limit_int, n, 3);
+            mpz_sub_ui(limit_int, limit_int, 1);
+            mpz_sub(limit_int, n,limit_int);
+
+            mpz_set_ui(rand_seed, rand);
+            mpz_powm(rand_seed, rand_seed, n, n_square);
+            mpz_mul(one, n_1, rand_seed);
+            mpz_mod(one, one, n_square);
+
+            store2Cgbn(&pub_key.n, n);
+            store2Cgbn(&pub_key.n_1, n_1);
+            store2Cgbn(&pub_key.n_square, n_square);
+            store2Cgbn(&pub_key.limit_int, limit_int);
+            store2Cgbn(&pub_key.rand_seed, rand_seed);
+            store2Cgbn(&_one.g, rand_seed);
+            store2Cgbn(&_one.h, rand_seed);
+            ck(cudaMemcpyToSymbol(c_PubKey, &pub_key, sizeof(pub_key)));
+
+            if (debug_) {
+                gmp_printf("n_1:%Zd\n", n_1);
+                gmp_printf("n:%Zd\n", n);
+                gmp_printf("rand:%d \n", rand);
+                gmp_printf("n_square:%Zd\n", n_square);
+                gmp_printf("limit_int:%Zd\n", limit_int);
+                gmp_printf("rand_seed:%Zd \n", rand_seed);
+                gmp_printf("one:%Zd \n", one);
+            }
+            mpz_clear(n_1);
+            mpz_clear(n_square);
+            mpz_clear(limit_int);
+            mpz_clear(rand_seed);
+            mpz_clear(one);
+            if (debug_) std::cout<<"end PaillierCipher::init_pub" << std::endl;
+        }
+
+        void init_prv(mpz_t n, mpz_t raw_p, mpz_t raw_q){
+            if (debug_) std::cout<<"PaillierCipher::init_prv" <<std::endl;
+            mpz_t p, q;
+            mpz_init(p);
+            mpz_init(q);
+
+            mpz_t lamda, u;
+            mpz_init(lamda);
+            mpz_init(u);
+
+            if(mpz_cmp(raw_q, raw_p) < 0) {
+                mpz_set(p, raw_q);
+                mpz_set(q, raw_p);
+            } else {
+                mpz_set(p, raw_p);
+                mpz_set(q, raw_q);
+            }
+
+            mpz_sub_ui(p, p, 1);
+            mpz_sub_ui(q, q, 1);
+            mpz_mul(lamda, p, q);
+            store2Cgbn(&prv_key.lamda, lamda);
+
+            mpz_invert(u,lamda,n);
+            store2Cgbn(&prv_key.u, u);
+
+            if (debug_) {
+                gmp_printf("\np:%Zd\n", p);
+                gmp_printf("q:%Zd\n", q);
+                gmp_printf("\nlamda:%Zd\n", lamda);
+                gmp_printf("u:%Zd\n", u);
+            }
+
+            mpz_clear(p);
+            mpz_clear(q);
+
+            mpz_clear(lamda);
+            mpz_clear(u);
+            if (debug_) std::cout<<"end PaillierCipher::init_prv " << std::endl;
+        }
+
+        void genKeypair(){
+            if (debug_) std::cout<<"PaillierCipher::genKeypair" << std::endl;
+            //Init mpz
+            mpz_t p;
+            mpz_t q;
+            mpz_t n;
+
+            mpz_init(p);
+            mpz_init(q);
+            mpz_init(n);
+
+
+            //pick random (modulusbits/2)-bit primes p and q)
+            std::random_device rd;           // Non-deterministic random seed
+            std::mt19937 gen(rd());          // Mersenne Twister generator
+
+            // Uniform integer distribution
+            std::uniform_int_distribution<uint64_t> distribution(0, UINT64_MAX);
+
+            // Generate a random number
+            _rand_seed = distribution(gen);
+            if (fix_seed_) _rand_seed = 12345;
+
+            uint64_t seed_start = _rand_seed;
+            int n_len = 0;
+            while(n_len!=key_len){
+                getPrimeOver(p, key_len/2, seed_start);
+                mpz_set(q, p);
+                while(mpz_cmp(p, q) == 0){
+                    getPrimeOver(q, key_len/2, seed_start);
+                    mpz_mul(n, p, q);
+                    n_len = mpz_sizeinbase(n, 2);
+                }
+            }
+
+            // Set Key
+            set_keys(n, _rand_seed, p, q);
+
+
+            if (debug_) {
+                printf("Rand bits for n: %lu, key_len %d\n", mpz_sizeinbase(n, 2), key_len);
+                std::cout<<"The size of data is:" <<sizeof(prv_key)<<" "<<sizeof(pub_key)<<std::endl;
+            }
+
+            //Set device variable
+            ck(cudaMemcpyToSymbol(c_PriKey, &prv_key, sizeof(prv_key)));
+            ck(cudaMemcpyToSymbol(c_PubKey, &pub_key, sizeof(pub_key)));
+            ck(cudaDeviceSynchronize());
+            ck(cudaGetLastError());
+
+
+            //Free mpz
+            mpz_clear(p);
+            mpz_clear(q);
+            mpz_clear(n);
+            if (debug_) std::cout<<"end PaillierCipher::genKeypair" << std::endl;
+        }
+
+
+        void updateRandSeed(uint64_t rand){
+            mpz_t n, n_square, rand_seed; 
+            mpz_init(rand_seed);
+            mpz_init(n);
+            mpz_init(n_square);
+
+            mpz_set_ui(rand_seed, rand);
+
+            store2Gmp(n, &pub_key.n);
+            store2Gmp(n_square, &pub_key.n_square);
+            mpz_powm(rand_seed, rand_seed, n, n_square); 
+
+
+            store2Cgbn(&pub_key.rand_seed, rand_seed); 
+            store2Cgbn(&_one.g, rand_seed); 
+            store2Cgbn(&_one.h, rand_seed); 
+            if (debug_) gmp_printf("Updated rand_seed:%Zd \n", rand_seed);
+            ck(cudaMemcpyToSymbol(c_PubKey, &pub_key, sizeof(pub_key)));
+#ifdef DEBUG
+            ck(cudaDeviceSynchronize());
+            ck(cudaGetLastError());
+#endif
+            mpz_clear(rand_seed);
+        }
+
+
+        template<unsigned int TPI, unsigned int TPB>
+            int encrypt(cgbn_mem_t<BITS>* d_plains_ptr, cgbn_mem_t<BITS>* d_ciphers_ptr, int count){
+                int IPB=TPB/TPI;
+                cgbn_error_report_t *report;
+                ck(cgbn_error_report_alloc(&report));
+                
+#ifdef DEBUG
+                ck(cudaDeviceSynchronize());
+                ck(cudaGetLastError());
+                std::cout<< "numBlocks: "<< (count+IPB-1)/IPB << ", threadsPerBlock: "<< TPB<<std::endl;
+#endif
+#ifdef TIME
+                CudaTimer cuda_timer(0);
+                cuda_timer.start();
+#endif
+
+                gpu_encrypt<TPI, BITS><<<(count+IPB-1)/IPB, TPB>>>(report, d_plains_ptr, d_ciphers_ptr, count); 
+
+#ifdef DEBUG
+                ck(cudaDeviceSynchronize());
+                ck(cudaGetLastError());
+                CGBN_CHECK(report);
+#endif
+#ifdef TIME
+                float encrypt_time=cuda_timer.stop();
+                std::cout<<"Encrypt Time (TPI="<<TPI<<" , TBP="<<TPB<<" ): "<<encrypt_time<<" MS"<<std::endl;
+#endif
+                ck(cgbn_error_report_free(report));
+                return 0;
+            }
+
+        template<unsigned int TPI, unsigned int TPB>
+            int decrypt(cgbn_mem_t<BITS>* d_ciphers_ptr, cgbn_mem_t<BITS>* d_plains_ptr,int count){
+                int IPB=TPB/TPI;
+
+                cgbn_error_report_t *report;
+                ck(cgbn_error_report_alloc(&report));
+#ifdef TIME
+                CudaTimer cuda_timer(0);
+                cuda_timer.start();
+#endif
+                gpu_decrypt<TPI, BITS><<<(count+IPB-1)/IPB, TPB>>>(report, d_plains_ptr, d_ciphers_ptr, count);
+#ifdef DEBUG
+                ck(cudaDeviceSynchronize());
+                ck(cudaGetLastError());
+                CGBN_CHECK(report);
+                ck(cgbn_error_report_free(report));
+#endif
+
+#ifdef TIME
+                float decrypt_time=cuda_timer.stop();
+                std::cout<<"Decrypt Time (TPI="<<TPI<<" , TBP="<<TPB<<" ): "<<decrypt_time<<" MS"<<std::endl;
+#endif
+                return 0;
+            }
+
+        template<unsigned int TPI, unsigned int TPB>
+            int sum(CgbnPair* d_res_ptr, CgbnPair* d_arr_ptr, int* sample_bin, int count) {
+                int IPB=TPB/TPI;
+                int maxBlocks = 2560;
+                int numBlocks = min((count - 1) / IPB + 1, maxBlocks);
+                int mem_size = numBlocks * sizeof(CgbnPair);
+                if (count == 0) {
+                    cudaMemcpy(d_res_ptr, &_one, sizeof(CgbnPair), cudaMemcpyHostToDevice);
+                    return 0;
+                }
+
+                cgbn_error_report_t *report;
+                ck(cgbn_error_report_alloc(&report));
+#ifdef TIME
+                CudaTimer cuda_timer(0);
+                cuda_timer.start();
+#endif
+
+                CgbnPair* d_res_ptr_2;
+                ck(cudaMalloc((void **)&d_res_ptr_2, mem_size));
+                CgbnPair* d_one;
+                ck(cudaMalloc((void **)&d_one, sizeof(CgbnPair)));
+                cudaMemcpy(d_one, &_one, sizeof(CgbnPair), cudaMemcpyHostToDevice);
+                
+                typedef cgbn_context_t<TPI>         context_t;
+                typedef cgbn_env_t<context_t, BITS> env_t;
+                typedef typename env_t::cgbn_t bn_t;
+                int shmem_size = IPB * sizeof(CgbnPair);
+
+#ifdef DEBUG
+                std::cout << "before calling reduce_sum with CgbnPair and sample_bin" << std::endl;
+                std::cout << "before calling reduce_sum count: " << count << " shm_size: " << shmem_size << " numBlocks: " << numBlocks << std::endl;
+                std::cout << "before calling reduce_sum TPI: " << TPI << " TPB: " << TPB << " IPB: " << IPB << std::endl;
+#endif
+
+                reduce_sum_with_index<TPI, BITS><<<numBlocks, TPB, shmem_size>>>(report, d_res_ptr_2, d_arr_ptr, sample_bin, count, d_one);
+                ck(cudaDeviceSynchronize());
+
+#ifdef DEBUG
+                std::cout << "after calling reduce_sum" << std::endl;
+#endif
+
+                // final reduction
+                if (numBlocks != 1) {
+                    reduce_sum<TPI, BITS><<<1, TPB, shmem_size>>>(report, d_res_ptr, d_res_ptr_2, numBlocks, d_one);
+                } else {
+                    cudaMemcpy(d_res_ptr, d_res_ptr_2, mem_size, cudaMemcpyDeviceToDevice);
+                }
+                ck(cudaDeviceSynchronize());
+
+#ifdef DEBUG
+                ck(cudaGetLastError());
+                CGBN_CHECK(report);
+                ck(cgbn_error_report_free(report));
+#endif
+
+#ifdef TIME
+                float add_time=cuda_timer.stop();
+                std::cout<<"Add Time (TPI="<<TPI<<" , TBP="<<TPB<<" ): "<<add_time<<" MS"<<std::endl;
+#endif
+                cudaFree(d_res_ptr_2);
+                cudaFree(d_one);
+                return 0;
+	    }
+
+};
+
+
+/***********************Kernels*************************/
+template<unsigned int T_TPI, unsigned int T_BITS>
+__global__  
+void gpu_encrypt(cgbn_error_report_t *report, cgbn_mem_t<T_BITS> *plains, cgbn_mem_t<T_BITS> * ciphers, int count) {
+    int tid=(blockIdx.x*blockDim.x + threadIdx.x)/T_TPI;
+    if(tid>=count)
+        return;
+
+    static const uint32_t TPI=T_TPI;
+    static const uint32_t BITS=T_BITS;
+    typedef cgbn_context_t<TPI>         context_t;
+    typedef cgbn_env_t<context_t, BITS> env_t;
+    typedef typename env_t::cgbn_t bn_t;
+    typedef typename env_t::cgbn_wide_t bn_w_t;
+
+    context_t      bn_context(cgbn_report_monitor, report, tid);   // construct a context
+    env_t          bn_env(bn_context);                     // construct an environment for 1024-bit math
+
+    bn_t  t1, t2, t3;                             // define a, b, r as 1024-bit bignums
+    cgbn_load(bn_env, t1, &(c_PubKey.n));//tn_
+    cgbn_load(bn_env, t2, &(c_PubKey.limit_int));//t_tmp
+    cgbn_load(bn_env, t3, plains+tid);//t_p
+
+    int compare=cgbn_compare(bn_env, t3, t2); 
+    if( (compare>=0) &&(cgbn_compare(bn_env, t3, t1) < 0)){
+        cgbn_sub(bn_env, t2, t1, t3);
+        cgbn_mul(bn_env, t2, t1, t2);
+        cgbn_add_ui32(bn_env, t2, t2, 1);
+
+        cgbn_load(bn_env, t3, &(c_PubKey.n_square));
+        cgbn_rem(bn_env, t2, t2, t3);
+        cgbn_modular_inverse(bn_env, t2, t2, t3);
+    }else{
+        cgbn_mul(bn_env, t2, t1, t3);
+        cgbn_add_ui32(bn_env, t2, t2, 1);
+
+        cgbn_load(bn_env, t3, &(c_PubKey.n_square));
+        cgbn_rem(bn_env, t2, t2, t3);
+    }
+
+    cgbn_load(bn_env, t1, &(c_PubKey.rand_seed));
+
+    bn_w_t r;
+    cgbn_mul_wide(bn_env,r,t2, t1);
+    cgbn_rem_wide(bn_env,t2,r,t3);
+    cgbn_store(bn_env, ciphers  + tid, t2);
+}
+
+template<class env_t>
+__device__ __forceinline__ void fixed_window_powm_odd(env_t _env, 
+        typename env_t::cgbn_t &result, const typename env_t::cgbn_t &x, 
+        const typename env_t::cgbn_t &power, const typename env_t::cgbn_t &modulus) {
+    typename env_t::cgbn_t    t;
+    typename env_t::cgbn_local_t window[1<<window_bits];
+    int32_t    index, position, offset;
+    uint32_t   np0;
+
+    // conmpute x^power mod modulus, using the fixed window algorithm
+    // requires:  x<modulus,  modulus is odd
+
+    // compute x^0 (in Montgomery space, this is just 2^BITS - modulus)
+    cgbnn_egate(_env, t, modulus);
+    cgbn_store(_env, window+0, t);
+    
+    // convert x into Montgomery space, store into window table
+    np0=cgbn_bn2mont(_env, result, x, modulus);
+    cgbn_store(_env, window+1, result);
+    cgbn_set(_env, t, result);
+    
+    // compute x^2, x^3, ... x^(2^window_bits-1), store into window table
+    #pragma nounroll
+    for(index=2;index<(1<<window_bits);index++) {
+      cgbn_mont_mul(_env, result, result, t, modulus, np0);
+      cgbn_store(_env, window+index, result);
+    }
+
+    // find leading high bit
+    position=bits - cgbn_clz(_env, power);
+
+    // break the exponent into chunks, each window_bits in length
+    // load the most significant non-zero exponent chunk
+    offset=position % window_bits;
+    if(offset==0)
+      position=position-window_bits;
+    else
+      position=position-offset;
+    index=cgbn_extract_bits_ui32(_env, power, position, window_bits);
+    cgbn_load(_env, result, window+index);
+    
+    // process the remaining exponent chunks
+    while(position>0) {
+      // square the result window_bits times
+      #pragma nounroll
+      for(int sqr_count=0;sqr_count<window_bits;sqr_count++)
+        cgbn_mont_sqr(_env, result, result, modulus, np0);
+      
+      // multiply by next exponent chunk
+      position=position-window_bits;
+      index=cgbn_extract_bits_ui32(_env, power, position, window_bits);
+      cgbn_load(_env, t, window+index);
+      cgbn_mont_mul(_env, result, result, t, modulus, np0);
+    }
+    
+    // we've processed the exponent now, convert back to normal space
+    cgbn_mont2bn(_env, result, result, modulus, np0);
+  }
+
+
+template<class env_t>
+__device__ __forceinline__ void sliding_window_powm_odd(env_t _env, 
+        typename env_t::cgbn_t &result, const typename env_t::cgbn_t &x, 
+        const typename env_t::cgbn_t &power, const typename env_t::cgbn_t &modulus) {
+    typename env_t::cgbn_t     t, starts;
+    int32_t      index, position, leading;
+    uint32_t     mont_inv;
+    typename env_t::cgbn_local_t  odd_powers[1<<window_bits-1];
+
+    // conmpute x^power mod modulus, using Constant Length Non-Zero windows (CLNZ).
+    // requires:  x<modulus,  modulus is odd
+        
+    // find the leading one in the power
+    leading=bits-1-cgbn_clz(_env, power);
+    if(leading>=0) {
+      // convert x into Montgomery space, store in the odd powers table
+      mont_inv=cgbn_bn2mont(_env, result, x, modulus);
+      
+      // compute t=x^2 mod modulus
+      cgbn_mont_sqr(_env, t, result, modulus, mont_inv);
+      
+      // compute odd powers window table: x^1, x^3, x^5, ...
+      cgbn_store(_env, odd_powers, result);
+      #pragma nounroll
+      for(index=1;index<(1<<window_bits-1);index++) {
+        cgbn_mont_mul(_env, result, result, t, modulus, mont_inv);
+        cgbn_store(_env, odd_powers+index, result);
+      }
+  
+      // starts contains an array of bits indicating the start of a window
+      cgbn_set_ui32(_env, starts, 0);
+  
+      // organize p as a sequence of odd window indexes
+      position=0;
+      while(true) {
+        if(cgbn_extract_bits_ui32(_env, power, position, 1)==0)
+          position++;
+        else {
+          cgbn_insert_bits_ui32(_env, starts, starts, position, 1, 1);
+          if(position+window_bits>leading)
+            break;
+          position=position+window_bits;
+        }
+      }
+  
+      // load first window.  Note, since the window index must be odd, we have to
+      // divide it by two before indexing the window table.  Instead, we just don't
+      // load the index LSB from power
+      index=cgbn_extract_bits_ui32(_env, power, position+1, window_bits-1);
+      cgbn_load(_env, result, odd_powers+index);
+      position--;
+      
+      // Process remaining windows 
+      while(position>=0) {
+        cgbn_mont_sqr(_env, result, result, modulus, mont_inv);
+        if(cgbn_extract_bits_ui32(_env, starts, position, 1)==1) {
+          // found a window, load the index
+          index=cgbn_extract_bits_ui32(_env, power, position+1, window_bits-1);
+          cgbn_load(_env, t, odd_powers+index);
+          cgbn_mont_mul(_env, result, result, t, modulus, mont_inv);
+        }
+        position--;
+      }
+      
+      // convert result from Montgomery space
+      cgbn_mont2bn(_env, result, result, modulus, mont_inv);
+    }
+    else {
+      // p=0, thus x^p mod modulus=1
+      cgbn_set_ui32(_env, result, 1);
+    }
+  }
+
+
+template<unsigned int TPI, unsigned int BITS>
+__global__ void gpu_decrypt(cgbn_error_report_t *report, cgbn_mem_t<BITS> * plains, cgbn_mem_t<BITS> *ciphers, int count) {
+    int tid=(blockIdx.x*blockDim.x + threadIdx.x)/TPI;
+    if(tid>=count)
+        return;
+
+    cgbn_context_t<TPI>  bn_context(cgbn_report_monitor, report, tid);  
+    cgbn_env_t<cgbn_context_t<TPI>, BITS>  bn_env(bn_context);
+
+    typename cgbn_env_t<cgbn_context_t<TPI>, BITS>::cgbn_t t, p;
+    typename cgbn_env_t<cgbn_context_t<TPI>, BITS>::cgbn_t n;
+
+    cgbn_load(bn_env, t, ciphers + tid);
+    cgbn_load(bn_env, p, &(c_PriKey.lamda));
+    cgbn_load(bn_env, n, &(c_PubKey.n_square));
+
+    //cgbn_modular_power(bn_env, t, t,p, n);
+    //fixed_window_powm_odd(bn_env,t, t, p, n);
+    sliding_window_powm_odd(bn_env,t, t, p, n);
+
+
+    cgbn_load(bn_env, n, &(c_PubKey.n));
+    cgbn_sub_ui32(bn_env, t, t, 1);
+
+    cgbn_load(bn_env, p, &(c_PriKey.u));
+
+    cgbn_div(bn_env, t, t, n);
+    cgbn_mul(bn_env, t, t, p);
+    cgbn_rem(bn_env, t, t, n);
+
+    cgbn_store(bn_env, plains + tid, t);
+}
+
+
+template <unsigned int TPI, unsigned int BITS>
+__global__ void reduce_sum(cgbn_error_report_t* report, CgbnPair* result, CgbnPair* arr, int count, CgbnPair* one) {
+    typedef cgbn_context_t<TPI> context_t;
+    typedef cgbn_env_t<context_t, BITS> env_t;
+    typedef typename env_t::cgbn_t bn_t;
+    typedef typename env_t::cgbn_wide_t bn_w_t;
+
+    int id = (blockIdx.x * blockDim.x + threadIdx.x) / TPI;
+    int shm_id = threadIdx.x / TPI;
+    int IPB = blockDim.x / TPI;
+
+    context_t bn_context(cgbn_report_monitor, report, id);
+    env_t bn_env(bn_context);
+
+    extern __shared__ CgbnPair sdata3[];
+    bn_t a, b, c, tmp_g, tmp_h;
+    bn_t n_square;
+    bn_w_t r;
+
+    int total_windows = (count - 1) / (IPB * gridDim.x) + 1;
+    cgbn_load(bn_env, n_square, &c_PubKey.n_square);
+    for (unsigned int window = 0; window < total_windows; window++) {
+        int global_position = id + window * IPB * gridDim.x;
+        if (global_position >= count) {
+            // Load rand_seed into sdata3 directly for positions exceeding count
+            sdata3[shm_id] = one[0];
+        } else {
+            // Load pairs of elements from arr into sdata3
+            sdata3[shm_id] = arr[global_position];
+        }
+        __syncthreads();
+
+        // Perform reduction in shared memory
+        for (unsigned int s = IPB / 2; s > 0; s >>= 1) {
+            if (shm_id < s) {
+                // Load pairs of elements from shared memory and perform reduction
+                cgbn_load(bn_env, a, &(sdata3[shm_id].g));
+                cgbn_load(bn_env, b, &(sdata3[shm_id + s].g));
+                cgbn_mul_wide(bn_env, r, a, b);
+                cgbn_rem_wide(bn_env, c, r, n_square);
+                cgbn_store(bn_env, &(sdata3[shm_id].g), c);
+
+                cgbn_load(bn_env, a, &(sdata3[shm_id].h));
+                cgbn_load(bn_env, b, &(sdata3[shm_id + s].h));
+                cgbn_mul_wide(bn_env, r, a, b);
+                cgbn_rem_wide(bn_env, c, r, n_square);
+                cgbn_store(bn_env, &(sdata3[shm_id].h), c);
+            }
+            __syncthreads();
+        }
+
+        if (shm_id == 0) {
+            if (window == 0) {
+                // Store the result of the first window into tmp
+                cgbn_load(bn_env, tmp_g, &(sdata3[0].g));
+                cgbn_load(bn_env, tmp_h, &(sdata3[0].h));
+            } else {
+                // Add the result of subsequent windows to tmp
+                cgbn_load(bn_env, a, &(sdata3[0].g));
+                cgbn_mul_wide(bn_env, r, a, tmp_g);
+                cgbn_rem_wide(bn_env, tmp_g, r, n_square);
+
+                cgbn_load(bn_env, a, &(sdata3[0].h));
+                cgbn_mul_wide(bn_env, r, a, tmp_h);
+                cgbn_rem_wide(bn_env, tmp_h, r, n_square);
+            }
+        }
+        __syncthreads();
+    }
+
+    __syncthreads();
+
+    // Write the final result for this block to global memory
+    if (shm_id == 0) {
+        cgbn_store(bn_env, &(result[blockIdx.x].g), tmp_g);
+        cgbn_store(bn_env, &(result[blockIdx.x].h), tmp_h);
+    }
+    __syncthreads();
+}
+
+
+template <unsigned int TPI, unsigned int BITS>
+__global__ void reduce_sum_with_index(cgbn_error_report_t* report, CgbnPair* result, CgbnPair* arr, int* sample_bin, int count, CgbnPair* one) {
+                            
+    typedef cgbn_context_t<TPI> context_t;
+    typedef cgbn_env_t<context_t, BITS> env_t;
+    typedef typename env_t::cgbn_t bn_t;
+    typedef typename env_t::cgbn_wide_t bn_w_t;
+
+    int id = (blockIdx.x * blockDim.x + threadIdx.x) / TPI;
+    int shm_id = threadIdx.x / TPI;
+    int IPB = blockDim.x / TPI;
+
+    context_t bn_context(cgbn_report_monitor, report, id);
+    env_t bn_env(bn_context);
+
+    extern __shared__ CgbnPair sdata4[];
+    bn_t a, b, c, tmp_g, tmp_h;
+    bn_t n_square;
+    bn_w_t r;
+
+    int total_windows = (count - 1) / (IPB * gridDim.x) + 1;
+    cgbn_load(bn_env, n_square, &c_PubKey.n_square);
+    for (unsigned int window = 0; window < total_windows; window++) {
+        int global_position = id + window * IPB * gridDim.x;
+
+#ifdef DEBUG
+        printf("id %d shm_id %d IPB %d threadIdx.x %d blockIdx.x %d gridDim.x %d window %d total_windows %d global_position %d \n", id, shm_id, IPB, threadIdx.x, blockIdx.x, gridDim.x, window, total_windows, global_position);
+#endif
+      
+        if (global_position >= count) {
+            // Load rand_seed into sdata4 directly for positions exceeding count
+            sdata4[shm_id] = one[0];
+        } else {
+            int sample_id = sample_bin[global_position];
+            //printf("loading global position %d sample id %d", global_position, sample_id);
+            // each shm_id copy one instance from global to shared mem
+            sdata4[shm_id] = arr[sample_id];
+        }
+        __syncthreads();
+
+        // Perform reduction in shared memory
+        for (unsigned int s = IPB / 2; s > 0; s >>= 1) {
+            if (shm_id < s) {
+                // Load pairs of elements from shared memory and perform reduction
+                cgbn_load(bn_env, a, &(sdata4[shm_id].g));
+                cgbn_load(bn_env, b, &(sdata4[shm_id + s].g));
+                cgbn_mul_wide(bn_env, r, a, b);
+                cgbn_rem_wide(bn_env, c, r, n_square);
+                cgbn_store(bn_env, &(sdata4[shm_id].g), c);
+
+                cgbn_load(bn_env, a, &(sdata4[shm_id].h));
+                cgbn_load(bn_env, b, &(sdata4[shm_id + s].h));
+                cgbn_mul_wide(bn_env, r, a, b);
+                cgbn_rem_wide(bn_env, c, r, n_square);
+                cgbn_store(bn_env, &(sdata4[shm_id].h), c);
+            }
+            __syncthreads();
+        }
+
+        if (shm_id == 0) {
+            if (window == 0) {
+                // Store the result of the first window into tmp
+                cgbn_load(bn_env, tmp_g, &(sdata4[0].g));
+                cgbn_load(bn_env, tmp_h, &(sdata4[0].h));
+            } else {
+                // Add the result of subsequent windows to tmp
+                cgbn_load(bn_env, a, &(sdata4[0].g));
+                cgbn_mul_wide(bn_env, r, a, tmp_g);
+                cgbn_rem_wide(bn_env, tmp_g, r, n_square);
+
+                cgbn_load(bn_env, a, &(sdata4[0].h));
+                cgbn_mul_wide(bn_env, r, a, tmp_h);
+                cgbn_rem_wide(bn_env, tmp_h, r, n_square);
+            }
+        }
+        __syncthreads();
+    }
+
+    __syncthreads();
+
+    // Write the final result for this block to global memory
+    if (shm_id == 0) {
+        cgbn_store(bn_env, &(result[blockIdx.x].g), tmp_g);
+        cgbn_store(bn_env, &(result[blockIdx.x].h), tmp_h);
+    }
+    __syncthreads();
+}
+
+#endif // PAILLIER_H
diff --git a/integration/xgboost/encryption_plugins/.editorconfig b/integration/xgboost/encryption_plugins/nvflare_plugin/.editorconfig
similarity index 100%
rename from integration/xgboost/encryption_plugins/.editorconfig
rename to integration/xgboost/encryption_plugins/nvflare_plugin/.editorconfig
diff --git a/integration/xgboost/encryption_plugins/nvflare_plugin/CMakeLists.txt b/integration/xgboost/encryption_plugins/nvflare_plugin/CMakeLists.txt
new file mode 100644
index 0000000000..b834206f4f
--- /dev/null
+++ b/integration/xgboost/encryption_plugins/nvflare_plugin/CMakeLists.txt
@@ -0,0 +1,49 @@
+
+option(GOOGLE_TEST "Build google tests" OFF)
+
+file(GLOB_RECURSE LIB_SRC
+	${CMAKE_SOURCE_DIR}/shared/dam/*.cc
+	${CMAKE_SOURCE_DIR}/shared/plugins/*.cc
+	${CMAKE_CURRENT_SOURCE_DIR}/src/*.cc
+)
+message(STATUS "LIB_SRC files: ${LIB_SRC}")
+
+set(TARGET_NAME nvflare)
+add_library(${TARGET_NAME} SHARED ${LIB_SRC})
+set_target_properties(${TARGET_NAME} PROPERTIES
+  CXX_STANDARD 17
+  CXX_STANDARD_REQUIRED ON
+  POSITION_INDEPENDENT_CODE ON
+  ENABLE_EXPORTS ON
+)
+target_include_directories(${TARGET_NAME} PRIVATE
+	${CMAKE_SOURCE_DIR}/shared/include
+	${CMAKE_CURRENT_SOURCE_DIR}/src
+)
+
+if (APPLE)
+    add_link_options("LINKER:-object_path_lto,$<TARGET_PROPERTY:NAME>_lto.o")
+    add_link_options("LINKER:-cache_path_lto,${CMAKE_BINARY_DIR}/LTOCache")
+endif ()
+
+#-- Unit Tests
+if(GOOGLE_TEST)
+    find_package(GTest REQUIRED)
+    enable_testing()
+    add_executable(nvflare_test)
+    target_link_libraries(nvflare_test PRIVATE ${TARGET_NAME})
+
+
+    target_include_directories(nvflare_test PRIVATE
+        ${CMAKE_SOURCE_DIR}/shared/include
+        ${CMAKE_CURRENT_SOURCE_DIR}/src
+    )
+
+    add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/tests)
+
+    add_test(
+        NAME TestNvflarePlugins
+        COMMAND nvflare_test
+	WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
+
+endif()
diff --git a/integration/xgboost/encryption_plugins/nvflare_plugin/README.md b/integration/xgboost/encryption_plugins/nvflare_plugin/README.md
new file mode 100644
index 0000000000..a2c20a5461
--- /dev/null
+++ b/integration/xgboost/encryption_plugins/nvflare_plugin/README.md
@@ -0,0 +1,5 @@
+# XGBoost nvflare plugin
+
+Includes:
+  - nvflare plugin (that does not do encryption in the C++ code but rely on NVFlare python side to do it)
+  - pass-thru plugin (that just pass through to showcase how to write a plugin)
diff --git a/integration/xgboost/encryption_plugins/src/README.md b/integration/xgboost/encryption_plugins/nvflare_plugin/src/README.md
similarity index 100%
rename from integration/xgboost/encryption_plugins/src/README.md
rename to integration/xgboost/encryption_plugins/nvflare_plugin/src/README.md
diff --git a/integration/xgboost/encryption_plugins/src/plugins/delegated_plugin.cc b/integration/xgboost/encryption_plugins/nvflare_plugin/src/delegated_plugin.cc
similarity index 100%
rename from integration/xgboost/encryption_plugins/src/plugins/delegated_plugin.cc
rename to integration/xgboost/encryption_plugins/nvflare_plugin/src/delegated_plugin.cc
diff --git a/integration/xgboost/encryption_plugins/src/plugins/nvflare_plugin.cc b/integration/xgboost/encryption_plugins/nvflare_plugin/src/nvflare_plugin.cc
similarity index 100%
rename from integration/xgboost/encryption_plugins/src/plugins/nvflare_plugin.cc
rename to integration/xgboost/encryption_plugins/nvflare_plugin/src/nvflare_plugin.cc
diff --git a/integration/xgboost/encryption_plugins/src/include/nvflare_plugin.h b/integration/xgboost/encryption_plugins/nvflare_plugin/src/nvflare_plugin.h
similarity index 100%
rename from integration/xgboost/encryption_plugins/src/include/nvflare_plugin.h
rename to integration/xgboost/encryption_plugins/nvflare_plugin/src/nvflare_plugin.h
diff --git a/integration/xgboost/encryption_plugins/src/plugins/pass_thru_plugin.cc b/integration/xgboost/encryption_plugins/nvflare_plugin/src/pass_thru_plugin.cc
similarity index 100%
rename from integration/xgboost/encryption_plugins/src/plugins/pass_thru_plugin.cc
rename to integration/xgboost/encryption_plugins/nvflare_plugin/src/pass_thru_plugin.cc
diff --git a/integration/xgboost/encryption_plugins/src/include/pass_thru_plugin.h b/integration/xgboost/encryption_plugins/nvflare_plugin/src/pass_thru_plugin.h
similarity index 100%
rename from integration/xgboost/encryption_plugins/src/include/pass_thru_plugin.h
rename to integration/xgboost/encryption_plugins/nvflare_plugin/src/pass_thru_plugin.h
diff --git a/integration/xgboost/encryption_plugins/tests/CMakeLists.txt b/integration/xgboost/encryption_plugins/nvflare_plugin/tests/CMakeLists.txt
similarity index 100%
rename from integration/xgboost/encryption_plugins/tests/CMakeLists.txt
rename to integration/xgboost/encryption_plugins/nvflare_plugin/tests/CMakeLists.txt
diff --git a/integration/xgboost/encryption_plugins/tests/test_dam.cc b/integration/xgboost/encryption_plugins/nvflare_plugin/tests/test_dam.cc
similarity index 100%
rename from integration/xgboost/encryption_plugins/tests/test_dam.cc
rename to integration/xgboost/encryption_plugins/nvflare_plugin/tests/test_dam.cc
diff --git a/integration/xgboost/encryption_plugins/tests/test_main.cc b/integration/xgboost/encryption_plugins/nvflare_plugin/tests/test_main.cc
similarity index 100%
rename from integration/xgboost/encryption_plugins/tests/test_main.cc
rename to integration/xgboost/encryption_plugins/nvflare_plugin/tests/test_main.cc
diff --git a/integration/xgboost/encryption_plugins/tests/test_tenseal.py b/integration/xgboost/encryption_plugins/nvflare_plugin/tests/test_tenseal.py
similarity index 100%
rename from integration/xgboost/encryption_plugins/tests/test_tenseal.py
rename to integration/xgboost/encryption_plugins/nvflare_plugin/tests/test_tenseal.py
diff --git a/integration/xgboost/encryption_plugins/src/dam/README.md b/integration/xgboost/encryption_plugins/shared/dam/README.md
old mode 100644
new mode 100755
similarity index 100%
rename from integration/xgboost/encryption_plugins/src/dam/README.md
rename to integration/xgboost/encryption_plugins/shared/dam/README.md
diff --git a/integration/xgboost/encryption_plugins/src/dam/dam.cc b/integration/xgboost/encryption_plugins/shared/dam/dam.cc
old mode 100644
new mode 100755
similarity index 100%
rename from integration/xgboost/encryption_plugins/src/dam/dam.cc
rename to integration/xgboost/encryption_plugins/shared/dam/dam.cc
diff --git a/integration/xgboost/encryption_plugins/src/include/base_plugin.h b/integration/xgboost/encryption_plugins/shared/include/base_plugin.h
old mode 100644
new mode 100755
similarity index 100%
rename from integration/xgboost/encryption_plugins/src/include/base_plugin.h
rename to integration/xgboost/encryption_plugins/shared/include/base_plugin.h
diff --git a/integration/xgboost/encryption_plugins/src/include/dam.h b/integration/xgboost/encryption_plugins/shared/include/dam.h
old mode 100644
new mode 100755
similarity index 100%
rename from integration/xgboost/encryption_plugins/src/include/dam.h
rename to integration/xgboost/encryption_plugins/shared/include/dam.h
diff --git a/integration/xgboost/encryption_plugins/src/include/data_set_ids.h b/integration/xgboost/encryption_plugins/shared/include/data_set_ids.h
old mode 100644
new mode 100755
similarity index 100%
rename from integration/xgboost/encryption_plugins/src/include/data_set_ids.h
rename to integration/xgboost/encryption_plugins/shared/include/data_set_ids.h
diff --git a/integration/xgboost/encryption_plugins/src/include/delegated_plugin.h b/integration/xgboost/encryption_plugins/shared/include/delegated_plugin.h
old mode 100644
new mode 100755
similarity index 100%
rename from integration/xgboost/encryption_plugins/src/include/delegated_plugin.h
rename to integration/xgboost/encryption_plugins/shared/include/delegated_plugin.h
diff --git a/integration/xgboost/encryption_plugins/src/include/local_plugin.h b/integration/xgboost/encryption_plugins/shared/include/local_plugin.h
old mode 100644
new mode 100755
similarity index 100%
rename from integration/xgboost/encryption_plugins/src/include/local_plugin.h
rename to integration/xgboost/encryption_plugins/shared/include/local_plugin.h
diff --git a/integration/xgboost/encryption_plugins/src/include/util.h b/integration/xgboost/encryption_plugins/shared/include/util.h
old mode 100644
new mode 100755
similarity index 100%
rename from integration/xgboost/encryption_plugins/src/include/util.h
rename to integration/xgboost/encryption_plugins/shared/include/util.h
diff --git a/integration/xgboost/encryption_plugins/src/plugins/local_plugin.cc b/integration/xgboost/encryption_plugins/shared/plugins/local_plugin.cc
old mode 100644
new mode 100755
similarity index 100%
rename from integration/xgboost/encryption_plugins/src/plugins/local_plugin.cc
rename to integration/xgboost/encryption_plugins/shared/plugins/local_plugin.cc
diff --git a/integration/xgboost/encryption_plugins/src/plugins/plugin_main.cc b/integration/xgboost/encryption_plugins/shared/plugins/plugin_main.cc
old mode 100644
new mode 100755
similarity index 100%
rename from integration/xgboost/encryption_plugins/src/plugins/plugin_main.cc
rename to integration/xgboost/encryption_plugins/shared/plugins/plugin_main.cc
diff --git a/integration/xgboost/encryption_plugins/src/plugins/util.cc b/integration/xgboost/encryption_plugins/shared/plugins/util.cc
old mode 100644
new mode 100755
similarity index 100%
rename from integration/xgboost/encryption_plugins/src/plugins/util.cc
rename to integration/xgboost/encryption_plugins/shared/plugins/util.cc