Feature/expert parallel (#9)

* add back expert parallel by id hash * add grok ep * fix mistral typo * accom cuda copy bug * sync after compute * fix:sync to make sure that input is ready --------- Co-authored-by: xly <[email protected]> Co-authored-by: luzhan <[email protected]>
TorchMoE · May 5, 2024 · 08ded21 · 08ded21
1 parent 30676fa
commit 08ded21
Show file tree

Hide file tree

Showing 20 changed files with 155 additions and 1,151 deletions.
diff --git a/README.md b/README.md
@@ -36,7 +36,7 @@ Note that: The open-sourced MoE-Infinity has been redesigned for making it Huggi
 Single GPU A5000 (24GB Memory), per-token-latency (seconds) for generation with a mixed dataset that includes [FLAN](https://huggingface.co/datasets/Muennighoff/flan), [BIG-Bench](https://huggingface.co/datasets/bigbench) and [MMLU](https://huggingface.co/datasets/lukaemon/mmlu) datasets.
 Lower per-token-latency is preferable.
 
-|  | switch-large-128 | NLLB-MoE-54B | Mixtral-7x8b |
+|  | switch-large-128 | NLLB-MoE-54B | Mixtral-8x7b |
 | :---: | :---: | :---: | :---: |
 | <ins>MoE-Infinity</ins> | <ins>*0.230*</ins>	| <ins>*0.239*</ins> | <ins>*0.895*</ins> |
 | Accelerate | 1.043 | 3.071 | 6.633 |
@@ -48,7 +48,7 @@ Lower per-token-latency is preferable.
 Single GPU A5000, throughput (token/s) for generation with batch size 32.
 Higher throughput is preferable.
 
-|  | switch-large-128 | NLLB-MoE-54B | Mixtral-7x8b |
+|  | switch-large-128 | NLLB-MoE-54B | Mixtral-8x7b |
 | :---: | :---: | :---: | :---: |
 | <ins>MoE-Infinity</ins> | <ins>*69.105*</ins>	| <ins>*30.300*</ins> | <ins>*12.579*</ins> |
 | Accelerate | 5.788 | 4.344 | 1.245 |

diff --git a/core/aio/archer_prio_aio_handle.cpp b/core/aio/archer_prio_aio_handle.cpp
@@ -90,7 +90,7 @@ std::int64_t ArcherPrioAioHandle::Write(const std::string& filename,
 
     auto mem_type = IsDevicePointer(buffer) ? cudaMemcpyDeviceToHost : cudaMemcpyHostToHost;
     cudaHostAlloc(&write_buffer, num_bytes_aligned, cudaHostAllocDefault);
-    cudaMemcpy(write_buffer, buffer, num_bytes, mem_type);
+    CudaMemcpy(write_buffer, buffer, num_bytes, mem_type);
     auto callbacks =
         aio_context_.PrepIocbs(false, write_buffer, fd, kBlockSize, offset, num_bytes_aligned);
     auto io_request = std::make_shared<struct AioRequest>();

diff --git a/core/model/model_topology.cpp b/core/model/model_topology.cpp
@@ -114,9 +114,9 @@ void Node::SetDevice(const torch::Device& target_device,
 
             auto start_time = MCIROSECONDS_SINCE_EPOCH;
             if (stream == nullptr) {
-                cudaMemcpy(device_memory_ptr, host_memory_ptr, byte_size, cudaMemcpyHostToDevice);
+                CudaMemcpy(device_memory_ptr, host_memory_ptr, byte_size, cudaMemcpyHostToDevice);
             } else {
-                cudaMemcpyAsync(
+                CudaMemcpyAsync(
                     device_memory_ptr, host_memory_ptr, byte_size, cudaMemcpyHostToDevice, stream);
                 cudaStreamSynchronize(stream);
             }

diff --git a/core/parallel/expert_dispatcher.cpp b/core/parallel/expert_dispatcher.cpp
@@ -332,6 +332,7 @@ void ExpertDispatcher::GPUExecFunc(int gpu_id)
 
             auto* expert_module = args.expert_node->module;
             int expert_type = expert_type_;
+            cudaStreamSynchronize(0); // make sure the input is ready
 
             try {
                 switch (expert_type) {
@@ -369,6 +370,8 @@ void ExpertDispatcher::GPUExecFunc(int gpu_id)
                 ss << "]";
                 ARCHER_LOG_FATAL("ExpertDispatcher::GPUExecFunc", ss.str(), "expert_type", expert_type, e.what());
             }
+
+            stream.synchronize();
         }
 
         (void)std::async(std::launch::async,
@@ -414,6 +417,7 @@ void ExpertDispatcher::OutputFunc(ExecArgs args, torch::Tensor output, int gpu_i
             gpu_id,
             args.hit, ")");
     }
+    stream.synchronize();
     pending_.fetch_sub(1);
 }
 

diff --git a/core/prefetch/archer_prefetch_handle.cpp b/core/prefetch/archer_prefetch_handle.cpp
@@ -11,7 +11,7 @@
 #include "common/time.h"
 #include "memory/memory_pool.h"
 #include "task_scheduler.h"
-
+#include "utils/cuda_utils.h"
 #include "utils/archer_logger.h"
 
 ArcherPrefetchHandle::ArcherPrefetchHandle(const std::string& prefix,
@@ -335,7 +335,7 @@ void ArcherPrefetchHandle::SetTensorDevice(torch::Tensor& tensor, torch::Device
     cudaSetDevice(device.index());
     cudaMalloc(&device_ptr, byte_size);
 
-    cudaMemcpy(device_ptr, tensor.data_ptr(), byte_size, cudaMemcpyDeviceToDevice);
+    CudaMemcpy(device_ptr, tensor.data_ptr(), byte_size, cudaMemcpyDeviceToDevice);
 
     auto new_tensor = torch::from_blob(
         device_ptr,

diff --git a/core/trace/archer_tensor_tracer.cpp b/core/trace/archer_tensor_tracer.cpp
diff --git a/core/trace/archer_tensor_tracer.h b/core/trace/archer_tensor_tracer.h