Skip to content

Commit

Permalink
Feature/expert parallel (#9)
Browse files Browse the repository at this point in the history
* add back expert parallel by id hash

* add grok ep

* fix mistral typo

* accom cuda copy bug

* sync after compute

* fix:sync to make sure that input is ready

---------

Co-authored-by: xly <[email protected]>
Co-authored-by: luzhan <[email protected]>
  • Loading branch information
3 people authored May 5, 2024
1 parent 30676fa commit 08ded21
Show file tree
Hide file tree
Showing 20 changed files with 155 additions and 1,151 deletions.
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ Note that: The open-sourced MoE-Infinity has been redesigned for making it Huggi
Single GPU A5000 (24GB Memory), per-token-latency (seconds) for generation with a mixed dataset that includes [FLAN](https://huggingface.co/datasets/Muennighoff/flan), [BIG-Bench](https://huggingface.co/datasets/bigbench) and [MMLU](https://huggingface.co/datasets/lukaemon/mmlu) datasets.
Lower per-token-latency is preferable.

| | switch-large-128 | NLLB-MoE-54B | Mixtral-7x8b |
| | switch-large-128 | NLLB-MoE-54B | Mixtral-8x7b |
| :---: | :---: | :---: | :---: |
| <ins>MoE-Infinity</ins> | <ins>*0.230*</ins> | <ins>*0.239*</ins> | <ins>*0.895*</ins> |
| Accelerate | 1.043 | 3.071 | 6.633 |
Expand All @@ -48,7 +48,7 @@ Lower per-token-latency is preferable.
Single GPU A5000, throughput (token/s) for generation with batch size 32.
Higher throughput is preferable.

| | switch-large-128 | NLLB-MoE-54B | Mixtral-7x8b |
| | switch-large-128 | NLLB-MoE-54B | Mixtral-8x7b |
| :---: | :---: | :---: | :---: |
| <ins>MoE-Infinity</ins> | <ins>*69.105*</ins> | <ins>*30.300*</ins> | <ins>*12.579*</ins> |
| Accelerate | 5.788 | 4.344 | 1.245 |
Expand Down
2 changes: 1 addition & 1 deletion core/aio/archer_prio_aio_handle.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ std::int64_t ArcherPrioAioHandle::Write(const std::string& filename,

auto mem_type = IsDevicePointer(buffer) ? cudaMemcpyDeviceToHost : cudaMemcpyHostToHost;
cudaHostAlloc(&write_buffer, num_bytes_aligned, cudaHostAllocDefault);
cudaMemcpy(write_buffer, buffer, num_bytes, mem_type);
CudaMemcpy(write_buffer, buffer, num_bytes, mem_type);
auto callbacks =
aio_context_.PrepIocbs(false, write_buffer, fd, kBlockSize, offset, num_bytes_aligned);
auto io_request = std::make_shared<struct AioRequest>();
Expand Down
4 changes: 2 additions & 2 deletions core/model/model_topology.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -114,9 +114,9 @@ void Node::SetDevice(const torch::Device& target_device,

auto start_time = MCIROSECONDS_SINCE_EPOCH;
if (stream == nullptr) {
cudaMemcpy(device_memory_ptr, host_memory_ptr, byte_size, cudaMemcpyHostToDevice);
CudaMemcpy(device_memory_ptr, host_memory_ptr, byte_size, cudaMemcpyHostToDevice);
} else {
cudaMemcpyAsync(
CudaMemcpyAsync(
device_memory_ptr, host_memory_ptr, byte_size, cudaMemcpyHostToDevice, stream);
cudaStreamSynchronize(stream);
}
Expand Down
4 changes: 4 additions & 0 deletions core/parallel/expert_dispatcher.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -332,6 +332,7 @@ void ExpertDispatcher::GPUExecFunc(int gpu_id)

auto* expert_module = args.expert_node->module;
int expert_type = expert_type_;
cudaStreamSynchronize(0); // make sure the input is ready

try {
switch (expert_type) {
Expand Down Expand Up @@ -369,6 +370,8 @@ void ExpertDispatcher::GPUExecFunc(int gpu_id)
ss << "]";
ARCHER_LOG_FATAL("ExpertDispatcher::GPUExecFunc", ss.str(), "expert_type", expert_type, e.what());
}

stream.synchronize();
}

(void)std::async(std::launch::async,
Expand Down Expand Up @@ -414,6 +417,7 @@ void ExpertDispatcher::OutputFunc(ExecArgs args, torch::Tensor output, int gpu_i
gpu_id,
args.hit, ")");
}
stream.synchronize();
pending_.fetch_sub(1);
}

Expand Down
4 changes: 2 additions & 2 deletions core/prefetch/archer_prefetch_handle.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
#include "common/time.h"
#include "memory/memory_pool.h"
#include "task_scheduler.h"

#include "utils/cuda_utils.h"
#include "utils/archer_logger.h"

ArcherPrefetchHandle::ArcherPrefetchHandle(const std::string& prefix,
Expand Down Expand Up @@ -335,7 +335,7 @@ void ArcherPrefetchHandle::SetTensorDevice(torch::Tensor& tensor, torch::Device
cudaSetDevice(device.index());
cudaMalloc(&device_ptr, byte_size);

cudaMemcpy(device_ptr, tensor.data_ptr(), byte_size, cudaMemcpyDeviceToDevice);
CudaMemcpy(device_ptr, tensor.data_ptr(), byte_size, cudaMemcpyDeviceToDevice);

auto new_tensor = torch::from_blob(
device_ptr,
Expand Down
140 changes: 0 additions & 140 deletions core/trace/archer_tensor_tracer.cpp

This file was deleted.

30 changes: 0 additions & 30 deletions core/trace/archer_tensor_tracer.h

This file was deleted.

Loading

0 comments on commit 08ded21

Please sign in to comment.