Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Xly/deepseek #34

Open
wants to merge 7 commits into
base: dev
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/build-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -51,4 +51,4 @@ jobs:
pip install build

- name: Build package
run: BUILD_OPS=1 python -m build
run: BUILD_OPS=1 python -m build
6 changes: 3 additions & 3 deletions .github/workflows/publish-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ jobs:
VERSION_HASH=$(date +"%Y%m%d%H%M%S")
echo "Generated version hash: $VERSION_HASH"
echo $VERSION_HASH > version.txt

- name: Upload version number as artifact
uses: actions/upload-artifact@v2
with:
Expand Down Expand Up @@ -84,7 +84,7 @@ jobs:
asset_name=${wheel_name//"linux"/"manylinux1"}
echo "wheel_name=${wheel_name}" >> $GITHUB_ENV
echo "asset_name=${asset_name}" >> $GITHUB_ENV


# only build source when the python version is 3.8
- name: Build Source
Expand All @@ -102,4 +102,4 @@ jobs:
uses: pypa/gh-action-pypi-publish@release/v1.8
with:
repository-url: https://test.pypi.org/legacy/
skip-existing: true
skip-existing: true
4 changes: 2 additions & 2 deletions .github/workflows/publish.yml
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ jobs:
asset_name=${wheel_name//"linux"/"manylinux1"}
echo "wheel_name=${wheel_name}" >> $GITHUB_ENV
echo "asset_name=${asset_name}" >> $GITHUB_ENV


# only build source when the python version is 3.8
- name: Build Source
Expand All @@ -115,4 +115,4 @@ jobs:
uses: pypa/gh-action-pypi-publish@release/v1.8
with:
# repository-url: https://test.pypi.org/legacy/
skip-existing: true
skip-existing: true
2 changes: 1 addition & 1 deletion .github/workflows/scripts/create-release.js
Original file line number Diff line number Diff line change
Expand Up @@ -17,4 +17,4 @@ module.exports = async (github, context, core) => {
} catch (error) {
core.setFailed(error.message);
}
}
}
2 changes: 1 addition & 1 deletion .github/workflows/scripts/cuda-install.sh
Original file line number Diff line number Diff line change
Expand Up @@ -20,4 +20,4 @@ nvcc --version
# Log gcc, g++, c++ versions
gcc --version
g++ --version
c++ --version
c++ --version
4 changes: 2 additions & 2 deletions .github/workflows/scripts/free-disk-space.sh
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
# Total space: 85GB
# Allocated: 67 GB
# Free: 17 GB
# This script frees up 28 GB of disk space by deleting unneeded packages and
# This script frees up 28 GB of disk space by deleting unneeded packages and
# large directories.
# The Flink end to end tests download and generate more than 17 GB of files,
# causing unpredictable behavior and build failures.
Expand All @@ -45,4 +45,4 @@ echo "Removing large directories"
# deleting 15GB
rm -rf /usr/share/dotnet/
rm -rf /opt/hostedtoolcache/
df -h
df -h
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
# Created by https://www.toptal.com/developers/gitignore/api/visualstudiocode
# Edit at https://www.toptal.com/developers/gitignore?templates=visualstudiocode

test*

### VisualStudioCode ###
.vscode/*
# !.vscode/settings.json
Expand Down
52 changes: 52 additions & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
repos:
- repo: meta
hooks:
- id: check-hooks-apply
- id: check-useless-excludes

- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v4.0.1
hooks:
- id: check-case-conflict
# - id: check-json
# - id: check-symlinks
- id: check-yaml
- id: destroyed-symlinks
- id: end-of-file-fixer
- id: fix-byte-order-marker
- id: fix-encoding-pragma
args: [--remove]
- id: mixed-line-ending
args: [--fix=lf]
- id: requirements-txt-fixer
- id: trailing-whitespace

- repo: https://github.com/astral-sh/ruff-pre-commit
# Ruff version.
rev: v0.6.9
hooks:
- id: ruff
args: [--fix]
- id: ruff-format
# args: [--check]

- repo: https://gitlab.com/daverona/pre-commit/cpp
rev: 0.8.0
hooks:
- id: clang-format # formatter of C/C++ code based on a style guide: LLVM, Google, Chromium, Mozilla, and WebKit available
args: ['-style=file']
# - id: cpplint
# - id: cppcheck # exclude some checks

- repo: https://github.com/codespell-project/codespell
rev: v2.3.0
hooks:
- id: codespell
args: [
# Do not check files that are automatically generated
'--skip=docs/Gemfile.lock,tests/unit/gpt2-merges.txt,tests/unit/gpt2-vocab.json',
'--ignore-regex=\\n', # Do not count the 'n' in an escaped newline as part of a word
'--ignore-words-list=youn,unsupport,noe,ccompiler', # Word used in error messages that need rewording
--check-filenames,
--check-hidden
]
2 changes: 1 addition & 1 deletion MANIFEST.in
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
recursive-include core *.cpp *.h *.cc
recursive-include op_builder *.py
recursive-include op_builder *.py
10 changes: 5 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ MoE-Infinity is cost-effective yet fast:
- Offloading MoE's experts to host memory, allowing memory-constrained GPUs to serve MoE models.
- Minimizing the expert offloading overheads through several novel techniques: expert activation tracing, activation-aware expert prefetching, and activation-aware expert caching.
- Supporting LLM acceleration techniques (such as [FlashAttention](https://github.com/Dao-AILab/flash-attention)).
- Supporting multi-GPU environments with numeorous OS-level performance optimizations.
- Supporting multi-GPU environments with numeorous OS-level performance optimizations.
- Achieving SOTA latency and throughput performance when serving MoEs in a resource-constrained GPU environment (in comparison with HuggingFace [Accelerate](https://github.com/huggingface/accelerate), [DeepSpeed](https://github.com/microsoft/DeepSpeed), [Mixtral-Offloading](https://github.com/dvmazur/mixtral-offloading), and [Ollama/LLama.cpp](https://github.com/ollama/ollama)).

MoE-Infinity is easy-to-use:
Expand Down Expand Up @@ -41,7 +41,7 @@ Lower per-token-latency is preferable.
| <ins>MoE-Infinity</ins> | <ins>*0.230*</ins> | <ins>*0.239*</ins> | <ins>*0.895*</ins> |
| Accelerate | 1.043 | 3.071 | 6.633 |
|DeepSpeed | 4.578 | 8.381 | 2.486 |
|Mixtral Offloading| X | X | 1.752 |
|Mixtral Offloading| X | X | 1.752 |
|Ollama | X | X | 0.903 |


Expand All @@ -53,7 +53,7 @@ Higher throughput is preferable.
| <ins>MoE-Infinity</ins> | <ins>*69.105*</ins> | <ins>*30.300*</ins> | <ins>*12.579*</ins> |
| Accelerate | 5.788 | 4.344 | 1.245 |
|DeepSpeed | 7.416 | 4.334 | 7.727 |
|Mixtral Offloading| X | X | 7.684 |
|Mixtral Offloading| X | X | 7.684 |
|Ollama | X | X | 1.107 |

> The Mixtral Offloading experiment was carried out with a batch size of 16, as utilizing a batch size of 32 would result in Out of Memory errors on the GPU.
Expand Down Expand Up @@ -145,14 +145,14 @@ CUDA_VISIBLE_DEVICES=0,1 python script.py
We provide a simple example to run inference on a Huggingface LLM model. The script will download the model checkpoint and run inference on the specified input text. The output will be printed to the console.

```bash
CUDA_VISIBLE_DEVICES=0 python examples/interface_example.py --model_name_or_path "mistralai/Mixtral-8x7B-Instruct-v0.1" --offload_dir <your local path on SSD>
CUDA_VISIBLE_DEVICES=0 python examples/interface_example.py --model_name_or_path "mistralai/Mixtral-8x7B-Instruct-v0.1" --offload_dir <your local path on SSD>
```

## Release Plan

We plan to release two functions in the following months:

* We currently support PyTorch as the default inference engine, and we are in the process of supporting vLLM as another inference runtime, which includes the support of KV cache offloading.
* We currently support PyTorch as the default inference engine, and we are in the process of supporting vLLM as another inference runtime, which includes the support of KV cache offloading.
* Supporting expert parallelism for distributed MoE inference.
* More (We welcome contributors to join us!)

Expand Down
4 changes: 2 additions & 2 deletions RELEASE.md
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ For developers who prefer to manually build and publish their package to PyPI, t
2. Install the required dependencies to build the package:
```bash
pip install -r requirements.txt
pip install build
pip install build
```
3. Build the source distribution and wheel for the package using:
```bash
Expand All @@ -46,4 +46,4 @@ For developers who prefer to manually build and publish their package to PyPI, t
Ensure that you have the necessary credentials configured for `twine` to authenticate to PyPI.


To build the package wheel for multiple Python versions, you should execute the build process individually for each version by specifying the corresponding Python interpreter.
To build the package wheel for multiple Python versions, you should execute the build process individually for each version by specifying the corresponding Python interpreter.
2 changes: 0 additions & 2 deletions core/aio/archer_aio_thread.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,6 @@ void ArcherAioThread::Wait()

void ArcherAioThread::Run()
{

while (is_running_) {
std::function<void()> callback;
{
Expand All @@ -60,5 +59,4 @@ void ArcherAioThread::Run()
callback();
pending_callbacks_.fetch_sub(1);
}

}
11 changes: 6 additions & 5 deletions core/aio/archer_aio_utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,10 @@
// TorchMoE Team

#include "archer_aio_utils.h"
#include <future>
#include "utils/archer_logger.h"
#include <string.h>
#include <cmath>
#include <future>
#include "utils/archer_logger.h"

const size_t kBlockSize = 1 * 1024 * 1024;
const size_t kQueueDepth =
Expand Down Expand Up @@ -86,7 +86,7 @@ int ArcherWriteFileBatch(const int fd,
const auto ret = future.get();
if (ret < 0) {
ARCHER_LOG_FATAL(
"Failed to write file: ", fd,", errno: ", errno,", msg: ", strerror(errno));
"Failed to write file: ", fd, ", errno: ", errno, ", msg: ", strerror(errno));
return -1;
}
}
Expand All @@ -98,7 +98,8 @@ int ArcherReadFile(int fd, void* buffer, const size_t num_bytes, const size_t of
{
const auto ret = pread(fd, buffer, num_bytes, offset);
if (ret < 0) {
ARCHER_LOG_FATAL("Failed to read file: ", fd,", errno: ", errno,", msg: ", strerror(errno));
ARCHER_LOG_FATAL(
"Failed to read file: ", fd, ", errno: ", errno, ", msg: ", strerror(errno));
return -1;
}

Expand All @@ -110,7 +111,7 @@ int ArcherWriteFile(int fd, const void* buffer, size_t num_bytes, size_t offset)
const auto ret = pwrite(fd, buffer, num_bytes, offset);
if (ret < 0) {
ARCHER_LOG_FATAL(
"Failed to write file: ", fd,", errno: ", errno,", msg: ", strerror(errno));
"Failed to write file: ", fd, ", errno: ", errno, ", msg: ", strerror(errno));
return -1;
}

Expand Down
3 changes: 1 addition & 2 deletions core/aio/archer_prio_aio_handle.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -108,8 +108,7 @@ std::int64_t ArcherPrioAioHandle::Write(const std::string& filename,
return num_bytes_aligned;
}

ArcherPrioAioContext::ArcherPrioAioContext(const int block_size)
: block_size_(block_size)
ArcherPrioAioContext::ArcherPrioAioContext(const int block_size) : block_size_(block_size)
{
thread_pool_ = std::make_unique<ArcherAioThreadPool>(1); // only one SSD device
thread_pool_->Start();
Expand Down
4 changes: 2 additions & 2 deletions core/aio/archer_tensor_handle.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ ArcherTensorHandle::ArcherTensorHandle(const std::string& prefix)
ARCHER_LOG_FATAL("Invalid prefix: ", prefix_, " is not a directory");
}
if (stat(prefix_.c_str(), &st) == -1) {
ARCHER_LOG_WARN("Invalid prefix: ", prefix_," does not exist, creating");
ARCHER_LOG_WARN("Invalid prefix: ", prefix_, " does not exist, creating");
mkdir(prefix_.c_str(), 0777);
}

Expand All @@ -44,7 +44,7 @@ ArcherTensorHandle::ArcherTensorHandle(const std::string& prefix)
kTensorIndex->Deserialize(ckpt_index_path.c_str());
is_serialized_ = true;
} else {
ARCHER_LOG_INFO("Index file", ckpt_index_path," does not exist, creating");
ARCHER_LOG_INFO("Index file", ckpt_index_path, " does not exist, creating");
}
ARCHER_LOG_INFO("Index file size ", kTensorIndex->size());
}
Expand Down
1 change: 1 addition & 0 deletions core/aio/archer_tensor_index.h
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ class ArcherTensorIndex : public std::unordered_map<uint32_t, TensorStorageMeta>

ArcherTensorIndex() = default;
~ArcherTensorIndex() = default;

private:
};

Expand Down
14 changes: 5 additions & 9 deletions core/memory/memory_pool.h
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,12 @@
#include "common/pytorch.h"
#include "utils/noncopyable.h"

#include "utils/archer_logger.h"
#include "host_caching_allocator.h"
#include <c10/cuda/CUDACachingAllocator.h>
#include <mutex>
#include <unordered_map>
#include <unordered_set>
#include "host_caching_allocator.h"
#include "utils/archer_logger.h"

std::size_t GetTotalSystemMemory();

Expand Down Expand Up @@ -41,9 +41,7 @@ class HostMemoryPool : public noncopyable {
{
auto allocator = c10::HostCachingAllocator::get();
for (auto& [key, data_ptr] : allocated_id_) {
if (data_ptr != nullptr) {
allocator->free(data_ptr);
}
if (data_ptr != nullptr) { allocator->free(data_ptr); }
}
allocated_id_.clear();
}
Expand Down Expand Up @@ -73,11 +71,9 @@ class DeviceMemoryPool : public noncopyable {
virtual ~DeviceMemoryPool()
{
auto allocator = c10::cuda::CUDACachingAllocator::get();
for(auto &allocated_id : allocated_id_){
for (auto& allocated_id : allocated_id_) {
for (auto& [key, data_ptr] : allocated_id) {
if (data_ptr != nullptr) {
allocator->raw_deallocate(data_ptr);
}
if (data_ptr != nullptr) { allocator->raw_deallocate(data_ptr); }
}
}
allocated_id_.clear();
Expand Down
4 changes: 2 additions & 2 deletions core/memory/stream_pool.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,5 +6,5 @@
#include "stream_pool.h"

// Stream0 is used for H2D, Stream1 is used for Kernel, Stream2 is used for D2H
// CUDAStreamPool* kCUDAStreamPool = CUDAStreamPool::GetInstance();
std::unique_ptr<CUDAStreamPool> kCUDAStreamPool = std::make_unique<CUDAStreamPool>();
// TorchStreamPool* kTorchStreamPool = TorchStreamPool::GetInstance();
std::unique_ptr<TorchStreamPool> kTorchStreamPool = std::make_unique<TorchStreamPool>();
16 changes: 8 additions & 8 deletions core/memory/stream_pool.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,14 +10,14 @@
#include "utils/cuda_utils.h"
#include "utils/noncopyable.h"

class CUDAStreamPool : public noncopyable {
class TorchStreamPool : public noncopyable {
public:
std::vector<c10::cuda::CUDAStream>& operator()(const int device_id)
{
return cuda_streams_[device_id];
}

CUDAStreamPool()
TorchStreamPool()
{
int num_devices = GetDeviceCount();
for (int i = 0; i < num_devices; ++i) {
Expand All @@ -28,14 +28,14 @@ class CUDAStreamPool : public noncopyable {
cuda_streams_.push_back(std::move(streams));
}
}
virtual ~CUDAStreamPool() = default;
virtual ~TorchStreamPool() = default;

private:
std::vector<std::vector<c10::cuda::CUDAStream>> cuda_streams_;
};

extern std::unique_ptr<CUDAStreamPool> kCUDAStreamPool;
#define CUDA_STREAM_VIEW(device_id, stream_id) (*kCUDAStreamPool)(device_id)[stream_id]
#define CUDA_STREAM_H2D_VIEW(device_id) CUDA_STREAM_VIEW(device_id, 0)
#define CUDA_STREAM_D2H_VIEW(device_id) CUDA_STREAM_VIEW(device_id, 1)
#define CUDA_STREAM_COMPUTE_VIEW(device_id) CUDA_STREAM_VIEW(device_id, 2)
extern std::unique_ptr<TorchStreamPool> kTorchStreamPool;
#define TORCH_STREAM_VIEW(device_id, stream_id) (*kTorchStreamPool)(device_id)[stream_id]
#define TORCH_STREAM_H2D_VIEW(device_id) TORCH_STREAM_VIEW(device_id, 0)
#define TORCH_STREAM_D2H_VIEW(device_id) TORCH_STREAM_VIEW(device_id, 1)
#define TORCH_STREAM_COMPUTE_VIEW(device_id) TORCH_STREAM_VIEW(device_id, 2)
Loading
Loading