TorchMoE · drunkcoding · Aug 15, 2024 · Nov 26, 2024 · Dec 9, 2024 · Jan 19, 2025
diff --git a/.github/workflows/build-test.yml b/.github/workflows/build-test.yml
@@ -51,4 +51,4 @@ jobs:
           pip install build
 
       - name: Build package
-        run: BUILD_OPS=1 python -m build
+        run: BUILD_OPS=1 python -m build
diff --git a/.github/workflows/publish-test.yml b/.github/workflows/publish-test.yml
@@ -22,7 +22,7 @@ jobs:
         VERSION_HASH=$(date +"%Y%m%d%H%M%S")
         echo "Generated version hash: $VERSION_HASH"
         echo $VERSION_HASH > version.txt
-    
+
     - name: Upload version number as artifact
       uses: actions/upload-artifact@v2
       with:
@@ -84,7 +84,7 @@ jobs:
           asset_name=${wheel_name//"linux"/"manylinux1"}
           echo "wheel_name=${wheel_name}" >> $GITHUB_ENV
           echo "asset_name=${asset_name}" >> $GITHUB_ENV
-      
+
 
       # only build source when the python version is 3.8
       - name: Build Source
@@ -102,4 +102,4 @@ jobs:
         uses: pypa/gh-action-pypi-publish@release/v1.8
         with:
           repository-url: https://test.pypi.org/legacy/
-          skip-existing: true
+          skip-existing: true
diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
@@ -88,7 +88,7 @@ jobs:
           asset_name=${wheel_name//"linux"/"manylinux1"}
           echo "wheel_name=${wheel_name}" >> $GITHUB_ENV
           echo "asset_name=${asset_name}" >> $GITHUB_ENV
-      
+
 
       # only build source when the python version is 3.8
       - name: Build Source
@@ -115,4 +115,4 @@ jobs:
         uses: pypa/gh-action-pypi-publish@release/v1.8
         with:
         #   repository-url: https://test.pypi.org/legacy/
-          skip-existing: true
+          skip-existing: true
diff --git a/.github/workflows/scripts/create-release.js b/.github/workflows/scripts/create-release.js
@@ -17,4 +17,4 @@ module.exports = async (github, context, core) => {
 	} catch (error) {
 		core.setFailed(error.message);
 	}
-}
+}
diff --git a/.github/workflows/scripts/cuda-install.sh b/.github/workflows/scripts/cuda-install.sh
@@ -20,4 +20,4 @@ nvcc --version
 # Log gcc, g++, c++ versions
 gcc --version
 g++ --version
-c++ --version
+c++ --version
diff --git a/.github/workflows/scripts/free-disk-space.sh b/.github/workflows/scripts/free-disk-space.sh
@@ -20,7 +20,7 @@
 # Total space: 85GB
 # Allocated: 67 GB
 # Free: 17 GB
-# This script frees up 28 GB of disk space by deleting unneeded packages and 
+# This script frees up 28 GB of disk space by deleting unneeded packages and
 # large directories.
 # The Flink end to end tests download and generate more than 17 GB of files,
 # causing unpredictable behavior and build failures.
@@ -45,4 +45,4 @@ echo "Removing large directories"
 # deleting 15GB
 rm -rf /usr/share/dotnet/
 rm -rf /opt/hostedtoolcache/
-df -h
+df -h
diff --git a/.gitignore b/.gitignore
@@ -1,6 +1,8 @@
 # Created by https://www.toptal.com/developers/gitignore/api/visualstudiocode
 # Edit at https://www.toptal.com/developers/gitignore?templates=visualstudiocode
 
+test*
+
 ### VisualStudioCode ###
 .vscode/*
 # !.vscode/settings.json

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -0,0 +1,52 @@
+repos:
+-   repo: meta
+    hooks:
+    -   id: check-hooks-apply
+    -   id: check-useless-excludes
+
+-   repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v4.0.1
+    hooks:
+    -   id: check-case-conflict
+    # -   id: check-json
+    # -   id: check-symlinks
+    -   id: check-yaml
+    -   id: destroyed-symlinks
+    -   id: end-of-file-fixer
+    -   id: fix-byte-order-marker
+    -   id: fix-encoding-pragma
+        args: [--remove]
+    -   id: mixed-line-ending
+        args: [--fix=lf]
+    -   id: requirements-txt-fixer
+    -   id: trailing-whitespace
+
+-   repo: https://github.com/astral-sh/ruff-pre-commit
+    # Ruff version.
+    rev: v0.6.9
+    hooks:
+    -   id: ruff
+        args: [--fix]
+    -   id: ruff-format
+        # args: [--check]
+
+-   repo: https://gitlab.com/daverona/pre-commit/cpp
+    rev: 0.8.0
+    hooks:
+    -   id: clang-format  # formatter of C/C++ code based on a style guide: LLVM, Google, Chromium, Mozilla, and WebKit available
+        args: ['-style=file']
+    # -   id: cpplint
+    # -   id: cppcheck # exclude some checks
+
+-   repo: https://github.com/codespell-project/codespell
+    rev: v2.3.0
+    hooks:
+    -   id: codespell
+        args: [
+            # Do not check files that are automatically generated
+            '--skip=docs/Gemfile.lock,tests/unit/gpt2-merges.txt,tests/unit/gpt2-vocab.json',
+            '--ignore-regex=\\n',  # Do not count the 'n' in an escaped newline as part of a word
+            '--ignore-words-list=youn,unsupport,noe,ccompiler',  # Word used in error messages that need rewording
+            --check-filenames,
+            --check-hidden
+        ]
diff --git a/MANIFEST.in b/MANIFEST.in
@@ -1,2 +1,2 @@
 recursive-include core *.cpp *.h *.cc
-recursive-include op_builder *.py
+recursive-include op_builder *.py
diff --git a/README.md b/README.md
@@ -7,7 +7,7 @@ MoE-Infinity is cost-effective yet fast:
 - Offloading MoE's experts to host memory, allowing memory-constrained GPUs to serve MoE models.
 - Minimizing the expert offloading overheads through several novel techniques: expert activation tracing, activation-aware expert prefetching, and activation-aware expert caching.
 - Supporting LLM acceleration techniques (such as [FlashAttention](https://github.com/Dao-AILab/flash-attention)).
-- Supporting multi-GPU environments with numeorous OS-level performance optimizations. 
+- Supporting multi-GPU environments with numeorous OS-level performance optimizations.
 - Achieving SOTA latency and throughput performance when serving MoEs in a resource-constrained GPU environment (in comparison with HuggingFace [Accelerate](https://github.com/huggingface/accelerate), [DeepSpeed](https://github.com/microsoft/DeepSpeed), [Mixtral-Offloading](https://github.com/dvmazur/mixtral-offloading), and [Ollama/LLama.cpp](https://github.com/ollama/ollama)).
 
 MoE-Infinity is easy-to-use:
@@ -41,7 +41,7 @@ Lower per-token-latency is preferable.
 | <ins>MoE-Infinity</ins> | <ins>*0.230*</ins>	| <ins>*0.239*</ins> | <ins>*0.895*</ins> |
 | Accelerate | 1.043 | 3.071 | 6.633 |
 |DeepSpeed | 4.578 | 8.381 | 2.486 |
-|Mixtral Offloading| X | X | 1.752 | 
+|Mixtral Offloading| X | X | 1.752 |
 |Ollama | X | X | 0.903 |
 
 
@@ -53,7 +53,7 @@ Higher throughput is preferable.
 | <ins>MoE-Infinity</ins> | <ins>*69.105*</ins>	| <ins>*30.300*</ins> | <ins>*12.579*</ins> |
 | Accelerate | 5.788 | 4.344 | 1.245 |
 |DeepSpeed | 7.416 | 4.334 | 7.727 |
-|Mixtral Offloading| X | X | 7.684 | 
+|Mixtral Offloading| X | X | 7.684 |
 |Ollama | X | X | 1.107 |
 
 > The Mixtral Offloading experiment was carried out with a batch size of 16, as utilizing a batch size of 32 would result in Out of Memory errors on the GPU.
@@ -145,14 +145,14 @@ CUDA_VISIBLE_DEVICES=0,1 python script.py
 We provide a simple example to run inference on a Huggingface LLM model. The script will download the model checkpoint and run inference on the specified input text. The output will be printed to the console.
 
 ```bash
-CUDA_VISIBLE_DEVICES=0 python examples/interface_example.py --model_name_or_path "mistralai/Mixtral-8x7B-Instruct-v0.1" --offload_dir <your local path on SSD> 
+CUDA_VISIBLE_DEVICES=0 python examples/interface_example.py --model_name_or_path "mistralai/Mixtral-8x7B-Instruct-v0.1" --offload_dir <your local path on SSD>
 ```
 
 ## Release Plan
 
 We plan to release two functions in the following months:
 
-* We currently support PyTorch as the default inference engine, and we are in the process of supporting vLLM as another inference runtime, which includes the support of KV cache offloading. 
+* We currently support PyTorch as the default inference engine, and we are in the process of supporting vLLM as another inference runtime, which includes the support of KV cache offloading.
 * Supporting expert parallelism for distributed MoE inference.
 * More (We welcome contributors to join us!)
 

diff --git a/RELEASE.md b/RELEASE.md
@@ -32,7 +32,7 @@ For developers who prefer to manually build and publish their package to PyPI, t
 2. Install the required dependencies to build the package:
     ```bash
     pip install -r requirements.txt
-    pip install build 
+    pip install build
     ```
 3. Build the source distribution and wheel for the package using:
     ```bash
@@ -46,4 +46,4 @@ For developers who prefer to manually build and publish their package to PyPI, t
     Ensure that you have the necessary credentials configured for `twine` to authenticate to PyPI.
 
 
-To build the package wheel for multiple Python versions, you should execute the build process individually for each version by specifying the corresponding Python interpreter. 
+To build the package wheel for multiple Python versions, you should execute the build process individually for each version by specifying the corresponding Python interpreter.
diff --git a/core/aio/archer_aio_thread.cpp b/core/aio/archer_aio_thread.cpp
@@ -48,7 +48,6 @@ void ArcherAioThread::Wait()
 
 void ArcherAioThread::Run()
 {
-
     while (is_running_) {
         std::function<void()> callback;
         {
@@ -60,5 +59,4 @@ void ArcherAioThread::Run()
         callback();
         pending_callbacks_.fetch_sub(1);
     }
-
 }
diff --git a/core/aio/archer_aio_utils.cpp b/core/aio/archer_aio_utils.cpp
@@ -4,10 +4,10 @@
 // TorchMoE Team
 
 #include "archer_aio_utils.h"
-#include <future>
-#include "utils/archer_logger.h"
 #include <string.h>
 #include <cmath>
+#include <future>
+#include "utils/archer_logger.h"
 
 const size_t kBlockSize = 1 * 1024 * 1024;
 const size_t kQueueDepth =
@@ -86,7 +86,7 @@ int ArcherWriteFileBatch(const int fd,
         const auto ret = future.get();
         if (ret < 0) {
             ARCHER_LOG_FATAL(
-                "Failed to write file: ", fd,", errno: ", errno,", msg: ", strerror(errno));
+                "Failed to write file: ", fd, ", errno: ", errno, ", msg: ", strerror(errno));
             return -1;
         }
     }
@@ -98,7 +98,8 @@ int ArcherReadFile(int fd, void* buffer, const size_t num_bytes, const size_t of
 {
     const auto ret = pread(fd, buffer, num_bytes, offset);
     if (ret < 0) {
-        ARCHER_LOG_FATAL("Failed to read file: ", fd,", errno: ", errno,", msg: ", strerror(errno));
+        ARCHER_LOG_FATAL(
+            "Failed to read file: ", fd, ", errno: ", errno, ", msg: ", strerror(errno));
         return -1;
     }
 
@@ -110,7 +111,7 @@ int ArcherWriteFile(int fd, const void* buffer, size_t num_bytes, size_t offset)
     const auto ret = pwrite(fd, buffer, num_bytes, offset);
     if (ret < 0) {
         ARCHER_LOG_FATAL(
-            "Failed to write file: ", fd,", errno: ", errno,", msg: ", strerror(errno));
+            "Failed to write file: ", fd, ", errno: ", errno, ", msg: ", strerror(errno));
         return -1;
     }
 

diff --git a/core/aio/archer_prio_aio_handle.cpp b/core/aio/archer_prio_aio_handle.cpp
@@ -108,8 +108,7 @@ std::int64_t ArcherPrioAioHandle::Write(const std::string& filename,
     return num_bytes_aligned;
 }
 
-ArcherPrioAioContext::ArcherPrioAioContext(const int block_size)
-    : block_size_(block_size)
+ArcherPrioAioContext::ArcherPrioAioContext(const int block_size) : block_size_(block_size)
 {
     thread_pool_ = std::make_unique<ArcherAioThreadPool>(1);  // only one SSD device
     thread_pool_->Start();

diff --git a/core/aio/archer_tensor_handle.cpp b/core/aio/archer_tensor_handle.cpp
@@ -32,7 +32,7 @@ ArcherTensorHandle::ArcherTensorHandle(const std::string& prefix)
         ARCHER_LOG_FATAL("Invalid prefix: ", prefix_, " is not a directory");
     }
     if (stat(prefix_.c_str(), &st) == -1) {
-        ARCHER_LOG_WARN("Invalid prefix: ", prefix_," does not exist, creating");
+        ARCHER_LOG_WARN("Invalid prefix: ", prefix_, " does not exist, creating");
         mkdir(prefix_.c_str(), 0777);
     }
 
@@ -44,7 +44,7 @@ ArcherTensorHandle::ArcherTensorHandle(const std::string& prefix)
         kTensorIndex->Deserialize(ckpt_index_path.c_str());
         is_serialized_ = true;
     } else {
-        ARCHER_LOG_INFO("Index file", ckpt_index_path," does not exist, creating");
+        ARCHER_LOG_INFO("Index file", ckpt_index_path, " does not exist, creating");
     }
     ARCHER_LOG_INFO("Index file size ", kTensorIndex->size());
 }

diff --git a/core/aio/archer_tensor_index.h b/core/aio/archer_tensor_index.h
@@ -47,6 +47,7 @@ class ArcherTensorIndex : public std::unordered_map<uint32_t, TensorStorageMeta>
 
     ArcherTensorIndex() = default;
     ~ArcherTensorIndex() = default;
+
 private:
 };
 

diff --git a/core/memory/memory_pool.h b/core/memory/memory_pool.h
@@ -7,12 +7,12 @@
 #include "common/pytorch.h"
 #include "utils/noncopyable.h"
 
-#include "utils/archer_logger.h"
-#include "host_caching_allocator.h"
 #include <c10/cuda/CUDACachingAllocator.h>
 #include <mutex>
 #include <unordered_map>
 #include <unordered_set>
+#include "host_caching_allocator.h"
+#include "utils/archer_logger.h"
 
 std::size_t GetTotalSystemMemory();
 
@@ -41,9 +41,7 @@ class HostMemoryPool : public noncopyable {
     {
         auto allocator = c10::HostCachingAllocator::get();
         for (auto& [key, data_ptr] : allocated_id_) {
-            if (data_ptr != nullptr) {
-                allocator->free(data_ptr);
-            }
+            if (data_ptr != nullptr) { allocator->free(data_ptr); }
         }
         allocated_id_.clear();
     }
@@ -73,11 +71,9 @@ class DeviceMemoryPool : public noncopyable {
     virtual ~DeviceMemoryPool()
     {
         auto allocator = c10::cuda::CUDACachingAllocator::get();
-        for(auto &allocated_id : allocated_id_){
+        for (auto& allocated_id : allocated_id_) {
             for (auto& [key, data_ptr] : allocated_id) {
-                if (data_ptr != nullptr) {
-                    allocator->raw_deallocate(data_ptr);
-                }
+                if (data_ptr != nullptr) { allocator->raw_deallocate(data_ptr); }
             }
         }
         allocated_id_.clear();

diff --git a/core/memory/stream_pool.cpp b/core/memory/stream_pool.cpp
@@ -6,5 +6,5 @@
 #include "stream_pool.h"
 
 // Stream0 is used for H2D, Stream1 is used for Kernel, Stream2 is used for D2H
-// CUDAStreamPool* kCUDAStreamPool = CUDAStreamPool::GetInstance();
-std::unique_ptr<CUDAStreamPool> kCUDAStreamPool = std::make_unique<CUDAStreamPool>();
+// TorchStreamPool* kTorchStreamPool = TorchStreamPool::GetInstance();
+std::unique_ptr<TorchStreamPool> kTorchStreamPool = std::make_unique<TorchStreamPool>();
diff --git a/core/memory/stream_pool.h b/core/memory/stream_pool.h
@@ -10,14 +10,14 @@
 #include "utils/cuda_utils.h"
 #include "utils/noncopyable.h"
 
-class CUDAStreamPool : public noncopyable {
+class TorchStreamPool : public noncopyable {
 public:
     std::vector<c10::cuda::CUDAStream>& operator()(const int device_id)
     {
         return cuda_streams_[device_id];
     }
 
-    CUDAStreamPool()
+    TorchStreamPool()
     {
         int num_devices = GetDeviceCount();
         for (int i = 0; i < num_devices; ++i) {
@@ -28,14 +28,14 @@ class CUDAStreamPool : public noncopyable {
             cuda_streams_.push_back(std::move(streams));
         }
     }
-    virtual ~CUDAStreamPool() = default;
+    virtual ~TorchStreamPool() = default;
 
 private:
     std::vector<std::vector<c10::cuda::CUDAStream>> cuda_streams_;
 };
 
-extern std::unique_ptr<CUDAStreamPool> kCUDAStreamPool;
-#define CUDA_STREAM_VIEW(device_id, stream_id) (*kCUDAStreamPool)(device_id)[stream_id]
-#define CUDA_STREAM_H2D_VIEW(device_id) CUDA_STREAM_VIEW(device_id, 0)
-#define CUDA_STREAM_D2H_VIEW(device_id) CUDA_STREAM_VIEW(device_id, 1)
-#define CUDA_STREAM_COMPUTE_VIEW(device_id) CUDA_STREAM_VIEW(device_id, 2)
+extern std::unique_ptr<TorchStreamPool> kTorchStreamPool;
+#define TORCH_STREAM_VIEW(device_id, stream_id) (*kTorchStreamPool)(device_id)[stream_id]
+#define TORCH_STREAM_H2D_VIEW(device_id) TORCH_STREAM_VIEW(device_id, 0)
+#define TORCH_STREAM_D2H_VIEW(device_id) TORCH_STREAM_VIEW(device_id, 1)
+#define TORCH_STREAM_COMPUTE_VIEW(device_id) TORCH_STREAM_VIEW(device_id, 2)
-Original file line number
+Diff line change
@@ Expand Up / @@ -17,4 +17,4 @@ module.exports = async (github, context, core) => { @@
     	} catch (error) {
     		core.setFailed(error.message);
     	}
-    }
+    }