Release dev to main (#6)

* update table format * improve table clarity * init code commit * doc: add flashattention installation guide and change toc * feat: remove libaio dependency * remove spdlog dependency * misc: remove unused code and dependencies * misc: remove commented-out code and unused imports * fix: cuda oom due to safe tensors open * remove gcc-12 requirement * gptq disable exllama * fix: key error in offload set * add forward and call (#7) * add forward and call * fix a bug * feat: support grok-1 model --------- Co-authored-by: xly <[email protected]> Co-authored-by: Zhan Lu <[email protected]> Co-authored-by: lausannel <[email protected]> Co-authored-by: Yao Fu <[email protected]>
TorchMoE · Apr 9, 2024 · 87eacf7 · 87eacf7
1 parent 4509b87
commit 87eacf7
Show file tree

Hide file tree

Showing 96 changed files with 11,548 additions and 16 deletions.
diff --git a/.clang-format b/.clang-format
@@ -0,0 +1,155 @@
+---
+# Refer to the following link for the explanation of each params:
+#   http://releases.llvm.org/8.0.0/tools/clang/docs/ClangFormatStyleOptions.html
+Language: Cpp
+# BasedOnStyle: Google
+AccessModifierOffset: -4
+AlignAfterOpenBracket: Align
+AlignConsecutiveAssignments: false
+AlignConsecutiveDeclarations: false
+AlignEscapedNewlines: Left
+AlignOperands: true
+AlignTrailingComments: true
+AllowAllParametersOfDeclarationOnNextLine: false
+AllowShortBlocksOnASingleLine: true
+AllowShortCaseLabelsOnASingleLine: true
+AllowShortFunctionsOnASingleLine: All
+AllowShortIfStatementsOnASingleLine: true
+AllowShortLoopsOnASingleLine: true
+# This is deprecated
+AlwaysBreakAfterDefinitionReturnType: None
+AlwaysBreakAfterReturnType: None
+AlwaysBreakBeforeMultilineStrings: true
+AlwaysBreakTemplateDeclarations: true
+BinPackArguments:  false
+BinPackParameters: false
+BraceWrapping:
+  AfterClass:            false
+  AfterControlStatement: false
+  AfterEnum:             false
+  AfterFunction:         false
+  AfterNamespace:        false
+  AfterObjCDeclaration:  false
+  AfterStruct:           false
+  AfterUnion:            false
+  AfterExternBlock:      false
+  BeforeCatch:           false
+  BeforeElse:            false
+  IndentBraces:          false
+  # disabling the below splits, else, they'll just add to the vertical length of source files!
+  SplitEmptyFunction: false
+  SplitEmptyRecord: false
+  SplitEmptyNamespace: false
+BreakBeforeBinaryOperators: None
+BreakBeforeBraces: WebKit
+BreakBeforeInheritanceComma: false
+BreakInheritanceList: BeforeColon
+BreakBeforeTernaryOperators: true
+BreakConstructorInitializersBeforeComma: false
+BreakConstructorInitializers: BeforeColon
+BreakAfterJavaFieldAnnotations: false
+BreakStringLiterals: true
+ColumnLimit: 100
+CommentPragmas: '^ IWYU pragma:'
+CompactNamespaces: false
+ConstructorInitializerAllOnOneLineOrOnePerLine: true
+# Kept the below 2 to be the same as `IndentWidth` to keep everything uniform
+ConstructorInitializerIndentWidth: 4
+ContinuationIndentWidth: 4
+Cpp11BracedListStyle: true
+DerivePointerAlignment: false
+DisableFormat: false
+ExperimentalAutoDetectBinPacking: false
+FixNamespaceComments: true
+ForEachMacros:
+  - foreach
+  - Q_FOREACH
+  - BOOST_FOREACH
+IncludeBlocks: Preserve
+IncludeCategories:
+  - Regex:           '^<ext/.*\.h>'
+    Priority:        2
+  - Regex:           '^<.*\.h>'
+    Priority:        1
+  - Regex:           '^<.*'
+    Priority:        2
+  - Regex:           '.*'
+    Priority:        3
+IncludeIsMainRegex: '([-_](test|unittest))?$'
+IndentCaseLabels: true
+IndentPPDirectives: None
+IndentWidth:     4
+IndentWrappedFunctionNames: false
+JavaScriptQuotes: Leave
+JavaScriptWrapImports: true
+KeepEmptyLinesAtTheStartOfBlocks: false
+MacroBlockBegin: ''
+MacroBlockEnd:   ''
+MaxEmptyLinesToKeep: 1
+NamespaceIndentation: None
+ObjCBinPackProtocolList: Never
+ObjCBlockIndentWidth: 4
+ObjCSpaceAfterProperty: false
+ObjCSpaceBeforeProtocolList: true
+PenaltyBreakAssignment: 4
+PenaltyBreakBeforeFirstCallParameter: 1
+PenaltyBreakComment: 300
+PenaltyBreakFirstLessLess: 120
+PenaltyBreakString: 1000
+PenaltyBreakTemplateDeclaration: 10
+PenaltyExcessCharacter: 1000000
+PenaltyReturnTypeOnItsOwnLine: 200
+PointerAlignment: Left
+RawStringFormats:
+  - Language: Cpp
+    Delimiters:
+      - cc
+      - CC
+      - cpp
+      - Cpp
+      - CPP
+      - 'c++'
+      - 'C++'
+    CanonicalDelimiter: ''
+  - Language: TextProto
+    Delimiters:
+      - pb
+      - PB
+      - proto
+      - PROTO
+    EnclosingFunctions:
+      - EqualsProto
+      - EquivToProto
+      - PARSE_PARTIAL_TEXT_PROTO
+      - PARSE_TEST_PROTO
+      - PARSE_TEXT_PROTO
+      - ParseTextOrDie
+      - ParseTextProtoOrDie
+    CanonicalDelimiter: ''
+    BasedOnStyle: google
+# Enabling comment reflow causes doxygen comments to be messed up in their formats!
+ReflowComments: true
+SortIncludes: true
+SortUsingDeclarations: true
+SpaceAfterCStyleCast: false
+SpaceAfterTemplateKeyword: true
+SpaceBeforeAssignmentOperators: true
+SpaceBeforeCpp11BracedList: false
+SpaceBeforeCtorInitializerColon: true
+SpaceBeforeInheritanceColon: true
+SpaceBeforeParens: ControlStatements
+SpaceBeforeRangeBasedForLoopColon: true
+SpaceInEmptyParentheses: false
+SpacesBeforeTrailingComments: 2
+SpacesInAngles: false
+SpacesInContainerLiterals: true
+SpacesInCStyleCastParentheses: false
+SpacesInParentheses: false
+SpacesInSquareBrackets: false
+Standard: Cpp11
+StatementMacros:
+  - Q_UNUSED
+  - QT_REQUIRE_VERSION
+# Be consistent with indent-width, even for people who use tab for indentation!
+TabWidth: 4
+UseTab: Never
diff --git a/.gitignore b/.gitignore
@@ -1,3 +1,27 @@
+# Created by https://www.toptal.com/developers/gitignore/api/visualstudiocode
+# Edit at https://www.toptal.com/developers/gitignore?templates=visualstudiocode
+
+### VisualStudioCode ###
+.vscode/*
+# !.vscode/settings.json
+# !.vscode/tasks.json
+# !.vscode/launch.json
+# !.vscode/extensions.json
+# !.vscode/*.code-snippets
+
+# Local History for Visual Studio Code
+.history/
+
+# Built Visual Studio Code Extensions
+*.vsix
+
+### VisualStudioCode Patch ###
+# Ignore all local history of files
+.history
+.ionide
+
+# End of https://www.toptal.com/developers/gitignore/api/visualstudiocode
+
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]

diff --git a/LICENSE b/LICENSE
@@ -186,7 +186,7 @@
       same "printed page" as the copyright notice for easier
       identification within third-party archives.
 
-   Copyright [yyyy] [name of copyright owner]
+   Copyright 2024 TorchMoE
 
    Licensed under the Apache License, Version 2.0 (the "License");
    you may not use this file except in compliance with the License.

diff --git a/MANIFEST.in b/MANIFEST.in
@@ -0,0 +1,2 @@
+recursive-include core *.cpp *.h *.cc
+recursive-include op_builder *.py
diff --git a/README.md b/README.md
@@ -20,56 +20,62 @@ Note that: The open-sourced MoE-Infinity has been redesigned for making it Huggi
 ## Contents
 - [Performance](#performance)
 - [Installation](#installation)
-     - [Prerequisites](#prerequisites)
-     - [Install from PyPI](#install-from-pypi)
-     - [Install from Source](#install-from-source)
+    - [Prerequisites](#prerequisites)
+    - [Install from conda environment](#install-from-conda-environment)
+    - [Install from PyPI](#install-from-pypi)
+    - [Install from Source](#install-from-source)
+    - [Enable FlashAttention (Optional)](#enable-flashattention-optional)
 - [Usage and Examples](#usage-and-examples)
-     - [Sample Code of Huggingface LLM Inference](#sample-code-of-huggingface-llm-inference)
+    - [Sample Code of Huggingface LLM Inference](#sample-code-of-huggingface-llm-inference)
     - [Running Inference](#running-inference)
-- [Roadmap](#roadmap)
+- [Release Plan](#release-plan)
+- [Citation](#citation)
 
 ## Performance
 
 Single GPU A5000 (24GB Memory), per-token-latency (seconds) for generation with a mixed dataset that includes [FLAN](https://huggingface.co/datasets/Muennighoff/flan), [BIG-Bench](https://huggingface.co/datasets/bigbench) and [MMLU](https://huggingface.co/datasets/lukaemon/mmlu) datasets.
+Lower per-token-latency is preferable.
 
 |  | switch-large-128 | NLLB-MoE-54B | Mixtral-7x8b |
 | :---: | :---: | :---: | :---: |
-| *MoE-Infinity* | *0.230*	| *0.239* | *0.895* |
+| <ins>MoE-Infinity</ins> | <ins>*0.230*</ins>	| <ins>*0.239*</ins> | <ins>*0.895*</ins> |
 | Accelerate | 1.043 | 3.071 | 6.633 |
 |DeepSpeed | 4.578 | 8.381 | 2.486 |
 |Mixtral Offloading| X | X | 1.752 | 
 |Ollama | X | X | 0.903 |
 
-Single GPU A5000, throughput (token/s) for generation at batch size 32.
+
+Single GPU A5000, throughput (token/s) for generation with batch size 32.
+Higher throughput is preferable.
 
 |  | switch-large-128 | NLLB-MoE-54B | Mixtral-7x8b |
 | :---: | :---: | :---: | :---: |
-| *MoE-Infinity* | *69.105*	| *30.300* | *12.579* |
+| <ins>MoE-Infinity</ins> | <ins>*69.105*</ins>	| <ins>*30.300*</ins> | <ins>*12.579*</ins> |
 | Accelerate | 5.788 | 4.344 | 1.245 |
 |DeepSpeed | 7.416 | 4.334 | 7.727 |
 |Mixtral Offloading| X | X | 7.684 | 
 |Ollama | X | X | 1.107 |
 
 > The Mixtral Offloading experiment was carried out with a batch size of 16, as utilizing a batch size of 32 would result in Out of Memory errors on the GPU.
 
+> Ollama does not support batching for generation, so the throughput is calculated with a batch size of 1.
+
 ## Installation
 
 We recommend installing MoE-Infinity in a virtual environment. To install MoE-Infinity, you can either install it from PyPI or build it from source.
 
-### Prerequisites
-MoE-Infinity is currently only supported on Linux, Ensure the following dependencies are installed on your system:
+### Install from conda environment
 
 ```bash
-# example of installing dependencies on ubuntu
-sudo apt install build-essential curl libaio-dev libspdlog-dev
+conda env create --file environment.yml
+conda activate moe-infinity
 ```
 
-Pytorch (>=2.0), libstdcxx-ng (>=12.0) and Python (>=3.8) required for MoE-Infinity, please refer to [Pytorch](https://pytorch.org/get-started/locally/) for installation instructions.
-
 ### Install from PyPI
 
 ```bash
 pip install moe-infinity
+conda install -c conda-forge libstdcxx-ng=12 # assume using conda, otherwise install libstdcxx-ng=12 using your package manager or gcc=12
 ```
 
 ### Install from Source
@@ -80,6 +86,14 @@ cd MoE-Infinity
 pip install -e .
 ```
 
+### Enable FlashAttention (Optional)
+
+Install FlashAttention (>=2.5.2) for faster inference with the following command.
+```bash
+FLASH_ATTENTION_FORCE_BUILD=TRUE pip install flash-attn
+```
+Post-installation, MoE-Infinity will automatically integrate with FlashAttention to enhance performance.
+
 ## Usage and Examples
 
 We provide a simple API for diverse setups, including single GPU, multiple GPUs, and multiple nodes. The following examples show how to use MoE-Infinity to run generation on a Huggingface LLM model.
@@ -144,4 +158,4 @@ If you use MoE-Inifity for your research, please cite our [paper](https://arxiv.
   booktitle={https://arxiv.org/abs/2401.14361},
   year={2024}
 }
-```
+```
diff --git a/core/aio/archer_aio_thread.cpp b/core/aio/archer_aio_thread.cpp
@@ -0,0 +1,64 @@
+// Copyright (c) TorchMoE.
+// SPDX-License-Identifier: Apache-2.0
+
+// TorchMoE Team
+
+#include "archer_aio_thread.h"
+
+#include "utils/archer_logger.h"
+
+ArcherAioThread::ArcherAioThread(int thread_id) : thread_id_(thread_id), is_running_(false)
+{
+    ARCHER_LOG_INFO("Create ArcherAioThread for thread: ", thread_id_);
+}
+
+ArcherAioThread::~ArcherAioThread() { Stop(); }
+
+void ArcherAioThread::Start()
+{
+    if (is_running_) { return; }
+
+    is_running_ = true;
+    pending_callbacks_ = 0;
+    thread_ = std::thread(&ArcherAioThread::Run, this);
+}
+
+void ArcherAioThread::Stop()
+{
+    if (!is_running_) { return; }
+
+    is_running_ = false;
+    thread_.join();
+}
+
+void ArcherAioThread::Enqueue(AioCallback& callback)
+{
+    std::lock_guard<std::mutex> lock(mutex_);
+    callbacks_.push_back(std::move(callback));
+    pending_callbacks_.fetch_add(1);
+}
+
+void ArcherAioThread::Wait()
+{
+    // while (!callbacks_.empty()) { usleep(1000); }
+    while (pending_callbacks_.load() != 0) { usleep(1000); }
+    std::lock_guard<std::mutex> lock(mutex_);
+    callbacks_.clear();
+}
+
+void ArcherAioThread::Run()
+{
+
+    while (is_running_) {
+        std::function<void()> callback;
+        {
+            std::lock_guard<std::mutex> lock(mutex_);
+            if (callbacks_.empty()) { continue; }
+            callback = std::move(callbacks_.front());
+            callbacks_.pop_front();
+        }
+        callback();
+        pending_callbacks_.fetch_sub(1);
+    }
+
+}
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		recursive-include core .cpp .h *.cc
		recursive-include op_builder *.py