diff --git a/.github/workflows/publish-test.yml b/.github/workflows/publish-test.yml new file mode 100644 index 0000000..95d6ae5 --- /dev/null +++ b/.github/workflows/publish-test.yml @@ -0,0 +1,105 @@ +# This workflow will upload a Python Package to Release asset +# For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions + +name: Publish to Test PyPI + +on: + push: + branches: + - main + +# Needed to create release and upload assets +permissions: + contents: write + + +jobs: + setup-version: + runs-on: ubuntu-latest + steps: + - name: Generate version number + run: | + VERSION_HASH=$(date +"%Y%m%d%H%M%S") + echo "Generated version hash: $VERSION_HASH" + echo $VERSION_HASH > version.txt + + - name: Upload version number as artifact + uses: actions/upload-artifact@v2 + with: + name: version + path: version.txt + + wheel: + name: Build Wheel + runs-on: ${{ matrix.os }} + permissions: write-all + + strategy: + fail-fast: false + matrix: + os: ['ubuntu-20.04'] + python-version: ['3.8', '3.9', '3.10', '3.11'] + cuda-version: ['11.7'] + + steps: + - name: Checkout + uses: actions/checkout@v3 + + # - name: Set up Linux Env + # if: ${{ runner.os == 'Linux' }} + # run: | + # bash -x .github/workflows/scripts/env.sh + + # https://github.com/orgs/community/discussions/26313 + - name: Download version value artifact + uses: actions/download-artifact@v2 + with: + name: version + path: artifact + + - name: Free disk space + run: | + sudo rm -rf /usr/local/cuda-* /opt/cuda + sudo rm -rf /usr/local/cuda + bash -x .github/workflows/scripts/free-disk-space.sh + + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: ${{ matrix.python-version }} + + - name: Install CUDA ${{ matrix.cuda-version }} + run: | + bash -x .github/workflows/scripts/cuda-install.sh ${{ matrix.cuda-version }} ${{ matrix.os }} + + - name: Build wheel + shell: bash + run: | + python -m pip install --upgrade pip + pip install -r requirements.txt + pip install build + VERSION_HASH=$(cat artifact/version.txt) + MOEINF_VERSION=0.0.1dev${VERSION_HASH} BUILD_OPS=1 python -m build --wheel + wheel_name=$(ls dist/*whl | xargs -n 1 basename) + asset_name=${wheel_name//"linux"/"manylinux1"} + echo "wheel_name=${wheel_name}" >> $GITHUB_ENV + echo "asset_name=${asset_name}" >> $GITHUB_ENV + + + # only build source when the python version is 3.8 + - name: Build Source + if: ${{ matrix.python-version == '3.8' }} + run: | + VERSION_HASH=$(cat artifact/version.txt) + MOEINF_VERSION=0.0.1dev${VERSION_HASH} python -m build --sdist + + - name: Rename wheel + run: | + mv dist/${{ env.wheel_name }} dist/${{ env.asset_name }} + + # (Danielkinz): This last step will publish the .whl to pypi. Warning: untested + - name: Publish package + uses: pypa/gh-action-pypi-publish@release/v1.8 + with: + repository-url: https://test.pypi.org/legacy/ + skip-existing: true \ No newline at end of file diff --git a/README.md b/README.md index 21ffaa9..103b1e7 100644 --- a/README.md +++ b/README.md @@ -137,7 +137,7 @@ CUDA_VISIBLE_DEVICES=0,1 python script.py We provide a simple example to run inference on a Huggingface LLM model. The script will download the model checkpoint and run inference on the specified input text. The output will be printed to the console. ```bash -CUDA_VISIBLE_DEVICES=0 python example/interface_example.py --model_name_or_path "mistralai/Mixtral-8x7B-Instruct-v0.1" --offload_dir +CUDA_VISIBLE_DEVICES=0 python examples/interface_example.py --model_name_or_path "mistralai/Mixtral-8x7B-Instruct-v0.1" --offload_dir ``` ## Release Plan diff --git a/core/prefetch/archer_prefetch_handle.cpp b/core/prefetch/archer_prefetch_handle.cpp index 70fb5d3..a107c50 100644 --- a/core/prefetch/archer_prefetch_handle.cpp +++ b/core/prefetch/archer_prefetch_handle.cpp @@ -34,12 +34,18 @@ ArcherPrefetchHandle::ArcherPrefetchHandle(const std::string& prefix, ARCHER_LOG_INFO("Device count ", device_count); for (int i = 0; i < device_count; i++) { - cudaSetDevice(i); for (int j = 0; j < device_count; j++) { - if (i != j) { cudaDeviceEnablePeerAccess(j, 0); } + if (i != j) { + int can_access = 0; + cudaDeviceCanAccessPeer(&can_access, i, j); + if (can_access == 1) { + cudaSetDevice(i); + cudaDeviceEnablePeerAccess(j, 0); + } + } } } - + ARCHER_LOG_INFO("Enabled peer access for all devices"); } diff --git a/examples/readme_example.py b/examples/readme_example.py new file mode 100644 index 0000000..2f1129c --- /dev/null +++ b/examples/readme_example.py @@ -0,0 +1,24 @@ +import torch +import os +from transformers import AutoTokenizer, SwitchTransformersForConditionalGeneration +from moe_infinity import MoE + +user_home = os.path.expanduser('~') + +checkpoint = 'TheBloke/Mixtral-8x7B-v0.1-GPTQ' +tokenizer = AutoTokenizer.from_pretrained(checkpoint) + +config = { + "offload_path": os.path.join(user_home, "moe-infinity"), + "device_memory_ratio": 0.75, # 75% of the device memory is used for caching, change the value according to your device memory size on OOM +} + +model = MoE(checkpoint, config) + +input_text = "translate English to German: How old are you?" +input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to("cuda:0") + +output_ids = model.generate(input_ids) +output_text = tokenizer.decode(output_ids[0], skip_special_tokens=True) + +print(output_text) \ No newline at end of file diff --git a/setup.py b/setup.py index 35242a5..936da6e 100644 --- a/setup.py +++ b/setup.py @@ -76,7 +76,7 @@ def read_readme() -> str: # install all files in the package, rather than just the egg setup( name='moe_infinity', - version='0.0.1', + version=os.getenv('MOEINF_VERSION', '0.0.1'), packages=find_packages(exclude=['op_builder', 'op_builder.*', 'moe_infinity.ops.core.*']), package_data={ 'moe_infinity.ops.prefetch': ['**/*.so'],