From 7f97af6651e2fbef236b46ecbb288b8a863ab77c Mon Sep 17 00:00:00 2001 From: Your Name Date: Tue, 16 Apr 2024 10:13:53 +0000 Subject: [PATCH] add readme example & fix peer access --- README.md | 2 +- core/prefetch/archer_prefetch_handle.cpp | 4 +++- examples/readme_example.py | 24 ++++++++++++++++++++++++ 3 files changed, 28 insertions(+), 2 deletions(-) create mode 100644 examples/readme_example.py diff --git a/README.md b/README.md index 21ffaa9..103b1e7 100644 --- a/README.md +++ b/README.md @@ -137,7 +137,7 @@ CUDA_VISIBLE_DEVICES=0,1 python script.py We provide a simple example to run inference on a Huggingface LLM model. The script will download the model checkpoint and run inference on the specified input text. The output will be printed to the console. ```bash -CUDA_VISIBLE_DEVICES=0 python example/interface_example.py --model_name_or_path "mistralai/Mixtral-8x7B-Instruct-v0.1" --offload_dir +CUDA_VISIBLE_DEVICES=0 python examples/interface_example.py --model_name_or_path "mistralai/Mixtral-8x7B-Instruct-v0.1" --offload_dir ``` ## Release Plan diff --git a/core/prefetch/archer_prefetch_handle.cpp b/core/prefetch/archer_prefetch_handle.cpp index 70fb5d3..b681c9a 100644 --- a/core/prefetch/archer_prefetch_handle.cpp +++ b/core/prefetch/archer_prefetch_handle.cpp @@ -35,8 +35,10 @@ ArcherPrefetchHandle::ArcherPrefetchHandle(const std::string& prefix, for (int i = 0; i < device_count; i++) { cudaSetDevice(i); + int can_access = 0; + cudaDeviceCanAccessPeer(&can_access, i, i); for (int j = 0; j < device_count; j++) { - if (i != j) { cudaDeviceEnablePeerAccess(j, 0); } + if (i != j && can_access == 1) { cudaDeviceEnablePeerAccess(j, 0); } } } diff --git a/examples/readme_example.py b/examples/readme_example.py new file mode 100644 index 0000000..2f1129c --- /dev/null +++ b/examples/readme_example.py @@ -0,0 +1,24 @@ +import torch +import os +from transformers import AutoTokenizer, SwitchTransformersForConditionalGeneration +from moe_infinity import MoE + +user_home = os.path.expanduser('~') + +checkpoint = 'TheBloke/Mixtral-8x7B-v0.1-GPTQ' +tokenizer = AutoTokenizer.from_pretrained(checkpoint) + +config = { + "offload_path": os.path.join(user_home, "moe-infinity"), + "device_memory_ratio": 0.75, # 75% of the device memory is used for caching, change the value according to your device memory size on OOM +} + +model = MoE(checkpoint, config) + +input_text = "translate English to German: How old are you?" +input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to("cuda:0") + +output_ids = model.generate(input_ids) +output_text = tokenizer.decode(output_ids[0], skip_special_tokens=True) + +print(output_text) \ No newline at end of file