add readme example & fix peer access

TorchMoE · Apr 16, 2024 · 7f97af6 · 7f97af6
1 parent 75e468c
commit 7f97af6
Show file tree

Hide file tree

Showing 3 changed files with 28 additions and 2 deletions.
diff --git a/README.md b/README.md
@@ -137,7 +137,7 @@ CUDA_VISIBLE_DEVICES=0,1 python script.py
 We provide a simple example to run inference on a Huggingface LLM model. The script will download the model checkpoint and run inference on the specified input text. The output will be printed to the console.
 
 ```bash
-CUDA_VISIBLE_DEVICES=0 python example/interface_example.py --model_name_or_path "mistralai/Mixtral-8x7B-Instruct-v0.1" --offload_dir <your local path on SSD> 
+CUDA_VISIBLE_DEVICES=0 python examples/interface_example.py --model_name_or_path "mistralai/Mixtral-8x7B-Instruct-v0.1" --offload_dir <your local path on SSD> 
 ```
 
 ## Release Plan

diff --git a/core/prefetch/archer_prefetch_handle.cpp b/core/prefetch/archer_prefetch_handle.cpp
@@ -35,8 +35,10 @@ ArcherPrefetchHandle::ArcherPrefetchHandle(const std::string& prefix,
 
     for (int i = 0; i < device_count; i++) {
         cudaSetDevice(i);
+        int can_access = 0;
+        cudaDeviceCanAccessPeer(&can_access, i, i);
         for (int j = 0; j < device_count; j++) {
-            if (i != j) { cudaDeviceEnablePeerAccess(j, 0); }
+            if (i != j && can_access == 1) { cudaDeviceEnablePeerAccess(j, 0); }
         }
     }
 

diff --git a/examples/readme_example.py b/examples/readme_example.py
@@ -0,0 +1,24 @@
+import torch
+import os
+from transformers import AutoTokenizer, SwitchTransformersForConditionalGeneration
+from moe_infinity import MoE
+
+user_home = os.path.expanduser('~')
+
+checkpoint = 'TheBloke/Mixtral-8x7B-v0.1-GPTQ'
+tokenizer = AutoTokenizer.from_pretrained(checkpoint)
+
+config = {
+    "offload_path": os.path.join(user_home, "moe-infinity"),
+    "device_memory_ratio": 0.75, # 75% of the device memory is used for caching, change the value according to your device memory size on OOM
+}
+
+model = MoE(checkpoint, config)
+
+input_text = "translate English to German: How old are you?"
+input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to("cuda:0")
+
+output_ids = model.generate(input_ids)
+output_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
+
+print(output_text)