From 7f97af6651e2fbef236b46ecbb288b8a863ab77c Mon Sep 17 00:00:00 2001
From: Your Name <you@example.com>
Date: Tue, 16 Apr 2024 10:13:53 +0000
Subject: [PATCH] add readme example & fix peer access

---
 README.md                                |  2 +-
 core/prefetch/archer_prefetch_handle.cpp |  4 +++-
 examples/readme_example.py               | 24 ++++++++++++++++++++++++
 3 files changed, 28 insertions(+), 2 deletions(-)
 create mode 100644 examples/readme_example.py
diff --git a/README.md b/README.md
index 21ffaa9..103b1e7 100644
--- a/README.md
+++ b/README.md
@@ -137,7 +137,7 @@ CUDA_VISIBLE_DEVICES=0,1 python script.py
 We provide a simple example to run inference on a Huggingface LLM model. The script will download the model checkpoint and run inference on the specified input text. The output will be printed to the console.
 
 ```bash
-CUDA_VISIBLE_DEVICES=0 python example/interface_example.py --model_name_or_path "mistralai/Mixtral-8x7B-Instruct-v0.1" --offload_dir <your local path on SSD> 
+CUDA_VISIBLE_DEVICES=0 python examples/interface_example.py --model_name_or_path "mistralai/Mixtral-8x7B-Instruct-v0.1" --offload_dir <your local path on SSD> 
 ```
 
 ## Release Plan
diff --git a/core/prefetch/archer_prefetch_handle.cpp b/core/prefetch/archer_prefetch_handle.cpp
index 70fb5d3..b681c9a 100644
--- a/core/prefetch/archer_prefetch_handle.cpp
+++ b/core/prefetch/archer_prefetch_handle.cpp
@@ -35,8 +35,10 @@ ArcherPrefetchHandle::ArcherPrefetchHandle(const std::string& prefix,
 
     for (int i = 0; i < device_count; i++) {
         cudaSetDevice(i);
+        int can_access = 0;
+        cudaDeviceCanAccessPeer(&can_access, i, i);
         for (int j = 0; j < device_count; j++) {
-            if (i != j) { cudaDeviceEnablePeerAccess(j, 0); }
+            if (i != j && can_access == 1) { cudaDeviceEnablePeerAccess(j, 0); }
         }
     }
 
diff --git a/examples/readme_example.py b/examples/readme_example.py
new file mode 100644
index 0000000..2f1129c
--- /dev/null
+++ b/examples/readme_example.py
@@ -0,0 +1,24 @@
+import torch
+import os
+from transformers import AutoTokenizer, SwitchTransformersForConditionalGeneration
+from moe_infinity import MoE
+
+user_home = os.path.expanduser('~')
+
+checkpoint = 'TheBloke/Mixtral-8x7B-v0.1-GPTQ'
+tokenizer = AutoTokenizer.from_pretrained(checkpoint)
+
+config = {
+    "offload_path": os.path.join(user_home, "moe-infinity"),
+    "device_memory_ratio": 0.75, # 75% of the device memory is used for caching, change the value according to your device memory size on OOM
+}
+
+model = MoE(checkpoint, config)
+
+input_text = "translate English to German: How old are you?"
+input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to("cuda:0")
+
+output_ids = model.generate(input_ids)
+output_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
+
+print(output_text)
\ No newline at end of file