-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrun_chat.py
78 lines (63 loc) · 1.85 KB
/
run_chat.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
import argparse
import os
import sys
from llama_cpp import Llama
from termcolor import cprint
cwd = os.getcwd()
model_path = os.path.join(cwd, "capybarahermes-2.5-mistral-7b.Q4_K_M.gguf")
print_bot = lambda x: cprint(x, 'green', end='')
def main(
model_path,
verbose=False,
n_threads=16,
seed=123,
n_gpu_layers=None
):
params = {
"model_path": model_path,
"n_ctx": 32768,
"seed": seed,
"n_threads": n_threads,
"verbose": verbose,
}
if n_gpu_layers:
params.update({'n_gpu_layers': n_gpu_layers})
llm = Llama(**params)
os.system("clear")
messages = ""
while True:
prompt = input("\n>>> ")
if prompt == "clear":
print("=== Clear! ===")
messages = ""
continue
if prompt == "bye":
break
messages += f"User: {prompt}\nAssistant: "
stream = llm(
messages,
max_tokens=512,
stream=True,
)
for output in stream:
out_text = output["choices"][0]["text"]
messages += out_text
print_bot(out_text)
sys.stdout.flush()
messages += " "
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"--model_path", default=model_path, help="Full path to the quantized model GGUF file."
)
parser.add_argument(
"--n_threads", default=16, help="Number of threads to run the LLM."
)
parser.add_argument(
"--n_gpu_layers", default=None, help="Number of GPU layers to offload to the GPU if available."
)
parser.add_argument(
"-v", "--verbose", action="store_true", help="Whether to have verbose outputs."
)
args = parser.parse_args()
main(model_path=args.model_path, n_threads=args.n_threads, verbose=args.verbose)