Skip to content
This repository has been archived by the owner on Oct 25, 2024. It is now read-only.

Commit

Permalink
[Transformers] Support load mode from HF Hub when use Neural Speed (#…
Browse files Browse the repository at this point in the history
…1449)

Co-authored-by: Wenxin Zhang <[email protected]>
Co-authored-by: changwangss <[email protected]>
  • Loading branch information
3 people authored Apr 3, 2024
1 parent 02a6984 commit 346211c
Show file tree
Hide file tree
Showing 12 changed files with 66 additions and 34 deletions.
16 changes: 8 additions & 8 deletions examples/.config/pytorch_optimize.json
Original file line number Diff line number Diff line change
Expand Up @@ -1580,7 +1580,8 @@
"params": {
"topology": "mistral_7b_autoround",
"task": "generation",
"output_model": "saved_results"
"output_model": "saved_results",
"weight_dtype": "int4_clip"
}
},
"benchmark": {
Expand All @@ -1590,11 +1591,10 @@
"task": "generation",
"backend": "neuralspeed",
"mode": "benchmark",
"batch_size": "112",
"batch_size": "10",
"iters": "100",
"int8": "false",
"config": "saved_results",
"weight_dtype": "int4_clip"
"config": "saved_results"
}
}
},
Expand All @@ -1616,7 +1616,7 @@
"task": "generation",
"mode": "benchmark",
"backend": "neuralspeed",
"batch_size": "112",
"batch_size": "10",
"iters": "100",
"int8": "false",
"config": "saved_results"
Expand All @@ -1642,7 +1642,7 @@
"task": "generation",
"backend": "neuralspeed",
"mode": "benchmark",
"batch_size": "112",
"batch_size": "10",
"iters": "100",
"int8": "false",
"config": "saved_results"
Expand Down Expand Up @@ -1732,7 +1732,7 @@
"task": "generation",
"backend": "neuralspeed",
"mode": "benchmark",
"batch_size": "112",
"batch_size": "10",
"iters": "100",
"int8": "false",
"config": "saved_results",
Expand All @@ -1750,7 +1750,7 @@
"task": "generation",
"mode": "benchmark",
"backend": "neuralspeed",
"batch_size": "112",
"batch_size": "10",
"iters": "100",
"int8": "false",
"config": "saved_results",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,4 +13,4 @@ tiktoken
py-cpuinfo
cmake
gguf
neural-speed==1.0a0
neural-speed
2 changes: 1 addition & 1 deletion examples/huggingface/neural_speed/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
intel_extension_for_transformers
neural-speed==1.0a0
neural-speed
git+https://github.com/EleutherAI/lm-evaluation-harness.git@cc9778fbe4fa1a709be2abed9deb6180fd40e7e2
sentencepiece
gguf
Expand Down
4 changes: 2 additions & 2 deletions examples/huggingface/neural_speed/run_accuracy.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,15 +19,15 @@
parser = argparse.ArgumentParser(description="Evaluate diff for a model")
parser.add_argument('--model_name', type=str, default="~/Llama-2-7b-chat-hf", help="path to model")
parser.add_argument('--tasks', type=str, default="lambada_openai")
parser.add_argument('--model_format', type=str, default="runtime")
parser.add_argument('--model_format', type=str, default="neural_speed")
parser.add_argument('--use_gptq', action='store_true')
parser.add_argument('--batch_size', type=int, default=1)
args = parser.parse_args()
print(args)
model_args=f'pretrained="{args.model_name}",dtype=float32,trust_remote_code=True'
if args.use_gptq:
model_args += ",use_gptq=True"
if args.model_format == "runtime":
if args.model_format == "neural_speed":
results = evaluate(
model="hf-causal",
model_args=model_args,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,8 @@ bitsandbytes #baichuan
transformers_stream_generator
tiktoken #qwen
einops #qwen
neural-speed
git+https://github.com/intel/neural-speed[email protected]
auto-round
git+https://github.com/intel/neural-compressor.git
git+https://github.com/EleutherAI/lm-evaluation-harness.git@cc9778fbe4fa1a709be2abed9deb6180fd40e7e2
huggingface_hub
Original file line number Diff line number Diff line change
Expand Up @@ -163,6 +163,8 @@ function run_benchmark {
model_name_or_path="/tf_dataset2/models/pytorch/Mistral-7B-v0.1"
elif [ "${topology}" = "mistral_7b_rtn" ] && [ "$model_source" != "huggingface" ]; then
model_name_or_path="/tf_dataset2/models/pytorch/Mistral-7B-v0.1"
elif [ "${topology}" = "mistral_7b_gptq" ] && [ "$model_source" != "huggingface" ]; then
model_name_or_path="/tf_dataset2/models/pytorch/Mistral-7B-v0.1"
fi

if [[ ${int8} == "true" ]]; then
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -250,7 +250,10 @@
args.model = args.peft_model_id if args.peft_model_id is not None else args.model

# Generation
generate_kwargs = dict(do_sample=False, temperature=0.9, num_beams=4)
if args.use_neural_speed:
generate_kwargs = dict(do_sample=False, temperature=0.9, num_beams=1)
else:
generate_kwargs = dict(do_sample=False, temperature=0.9, num_beams=4)

# mp/sq/woq/bitsandbytes config setting
quantization_config = None
Expand Down Expand Up @@ -478,10 +481,9 @@

if args.benchmark:
user_model = (
user_model.eval() if not (args.int8 or args.int8_bf16_mixed) else user_model
user_model.eval() if (not (args.int8 or args.int8_bf16_mixed) and hasattr(user_model, "eval")) else user_model
)
prompt = "Once upon a time, there existed a little girl, who liked to have adventures. She wanted to go to places and meet new people, and have fun."

input_size = tokenizer(prompt, return_tensors="pt").input_ids.size(dim=1)
print("---- Prompt size:", input_size)

Expand Down Expand Up @@ -521,7 +523,7 @@
toc = time.time()
# please check the gen_ids if include input_ids.
input_tokens_num = input_ids.numel()
output_tokens_num = gen_ids.numel() - input_tokens_num
output_tokens_num = torch.tensor(gen_ids).numel() - input_tokens_num
print(gen_text, flush=True)
if i >= num_warmup:
total_time += toc - tic
Expand All @@ -534,18 +536,30 @@
print("Throughput: {} samples/sec".format(throughput))

if args.accuracy:
user_model = (user_model.eval() if not (args.int8 or args.int8_bf16_mixed) else user_model)
user_model = (user_model.eval() if (not (args.int8 or args.int8_bf16_mixed) and hasattr(user_model, "eval")) \
else user_model)
args.model = (peft_config.base_model_name_or_path if args.peft_model_id else args.model)
from intel_extension_for_transformers.transformers.llm.evaluation.lm_eval import evaluate

pretrained = ',pretrained=' + args.model
args._commit_hash = "main" if args._commit_hash is None else args._commit_hash
eval_args = "tokenizer=" + args.model + ",dtype=float32" + ",_commit_hash=" + \
args._commit_hash + ",trust_remote_code=" + str(args.trust_remote_code)
if args.use_neural_speed:
eval_args += pretrained
q_conf = user_model.config.quantization_config
if isinstance(q_conf, dict):
q_algo = q_conf.get("quant_method", None)
else:
q_algo = q_conf.quant_method.value
if q_algo.upper() in ["AWQ", "GPTQ", "AUTOROUND"]:
eval_args += ",use_gptq=True"
results = evaluate(
model="hf-causal",
model_args="pretrained=" + args.model + ",tokenizer=" + args.model + ",dtype=float32" + ",_commit_hash=" + args._commit_hash +
",trust_remote_code=" + str(args.trust_remote_code),
model_args=eval_args,
user_model=user_model,
batch_size=args.batch_size,
tasks=args.tasks,
model_format="neural_speed" if args.use_neural_speed else "torch",
)
dumped = json.dumps(results, indent=2)
if args.save_accuracy_path:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -323,7 +323,7 @@

results = evaluate(
model="hf-causal",
model_args='pretrained=' + args.model + ',tokenizer=' + args.model + \
model_args='tokenizer=' + args.model + \
',dtype=float32,trust_remote_code=' + str(args.trust_remote_code),
user_model=user_model,
batch_size=args.batch_size,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -220,7 +220,7 @@ function run_tuning {
extra_cmd=$extra_cmd" --weight_dtype ${weight_dtype}"
elif [ "${topology}" = "mistral_7b_rtn" ]; then
model_name_or_path="/tf_dataset2/models/pytorch/Mistral-7B-v0.1"
extra_cmd=$extra_cmd" --woq --bits 4 -compute_dtype fp32 --scheme asym "
extra_cmd=$extra_cmd" --woq --bits 4 --compute_dtype fp32 --scheme asym "
extra_cmd=$extra_cmd" --woq_algo "Rtn" --desc_act --blocksize 128 --max_input_length 2048 "
extra_cmd=$extra_cmd" --output_dir ${tuned_checkpoint}"
extra_cmd=$extra_cmd" --trust_remote_code"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,8 @@ def evaluate(model,
}
if user_model:
kwargs["init_empty_weights"] = True
if "pretrained" not in model_args:
model_args = "pretrained='Muennighoff/tiny-random-bert'," + model_args

if device == "hpu":
# if hpu, set user_model
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -465,7 +465,7 @@ def add_special_tokens(self) -> bool:
"""
if self._add_special_tokens is not None:
return self._add_special_tokens
elif self.model_format == "runtime":
elif self.model_format == "neural_speed":
return True
elif self.AUTO_MODEL_CLASS is transformers.AutoModelForCausalLM:
return False
Expand Down Expand Up @@ -614,7 +614,7 @@ class AutoCausalLM(HuggingFaceAutoLM):

def __init__(self, *args, pretrained, model_format, **kwargs):
self.model_format = model_format
if self.model_format == "runtime":
if self.model_format == "neural_speed":
from intel_extension_for_transformers.transformers import RtnConfig, AwqConfig, GPTQConfig, AutoRoundConfig
use_gptq = kwargs.pop("use_gptq", False)
if use_gptq:
Expand All @@ -623,11 +623,11 @@ def __init__(self, *args, pretrained, model_format, **kwargs):
self.woq_config = RtnConfig(bits=4, compute_dtype="int8", weight_dtype="int4")
super().__init__(*args, pretrained=pretrained, model_format=model_format, **kwargs)

if self.model_format == "runtime":
if self.model_format == "neural_speed":
from transformers import AutoTokenizer, TextStreamer
from intel_extension_for_transformers.transformers import AutoModelForCausalLM
self.runtime_model = AutoModelForCausalLM.from_pretrained(pretrained, quantization_config=self.woq_config,
trust_remote_code=kwargs.get("trust_remote_code", False))
use_neural_speed=True, trust_remote_code=kwargs.get("trust_remote_code", False))

if self.model_format == "onnx":
if not os.path.exists(os.path.join(pretrained, "decoder_model.onnx")) and \
Expand Down Expand Up @@ -758,7 +758,7 @@ def _model_call(
input_bs, input_len = inputs.shape
bos = torch.tensor([64790, 64792]).repeat(input_bs, 1)
inputs = torch.cat((bos, inputs), 1)
if self.model_format == "runtime":
if self.model_format == "neural_speed":
out = self.runtime_model(inputs, reinit=True, logits_all=True, ignore_padding=True)
output = {"logits": torch.from_numpy(out)}
elif self.model_format != "onnx":
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -398,12 +398,29 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
else:
use_neural_speed = False

if hasattr(config, "quantization_config") and not use_neural_speed:
if hasattr(config, "quantization_config"):
if config.quantization_config is None:
logger.warning(
"Quantization_config loading failed. If you want to load saved "
"low bit model, please check your quantizate_config.json."
)
elif use_neural_speed:
if not os.path.exists(pretrained_model_name_or_path):
from huggingface_hub import snapshot_download
pretrained_model_name_or_path = snapshot_download(repo_id=pretrained_model_name_or_path,
allow_patterns=["*.pt", "*.safetensors", "*.json", ".model"],
)
if quantization_config is None:
ConfigInit = {"rtn": RtnConfig,
"awq": AwqConfig,
"teq": TeqConfig,
"gptq": GPTQConfig,
"autoround": AutoRoundConfig,
}
quantization_config = config.quantization_config
assert quantization_config.get("quant_method", None) in ConfigInit, \
"Detect this model is not a low-bit model."
quantization_config = ConfigInit[quantization_config["quant_method"]].from_dict(quantization_config)
else:
logger.info(
"quantization_config: {}".format(config.quantization_config)
Expand Down Expand Up @@ -556,11 +573,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
scale_dtype=quantization_config.scale_dtype,
compute_dtype=quantization_config.compute_dtype,
use_ggml=quantization_config.use_ggml,
use_quant=(
quantization_config.use_quant
if hasattr(quantization_config, "use_quant")
else False
),
use_quant=True,
use_gptq=quantization_config.quant_method.value == "gptq"
or quantization_config.quant_method.value == "autoround",
use_awq=quantization_config.quant_method.value == "awq",
Expand Down

0 comments on commit 346211c

Please sign in to comment.