Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

使用GLM4跑您这边提供的数据,效果很低,是因为大模型的问题吗? #9

Open
AryaFeng opened this issue Dec 10, 2024 · 0 comments

Comments

@AryaFeng
Copy link

1.代码如下。

测试generate_then_read代码在glm9b下的性

from openai import OpenAI
import re
import json
from tqdm import tqdm

class ChatGLM4(object):

def __init__(self, api_key="EMPTY", base_url="http://x.x.x.x:8000/v1/",model_name="glm-4"):
    self.base_url = base_url
    self.model_name = model_name
    # 初始化 OpenAI 客户端,并增加错误处理
    try:
        self._client = OpenAI(api_key="glm-4", base_url=self.base_url)
    except Exception as e:
        raise ValueError(f"Failed to initialize OpenAI client: {e}")

def call(self, prompt, chat_history = []):

    # 将 chat_history 与当前 prompt 组合成请求
    messages = [{"role": "system", "content": "you are a helpful assistant!"}]
    # 添加之前的对话历史
    for user_content, assistant_content in chat_history:
        messages.append({"role": "user", "content": user_content})
        messages.append({"role": "system", "content": assistant_content})

    # 当前用户的输入
    messages.append({"role": "user", "content": prompt})
    # logger_file.info(f"messages: {messages}")
    # 发送请求
    try:
        response = self._client.chat.completions.create(
            model="glm-4",  # 或其他模型
            messages=messages,
            # max_tokens=,  # 可以调整返回的 token 数量
            n=1,  # 返回一个结果
            stop=None,  # 可设置 stop 标记
            temperature=0,  # 控制输出的创造性
            top_p = 0.1
        )
        return response

    except Exception as e:
        log.error(f"error:,Failed to get a response from OpenAI: {e}")
        raise ValueError(f"Failed to get a response from OpenAI: {e}")


def chat(self,prompt,chat_history=[]):
    response = self.call(prompt,chat_history)
    response = response.choices[0].message.content
    return response

llm = ChatGLM4()

with open("webq_test_gpt_output.json",'r') as f:
datas = json.load(f)

prompt = """
Refer to the passage below and answer the following question with just one entity. \n\n Passage: {background} \n\n Question: {query} \n\n The answer is
"""
results = []
for data in tqdm(datas):
question = data['question']
ctxs = data['ctxs']
answer = data['answers']
string = ""
for idx,ctx in enumerate(ctxs):
string += f"\n {idx}: {ctx['text']}"
final_prompt = prompt.format(background=string,query=question)
output= llm.chat(final_prompt)
results.append({"question":question,"answer":answer,"output":output})

with open('results2.json', 'w', encoding='utf-8') as f:
json.dump(results, f, ensure_ascii=False, indent=4) # 设置 ensure_ascii=False 以支持中文
得到结果如下
image

将输出转化为列表,

import json

results = []
with open("results2.json",'r') as f:
datas = json.load(f)

for data in datas:
question = data['question']
answer = data['answer']
output = [ ans.strip() for ans in data['output'][0].split(',')]
results.append({"question":question,"answer":answer,"output":output})

with open('results_output.json', 'w', encoding='utf-8') as f:
json.dump(results, f, ensure_ascii=False, indent=4) # 设置 ensure_ascii=False 以支持中文

得到结果如下:
image

最后调用eval进行评估

import regex
import json
import string
import unicodedata
from typing import List
import numpy as np
from collections import Counter

from rouge import Rouge

class SimpleTokenizer(object):
ALPHA_NUM = r'[\p{L}\p{N}\p{M}]+'
NON_WS = r'[^\p{Z}\p{C}]'

def __init__(self):
    """
    Args:
        annotators: None or empty set (only tokenizes).
    """
    self._regexp = regex.compile(
        '(%s)|(%s)' % (self.ALPHA_NUM, self.NON_WS),
        flags=regex.IGNORECASE + regex.UNICODE + regex.MULTILINE
    )

def tokenize(self, text, uncased=False):
    matches = [m for m in self._regexp.finditer(text)]
    if uncased:
        tokens = [m.group().lower() for m in matches]
    else:
        tokens = [m.group() for m in matches]
    return tokens

def check_answer(example, tokenizer) -> List[bool]:
"""Search through all the top docs to see if they have any of the answers."""
answers = example['answers']
ctxs = example['ctxs']

hits = []

for _, doc in enumerate(ctxs):
    text = doc['text']

    if text is None:  # cannot find the document for some reason
        hits.append(False)
        continue

    hits.append(has_answer(answers, text, tokenizer))

return hits

def has_answer(answers, text, tokenizer=SimpleTokenizer()) -> bool:
"""Check if a document contains an answer string."""
text = _normalize(text)
text = tokenizer.tokenize(text, uncased=True)

for answer in answers:
    answer = _normalize(answer)
    answer = tokenizer.tokenize(answer, uncased=True)
    for i in range(0, len(text) - len(answer) + 1):
        if answer == text[i: i + len(answer)]:
            return True
return False

def _normalize(text):
return unicodedata.normalize('NFD', text)

def normalize_answer(s):
def remove_articles(text):
return regex.sub(r'\b(a|an|the)\b', ' ', text)

def white_space_fix(text):
    return ' '.join(text.split())

def remove_punc(text):
    exclude = set(string.punctuation)
    return ''.join(ch for ch in text if ch not in exclude)

def lower(text):
    return text.lower()

return white_space_fix(remove_articles(remove_punc(lower(s))))

def exact_match_score(prediction, ground_truth):
return normalize_answer(prediction) == normalize_answer(ground_truth)

def ems(prediction, ground_truths):
return max([exact_match_score(prediction, gt) for gt in ground_truths])

def f1_score(prediction, ground_truth):
prediction_tokens = normalize_answer(prediction).split()
ground_truth_tokens = normalize_answer(ground_truth).split()
common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
num_same = sum(common.values())
if num_same == 0:
return 0
precision = 1.0 * num_same / len(prediction_tokens)
recall = 1.0 * num_same / len(ground_truth_tokens)
f1 = (2 * precision * recall) / (precision + recall)
return f1

def f1(prediction, ground_truths):
return max([f1_score(prediction, gt) for gt in ground_truths])

def rougel_score(prediction, ground_truth):

rouge = Rouge()

# no normalization

try:

scores = rouge.get_scores(prediction, ground_truth, avg=True)

except ValueError: # "Hypothesis is empty."

return 0.0

return scores["rouge-l"]["f"]

def rl(prediction, ground_truths):

return max([rougel_score(prediction, gt) for gt in ground_truths])

file-level evaluation ...

def eval_recall(infile):

tokenizer = SimpleTokenizer()
lines = open(infile, 'r').readlines()[1:]

has_answer_count = 0
answer_lengths = []
for line in lines:
    line = json.loads(line)
    answer = line['answer']
    output = ' || '.join(line['output'])

    if has_answer(answer, output, tokenizer):
        has_answer_count += 1

    answer_lengths.append(len(output.split()))

recall = round(has_answer_count/len(lines), 4)
lens = round(np.mean(answer_lengths), 4)

return recall, lens

def eval_question_answering(infile):

# lines = open(infile, 'r').readlines()[1:]
with open(infile,'r') as f:
    lines = json.load(f)

exact_match_count = 0
answer_lengths = []
for line in lines:
    # line = json.loads(line)
    answer = line['answer']
    output = line['output'][0]

    if ems(output, answer): # EM evaluation
        exact_match_count += 1
    
    answer_lengths.append(len(output.split()))

em = round(exact_match_count/len(lines), 4)
lens = round(np.mean(answer_lengths), 4)

return em, lens

def eval_fact_checking(infile):

tokenizer = SimpleTokenizer()
lines = open(infile, 'r').readlines()[1:]

exact_match_count = 0
answer_lengths = []
for line in lines:
    line = json.loads(line)
    answer = line['answer']
    output = line['output'][0]

    if answer == ["refutes"]:
        answer = ["refutes", "no", "false"]
    if answer == ["supports"]:
        answer = ["supports", "yes", "true"]

    if has_answer(answer, output, tokenizer):
        exact_match_count += 1
    
    answer_lengths.append(len(output.split()))

em = round(exact_match_count/len(lines), 4)
lens = round(np.mean(answer_lengths), 4)

return em, lens

def eval_dialogue_system(infile):

lines = open(infile, 'r').readlines()[1:]

f1_scores = []
rl_scores = []
answer_lengths = []
for line in lines:
    line = json.loads(line)
    answer = line['answer']
    output = line['output'][0]

    f1_scores.append(f1(output, answer))
    rl_scores.append(rl(output, answer))
    answer_lengths.append(len(output.split()))

F1 = round(np.mean(f1_scores), 4)
RL = round(np.mean(rl_scores), 4)
lens = round(np.mean(answer_lengths), 4)

return F1, RL, lens

emscore, length = eval_question_answering("results_output.json")
print(emscore)
print(length) 最后的结果只有
0.3179
2.6216
不清楚是因为大模型的问题吗。
webq_test_gpt_output.json来自于https://drive.google.com/drive/folders/1DNjTTOLKi24wohJKu1Z-v6b4izfymlLu 您这边提供的文件。希望可以得到您的回复

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

1 participant