使用GLM4跑您这边提供的数据，效果很低，是因为大模型的问题吗？ #9

AryaFeng · 2024-12-10T14:57:08Z

1.代码如下。

测试generate_then_read代码在glm9b下的性

from openai import OpenAI
import re
import json
from tqdm import tqdm

class ChatGLM4(object):

def __init__(self, api_key="EMPTY", base_url="http://x.x.x.x:8000/v1/",model_name="glm-4"):
    self.base_url = base_url
    self.model_name = model_name
    # 初始化 OpenAI 客户端，并增加错误处理
    try:
        self._client = OpenAI(api_key="glm-4", base_url=self.base_url)
    except Exception as e:
        raise ValueError(f"Failed to initialize OpenAI client: {e}")

def call(self, prompt, chat_history = []):

    # 将 chat_history 与当前 prompt 组合成请求
    messages = [{"role": "system", "content": "you are a helpful assistant!"}]
    # 添加之前的对话历史
    for user_content, assistant_content in chat_history:
        messages.append({"role": "user", "content": user_content})
        messages.append({"role": "system", "content": assistant_content})

    # 当前用户的输入
    messages.append({"role": "user", "content": prompt})
    # logger_file.info(f"messages: {messages}")
    # 发送请求
    try:
        response = self._client.chat.completions.create(
            model="glm-4",  # 或其他模型
            messages=messages,
            # max_tokens=,  # 可以调整返回的 token 数量
            n=1,  # 返回一个结果
            stop=None,  # 可设置 stop 标记
            temperature=0,  # 控制输出的创造性
            top_p = 0.1
        )
        return response

    except Exception as e:
        log.error(f"error:,Failed to get a response from OpenAI: {e}")
        raise ValueError(f"Failed to get a response from OpenAI: {e}")


def chat(self,prompt,chat_history=[]):
    response = self.call(prompt,chat_history)
    response = response.choices[0].message.content
    return response

llm = ChatGLM4()

with open("webq_test_gpt_output.json",'r') as f:
datas = json.load(f)

prompt = """
Refer to the passage below and answer the following question with just one entity. \n\n Passage: {background} \n\n Question: {query} \n\n The answer is
"""
results = []
for data in tqdm(datas):
question = data['question']
ctxs = data['ctxs']
answer = data['answers']
string = ""
for idx,ctx in enumerate(ctxs):
string += f"\n {idx}: {ctx['text']}"
final_prompt = prompt.format(background=string,query=question)
output= llm.chat(final_prompt)
results.append({"question":question,"answer":answer,"output":output})

with open('results2.json', 'w', encoding='utf-8') as f:
json.dump(results, f, ensure_ascii=False, indent=4) # 设置 ensure_ascii=False 以支持中文
得到结果如下

将输出转化为列表，

import json

results = []
with open("results2.json",'r') as f:
datas = json.load(f)

for data in datas:
question = data['question']
answer = data['answer']
output = [ ans.strip() for ans in data['output'][0].split(',')]
results.append({"question":question,"answer":answer,"output":output})

with open('results_output.json', 'w', encoding='utf-8') as f:
json.dump(results, f, ensure_ascii=False, indent=4) # 设置 ensure_ascii=False 以支持中文

得到结果如下：

最后调用eval进行评估

import regex
import json
import string
import unicodedata
from typing import List
import numpy as np
from collections import Counter

from rouge import Rouge

class SimpleTokenizer(object):
ALPHA_NUM = r'[\p{L}\p{N}\p{M}]+'
NON_WS = r'[^\p{Z}\p{C}]'

def __init__(self):
    """
    Args:
        annotators: None or empty set (only tokenizes).
    """
    self._regexp = regex.compile(
        '(%s)|(%s)' % (self.ALPHA_NUM, self.NON_WS),
        flags=regex.IGNORECASE + regex.UNICODE + regex.MULTILINE
    )

def tokenize(self, text, uncased=False):
    matches = [m for m in self._regexp.finditer(text)]
    if uncased:
        tokens = [m.group().lower() for m in matches]
    else:
        tokens = [m.group() for m in matches]
    return tokens

def check_answer(example, tokenizer) -> List[bool]:
"""Search through all the top docs to see if they have any of the answers."""
answers = example['answers']
ctxs = example['ctxs']

hits = []

for _, doc in enumerate(ctxs):
    text = doc['text']

    if text is None:  # cannot find the document for some reason
        hits.append(False)
        continue

    hits.append(has_answer(answers, text, tokenizer))

return hits

def has_answer(answers, text, tokenizer=SimpleTokenizer()) -> bool:
"""Check if a document contains an answer string."""
text = _normalize(text)
text = tokenizer.tokenize(text, uncased=True)

for answer in answers:
    answer = _normalize(answer)
    answer = tokenizer.tokenize(answer, uncased=True)
    for i in range(0, len(text) - len(answer) + 1):
        if answer == text[i: i + len(answer)]:
            return True
return False

def _normalize(text):
return unicodedata.normalize('NFD', text)

def normalize_answer(s):
def remove_articles(text):
return regex.sub(r'\b(a|an|the)\b', ' ', text)

def white_space_fix(text):
    return ' '.join(text.split())

def remove_punc(text):
    exclude = set(string.punctuation)
    return ''.join(ch for ch in text if ch not in exclude)

def lower(text):
    return text.lower()

return white_space_fix(remove_articles(remove_punc(lower(s))))

def exact_match_score(prediction, ground_truth):
return normalize_answer(prediction) == normalize_answer(ground_truth)

def ems(prediction, ground_truths):
return max([exact_match_score(prediction, gt) for gt in ground_truths])

def f1_score(prediction, ground_truth):
prediction_tokens = normalize_answer(prediction).split()
ground_truth_tokens = normalize_answer(ground_truth).split()
common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
num_same = sum(common.values())
if num_same == 0:
return 0
precision = 1.0 * num_same / len(prediction_tokens)
recall = 1.0 * num_same / len(ground_truth_tokens)
f1 = (2 * precision * recall) / (precision + recall)
return f1

def f1(prediction, ground_truths):
return max([f1_score(prediction, gt) for gt in ground_truths])

def rougel_score(prediction, ground_truth):

rouge = Rouge()

# no normalization

try:

scores = rouge.get_scores(prediction, ground_truth, avg=True)

except ValueError: # "Hypothesis is empty."

return 0.0

return scores["rouge-l"]["f"]

def rl(prediction, ground_truths):

return max([rougel_score(prediction, gt) for gt in ground_truths])

file-level evaluation ...

def eval_recall(infile):

tokenizer = SimpleTokenizer()
lines = open(infile, 'r').readlines()[1:]

has_answer_count = 0
answer_lengths = []
for line in lines:
    line = json.loads(line)
    answer = line['answer']
    output = ' || '.join(line['output'])

    if has_answer(answer, output, tokenizer):
        has_answer_count += 1

    answer_lengths.append(len(output.split()))

recall = round(has_answer_count/len(lines), 4)
lens = round(np.mean(answer_lengths), 4)

return recall, lens

def eval_question_answering(infile):

# lines = open(infile, 'r').readlines()[1:]
with open(infile,'r') as f:
    lines = json.load(f)

exact_match_count = 0
answer_lengths = []
for line in lines:
    # line = json.loads(line)
    answer = line['answer']
    output = line['output'][0]

    if ems(output, answer): # EM evaluation
        exact_match_count += 1
    
    answer_lengths.append(len(output.split()))

em = round(exact_match_count/len(lines), 4)
lens = round(np.mean(answer_lengths), 4)

return em, lens

def eval_fact_checking(infile):

tokenizer = SimpleTokenizer()
lines = open(infile, 'r').readlines()[1:]

exact_match_count = 0
answer_lengths = []
for line in lines:
    line = json.loads(line)
    answer = line['answer']
    output = line['output'][0]

    if answer == ["refutes"]:
        answer = ["refutes", "no", "false"]
    if answer == ["supports"]:
        answer = ["supports", "yes", "true"]

    if has_answer(answer, output, tokenizer):
        exact_match_count += 1
    
    answer_lengths.append(len(output.split()))

em = round(exact_match_count/len(lines), 4)
lens = round(np.mean(answer_lengths), 4)

return em, lens

def eval_dialogue_system(infile):

lines = open(infile, 'r').readlines()[1:]

f1_scores = []
rl_scores = []
answer_lengths = []
for line in lines:
    line = json.loads(line)
    answer = line['answer']
    output = line['output'][0]

    f1_scores.append(f1(output, answer))
    rl_scores.append(rl(output, answer))
    answer_lengths.append(len(output.split()))

F1 = round(np.mean(f1_scores), 4)
RL = round(np.mean(rl_scores), 4)
lens = round(np.mean(answer_lengths), 4)

return F1, RL, lens

emscore, length = eval_question_answering("results_output.json")
print(emscore)
print(length) 最后的结果只有
0.3179
2.6216
不清楚是因为大模型的问题吗。
webq_test_gpt_output.json来自于https://drive.google.com/drive/folders/1DNjTTOLKi24wohJKu1Z-v6b4izfymlLu 您这边提供的文件。希望可以得到您的回复

The text was updated successfully, but these errors were encountered:

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

使用GLM4跑您这边提供的数据，效果很低，是因为大模型的问题吗？ #9

使用GLM4跑您这边提供的数据，效果很低，是因为大模型的问题吗？ #9

AryaFeng commented Dec 10, 2024

使用GLM4跑您这边提供的数据，效果很低，是因为大模型的问题吗？ #9

使用GLM4跑您这边提供的数据，效果很低，是因为大模型的问题吗？ #9

Comments

AryaFeng commented Dec 10, 2024

1.代码如下。

测试generate_then_read代码在glm9b下的性

将输出转化为列表，

最后调用eval进行评估

from rouge import Rouge

def rougel_score(prediction, ground_truth):

rouge = Rouge()

# no normalization

try:

scores = rouge.get_scores(prediction, ground_truth, avg=True)

except ValueError: # "Hypothesis is empty."

return 0.0

return scores["rouge-l"]["f"]

def rl(prediction, ground_truths):

return max([rougel_score(prediction, gt) for gt in ground_truths])

file-level evaluation ...