-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathagent-bigcodebench.py
192 lines (147 loc) · 6.56 KB
/
agent-bigcodebench.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
from tqdm import tqdm
from dotenv import load_dotenv
from bigcodebench.data import get_bigcodebench, load_solutions
from bigcodebench.data.utils import CACHE_DIR
from bigcodebench.eval import PASS, compatible_eval_result, estimate_pass_at_k, untrusted_check
from bigcodebench.gen.util import trusted_check
from langgraph.graph import StateGraph, START, END
from langgraph.graph.message import add_messages
from typing import Annotated
from typing_extensions import TypedDict
from langchain_openai import ChatOpenAI
import os
import json
import traceback
load_dotenv()
# Load BigCodeBench dataset and solutions
subset = "bigcodebench_instruct" # Replace with your subset name
problems = get_bigcodebench(subset=subset)
solutions_path = "./solutions.jsonl" # Replace with the path to your solutions
solutions = load_solutions(solutions_path)
NUM_SAMPLES = 50
NUM_CORRECT = 0
print(f"Loaded {len(solutions)} solutions for testing.")
class State(TypedDict):
messages: Annotated[list, add_messages]
graph_builder = StateGraph(State)
llm = ChatOpenAI(
model="gpt-4o-mini",
api_key="your-api-key",
)
def code_understanding(state: State):
prompt = f"""
<Identity>
You are an expert AI assistant specializing in programmatic reasoning, problem decomposition, reflective reasoning, and solution verification. Your goal is to deliver clear, logically sound, and testable solutions to complex programming challenges with confidence and precision.
</Identity>
<Context>
You are provided with a programming problem or codebase that requires a structured approach for analysis and resolution. The process includes breaking the problem into sub-problems, solving each individually, merging solutions, and verifying correctness through testing.
</Context>
<Task>
{state["messages"]}
</Task>
<Instructions>
Analyze the problem or codebase and divide it into smaller sub-problems or logical components. Clearly describe each sub-problem, its purpose, and how it contributes to the overall solution.
</Instructions>
"""
return {"messages": [llm.invoke(prompt)]}
def code_sol_reasoning(state: State):
prompt = f"""
<Identity>
You are an expert AI assistant specializing in programmatic reasoning, problem decomposition, reflective reasoning, and solution verification. Your goal is to deliver clear, logically sound, and testable solutions to complex programming challenges with confidence and precision.
</Identity>
<Context>
You are provided with a programming problem or codebase that requires a structured approach for analysis and resolution. The process includes breaking the problem into sub-problems, solving each individually, merging solutions, and verifying correctness through testing.
</Context>
<Task>
{state["messages"]}
</Task>
<Instructions>
Solve each sub-problem using reflective reasoning. Evaluate the solution’s logic, refine it to address edge cases or ambiguities, and present the final solution for each component. Combine the resolved sub-problems to create N potential solutions. Ensure logical compatibility between the components and explain the merging strategy.
</Instructions>
<OUTPUT_INSTRUCT>
Please generate the final solution for the problem by combining the resolved sub-problems. Please generate the final code only and do not include any additional instructions or explanations or comments, purely the code.
</OUTPUT_INSTRUCT>
"""
return {"messages": [llm.invoke(prompt)]}
def evaluate_solution(problem, solution_code, cache_dir, max_time_limit=5.0):
"""
Evaluates a solution using untrusted and trusted checks.
Args:
problem: The problem details from the dataset.
solution_code: The solution code to evaluate.
cache_dir: Path to the cache directory.
max_time_limit: Maximum allowed time for execution.
Returns:
bool: True if the solution passes all checks, False otherwise.
"""
try:
# Untrusted Check
untrusted_result = untrusted_check(
solution_code,
problem["test"],
problem["entry_point"],
max_as_limit=30 * 1024,
max_data_limit=30 * 1024,
max_stack_limit=10,
min_time_limit=1.0,
gt_time_limit=max_time_limit,
)
if untrusted_result[0] != PASS:
print("Untrusted check failed.")
return False
# Trusted Check
trusted_result = trusted_check(
solution_code + "\n" + problem["canonical_solution"],
problem["test"],
problem["entry_point"],
max_as_limit=30 * 1024,
max_data_limit=30 * 1024,
max_stack_limit=10,
min_time_limit=1.0,
cache_dir=cache_dir,
)
if trusted_result[0] != PASS:
print("Trusted check failed.")
return False
except Exception as e:
print(f"Error during evaluation: {traceback.format_exc()}")
return False
return True
graph_builder.add_node("code_understanding", code_understanding)
graph_builder.add_node("code_sol_reasoning", code_sol_reasoning)
graph_builder.set_entry_point("code_understanding")
graph_builder.add_edge("code_understanding", "code_sol_reasoning")
graph_builder.set_finish_point("code_sol_reasoning")
graph = graph_builder.compile()
# Ensure cache directory exists
os.makedirs(CACHE_DIR, exist_ok=True)
# Update the main loop to integrate testing
for i, solution in tqdm(enumerate(solutions), total=NUM_SAMPLES):
if i >= NUM_SAMPLES:
break
print("--------------------")
task_id = solution["task_id"]
problem = problems.get(task_id)
if not problem:
print(f"Problem with task ID {task_id} not found in the dataset.")
continue
state = {
"messages": problem["complete_prompt"]
+ " based on the instruction: "
+ problem["instruct_prompt"]
}
answer = graph.invoke(state)
answer_snippet = (
str(answer["messages"][-1].content).strip().replace("OUTPUT:", "").strip()
)
if "```python" in answer_snippet:
answer_snippet = answer_snippet.split("```python")[1].split("```")[0].strip()
print("Answer Snippet:", answer_snippet)
# Evaluate the solution
tests_passed = evaluate_solution(problem, answer_snippet, CACHE_DIR)
if tests_passed:
NUM_CORRECT += 1
print(f"Correct: {tests_passed}")
print("--------------------")
print(f"Correct: {NUM_CORRECT}/{NUM_SAMPLES}")
print(f"Accuracy: {NUM_CORRECT / NUM_SAMPLES * 100:.2f}%")