diff --git a/WokeyTalky_Research_Code/.gitignore b/WokeyTalky_Research_Code/.gitignore new file mode 100644 index 0000000..c585e19 --- /dev/null +++ b/WokeyTalky_Research_Code/.gitignore @@ -0,0 +1 @@ +out \ No newline at end of file diff --git a/WokeyTalky_Research_Code/scripts/stage_1_data_batch_request.py b/WokeyTalky_Research_Code/scripts/stage_1_data_batch_request.py index ac25896..da05897 100755 --- a/WokeyTalky_Research_Code/scripts/stage_1_data_batch_request.py +++ b/WokeyTalky_Research_Code/scripts/stage_1_data_batch_request.py @@ -1,11 +1,17 @@ import json +import asyncio +import aiohttp from dotenv import load_dotenv import os load_dotenv() from utils.eval_util import load_prompt_format import argparse from utils.progress_util import progress_bar -from utils.gpt_batch_request import save_requests_to_file,upload_file_to_openai,create_batch,retrieve_batch,get_file_content,parse_multiple_json_objects,save_as_jsonl,confirm_action +from utils.gpt_batch_request import save_requests_to_file, upload_file_to_openai, create_batch, retrieve_batch, get_file_content, parse_multiple_json_objects, save_as_jsonl, confirm_action +from openai import OpenAI + +def split_requests_into_chunks(requests, chunk_size): + return [requests[i:i + chunk_size] for i in range(0, len(requests), chunk_size)] def process_json_files(input_dir, judge_prompt_filename): requests = [] @@ -54,8 +60,33 @@ def process_json_files(input_dir, judge_prompt_filename): return requests +async def send_batch_request(client, chunk, output_dir, i, api_safety_on): + save_requests_to_file(chunk, f'{output_dir}/batch_request_chunk_{i}.jsonl') + + if api_safety_on and not confirm_action("Do you want to proceed with sending the request to OpenAI?"): + print("Request canceled.") + return None + + _, response = upload_file_to_openai(f'{output_dir}/batch_request_chunk_{i}.jsonl', client.api_key) + batch = create_batch(client, response) + while True: + try: + retrieved_batch = retrieve_batch(client, batch.id) + if retrieved_batch.status == 'completed': + break + else: + raise Exception(f"Batch processing failed with status: {retrieved_batch.status}") + except Exception as e: + print(f"Error retrieving batch: {e}") + print(f"Batch {i}: Retrying in 30 seconds...\n") + await asyncio.sleep(30) + text_content = get_file_content(client, retrieved_batch) + if text_content: + parsed_objects = parse_multiple_json_objects(text_content) + return parsed_objects + return None if __name__ == "__main__": parser = argparse.ArgumentParser( @@ -68,13 +99,16 @@ def process_json_files(input_dir, judge_prompt_filename): help='Path and name of the output file to save the requests.') parser.add_argument('--batch_name', type=str, default='batch_output', help='Name of the batch output file.') + parser.add_argument('--api_safety_on', type=bool, default=True, + help='Name of the batch output file.') args = parser.parse_args() input_dir = args.input_dir judge_prompt_filename = args.judge_prompt_filename output_dir = args.output_dir batch_name = args.batch_name - + api_safety_on = args.api_safety_on + api_key = os.getenv("OPENAI_API_KEY") if not api_key: raise ValueError( @@ -106,7 +140,9 @@ def process_json_files(input_dir, judge_prompt_filename): else: raise Exception( f"Batch processing failed with status: {retrieved_batch.status}") + except Exception as e: + print(retrieved_batch) print(f"Error retrieving batch: {e}") print("Retrying in 30 seconds...\n") progress_bar(30) @@ -121,33 +157,22 @@ def process_json_files(input_dir, judge_prompt_filename): print("Invalid input. Please enter a valid number.") requests = process_json_files(input_dir, judge_prompt_filename) - save_requests_to_file(requests, f'{output_dir}/batch_request.jsonl') - - if not confirm_action("Do you want to proceed with sending the request to OpenAI?"): - print("Request canceled.") - exit(0) + request_chunks = split_requests_into_chunks(requests, 50000) - client, response = upload_file_to_openai( - f'{output_dir}/batch_request.jsonl', api_key) - batch = create_batch(client, response) + async def main(): + client = OpenAI(api_key=api_key) + tasks = [] + for i, chunk in enumerate(request_chunks, start=1): + print(f"Processing chunk {i} of {len(request_chunks)}...") + task = asyncio.ensure_future(send_batch_request(client, chunk, output_dir, i, api_safety_on)) + tasks.append(task) - with open(batch_ids_file, "a") as f: - f.write(json.dumps({"batch_id": batch.id}) + "\n") + all_parsed_objects = [] + for task in asyncio.as_completed(tasks): + parsed_objects = await task + if parsed_objects: + all_parsed_objects.extend(parsed_objects) - while True: - try: - retrieved_batch = retrieve_batch(client, batch.id) - if retrieved_batch.status == 'completed': - break - else: - raise Exception( - f"Batch processing failed with status: {retrieved_batch.status}") - except Exception as e: - print(f"Error retrieving batch: {e}") - print("Retrying in 30 seconds...\n") - progress_bar(30) + save_as_jsonl(all_parsed_objects, f"{output_dir}/batch_raw_outputs.jsonl") - text_content = get_file_content(client, retrieved_batch) - if text_content: - parsed_objects = parse_multiple_json_objects(text_content) - save_as_jsonl(parsed_objects, f"{output_dir}/batch_raw_outputs.jsonl") + asyncio.run(main()) \ No newline at end of file diff --git a/WokeyTalky_Research_Code/scripts/utils/__pycache__/__init__.cpython-311.pyc b/WokeyTalky_Research_Code/scripts/utils/__pycache__/__init__.cpython-311.pyc index 7d66dcf..e65a5d4 100644 Binary files a/WokeyTalky_Research_Code/scripts/utils/__pycache__/__init__.cpython-311.pyc and b/WokeyTalky_Research_Code/scripts/utils/__pycache__/__init__.cpython-311.pyc differ diff --git a/WokeyTalky_Research_Code/scripts/utils/__pycache__/eval_util.cpython-311.pyc b/WokeyTalky_Research_Code/scripts/utils/__pycache__/eval_util.cpython-311.pyc index 55fa4bd..a33a8a1 100644 Binary files a/WokeyTalky_Research_Code/scripts/utils/__pycache__/eval_util.cpython-311.pyc and b/WokeyTalky_Research_Code/scripts/utils/__pycache__/eval_util.cpython-311.pyc differ diff --git a/WokeyTalky_Research_Code/scripts/utils/__pycache__/eval_util.cpython-39.pyc b/WokeyTalky_Research_Code/scripts/utils/__pycache__/eval_util.cpython-39.pyc index 4834472..77e5fe7 100644 Binary files a/WokeyTalky_Research_Code/scripts/utils/__pycache__/eval_util.cpython-39.pyc and b/WokeyTalky_Research_Code/scripts/utils/__pycache__/eval_util.cpython-39.pyc differ diff --git a/WokeyTalky_Research_Code/scripts/utils/__pycache__/gpt_batch_request.cpython-311.pyc b/WokeyTalky_Research_Code/scripts/utils/__pycache__/gpt_batch_request.cpython-311.pyc new file mode 100644 index 0000000..ec1b74e Binary files /dev/null and b/WokeyTalky_Research_Code/scripts/utils/__pycache__/gpt_batch_request.cpython-311.pyc differ diff --git a/WokeyTalky_Research_Code/scripts/utils/__pycache__/gpt_batch_request.cpython-39.pyc b/WokeyTalky_Research_Code/scripts/utils/__pycache__/gpt_batch_request.cpython-39.pyc index 1f2b016..29070c4 100644 Binary files a/WokeyTalky_Research_Code/scripts/utils/__pycache__/gpt_batch_request.cpython-39.pyc and b/WokeyTalky_Research_Code/scripts/utils/__pycache__/gpt_batch_request.cpython-39.pyc differ diff --git a/WokeyTalky_Research_Code/scripts/utils/__pycache__/progress_util.cpython-311.pyc b/WokeyTalky_Research_Code/scripts/utils/__pycache__/progress_util.cpython-311.pyc new file mode 100644 index 0000000..bcf117c Binary files /dev/null and b/WokeyTalky_Research_Code/scripts/utils/__pycache__/progress_util.cpython-311.pyc differ diff --git a/WokeyTalky_Research_Code/scripts/utils/__pycache__/progress_util.cpython-39.pyc b/WokeyTalky_Research_Code/scripts/utils/__pycache__/progress_util.cpython-39.pyc index b91250b..286c89f 100644 Binary files a/WokeyTalky_Research_Code/scripts/utils/__pycache__/progress_util.cpython-39.pyc and b/WokeyTalky_Research_Code/scripts/utils/__pycache__/progress_util.cpython-39.pyc differ diff --git a/WokeyTalky_Research_Code/scripts/utils/__pycache__/util.cpython-311.pyc b/WokeyTalky_Research_Code/scripts/utils/__pycache__/util.cpython-311.pyc index baed167..5999acc 100644 Binary files a/WokeyTalky_Research_Code/scripts/utils/__pycache__/util.cpython-311.pyc and b/WokeyTalky_Research_Code/scripts/utils/__pycache__/util.cpython-311.pyc differ diff --git a/WokeyTalky_Research_Code/scripts/utils/__pycache__/util.cpython-39.pyc b/WokeyTalky_Research_Code/scripts/utils/__pycache__/util.cpython-39.pyc index f23ae46..403f9e6 100644 Binary files a/WokeyTalky_Research_Code/scripts/utils/__pycache__/util.cpython-39.pyc and b/WokeyTalky_Research_Code/scripts/utils/__pycache__/util.cpython-39.pyc differ diff --git a/WokeyTalky_Research_Code/setup.sh b/WokeyTalky_Research_Code/setup.sh old mode 100644 new mode 100755 index 1257827..b0f6db8 --- a/WokeyTalky_Research_Code/setup.sh +++ b/WokeyTalky_Research_Code/setup.sh @@ -2,18 +2,18 @@ # Set your default values here DATASET_FILE="$(pwd)/data/HEx-PHI/category_1.csv" -PROJECT_NAME="Demo" +PROJECT_NAME="GuardrailText" MODELS_LIST_DIR="$(pwd)/configs/open_models.txt" JUDGE_TEMPLATES_FILE="$(pwd)/configs/judge_prompt.jsonl" WOKE_TEMPLATES_FILE="$(pwd)/configs/woke_templates.json" SCRIPTS_DIR="$(pwd)/scripts" STAGE_6_FILE="" -INSERTION_POINT=0 +INSERTION_POINT=1 CUDA_VISIBLE_DEVICES="0,1,2,3" RAY_TMPDIR="$HOME/tmp_ray" # Run the main pipeline script with arguments -./main_pipeline.sh \ +./pipeline.sh \ --dataset_file "${1:-$DATASET_FILE}" \ --project_name "${2:-$PROJECT_NAME}" \ --models_list_dir "${3:-$MODELS_LIST_DIR}" \