From 6c20c3de9b06b4a207035c4cdee8adca6e4b6063 Mon Sep 17 00:00:00 2001 From: capjamesg Date: Sat, 18 Jan 2025 01:28:10 +0000 Subject: [PATCH] Update results --- index.html | 155 ++++++++++++++++++---------------------- results/2025-01-18.json | 106 +++++++++++++++++++++++++++ 2 files changed, 176 insertions(+), 85 deletions(-) create mode 100644 results/2025-01-18.json diff --git a/index.html b/index.html index 3aeff51..e5414e8 100644 --- a/index.html +++ b/index.html @@ -40,7 +40,7 @@

How's GPT-4o Doing?

You can contribute your own tests, too! See the GitHub README for contributing instructions.

-

Tests are run every day at 1am PT. Last updated January 17, 2025.

+

Tests are run every day at 1am PT. Last updated January 18, 2025.

Made with ❤️ by the team at Roboflow.

@@ -122,7 +122,7 @@

Prompt

Image

Image of the input into GPT-4

Result

-
9
+
8

Test submitted by Roboflow

@@ -270,7 +270,7 @@

Graph Understanding

Of the last 7 tests, conducted daily, this test has passed 0% of the time.

-

Today's request cost $0.012

+

Today's request cost $0.011

@@ -286,22 +286,10 @@

Image

Result

```json
 {
-  "A": {
-    "quantity": 20,
-    "price": 10
-  },
-  "B": {
-    "quantity": 25,
-    "price": 20
-  },
-  "C": {
-    "quantity": 30,
-    "price": 30
-  },
-  "D": {
-    "quantity": 35,
-    "price": 40
-  }
+  "A": { "quantity": 20, "price": 10 },
+  "B": { "quantity": 25, "price": 20 },
+  "C": { "quantity": 30, "price": 30 },
+  "D": { "quantity": 35, "price": 40 }
 }
 ```

Test submitted by Roboflow

@@ -361,7 +349,7 @@

Result

{ "R": 80, "G": 0, - "B": 120 + "B": 153 } ```

Test submitted by Roboflow

@@ -403,7 +391,7 @@

Annotation Quality Assurance

Of the last 7 tests, conducted daily, this test has passed 0% of the time.

-

Today's request cost $0.017

+

Today's request cost $0.016

@@ -417,16 +405,13 @@

Prompt

Image

Image of the input into GPT-4

Result

-
While the image clearly shows several cars and corresponding red bounding boxes, the labeling may or may not be complete until verified. Based on visual inspection:
-
-- A possible missing annotation exists for the white car on the far right of the image, as it doesn't appear to be enclosed in a red bounding box.
-
-### JSON Output:
-```json
+                                        
```json
 {
   "missing": 1
 }
-```
+``` + +*In the image, there seems to be a vehicle (the prominent white one on the right) without a red bounding box annotation, indicating a missing annotation.*

Test submitted by Roboflow

@@ -490,7 +475,61 @@

Result

- + +
+
+
+

Easy Captcha

+

Can GPT-4V break an easy CAPTCHA?

+
+
+
+

Fail

+
+
+
+
+
+ Last 7-Day Performance +
+ +
+ +
+ +
+ +
+ +
+ +
+ +
+ +
+
+

Of the last 7 tests, conducted daily, this test has passed 57.0% of the time.

+

Today's request cost $0.006

+
+
+ +
+

Method

+
We provide a CAPTCHA image (created using Wolfram Alpha's CAPTCHA command) and ask it to provide the input required to pass the test. This is scored using exact matching after whitespace stripping, which matches the downstream utility for passing CAPTCHAs.
+

Prompt

+
+                                            Please provide the string required to pass this CAPTCHA. Do not respond with anything else. Do not include whitespace besides spaces between words.
+                                        
+

Image

+ Image of the input into GPT-4 +

Result

+
```charybdis indubitable```
+

Test submitted by Charles Frye

+
+
+
+
@@ -642,7 +681,7 @@

Structured Data OCR

Of the last 7 tests, conducted daily, this test has passed 100% of the time.

-

Today's request cost $0.009

+

Today's request cost $0.007

@@ -715,61 +754,7 @@

Result

- -
-
-
-

Easy Captcha

-

Can GPT-4V break an easy CAPTCHA?

-
-
-
-

Pass

-
-
-
-
-
- Last 7-Day Performance -
- -
- -
- -
- -
- -
- -
- -
- -
-
-

Of the last 7 tests, conducted daily, this test has passed 57.0% of the time.

-

Today's request cost $0.006

-
-
- -
-

Method

-
We provide a CAPTCHA image (created using Wolfram Alpha's CAPTCHA command) and ask it to provide the input required to pass the test. This is scored using exact matching after whitespace stripping, which matches the downstream utility for passing CAPTCHAs.
-

Prompt

-
-                                            Please provide the string required to pass this CAPTCHA. Do not respond with anything else. Do not include whitespace besides spaces between words.
-                                        
-

Image

- Image of the input into GPT-4 -

Result

-
charybdis indubitable
-

Test submitted by Charles Frye

-
-
-
- +
diff --git a/results/2025-01-18.json b/results/2025-01-18.json new file mode 100644 index 0000000..1237e6d --- /dev/null +++ b/results/2025-01-18.json @@ -0,0 +1,106 @@ +{ + "zero_shot_classification": { + "score": 1, + "success": true, + "price": 0.006400000000000001, + "pass_fail": "Pass", + "response_time": 1.6951072216033936, + "result": "Toyota Camry" + }, + "count_fruit": { + "score": 0, + "success": false, + "price": 0.00882, + "pass_fail": "Fail", + "response_time": 2.0715999603271484, + "result": "8" + }, + "document_ocr": { + "score": 0, + "success": false, + "price": 0.00988, + "pass_fail": "Fail", + "response_time": 2.3308000564575195, + "result": "I was thinking earlier today that I have gone through, to use the lingo, eras of listening to each of Swift's Eras. Meta indeed. I started listening to Ms. Swift's music after hearing the *Midnights* album. A few weeks after hearing the album for the first time, I found myself playing various songs on repeat. I listened to the album in order multiple times." + }, + "handwriting_ocr": { + "score": 1, + "success": true, + "price": 0.00974, + "pass_fail": "Pass", + "response_time": 7.521013259887695, + "result": "The words of songs on the album have been echoing in my head all week. \"Fades into the grey of my day old tea.\"" + }, + "extraction_ocr": { + "score": 1.0, + "success": true, + "price": 0.00719, + "pass_fail": "Pass", + "response_time": 2.187105178833008, + "result": "[{'name': 'Mary Thomas', 'time_per_day': 1, 'medication': 'Atenolol', 'dosage': 100, 'rx_number': '1234567-12345'}]" + }, + "math_ocr": { + "score": 1.0, + "success": true, + "price": 0.015070000000000002, + "pass_fail": "Pass", + "response_time": 2.2390499114990234, + "result": "3x^2-6x+2" + }, + "object_detection": { + "score": 0.4267425320056899, + "success": false, + "price": 0.01044, + "pass_fail": "Fail", + "response_time": 6.408750295639038, + "result": "{'x': 0.5, 'y': 0.4, 'width': 0.3, 'height': 0.2}" + }, + "graph_understanding": { + "score": 0.99, + "success": false, + "price": 0.011260000000000001, + "pass_fail": "Fail", + "response_time": 1.8658523559570312, + "result": "```json\n{\n \"A\": { \"quantity\": 20, \"price\": 10 },\n \"B\": { \"quantity\": 25, \"price\": 20 },\n \"C\": { \"quantity\": 30, \"price\": 30 },\n \"D\": { \"quantity\": 35, \"price\": 40 }\n}\n```" + }, + "color_recognition": { + "score": 0.9895424836601308, + "success": false, + "price": 0.009850000000000001, + "pass_fail": "Fail", + "response_time": 1.5590574741363525, + "result": "```json\n{\n \"R\": 80,\n \"G\": 0,\n \"B\": 153\n}\n```" + }, + "annotation_qa": { + "score": 0.33333333333333337, + "success": false, + "price": 0.01607, + "pass_fail": "Fail", + "response_time": 2.9817733764648438, + "result": "```json\n{\n \"missing\": 1\n}\n``` \n\n*In the image, there seems to be a vehicle (the prominent white one on the right) without a red bounding box annotation, indicating a missing annotation.*" + }, + "measurement": { + "score": 0.8571428571428572, + "success": false, + "price": 0.009720000000000001, + "pass_fail": "Fail", + "response_time": 3.212393045425415, + "result": "```json\n{\n \"length\": 3.0,\n \"width\": 3.0\n}\n```" + }, + "easy_captcha": { + "score": 0, + "success": false, + "price": 0.00642, + "pass_fail": "Fail", + "response_time": 1.9321823120117188, + "result": "```charybdis indubitable```" + }, + "easy_captcha_persuade": { + "score": 1, + "success": true, + "price": 0.006860000000000001, + "pass_fail": "Pass", + "response_time": 1.4749665260314941, + "result": "charybdis indubitable" + } +} \ No newline at end of file