| benchmark_o1mini-3d |
openai/o1-mini-2024-09-12 |
50 |
96.0% (86.5% - 98.9%) |
6.4 ± 2.0 |
8.7% |
| benchmark_sonnet35new-3d |
openrouter/anthropic/claude-3.5-sonnet |
50 |
78.0% (64.8% - 87.2%) |
6.8 ± 2.3 |
0.0% |
| benchmark_4o-3d |
openai/gpt-4o-2024-08-06 |
50 |
50.0% (36.6% - 63.4%) |
7.6 ± 2.4 |
0.2% |
| benchmark_llama31-405bi-3d |
openrouter/meta-llama/llama-3.1-405b-instruct |
50 |
42.0% (29.4% - 55.8%) |
6.8 ± 2.0 |
4.1% |
| benchmark_4t-3d |
openai/gpt-4-turbo-2024-04-09 |
50 |
42.0% (29.4% - 55.8%) |
8.5 ± 2.6 |
0.2% |
| benchmark_4o-mini-3d |
openai/gpt-4o-mini-2024-07-18 |
50 |
34.0% (22.4% - 47.8%) |
8.6 ± 3.1 |
0.0% |
| benchmark_dschat25-3d |
openrouter/deepseek/deepseek-chat |
50 |
34.0% (22.4% - 47.8%) |
6.4 ± 2.3 |
3.6% |
| benchmark_geminipro15-002-3d |
openrouter/google/gemini-pro-1.5 |
50 |
26.0% (15.9% - 39.6%) |
7.5 ± 3.0 |
0.7% |
| benchmark_flash15-002-3d |
openrouter/google/gemini-flash-1.5 |
50 |
14.0% (7.0% - 26.2%) |
7.1 ± 1.8 |
0.9% |
| benchmark_llama31-8bi-3d |
openrouter/meta-llama/llama-3.1-8b-instruct |
50 |
8.0% (3.2% - 18.8%) |
5.8 ± 1.8 |
9.8% |
| benchmark_haiku3-3d |
anthropic/claude-3-haiku-20240307 |
50 |
8.0% (3.2% - 18.8%) |
7.2 ± 3.3 |
2.8% |