tb-anti-cheating-full

Public
801/801 trials completed|76 errors|avg reward 0.69
harbor job download 032bd436-5e83-4b44-b1a8-e644a305d002
terminal-bench/adaptive-rejection-samplerclaude-code2.1.118anthropicclaude-opus-4-7-
0.00
3
0
9m 47s
-
1,897,816
135,777
1,724,559
$5.34
terminal-bench/adaptive-rejection-samplercodex0.122.0openaigpt-5.4-
0.33
3
0
6m 32s
-
2,525,738
63,918
2,221,696
$2.27
terminal-bench/adaptive-rejection-samplerterminus-22.0.0geminigemini-3.1-pro-preview-
0.67
3
0
6m 23s
-
114,512
111,045
28,563
$1.51
terminal-bench/bn-fit-modifyclaude-code2.1.118anthropicclaude-opus-4-7-
1.00
3
0
2m 31s
-
1,220,797
20,549
1,124,218
$1.68
terminal-bench/bn-fit-modifycodex0.122.0openaigpt-5.4-
1.00
3
0
2m 55s
-
1,235,369
25,039
1,160,832
$0.85
terminal-bench/bn-fit-modifyterminus-22.0.0geminigemini-3.1-pro-preview-
1.00
3
0
3m 59s
-
133,325
61,427
40,608
$0.93
terminal-bench/break-filter-js-from-htmlclaude-code2.1.118anthropicclaude-opus-4-7-
0.67
3
1
5m 32s
NonZeroAgentExitCodeError
2,219,973
60,183
2,128,992
$3.14
terminal-bench/break-filter-js-from-htmlcodex0.122.0openaigpt-5.4-
1.00
3
0
6m 27s
-
3,111,081
46,841
2,964,736
$1.81
terminal-bench/break-filter-js-from-htmlterminus-22.0.0geminigemini-3.1-pro-preview-
1.00
3
0
7m 18s
-
536,104
84,265
263,268
$1.61
terminal-bench/build-cython-extclaude-code2.1.118anthropicclaude-opus-4-7-
1.00
3
0
5m 11s
-
7,856,086
39,118
7,665,709
$6.00
terminal-bench/build-cython-extcodex0.122.0openaigpt-5.4-
1.00
3
0
5m 27s
-
8,626,439
40,965
8,180,096
$3.78
terminal-bench/build-cython-extterminus-22.0.0geminigemini-3.1-pro-preview-
0.67
3
0
4m 58s
-
520,403
27,498
305,357
$0.82
terminal-bench/build-pmarsclaude-code2.1.118anthropicclaude-opus-4-7-
1.00
3
0
1m 43s
-
1,636,726
11,110
1,588,632
$1.37
terminal-bench/build-pmarscodex0.122.0openaigpt-5.4-
1.00
3
0
3m 31s
-
4,406,107
32,222
4,041,088
$2.41
terminal-bench/build-pmarsterminus-22.0.0geminigemini-3.1-pro-preview-
0.67
3
0
2m 12s
-
201,640
20,673
97,833
$0.48
terminal-bench/build-pov-rayclaude-code2.1.118anthropicclaude-opus-4-7-
1.00
3
0
3m 18s
-
3,631,629
18,344
3,498,049
$3.22
terminal-bench/build-pov-raycodex0.122.0openaigpt-5.4-
1.00
3
0
2m 57s
-
2,632,287
22,208
2,286,848
$1.77
terminal-bench/build-pov-rayterminus-22.0.0geminigemini-3.1-pro-preview-
0.67
3
0
5m 12s
-
811,766
39,903
565,656
$1.08
terminal-bench/caffe-cifar-10claude-code2.1.118anthropicclaude-opus-4-7-
0.00
3
3
22m 32s
AgentTimeoutError +1 more
2,287,437
18,159
2,180,694
-
terminal-bench/caffe-cifar-10codex0.122.0openaigpt-5.4-
0.50
3
1
34m 35s
AgentTimeoutError
14,805,699
42,598
13,954,560
$6.26
terminal-bench/caffe-cifar-10terminus-22.0.0geminigemini-3.1-pro-preview-
0.00
3
3
21m 17s
AgentTimeoutError
1,274,347
35,000
926,749
$1.30
terminal-bench/cancel-async-tasksclaude-code2.1.118anthropicclaude-opus-4-7-
0.33
3
0
3m 6s
-
885,272
37,142
825,479
$1.71
terminal-bench/cancel-async-taskscodex0.122.0openaigpt-5.4-
1.00
3
0
1m 29s
-
250,644
8,709
234,752
$0.23
terminal-bench/cancel-async-tasksterminus-22.0.0geminigemini-3.1-pro-preview-
0.67
3
0
5m 10s
-
209,538
69,076
52,845
$1.15
terminal-bench/chess-best-moveclaude-code2.1.118anthropicclaude-opus-4-7-
0.33
3
0
3m 29s
-
318,126
39,985
255,812
$1.52
terminal-bench/chess-best-movecodex0.122.0openaigpt-5.4-
0.67
3
0
2m 4s
-
855,962
14,494
751,488
$0.67
terminal-bench/chess-best-moveterminus-22.0.0geminigemini-3.1-pro-preview-
0.67
3
1
12m 46s
AgentTimeoutError
547,518
213,513
317,625
$3.09
terminal-bench/circuit-fibsqrtclaude-code2.1.118anthropicclaude-opus-4-7-
0.67
3
1
42m 52s
NonZeroAgentExitCodeError
1,401,343
610,894
1,058,618
$11.22
terminal-bench/circuit-fibsqrtcodex0.122.0openaigpt-5.4-
1.00
3
0
7m 50s
-
1,536,340
75,204
1,485,696
$1.63
terminal-bench/circuit-fibsqrtterminus-22.0.0geminigemini-3.1-pro-preview-
1.00
3
0
13m 8s
-
329,619
215,169
138,841
$2.99
terminal-bench/cobol-modernizationclaude-code2.1.118anthropicclaude-opus-4-7-
1.00
3
0
3m 21s
-
1,892,255
40,634
1,813,156
$2.42
terminal-bench/cobol-modernizationcodex0.122.0openaigpt-5.4-
1.00
3
0
3m 6s
-
879,940
32,424
839,808
$0.80
terminal-bench/cobol-modernizationterminus-22.0.0geminigemini-3.1-pro-preview-
1.00
3
0
7m 22s
-
339,821
140,724
174,959
$2.05
terminal-bench/code-from-imageclaude-code2.1.118anthropicclaude-opus-4-7-
1.00
3
0
1m 4s
-
274,209
1,908
248,210
$0.33
terminal-bench/code-from-imagecodex0.122.0openaigpt-5.4-
1.00
3
0
1m 7s
-
176,491
2,788
145,152
$0.16
terminal-bench/code-from-imageterminus-22.0.0geminigemini-3.1-pro-preview-
1.00
3
0
1m 55s
-
82,238
12,442
0
$0.31
terminal-bench/compile-compcertclaude-code2.1.118anthropicclaude-opus-4-7-
0.67
3
1
30m 7s
AgentTimeoutError
4,366,736
29,326
4,149,792
$3.17
terminal-bench/compile-compcertcodex0.122.0openaigpt-5.4-
1.00
3
0
13m 49s
-
10,637,457
41,497
10,148,608
$4.38
terminal-bench/compile-compcertterminus-22.0.0geminigemini-3.1-pro-preview-
0.33
3
0
26m 15s
-
1,389,900
47,773
1,052,262
$1.46
terminal-bench/configure-git-webserverclaude-code2.1.118anthropicclaude-opus-4-7-
0.00
3
0
2m 26s
-
1,699,341
15,099
1,638,018
$1.58
terminal-bench/configure-git-webservercodex0.122.0openaigpt-5.4-
1.00
3
0
7m 2s
-
2,052,755
36,875
1,907,968
$1.39
terminal-bench/configure-git-webserverterminus-22.0.0geminigemini-3.1-pro-preview-
0.00
3
0
2m 36s
-
111,998
30,317
32,521
$0.53
terminal-bench/constraints-schedulingclaude-code2.1.118anthropicclaude-opus-4-7-
1.00
3
0
1m 34s
-
380,156
14,910
334,059
$0.83
terminal-bench/constraints-schedulingcodex0.122.0openaigpt-5.4-
1.00
3
0
1m 45s
-
363,804
15,210
341,120
$0.37
terminal-bench/constraints-schedulingterminus-22.0.0geminigemini-3.1-pro-preview-
1.00
3
0
2m 15s
-
29,521
43,432
0
$0.58
terminal-bench/count-dataset-tokensclaude-code2.1.118anthropicclaude-opus-4-7-
0.67
3
0
2m 18s
-
1,163,320
15,886
1,083,962
$1.46
terminal-bench/count-dataset-tokenscodex0.122.0openaigpt-5.4-
1.00
3
0
2m 32s
-
1,397,161
19,999
1,239,296
$1.00
terminal-bench/count-dataset-tokensterminus-22.0.0geminigemini-3.1-pro-preview-
1.00
3
0
8m 49s
-
1,334,635
79,060
995,408
$1.83
terminal-bench/crack-7z-hashclaude-code2.1.118anthropicclaude-opus-4-7-
1.00
3
0
3m 29s
-
1,186,311
6,654
1,149,214
$0.97
terminal-bench/crack-7z-hashcodex0.122.0openaigpt-5.4-
0.67
3
1
8m 32s
AgentTimeoutError
6,918,491
39,559
6,683,776
$2.85
terminal-bench/crack-7z-hashterminus-22.0.0geminigemini-3.1-pro-preview-
1.00
3
0
9m 42s
-
584,457
46,728
323,751
$1.15
terminal-bench/custom-memory-heap-crashclaude-code2.1.118anthropicclaude-opus-4-7-
1.00
3
0
6m 32s
-
4,565,183
71,573
4,388,573
$5.09
terminal-bench/custom-memory-heap-crashcodex0.122.0openaigpt-5.4-
1.00
3
0
4m 23s
-
2,785,202
23,069
2,419,712
$1.86
terminal-bench/custom-memory-heap-crashterminus-22.0.0geminigemini-3.1-pro-preview-
1.00
3
0
3m 48s
-
260,162
43,137
118,043
$0.83
terminal-bench/db-wal-recoveryclaude-code2.1.118anthropicclaude-opus-4-7-
1.00
3
0
1m 3s
-
656,575
6,460
621,513
$0.69
terminal-bench/db-wal-recoverycodex0.122.0openaigpt-5.4-
0.33
3
0
4m 33s
-
4,305,219
42,039
4,030,592
$2.32
terminal-bench/db-wal-recoveryterminus-22.0.0geminigemini-3.1-pro-preview-
0.00
3
2
15m 6s
AgentTimeoutError
1,472,047
224,485
1,067,541
$3.72
terminal-bench/distribution-searchclaude-code2.1.118anthropicclaude-opus-4-7-
1.00
3
0
6m 6s
-
760,502
92,005
633,818
$3.41
terminal-bench/distribution-searchcodex0.122.0openaigpt-5.4-
1.00
3
0
2m 33s
-
589,873
24,319
568,448
$0.56
terminal-bench/distribution-searchterminus-22.0.0geminigemini-3.1-pro-preview-
1.00
3
0
2m 17s
-
64,012
34,053
0
$0.54
terminal-bench/dna-assemblyclaude-code2.1.118anthropicclaude-opus-4-7-
0.33
3
2
28m 30s
AgentTimeoutError
4,358,266
393,135
3,710,312
$5.24
terminal-bench/dna-assemblycodex0.122.0openaigpt-5.4-
0.00
3
0
10m 21s
-
3,849,252
103,617
3,612,416
$3.05
terminal-bench/dna-assemblyterminus-22.0.0geminigemini-3.1-pro-preview-
1.00
3
0
13m 33s
-
511,817
239,000
262,736
$3.42
terminal-bench/dna-insertclaude-code2.1.118anthropicclaude-opus-4-7-
0.00
3
0
5m 8s
-
2,320,604
58,917
2,205,377
$3.30
terminal-bench/dna-insertcodex0.122.0openaigpt-5.4-
0.00
3
0
3m 17s
-
1,099,520
28,766
962,688
$1.01
terminal-bench/dna-insertterminus-22.0.0geminigemini-3.1-pro-preview-
0.00
3
0
4m 32s
-
232,141
72,481
109,902
$1.14
terminal-bench/extract-elfclaude-code2.1.118anthropicclaude-opus-4-7-
0.00
3
0
2m 48s
-
1,081,988
28,569
1,017,553
$1.63
terminal-bench/extract-elfcodex0.122.0openaigpt-5.4-
1.00
3
0
4m 49s
-
1,361,093
45,853
1,302,144
$1.16
terminal-bench/extract-elfterminus-22.0.0geminigemini-3.1-pro-preview-
0.33
3
0
4m 47s
-
158,202
89,267
28,439
$1.34
terminal-bench/extract-moves-from-videoclaude-code2.1.118anthropicclaude-opus-4-7-
0.00
3
3
22m 20s
AgentTimeoutError +1 more
13,413,863
72,374
13,199,405
-
terminal-bench/extract-moves-from-videocodex0.122.0openaigpt-5.4-
0.00
3
3
30m 34s
AgentTimeoutError
37,376,460
108,143
35,602,560
$14.96
terminal-bench/extract-moves-from-videoterminus-22.0.0geminigemini-3.1-pro-preview-
0.00
3
1
25m 7s
AgentTimeoutError
3,001,604
98,733
2,553,827
$2.59
terminal-bench/feal-differential-cryptanalysisclaude-code2.1.118anthropicclaude-opus-4-7-
1.00
3
0
5m 25s
-
904,656
77,080
803,401
$2.96
terminal-bench/feal-differential-cryptanalysiscodex0.122.0openaigpt-5.4-
1.00
3
0
6m 9s
-
2,174,972
41,827
2,089,984
$1.36
terminal-bench/feal-differential-cryptanalysisterminus-22.0.0geminigemini-3.1-pro-preview-
1.00
3
0
3m 38s
-
71,489
52,705
8,145
$0.76
terminal-bench/feal-linear-cryptanalysisclaude-code2.1.118anthropicclaude-opus-4-7-
1.00
3
0
7m 33s
-
2,120,230
114,355
1,882,827
$5.28
terminal-bench/feal-linear-cryptanalysiscodex0.122.0openaigpt-5.4-
1.00
3
0
3m 51s
-
1,169,275
38,371
1,109,120
$1.00
terminal-bench/feal-linear-cryptanalysisterminus-22.0.0geminigemini-3.1-pro-preview-
0.33
3
2
22m 12s
AgentTimeoutError
1,142,433
424,435
753,930
$6.02
terminal-bench/filter-js-from-htmlclaude-code2.1.118anthropicclaude-opus-4-7-
0.00
3
0
11m 1s
-
913,828
68,245
823,297
$2.68
terminal-bench/filter-js-from-htmlcodex0.122.0openaigpt-5.4-
0.00
3
1
23m 45s
VerifierTimeoutError
1,053,146
56,947
1,005,824
$1.22
terminal-bench/filter-js-from-htmlterminus-22.0.0geminigemini-3.1-pro-preview-
0.00
3
1
27m 4s
VerifierTimeoutError
193,023
132,785
32,527
$1.92
terminal-bench/financial-document-processorclaude-code2.1.118anthropicclaude-opus-4-7-
1.00
3
0
3m 45s
-
3,688,922
25,720
3,483,579
$3.67
terminal-bench/financial-document-processorcodex0.122.0openaigpt-5.4-
1.00
3
0
3m 50s
-
2,910,718
29,503
2,422,272
$2.27
terminal-bench/financial-document-processorterminus-22.0.0geminigemini-3.1-pro-preview-
0.67
3
0
6m 21s
-
600,596
69,187
374,751
$1.36
terminal-bench/fix-code-vulnerabilityclaude-code2.1.118anthropicclaude-opus-4-7-
1.00
3
0
50s
-
612,497
4,212
576,555
$0.62
terminal-bench/fix-code-vulnerabilitycodex0.122.0openaigpt-5.4-
1.00
3
0
1m 36s
-
1,425,735
12,113
1,221,248
$1.00
terminal-bench/fix-code-vulnerabilityterminus-22.0.0geminigemini-3.1-pro-preview-
1.00
3
0
2m 3s
-
270,748
21,026
122,135
$0.57
terminal-bench/fix-gitclaude-code2.1.118anthropicclaude-opus-4-7-
1.00
3
0
1m 23s
-
755,787
10,125
715,687
$0.88
terminal-bench/fix-gitcodex0.122.0openaigpt-5.4-
1.00
3
0
1m 35s
-
715,125
12,964
640,384
$0.54
terminal-bench/fix-gitterminus-22.0.0geminigemini-3.1-pro-preview-
1.00
3
0
1m 43s
-
83,094
19,137
0
$0.40
terminal-bench/fix-ocaml-gcclaude-code2.1.118anthropicclaude-opus-4-7-
1.00
3
0
21m 6s
-
6,797,045
53,888
6,355,369
$7.29
terminal-bench/fix-ocaml-gccodex0.122.0openaigpt-5.4-
1.00
3
0
22m 59s
-
12,363,658
39,241
11,630,592
$5.33
terminal-bench/fix-ocaml-gcterminus-22.0.0geminigemini-3.1-pro-preview-
1.00
3
0
21m 59s
-
2,088,165
42,802
1,666,847
$1.69
terminal-bench/gcode-to-textclaude-code2.1.118anthropicclaude-opus-4-7-
1.00
3
0
7m 2s
-
4,432,217
74,791
4,260,678
$5.07
terminal-bench/gcode-to-textcodex0.122.0openaigpt-5.4-
0.33
3
1
12m 34s
AgentTimeoutError
7,004,132
103,749
6,683,904
$4.03
terminal-bench/gcode-to-textterminus-22.0.0geminigemini-3.1-pro-preview-
0.00
3
1
9m 56s
AgentTimeoutError
1,628,027
129,695
1,277,083
$2.51
terminal-bench/git-leak-recoveryclaude-code2.1.118anthropicclaude-opus-4-7-
1.00
3
0
1m 7s
-
838,045
5,964
807,746
$0.74
terminal-bench/git-leak-recoverycodex0.122.0openaigpt-5.4-
1.00
3
0
1m 25s
-
388,903
9,919
363,776
$0.30
terminal-bench/git-leak-recoveryterminus-22.0.0geminigemini-3.1-pro-preview-
1.00
3
0
1m 51s
-
56,515
20,677
0
$0.36
terminal-bench/git-multibranchclaude-code2.1.118anthropicclaude-opus-4-7-
1.00
3
0
2m 47s
-
3,365,104
22,766
3,284,219
$2.72
Showing 1-100 of 267 tasks