tb-anti-cheating-full
Public
801/801 trials completed|76 errors|avg reward 0.69
jknavigateEnteropenEscgo backycopy
⌘K
harbor job download 032bd436-5e83-4b44-b1a8-e644a305d002| terminal-bench/adaptive-rejection-sampler | claude-code | 2.1.118 | anthropic | claude-opus-4-7 | - | 0.00 | 3 | 0 | 9m 47s | - | 1,897,816 | 135,777 | 1,724,559 | $5.34 |
| terminal-bench/adaptive-rejection-sampler | codex | 0.122.0 | openai | gpt-5.4 | - | 0.33 | 3 | 0 | 6m 32s | - | 2,525,738 | 63,918 | 2,221,696 | $2.27 |
| terminal-bench/adaptive-rejection-sampler | terminus-2 | 2.0.0 | gemini | gemini-3.1-pro-preview | - | 0.67 | 3 | 0 | 6m 23s | - | 114,512 | 111,045 | 28,563 | $1.51 |
| terminal-bench/bn-fit-modify | claude-code | 2.1.118 | anthropic | claude-opus-4-7 | - | 1.00 | 3 | 0 | 2m 31s | - | 1,220,797 | 20,549 | 1,124,218 | $1.68 |
| terminal-bench/bn-fit-modify | codex | 0.122.0 | openai | gpt-5.4 | - | 1.00 | 3 | 0 | 2m 55s | - | 1,235,369 | 25,039 | 1,160,832 | $0.85 |
| terminal-bench/bn-fit-modify | terminus-2 | 2.0.0 | gemini | gemini-3.1-pro-preview | - | 1.00 | 3 | 0 | 3m 59s | - | 133,325 | 61,427 | 40,608 | $0.93 |
| terminal-bench/break-filter-js-from-html | claude-code | 2.1.118 | anthropic | claude-opus-4-7 | - | 0.67 | 3 | 1 | 5m 32s | NonZeroAgentExitCodeError | 2,219,973 | 60,183 | 2,128,992 | $3.14 |
| terminal-bench/break-filter-js-from-html | codex | 0.122.0 | openai | gpt-5.4 | - | 1.00 | 3 | 0 | 6m 27s | - | 3,111,081 | 46,841 | 2,964,736 | $1.81 |
| terminal-bench/break-filter-js-from-html | terminus-2 | 2.0.0 | gemini | gemini-3.1-pro-preview | - | 1.00 | 3 | 0 | 7m 18s | - | 536,104 | 84,265 | 263,268 | $1.61 |
| terminal-bench/build-cython-ext | claude-code | 2.1.118 | anthropic | claude-opus-4-7 | - | 1.00 | 3 | 0 | 5m 11s | - | 7,856,086 | 39,118 | 7,665,709 | $6.00 |
| terminal-bench/build-cython-ext | codex | 0.122.0 | openai | gpt-5.4 | - | 1.00 | 3 | 0 | 5m 27s | - | 8,626,439 | 40,965 | 8,180,096 | $3.78 |
| terminal-bench/build-cython-ext | terminus-2 | 2.0.0 | gemini | gemini-3.1-pro-preview | - | 0.67 | 3 | 0 | 4m 58s | - | 520,403 | 27,498 | 305,357 | $0.82 |
| terminal-bench/build-pmars | claude-code | 2.1.118 | anthropic | claude-opus-4-7 | - | 1.00 | 3 | 0 | 1m 43s | - | 1,636,726 | 11,110 | 1,588,632 | $1.37 |
| terminal-bench/build-pmars | codex | 0.122.0 | openai | gpt-5.4 | - | 1.00 | 3 | 0 | 3m 31s | - | 4,406,107 | 32,222 | 4,041,088 | $2.41 |
| terminal-bench/build-pmars | terminus-2 | 2.0.0 | gemini | gemini-3.1-pro-preview | - | 0.67 | 3 | 0 | 2m 12s | - | 201,640 | 20,673 | 97,833 | $0.48 |
| terminal-bench/build-pov-ray | claude-code | 2.1.118 | anthropic | claude-opus-4-7 | - | 1.00 | 3 | 0 | 3m 18s | - | 3,631,629 | 18,344 | 3,498,049 | $3.22 |
| terminal-bench/build-pov-ray | codex | 0.122.0 | openai | gpt-5.4 | - | 1.00 | 3 | 0 | 2m 57s | - | 2,632,287 | 22,208 | 2,286,848 | $1.77 |
| terminal-bench/build-pov-ray | terminus-2 | 2.0.0 | gemini | gemini-3.1-pro-preview | - | 0.67 | 3 | 0 | 5m 12s | - | 811,766 | 39,903 | 565,656 | $1.08 |
| terminal-bench/caffe-cifar-10 | claude-code | 2.1.118 | anthropic | claude-opus-4-7 | - | 0.00 | 3 | 3 | 22m 32s | AgentTimeoutError +1 more | 2,287,437 | 18,159 | 2,180,694 | - |
| terminal-bench/caffe-cifar-10 | codex | 0.122.0 | openai | gpt-5.4 | - | 0.50 | 3 | 1 | 34m 35s | AgentTimeoutError | 14,805,699 | 42,598 | 13,954,560 | $6.26 |
| terminal-bench/caffe-cifar-10 | terminus-2 | 2.0.0 | gemini | gemini-3.1-pro-preview | - | 0.00 | 3 | 3 | 21m 17s | AgentTimeoutError | 1,274,347 | 35,000 | 926,749 | $1.30 |
| terminal-bench/cancel-async-tasks | claude-code | 2.1.118 | anthropic | claude-opus-4-7 | - | 0.33 | 3 | 0 | 3m 6s | - | 885,272 | 37,142 | 825,479 | $1.71 |
| terminal-bench/cancel-async-tasks | codex | 0.122.0 | openai | gpt-5.4 | - | 1.00 | 3 | 0 | 1m 29s | - | 250,644 | 8,709 | 234,752 | $0.23 |
| terminal-bench/cancel-async-tasks | terminus-2 | 2.0.0 | gemini | gemini-3.1-pro-preview | - | 0.67 | 3 | 0 | 5m 10s | - | 209,538 | 69,076 | 52,845 | $1.15 |
| terminal-bench/chess-best-move | claude-code | 2.1.118 | anthropic | claude-opus-4-7 | - | 0.33 | 3 | 0 | 3m 29s | - | 318,126 | 39,985 | 255,812 | $1.52 |
| terminal-bench/chess-best-move | codex | 0.122.0 | openai | gpt-5.4 | - | 0.67 | 3 | 0 | 2m 4s | - | 855,962 | 14,494 | 751,488 | $0.67 |
| terminal-bench/chess-best-move | terminus-2 | 2.0.0 | gemini | gemini-3.1-pro-preview | - | 0.67 | 3 | 1 | 12m 46s | AgentTimeoutError | 547,518 | 213,513 | 317,625 | $3.09 |
| terminal-bench/circuit-fibsqrt | claude-code | 2.1.118 | anthropic | claude-opus-4-7 | - | 0.67 | 3 | 1 | 42m 52s | NonZeroAgentExitCodeError | 1,401,343 | 610,894 | 1,058,618 | $11.22 |
| terminal-bench/circuit-fibsqrt | codex | 0.122.0 | openai | gpt-5.4 | - | 1.00 | 3 | 0 | 7m 50s | - | 1,536,340 | 75,204 | 1,485,696 | $1.63 |
| terminal-bench/circuit-fibsqrt | terminus-2 | 2.0.0 | gemini | gemini-3.1-pro-preview | - | 1.00 | 3 | 0 | 13m 8s | - | 329,619 | 215,169 | 138,841 | $2.99 |
| terminal-bench/cobol-modernization | claude-code | 2.1.118 | anthropic | claude-opus-4-7 | - | 1.00 | 3 | 0 | 3m 21s | - | 1,892,255 | 40,634 | 1,813,156 | $2.42 |
| terminal-bench/cobol-modernization | codex | 0.122.0 | openai | gpt-5.4 | - | 1.00 | 3 | 0 | 3m 6s | - | 879,940 | 32,424 | 839,808 | $0.80 |
| terminal-bench/cobol-modernization | terminus-2 | 2.0.0 | gemini | gemini-3.1-pro-preview | - | 1.00 | 3 | 0 | 7m 22s | - | 339,821 | 140,724 | 174,959 | $2.05 |
| terminal-bench/code-from-image | claude-code | 2.1.118 | anthropic | claude-opus-4-7 | - | 1.00 | 3 | 0 | 1m 4s | - | 274,209 | 1,908 | 248,210 | $0.33 |
| terminal-bench/code-from-image | codex | 0.122.0 | openai | gpt-5.4 | - | 1.00 | 3 | 0 | 1m 7s | - | 176,491 | 2,788 | 145,152 | $0.16 |
| terminal-bench/code-from-image | terminus-2 | 2.0.0 | gemini | gemini-3.1-pro-preview | - | 1.00 | 3 | 0 | 1m 55s | - | 82,238 | 12,442 | 0 | $0.31 |
| terminal-bench/compile-compcert | claude-code | 2.1.118 | anthropic | claude-opus-4-7 | - | 0.67 | 3 | 1 | 30m 7s | AgentTimeoutError | 4,366,736 | 29,326 | 4,149,792 | $3.17 |
| terminal-bench/compile-compcert | codex | 0.122.0 | openai | gpt-5.4 | - | 1.00 | 3 | 0 | 13m 49s | - | 10,637,457 | 41,497 | 10,148,608 | $4.38 |
| terminal-bench/compile-compcert | terminus-2 | 2.0.0 | gemini | gemini-3.1-pro-preview | - | 0.33 | 3 | 0 | 26m 15s | - | 1,389,900 | 47,773 | 1,052,262 | $1.46 |
| terminal-bench/configure-git-webserver | claude-code | 2.1.118 | anthropic | claude-opus-4-7 | - | 0.00 | 3 | 0 | 2m 26s | - | 1,699,341 | 15,099 | 1,638,018 | $1.58 |
| terminal-bench/configure-git-webserver | codex | 0.122.0 | openai | gpt-5.4 | - | 1.00 | 3 | 0 | 7m 2s | - | 2,052,755 | 36,875 | 1,907,968 | $1.39 |
| terminal-bench/configure-git-webserver | terminus-2 | 2.0.0 | gemini | gemini-3.1-pro-preview | - | 0.00 | 3 | 0 | 2m 36s | - | 111,998 | 30,317 | 32,521 | $0.53 |
| terminal-bench/constraints-scheduling | claude-code | 2.1.118 | anthropic | claude-opus-4-7 | - | 1.00 | 3 | 0 | 1m 34s | - | 380,156 | 14,910 | 334,059 | $0.83 |
| terminal-bench/constraints-scheduling | codex | 0.122.0 | openai | gpt-5.4 | - | 1.00 | 3 | 0 | 1m 45s | - | 363,804 | 15,210 | 341,120 | $0.37 |
| terminal-bench/constraints-scheduling | terminus-2 | 2.0.0 | gemini | gemini-3.1-pro-preview | - | 1.00 | 3 | 0 | 2m 15s | - | 29,521 | 43,432 | 0 | $0.58 |
| terminal-bench/count-dataset-tokens | claude-code | 2.1.118 | anthropic | claude-opus-4-7 | - | 0.67 | 3 | 0 | 2m 18s | - | 1,163,320 | 15,886 | 1,083,962 | $1.46 |
| terminal-bench/count-dataset-tokens | codex | 0.122.0 | openai | gpt-5.4 | - | 1.00 | 3 | 0 | 2m 32s | - | 1,397,161 | 19,999 | 1,239,296 | $1.00 |
| terminal-bench/count-dataset-tokens | terminus-2 | 2.0.0 | gemini | gemini-3.1-pro-preview | - | 1.00 | 3 | 0 | 8m 49s | - | 1,334,635 | 79,060 | 995,408 | $1.83 |
| terminal-bench/crack-7z-hash | claude-code | 2.1.118 | anthropic | claude-opus-4-7 | - | 1.00 | 3 | 0 | 3m 29s | - | 1,186,311 | 6,654 | 1,149,214 | $0.97 |
| terminal-bench/crack-7z-hash | codex | 0.122.0 | openai | gpt-5.4 | - | 0.67 | 3 | 1 | 8m 32s | AgentTimeoutError | 6,918,491 | 39,559 | 6,683,776 | $2.85 |
| terminal-bench/crack-7z-hash | terminus-2 | 2.0.0 | gemini | gemini-3.1-pro-preview | - | 1.00 | 3 | 0 | 9m 42s | - | 584,457 | 46,728 | 323,751 | $1.15 |
| terminal-bench/custom-memory-heap-crash | claude-code | 2.1.118 | anthropic | claude-opus-4-7 | - | 1.00 | 3 | 0 | 6m 32s | - | 4,565,183 | 71,573 | 4,388,573 | $5.09 |
| terminal-bench/custom-memory-heap-crash | codex | 0.122.0 | openai | gpt-5.4 | - | 1.00 | 3 | 0 | 4m 23s | - | 2,785,202 | 23,069 | 2,419,712 | $1.86 |
| terminal-bench/custom-memory-heap-crash | terminus-2 | 2.0.0 | gemini | gemini-3.1-pro-preview | - | 1.00 | 3 | 0 | 3m 48s | - | 260,162 | 43,137 | 118,043 | $0.83 |
| terminal-bench/db-wal-recovery | claude-code | 2.1.118 | anthropic | claude-opus-4-7 | - | 1.00 | 3 | 0 | 1m 3s | - | 656,575 | 6,460 | 621,513 | $0.69 |
| terminal-bench/db-wal-recovery | codex | 0.122.0 | openai | gpt-5.4 | - | 0.33 | 3 | 0 | 4m 33s | - | 4,305,219 | 42,039 | 4,030,592 | $2.32 |
| terminal-bench/db-wal-recovery | terminus-2 | 2.0.0 | gemini | gemini-3.1-pro-preview | - | 0.00 | 3 | 2 | 15m 6s | AgentTimeoutError | 1,472,047 | 224,485 | 1,067,541 | $3.72 |
| terminal-bench/distribution-search | claude-code | 2.1.118 | anthropic | claude-opus-4-7 | - | 1.00 | 3 | 0 | 6m 6s | - | 760,502 | 92,005 | 633,818 | $3.41 |
| terminal-bench/distribution-search | codex | 0.122.0 | openai | gpt-5.4 | - | 1.00 | 3 | 0 | 2m 33s | - | 589,873 | 24,319 | 568,448 | $0.56 |
| terminal-bench/distribution-search | terminus-2 | 2.0.0 | gemini | gemini-3.1-pro-preview | - | 1.00 | 3 | 0 | 2m 17s | - | 64,012 | 34,053 | 0 | $0.54 |
| terminal-bench/dna-assembly | claude-code | 2.1.118 | anthropic | claude-opus-4-7 | - | 0.33 | 3 | 2 | 28m 30s | AgentTimeoutError | 4,358,266 | 393,135 | 3,710,312 | $5.24 |
| terminal-bench/dna-assembly | codex | 0.122.0 | openai | gpt-5.4 | - | 0.00 | 3 | 0 | 10m 21s | - | 3,849,252 | 103,617 | 3,612,416 | $3.05 |
| terminal-bench/dna-assembly | terminus-2 | 2.0.0 | gemini | gemini-3.1-pro-preview | - | 1.00 | 3 | 0 | 13m 33s | - | 511,817 | 239,000 | 262,736 | $3.42 |
| terminal-bench/dna-insert | claude-code | 2.1.118 | anthropic | claude-opus-4-7 | - | 0.00 | 3 | 0 | 5m 8s | - | 2,320,604 | 58,917 | 2,205,377 | $3.30 |
| terminal-bench/dna-insert | codex | 0.122.0 | openai | gpt-5.4 | - | 0.00 | 3 | 0 | 3m 17s | - | 1,099,520 | 28,766 | 962,688 | $1.01 |
| terminal-bench/dna-insert | terminus-2 | 2.0.0 | gemini | gemini-3.1-pro-preview | - | 0.00 | 3 | 0 | 4m 32s | - | 232,141 | 72,481 | 109,902 | $1.14 |
| terminal-bench/extract-elf | claude-code | 2.1.118 | anthropic | claude-opus-4-7 | - | 0.00 | 3 | 0 | 2m 48s | - | 1,081,988 | 28,569 | 1,017,553 | $1.63 |
| terminal-bench/extract-elf | codex | 0.122.0 | openai | gpt-5.4 | - | 1.00 | 3 | 0 | 4m 49s | - | 1,361,093 | 45,853 | 1,302,144 | $1.16 |
| terminal-bench/extract-elf | terminus-2 | 2.0.0 | gemini | gemini-3.1-pro-preview | - | 0.33 | 3 | 0 | 4m 47s | - | 158,202 | 89,267 | 28,439 | $1.34 |
| terminal-bench/extract-moves-from-video | claude-code | 2.1.118 | anthropic | claude-opus-4-7 | - | 0.00 | 3 | 3 | 22m 20s | AgentTimeoutError +1 more | 13,413,863 | 72,374 | 13,199,405 | - |
| terminal-bench/extract-moves-from-video | codex | 0.122.0 | openai | gpt-5.4 | - | 0.00 | 3 | 3 | 30m 34s | AgentTimeoutError | 37,376,460 | 108,143 | 35,602,560 | $14.96 |
| terminal-bench/extract-moves-from-video | terminus-2 | 2.0.0 | gemini | gemini-3.1-pro-preview | - | 0.00 | 3 | 1 | 25m 7s | AgentTimeoutError | 3,001,604 | 98,733 | 2,553,827 | $2.59 |
| terminal-bench/feal-differential-cryptanalysis | claude-code | 2.1.118 | anthropic | claude-opus-4-7 | - | 1.00 | 3 | 0 | 5m 25s | - | 904,656 | 77,080 | 803,401 | $2.96 |
| terminal-bench/feal-differential-cryptanalysis | codex | 0.122.0 | openai | gpt-5.4 | - | 1.00 | 3 | 0 | 6m 9s | - | 2,174,972 | 41,827 | 2,089,984 | $1.36 |
| terminal-bench/feal-differential-cryptanalysis | terminus-2 | 2.0.0 | gemini | gemini-3.1-pro-preview | - | 1.00 | 3 | 0 | 3m 38s | - | 71,489 | 52,705 | 8,145 | $0.76 |
| terminal-bench/feal-linear-cryptanalysis | claude-code | 2.1.118 | anthropic | claude-opus-4-7 | - | 1.00 | 3 | 0 | 7m 33s | - | 2,120,230 | 114,355 | 1,882,827 | $5.28 |
| terminal-bench/feal-linear-cryptanalysis | codex | 0.122.0 | openai | gpt-5.4 | - | 1.00 | 3 | 0 | 3m 51s | - | 1,169,275 | 38,371 | 1,109,120 | $1.00 |
| terminal-bench/feal-linear-cryptanalysis | terminus-2 | 2.0.0 | gemini | gemini-3.1-pro-preview | - | 0.33 | 3 | 2 | 22m 12s | AgentTimeoutError | 1,142,433 | 424,435 | 753,930 | $6.02 |
| terminal-bench/filter-js-from-html | claude-code | 2.1.118 | anthropic | claude-opus-4-7 | - | 0.00 | 3 | 0 | 11m 1s | - | 913,828 | 68,245 | 823,297 | $2.68 |
| terminal-bench/filter-js-from-html | codex | 0.122.0 | openai | gpt-5.4 | - | 0.00 | 3 | 1 | 23m 45s | VerifierTimeoutError | 1,053,146 | 56,947 | 1,005,824 | $1.22 |
| terminal-bench/filter-js-from-html | terminus-2 | 2.0.0 | gemini | gemini-3.1-pro-preview | - | 0.00 | 3 | 1 | 27m 4s | VerifierTimeoutError | 193,023 | 132,785 | 32,527 | $1.92 |
| terminal-bench/financial-document-processor | claude-code | 2.1.118 | anthropic | claude-opus-4-7 | - | 1.00 | 3 | 0 | 3m 45s | - | 3,688,922 | 25,720 | 3,483,579 | $3.67 |
| terminal-bench/financial-document-processor | codex | 0.122.0 | openai | gpt-5.4 | - | 1.00 | 3 | 0 | 3m 50s | - | 2,910,718 | 29,503 | 2,422,272 | $2.27 |
| terminal-bench/financial-document-processor | terminus-2 | 2.0.0 | gemini | gemini-3.1-pro-preview | - | 0.67 | 3 | 0 | 6m 21s | - | 600,596 | 69,187 | 374,751 | $1.36 |
| terminal-bench/fix-code-vulnerability | claude-code | 2.1.118 | anthropic | claude-opus-4-7 | - | 1.00 | 3 | 0 | 50s | - | 612,497 | 4,212 | 576,555 | $0.62 |
| terminal-bench/fix-code-vulnerability | codex | 0.122.0 | openai | gpt-5.4 | - | 1.00 | 3 | 0 | 1m 36s | - | 1,425,735 | 12,113 | 1,221,248 | $1.00 |
| terminal-bench/fix-code-vulnerability | terminus-2 | 2.0.0 | gemini | gemini-3.1-pro-preview | - | 1.00 | 3 | 0 | 2m 3s | - | 270,748 | 21,026 | 122,135 | $0.57 |
| terminal-bench/fix-git | claude-code | 2.1.118 | anthropic | claude-opus-4-7 | - | 1.00 | 3 | 0 | 1m 23s | - | 755,787 | 10,125 | 715,687 | $0.88 |
| terminal-bench/fix-git | codex | 0.122.0 | openai | gpt-5.4 | - | 1.00 | 3 | 0 | 1m 35s | - | 715,125 | 12,964 | 640,384 | $0.54 |
| terminal-bench/fix-git | terminus-2 | 2.0.0 | gemini | gemini-3.1-pro-preview | - | 1.00 | 3 | 0 | 1m 43s | - | 83,094 | 19,137 | 0 | $0.40 |
| terminal-bench/fix-ocaml-gc | claude-code | 2.1.118 | anthropic | claude-opus-4-7 | - | 1.00 | 3 | 0 | 21m 6s | - | 6,797,045 | 53,888 | 6,355,369 | $7.29 |
| terminal-bench/fix-ocaml-gc | codex | 0.122.0 | openai | gpt-5.4 | - | 1.00 | 3 | 0 | 22m 59s | - | 12,363,658 | 39,241 | 11,630,592 | $5.33 |
| terminal-bench/fix-ocaml-gc | terminus-2 | 2.0.0 | gemini | gemini-3.1-pro-preview | - | 1.00 | 3 | 0 | 21m 59s | - | 2,088,165 | 42,802 | 1,666,847 | $1.69 |
| terminal-bench/gcode-to-text | claude-code | 2.1.118 | anthropic | claude-opus-4-7 | - | 1.00 | 3 | 0 | 7m 2s | - | 4,432,217 | 74,791 | 4,260,678 | $5.07 |
| terminal-bench/gcode-to-text | codex | 0.122.0 | openai | gpt-5.4 | - | 0.33 | 3 | 1 | 12m 34s | AgentTimeoutError | 7,004,132 | 103,749 | 6,683,904 | $4.03 |
| terminal-bench/gcode-to-text | terminus-2 | 2.0.0 | gemini | gemini-3.1-pro-preview | - | 0.00 | 3 | 1 | 9m 56s | AgentTimeoutError | 1,628,027 | 129,695 | 1,277,083 | $2.51 |
| terminal-bench/git-leak-recovery | claude-code | 2.1.118 | anthropic | claude-opus-4-7 | - | 1.00 | 3 | 0 | 1m 7s | - | 838,045 | 5,964 | 807,746 | $0.74 |
| terminal-bench/git-leak-recovery | codex | 0.122.0 | openai | gpt-5.4 | - | 1.00 | 3 | 0 | 1m 25s | - | 388,903 | 9,919 | 363,776 | $0.30 |
| terminal-bench/git-leak-recovery | terminus-2 | 2.0.0 | gemini | gemini-3.1-pro-preview | - | 1.00 | 3 | 0 | 1m 51s | - | 56,515 | 20,677 | 0 | $0.36 |
| terminal-bench/git-multibranch | claude-code | 2.1.118 | anthropic | claude-opus-4-7 | - | 1.00 | 3 | 0 | 2m 47s | - | 3,365,104 | 22,766 | 3,284,219 | $2.72 |
Showing 1-100 of 267 tasks