tb-timeout-ablation
Public
1602/1602 trials completed|159 errors|avg reward 0.70
jknavigateEnteropenEscgo backycopy
⌘K
harbor job download c6bb2353-6b49-4ed2-bd34-712f528d02df| terminal-bench/adaptive-rejection-sampler | claude-code | 2.1.118 | anthropic | claude-opus-4-7 | - | 0.00 | 6 | 0 | 9m 9s | - | 3,370,003 | 256,663 | 2,979,763 | - |
| terminal-bench/adaptive-rejection-sampler | codex | 0.122.0 | openai | gpt-5.4 | - | 0.67 | 6 | 0 | 12m 32s | - | 5,374,933 | 131,664 | 4,860,800 | - |
| terminal-bench/adaptive-rejection-sampler | terminus-2 | 2.0.0 | gemini | gemini-3.1-pro-preview | - | 0.83 | 6 | 0 | 5m 44s | - | 202,149 | 191,385 | 69,367 | $2.58 |
| terminal-bench/bn-fit-modify | claude-code | 2.1.118 | anthropic | claude-opus-4-7 | - | 1.00 | 6 | 0 | 2m 22s | - | 2,199,153 | 37,978 | 2,056,975 | - |
| terminal-bench/bn-fit-modify | codex | 0.122.0 | openai | gpt-5.4 | - | 1.00 | 6 | 0 | 10m 4s | - | 1,612,782 | 49,441 | 1,494,144 | - |
| terminal-bench/bn-fit-modify | terminus-2 | 2.0.0 | gemini | gemini-3.1-pro-preview | - | 1.00 | 6 | 0 | 4m 26s | - | 342,517 | 120,374 | 146,502 | $1.87 |
| terminal-bench/break-filter-js-from-html | claude-code | 2.1.118 | anthropic | claude-opus-4-7 | - | 0.83 | 6 | 1 | 6m 9s | NonZeroAgentExitCodeError | 4,120,208 | 152,618 | 3,853,829 | - |
| terminal-bench/break-filter-js-from-html | codex | 0.122.0 | openai | gpt-5.4 | - | 0.33 | 6 | 0 | 8m 13s | - | 852,230 | 21,407 | 795,776 | - |
| terminal-bench/break-filter-js-from-html | terminus-2 | 2.0.0 | gemini | gemini-3.1-pro-preview | - | 1.00 | 6 | 0 | 5m 34s | - | 661,259 | 141,060 | 289,302 | $2.49 |
| terminal-bench/build-cython-ext | claude-code | 2.1.118 | anthropic | claude-opus-4-7 | - | 1.00 | 6 | 0 | 5m 36s | - | 18,530,515 | 88,071 | 18,110,794 | - |
| terminal-bench/build-cython-ext | codex | 0.122.0 | openai | gpt-5.4 | - | 1.00 | 6 | 0 | 10m 10s | - | 12,923,795 | 80,026 | 12,140,928 | - |
| terminal-bench/build-cython-ext | terminus-2 | 2.0.0 | gemini | gemini-3.1-pro-preview | - | 0.50 | 6 | 0 | 3m 54s | - | 851,955 | 52,077 | 508,388 | $1.41 |
| terminal-bench/build-pmars | claude-code | 2.1.118 | anthropic | claude-opus-4-7 | - | 1.00 | 6 | 0 | 1m 44s | - | 3,500,151 | 22,390 | 3,395,446 | - |
| terminal-bench/build-pmars | codex | 0.122.0 | openai | gpt-5.4 | - | 1.00 | 6 | 0 | 10m 20s | - | 5,277,494 | 47,959 | 4,646,784 | - |
| terminal-bench/build-pmars | terminus-2 | 2.0.0 | gemini | gemini-3.1-pro-preview | - | 1.00 | 6 | 0 | 2m 1s | - | 376,161 | 36,147 | 171,128 | $0.88 |
| terminal-bench/build-pov-ray | claude-code | 2.1.118 | anthropic | claude-opus-4-7 | - | 1.00 | 6 | 0 | 3m 21s | - | 7,790,281 | 39,468 | 7,510,830 | - |
| terminal-bench/build-pov-ray | codex | 0.122.0 | openai | gpt-5.4 | - | 0.83 | 6 | 0 | 9m 48s | - | 10,123,686 | 63,224 | 9,266,560 | - |
| terminal-bench/build-pov-ray | terminus-2 | 2.0.0 | gemini | gemini-3.1-pro-preview | - | 1.00 | 6 | 0 | 4m 38s | - | 1,405,413 | 56,547 | 985,240 | $1.72 |
| terminal-bench/caffe-cifar-10 | claude-code | 2.1.118 | anthropic | claude-opus-4-7 | - | 0.00 | 6 | 6 | 34m 51s | AgentTimeoutError +1 more | 4,881,534 | 38,040 | 4,660,608 | - |
| terminal-bench/caffe-cifar-10 | codex | 0.122.0 | openai | gpt-5.4 | - | 0.17 | 6 | 4 | 32m 47s | AgentTimeoutError | 41,187,063 | 93,610 | 38,787,456 | - |
| terminal-bench/caffe-cifar-10 | terminus-2 | 2.0.0 | gemini | gemini-3.1-pro-preview | - | 0.00 | 6 | 5 | 31m 18s | AgentTimeoutError +1 more | 1,064,832 | 60,657 | 671,405 | $1.65 |
| terminal-bench/cancel-async-tasks | claude-code | 2.1.118 | anthropic | claude-opus-4-7 | - | 0.67 | 6 | 0 | 3m 27s | - | 2,481,564 | 68,977 | 2,365,491 | - |
| terminal-bench/cancel-async-tasks | codex | 0.122.0 | openai | gpt-5.4 | - | 0.83 | 6 | 0 | 9m 4s | - | 413,967 | 19,148 | 390,144 | - |
| terminal-bench/cancel-async-tasks | terminus-2 | 2.0.0 | gemini | gemini-3.1-pro-preview | - | 1.00 | 6 | 0 | 4m 9s | - | 156,844 | 118,366 | 8,118 | $1.72 |
| terminal-bench/chess-best-move | claude-code | 2.1.118 | anthropic | claude-opus-4-7 | - | 0.50 | 6 | 0 | 3m 15s | - | 648,186 | 74,826 | 529,749 | - |
| terminal-bench/chess-best-move | codex | 0.122.0 | openai | gpt-5.4 | - | 0.83 | 6 | 0 | 7m 58s | - | 1,632,010 | 27,128 | 1,436,416 | - |
| terminal-bench/chess-best-move | terminus-2 | 2.0.0 | gemini | gemini-3.1-pro-preview | - | 0.50 | 6 | 2 | 11m 59s | AgentTimeoutError | 1,698,164 | 366,664 | 1,122,790 | $5.78 |
| terminal-bench/circuit-fibsqrt | claude-code | 2.1.118 | anthropic | claude-opus-4-7 | - | 0.67 | 6 | 2 | 36m 3s | NonZeroAgentExitCodeError | 5,590,378 | 1,010,096 | 4,978,753 | - |
| terminal-bench/circuit-fibsqrt | codex | 0.122.0 | openai | gpt-5.4 | - | 1.00 | 6 | 0 | 15m 5s | - | 4,437,910 | 131,031 | 4,157,696 | - |
| terminal-bench/circuit-fibsqrt | terminus-2 | 2.0.0 | gemini | gemini-3.1-pro-preview | - | 0.83 | 6 | 0 | 8m 47s | - | 274,970 | 323,201 | 84,488 | $4.28 |
| terminal-bench/cobol-modernization | claude-code | 2.1.118 | anthropic | claude-opus-4-7 | - | 1.00 | 6 | 0 | 4m 9s | - | 3,888,354 | 106,222 | 3,696,290 | - |
| terminal-bench/cobol-modernization | codex | 0.122.0 | openai | gpt-5.4 | - | 1.00 | 6 | 0 | 9m 23s | - | 2,161,032 | 71,380 | 2,073,984 | - |
| terminal-bench/cobol-modernization | terminus-2 | 2.0.0 | gemini | gemini-3.1-pro-preview | - | 1.00 | 6 | 0 | 6m 40s | - | 594,492 | 244,695 | 260,287 | $3.66 |
| terminal-bench/code-from-image | claude-code | 2.1.118 | anthropic | claude-opus-4-7 | - | 1.00 | 6 | 0 | 40s | - | 523,451 | 3,249 | 472,225 | - |
| terminal-bench/code-from-image | codex | 0.122.0 | openai | gpt-5.4 | - | 1.00 | 6 | 1 | 7m 14s | AgentSetupTimeoutError | 339,819 | 4,492 | 252,160 | - |
| terminal-bench/code-from-image | terminus-2 | 2.0.0 | gemini | gemini-3.1-pro-preview | - | 1.00 | 6 | 0 | 1m 25s | - | 146,874 | 25,450 | 0 | $0.60 |
| terminal-bench/compile-compcert | claude-code | 2.1.118 | anthropic | claude-opus-4-7 | - | 0.67 | 6 | 4 | 37m 46s | AgentTimeoutError | 12,736,618 | 84,062 | 12,091,532 | - |
| terminal-bench/compile-compcert | codex | 0.122.0 | openai | gpt-5.4 | - | 1.00 | 6 | 1 | 31m 51s | AgentTimeoutError | 39,177,213 | 112,028 | 37,918,080 | - |
| terminal-bench/compile-compcert | terminus-2 | 2.0.0 | gemini | gemini-3.1-pro-preview | - | 0.17 | 6 | 1 | 27m 32s | AgentTimeoutError | 4,302,469 | 112,855 | 3,674,249 | $3.35 |
| terminal-bench/configure-git-webserver | claude-code | 2.1.118 | anthropic | claude-opus-4-7 | - | 0.50 | 6 | 0 | 1m 57s | - | 2,627,170 | 22,956 | 2,530,503 | - |
| terminal-bench/configure-git-webserver | codex | 0.122.0 | openai | gpt-5.4 | - | 0.00 | 6 | 0 | 9m 55s | - | 2,490,430 | 61,560 | 2,342,016 | - |
| terminal-bench/configure-git-webserver | terminus-2 | 2.0.0 | gemini | gemini-3.1-pro-preview | - | 0.33 | 6 | 0 | 2m 28s | - | 211,089 | 58,359 | 32,526 | $1.06 |
| terminal-bench/constraints-scheduling | claude-code | 2.1.118 | anthropic | claude-opus-4-7 | - | 1.00 | 6 | 0 | 1m 40s | - | 667,041 | 33,791 | 570,001 | - |
| terminal-bench/constraints-scheduling | codex | 0.122.0 | openai | gpt-5.4 | - | 1.00 | 6 | 0 | 9m 11s | - | 556,117 | 24,662 | 494,848 | - |
| terminal-bench/constraints-scheduling | terminus-2 | 2.0.0 | gemini | gemini-3.1-pro-preview | - | 1.00 | 6 | 0 | 2m 50s | - | 139,496 | 98,202 | 0 | $1.46 |
| terminal-bench/count-dataset-tokens | claude-code | 2.1.118 | anthropic | claude-opus-4-7 | - | 0.67 | 6 | 0 | 2m 7s | - | 1,958,936 | 21,857 | 1,827,885 | - |
| terminal-bench/count-dataset-tokens | codex | 0.122.0 | openai | gpt-5.4 | - | 1.00 | 6 | 0 | 8m 26s | - | 2,663,691 | 35,958 | 2,385,152 | - |
| terminal-bench/count-dataset-tokens | terminus-2 | 2.0.0 | gemini | gemini-3.1-pro-preview | - | 1.00 | 6 | 0 | 8m 13s | - | 2,086,554 | 165,227 | 1,543,540 | $3.38 |
| terminal-bench/crack-7z-hash | claude-code | 2.1.118 | anthropic | claude-opus-4-7 | - | 1.00 | 6 | 0 | 4m 11s | - | 2,595,181 | 14,329 | 2,519,134 | - |
| terminal-bench/crack-7z-hash | codex | 0.122.0 | openai | gpt-5.4 | - | 1.00 | 6 | 0 | 12m 5s | - | 6,321,748 | 51,038 | 5,937,408 | - |
| terminal-bench/crack-7z-hash | terminus-2 | 2.0.0 | gemini | gemini-3.1-pro-preview | - | 0.83 | 6 | 1 | 11m 36s | AgentTimeoutError | 1,865,382 | 145,200 | 1,176,345 | $3.36 |
| terminal-bench/custom-memory-heap-crash | claude-code | 2.1.118 | anthropic | claude-opus-4-7 | - | 1.00 | 6 | 0 | 5m 34s | - | 10,125,833 | 112,154 | 9,831,417 | - |
| terminal-bench/custom-memory-heap-crash | codex | 0.122.0 | openai | gpt-5.4 | - | 1.00 | 6 | 1 | 9m 0s | AgentSetupTimeoutError | 3,002,652 | 30,932 | 2,704,640 | - |
| terminal-bench/custom-memory-heap-crash | terminus-2 | 2.0.0 | gemini | gemini-3.1-pro-preview | - | 1.00 | 6 | 0 | 3m 59s | - | 602,414 | 99,938 | 288,287 | $1.89 |
| terminal-bench/db-wal-recovery | claude-code | 2.1.118 | anthropic | claude-opus-4-7 | - | 0.17 | 6 | 0 | 5m 25s | - | 8,827,110 | 86,957 | 8,556,536 | - |
| terminal-bench/db-wal-recovery | codex | 0.122.0 | openai | gpt-5.4 | - | 0.83 | 6 | 0 | 10m 44s | - | 6,884,894 | 79,763 | 6,037,504 | - |
| terminal-bench/db-wal-recovery | terminus-2 | 2.0.0 | gemini | gemini-3.1-pro-preview | - | 0.20 | 6 | 5 | 11m 22s | AgentTimeoutError +1 more | 1,525,608 | 320,191 | 1,089,349 | $4.93 |
| terminal-bench/distribution-search | claude-code | 2.1.118 | anthropic | claude-opus-4-7 | - | 1.00 | 6 | 0 | 6m 29s | - | 1,428,105 | 199,583 | 1,240,919 | - |
| terminal-bench/distribution-search | codex | 0.122.0 | openai | gpt-5.4 | - | 1.00 | 6 | 0 | 9m 27s | - | 1,394,902 | 45,096 | 1,304,320 | - |
| terminal-bench/distribution-search | terminus-2 | 2.0.0 | gemini | gemini-3.1-pro-preview | - | 1.00 | 6 | 1 | 1m 43s | DaytonaError | 75,968 | 49,710 | 0 | $0.75 |
| terminal-bench/dna-assembly | claude-code | 2.1.118 | anthropic | claude-opus-4-7 | - | 0.17 | 6 | 5 | 28m 39s | AgentTimeoutError | 7,301,138 | 746,985 | 6,153,630 | - |
| terminal-bench/dna-assembly | codex | 0.122.0 | openai | gpt-5.4 | - | 0.20 | 6 | 1 | 13m 37s | AgentSetupTimeoutError | 6,674,700 | 146,677 | 6,150,272 | - |
| terminal-bench/dna-assembly | terminus-2 | 2.0.0 | gemini | gemini-3.1-pro-preview | - | 0.83 | 6 | 0 | 12m 34s | - | 636,833 | 447,701 | 268,965 | $6.16 |
| terminal-bench/dna-insert | claude-code | 2.1.118 | anthropic | claude-opus-4-7 | - | 0.17 | 6 | 0 | 5m 25s | - | 4,326,000 | 121,426 | 4,049,550 | - |
| terminal-bench/dna-insert | codex | 0.122.0 | openai | gpt-5.4 | - | 0.17 | 6 | 0 | 9m 2s | - | 2,251,709 | 71,206 | 2,091,648 | - |
| terminal-bench/dna-insert | terminus-2 | 2.0.0 | gemini | gemini-3.1-pro-preview | - | 0.17 | 6 | 0 | 4m 40s | - | 475,089 | 148,288 | 244,268 | $2.29 |
| terminal-bench/extract-elf | claude-code | 2.1.118 | anthropic | claude-opus-4-7 | - | 0.20 | 6 | 1 | 2m 19s | DaytonaError | 1,708,146 | 52,428 | 1,595,676 | - |
| terminal-bench/extract-elf | codex | 0.122.0 | openai | gpt-5.4 | - | 0.33 | 6 | 0 | 12m 34s | - | 2,287,066 | 89,348 | 2,147,840 | - |
| terminal-bench/extract-elf | terminus-2 | 2.0.0 | gemini | gemini-3.1-pro-preview | - | 0.83 | 6 | 0 | 4m 49s | - | 277,347 | 168,935 | 85,603 | $2.43 |
| terminal-bench/extract-moves-from-video | claude-code | 2.1.118 | anthropic | claude-opus-4-7 | - | 0.00 | 6 | 3 | 16m 16s | AgentTimeoutError +1 more | 14,552,885 | 74,139 | 14,264,914 | - |
| terminal-bench/extract-moves-from-video | codex | 0.122.0 | openai | gpt-5.4 | - | 0.67 | 6 | 3 | 27m 44s | AgentTimeoutError | 59,847,394 | 186,272 | 56,355,968 | - |
| terminal-bench/extract-moves-from-video | terminus-2 | 2.0.0 | gemini | gemini-3.1-pro-preview | - | 0.00 | 6 | 2 | 24m 13s | AgentTimeoutError | 5,510,541 | 201,253 | 4,537,373 | $5.27 |
| terminal-bench/feal-differential-cryptanalysis | claude-code | 2.1.118 | anthropic | claude-opus-4-7 | - | 1.00 | 6 | 1 | 4m 23s | DaytonaError | 1,198,678 | 123,730 | 1,027,092 | - |
| terminal-bench/feal-differential-cryptanalysis | codex | 0.122.0 | openai | gpt-5.4 | - | 1.00 | 6 | 0 | 14m 17s | - | 8,706,927 | 117,048 | 8,460,288 | - |
| terminal-bench/feal-differential-cryptanalysis | terminus-2 | 2.0.0 | gemini | gemini-3.1-pro-preview | - | 1.00 | 6 | 0 | 4m 42s | - | 171,530 | 144,549 | 0 | $2.08 |
| terminal-bench/feal-linear-cryptanalysis | claude-code | 2.1.118 | anthropic | claude-opus-4-7 | - | 1.00 | 6 | 0 | 8m 41s | - | 3,465,512 | 269,812 | 3,049,967 | - |
| terminal-bench/feal-linear-cryptanalysis | codex | 0.122.0 | openai | gpt-5.4 | - | 1.00 | 6 | 1 | 9m 6s | DownloadVerifierDirError | 2,126,894 | 66,248 | 2,045,312 | - |
| terminal-bench/feal-linear-cryptanalysis | terminus-2 | 2.0.0 | gemini | gemini-3.1-pro-preview | - | 0.83 | 6 | 0 | 6m 13s | - | 202,319 | 247,323 | 48,947 | $3.28 |
| terminal-bench/filter-js-from-html | claude-code | 2.1.118 | anthropic | claude-opus-4-7 | - | 0.00 | 6 | 0 | 12m 41s | - | 2,202,789 | 142,262 | 1,999,693 | - |
| terminal-bench/filter-js-from-html | codex | 0.122.0 | openai | gpt-5.4 | - | 0.00 | 6 | 0 | 18m 1s | - | 1,547,018 | 100,505 | 1,477,632 | - |
| terminal-bench/filter-js-from-html | terminus-2 | 2.0.0 | gemini | gemini-3.1-pro-preview | - | 0.00 | 6 | 0 | 14m 32s | - | 173,520 | 258,377 | 16,262 | $3.42 |
| terminal-bench/financial-document-processor | claude-code | 2.1.118 | anthropic | claude-opus-4-7 | - | 0.83 | 6 | 0 | 3m 46s | - | 7,551,176 | 49,750 | 7,039,369 | - |
| terminal-bench/financial-document-processor | codex | 0.122.0 | openai | gpt-5.4 | - | 1.00 | 6 | 0 | 11m 35s | - | 6,375,370 | 66,778 | 5,609,088 | - |
| terminal-bench/financial-document-processor | terminus-2 | 2.0.0 | gemini | gemini-3.1-pro-preview | - | 0.83 | 6 | 0 | 6m 35s | - | 1,220,965 | 173,956 | 839,161 | $3.02 |
| terminal-bench/fix-code-vulnerability | claude-code | 2.1.118 | anthropic | claude-opus-4-7 | - | 1.00 | 6 | 0 | 1m 7s | - | 1,588,115 | 11,232 | 1,503,252 | - |
| terminal-bench/fix-code-vulnerability | codex | 0.122.0 | openai | gpt-5.4 | - | 1.00 | 6 | 0 | 9m 55s | - | 2,228,868 | 25,232 | 1,881,728 | - |
| terminal-bench/fix-code-vulnerability | terminus-2 | 2.0.0 | gemini | gemini-3.1-pro-preview | - | 1.00 | 6 | 0 | 2m 4s | - | 408,262 | 46,842 | 150,742 | $1.11 |
| terminal-bench/fix-git | claude-code | 2.1.118 | anthropic | claude-opus-4-7 | - | 1.00 | 6 | 0 | 1m 19s | - | 1,302,438 | 20,111 | 1,227,729 | - |
| terminal-bench/fix-git | codex | 0.122.0 | openai | gpt-5.4 | - | 1.00 | 6 | 0 | 8m 28s | - | 1,535,407 | 23,927 | 1,368,960 | - |
| terminal-bench/fix-git | terminus-2 | 2.0.0 | gemini | gemini-3.1-pro-preview | - | 1.00 | 6 | 0 | 1m 28s | - | 97,900 | 30,356 | 0 | $0.56 |
| terminal-bench/fix-ocaml-gc | claude-code | 2.1.118 | anthropic | claude-opus-4-7 | - | 1.00 | 6 | 0 | 20m 43s | - | 11,489,069 | 100,261 | 10,571,796 | - |
| terminal-bench/fix-ocaml-gc | codex | 0.122.0 | openai | gpt-5.4 | - | 1.00 | 6 | 0 | 29m 23s | - | 36,484,162 | 104,404 | 34,888,320 | - |
| terminal-bench/fix-ocaml-gc | terminus-2 | 2.0.0 | gemini | gemini-3.1-pro-preview | - | 1.00 | 6 | 0 | 21m 57s | - | 4,733,580 | 72,535 | 3,996,139 | $3.14 |
| terminal-bench/gcode-to-text | claude-code | 2.1.118 | anthropic | claude-opus-4-7 | - | 1.00 | 6 | 0 | 5m 19s | - | 6,033,398 | 125,133 | 5,767,948 | - |
| terminal-bench/gcode-to-text | codex | 0.122.0 | openai | gpt-5.4 | - | 0.17 | 6 | 1 | 15m 55s | AgentTimeoutError | 16,554,758 | 162,763 | 15,355,520 | - |
| terminal-bench/gcode-to-text | terminus-2 | 2.0.0 | gemini | gemini-3.1-pro-preview | - | 0.17 | 6 | 0 | 8m 41s | - | 1,586,319 | 325,514 | 1,140,995 | $5.03 |
| terminal-bench/git-leak-recovery | claude-code | 2.1.118 | anthropic | claude-opus-4-7 | - | 1.00 | 6 | 0 | 1m 5s | - | 1,638,713 | 12,251 | 1,577,855 | - |
| terminal-bench/git-leak-recovery | codex | 0.122.0 | openai | gpt-5.4 | - | 0.83 | 6 | 0 | 7m 21s | - | 796,336 | 20,280 | 767,232 | - |
| terminal-bench/git-leak-recovery | terminus-2 | 2.0.0 | gemini | gemini-3.1-pro-preview | - | 1.00 | 6 | 0 | 1m 35s | - | 72,297 | 30,251 | 0 | $0.51 |
| terminal-bench/git-multibranch | claude-code | 2.1.118 | anthropic | claude-opus-4-7 | - | 0.83 | 6 | 0 | 2m 58s | - | 5,980,741 | 43,395 | 5,787,052 | - |
Showing 1-100 of 267 tasks