"
],
"text/html": [
"\n",
" \n",
" \n",
"
\n",
" [5609/8400 1:13:39 < 40:17, 1.15 it/s, Epoch 33.38/50]\n",
"
\n",
" \n",
" \n",
" \n",
" | Step | \n",
" Training Loss | \n",
"
\n",
" \n",
" \n",
" \n",
" | 525 | \n",
" 1.157000 | \n",
"
\n",
" \n",
" | 550 | \n",
" 1.317600 | \n",
"
\n",
" \n",
" | 575 | \n",
" 1.205000 | \n",
"
\n",
" \n",
" | 600 | \n",
" 1.274100 | \n",
"
\n",
" \n",
" | 625 | \n",
" 1.275500 | \n",
"
\n",
" \n",
" | 650 | \n",
" 1.289200 | \n",
"
\n",
" \n",
" | 675 | \n",
" 1.154400 | \n",
"
\n",
" \n",
" | 700 | \n",
" 1.140200 | \n",
"
\n",
" \n",
" | 725 | \n",
" 1.177400 | \n",
"
\n",
" \n",
" | 750 | \n",
" 1.072500 | \n",
"
\n",
" \n",
" | 775 | \n",
" 1.157700 | \n",
"
\n",
" \n",
" | 800 | \n",
" 1.017800 | \n",
"
\n",
" \n",
" | 825 | \n",
" 1.030200 | \n",
"
\n",
" \n",
" | 850 | \n",
" 0.926700 | \n",
"
\n",
" \n",
" | 875 | \n",
" 1.017300 | \n",
"
\n",
" \n",
" | 900 | \n",
" 0.955100 | \n",
"
\n",
" \n",
" | 925 | \n",
" 0.881700 | \n",
"
\n",
" \n",
" | 950 | \n",
" 0.954700 | \n",
"
\n",
" \n",
" | 975 | \n",
" 0.939900 | \n",
"
\n",
" \n",
" | 1000 | \n",
" 0.910200 | \n",
"
\n",
" \n",
" | 1025 | \n",
" 0.791700 | \n",
"
\n",
" \n",
" | 1050 | \n",
" 0.823100 | \n",
"
\n",
" \n",
" | 1075 | \n",
" 0.776600 | \n",
"
\n",
" \n",
" | 1100 | \n",
" 0.813600 | \n",
"
\n",
" \n",
" | 1125 | \n",
" 0.790400 | \n",
"
\n",
" \n",
" | 1150 | \n",
" 0.848300 | \n",
"
\n",
" \n",
" | 1175 | \n",
" 0.933600 | \n",
"
\n",
" \n",
" | 1200 | \n",
" 0.679000 | \n",
"
\n",
" \n",
" | 1225 | \n",
" 0.663400 | \n",
"
\n",
" \n",
" | 1250 | \n",
" 0.720600 | \n",
"
\n",
" \n",
" | 1275 | \n",
" 0.650100 | \n",
"
\n",
" \n",
" | 1300 | \n",
" 0.655800 | \n",
"
\n",
" \n",
" | 1325 | \n",
" 0.724900 | \n",
"
\n",
" \n",
" | 1350 | \n",
" 0.703000 | \n",
"
\n",
" \n",
" | 1375 | \n",
" 0.569400 | \n",
"
\n",
" \n",
" | 1400 | \n",
" 0.537900 | \n",
"
\n",
" \n",
" | 1425 | \n",
" 0.606400 | \n",
"
\n",
" \n",
" | 1450 | \n",
" 0.631300 | \n",
"
\n",
" \n",
" | 1475 | \n",
" 0.608900 | \n",
"
\n",
" \n",
" | 1500 | \n",
" 0.614900 | \n",
"
\n",
" \n",
" | 1525 | \n",
" 0.511400 | \n",
"
\n",
" \n",
" | 1550 | \n",
" 0.510400 | \n",
"
\n",
" \n",
" | 1575 | \n",
" 0.452000 | \n",
"
\n",
" \n",
" | 1600 | \n",
" 0.520100 | \n",
"
\n",
" \n",
" | 1625 | \n",
" 0.527200 | \n",
"
\n",
" \n",
" | 1650 | \n",
" 0.520400 | \n",
"
\n",
" \n",
" | 1675 | \n",
" 0.584000 | \n",
"
\n",
" \n",
" | 1700 | \n",
" 0.429200 | \n",
"
\n",
" \n",
" | 1725 | \n",
" 0.434600 | \n",
"
\n",
" \n",
" | 1750 | \n",
" 0.461700 | \n",
"
\n",
" \n",
" | 1775 | \n",
" 0.447100 | \n",
"
\n",
" \n",
" | 1800 | \n",
" 0.439800 | \n",
"
\n",
" \n",
" | 1825 | \n",
" 0.449900 | \n",
"
\n",
" \n",
" | 1850 | \n",
" 0.457300 | \n",
"
\n",
" \n",
" | 1875 | \n",
" 0.390200 | \n",
"
\n",
" \n",
" | 1900 | \n",
" 0.372600 | \n",
"
\n",
" \n",
" | 1925 | \n",
" 0.408500 | \n",
"
\n",
" \n",
" | 1950 | \n",
" 0.414600 | \n",
"
\n",
" \n",
" | 1975 | \n",
" 0.420800 | \n",
"
\n",
" \n",
" | 2000 | \n",
" 0.382200 | \n",
"
\n",
" \n",
" | 2025 | \n",
" 0.368600 | \n",
"
\n",
" \n",
" | 2050 | \n",
" 0.368000 | \n",
"
\n",
" \n",
" | 2075 | \n",
" 0.386200 | \n",
"
\n",
" \n",
" | 2100 | \n",
" 0.376900 | \n",
"
\n",
" \n",
" | 2125 | \n",
" 0.340500 | \n",
"
\n",
" \n",
" | 2150 | \n",
" 0.347000 | \n",
"
\n",
" \n",
" | 2175 | \n",
" 0.379200 | \n",
"
\n",
" \n",
" | 2200 | \n",
" 0.330500 | \n",
"
\n",
" \n",
" | 2225 | \n",
" 0.332600 | \n",
"
\n",
" \n",
" | 2250 | \n",
" 0.325600 | \n",
"
\n",
" \n",
" | 2275 | \n",
" 0.330200 | \n",
"
\n",
" \n",
" | 2300 | \n",
" 0.302700 | \n",
"
\n",
" \n",
" | 2325 | \n",
" 0.345900 | \n",
"
\n",
" \n",
" | 2350 | \n",
" 0.368500 | \n",
"
\n",
" \n",
" | 2375 | \n",
" 0.285400 | \n",
"
\n",
" \n",
" | 2400 | \n",
" 0.297600 | \n",
"
\n",
" \n",
" | 2425 | \n",
" 0.302000 | \n",
"
\n",
" \n",
" | 2450 | \n",
" 0.324900 | \n",
"
\n",
" \n",
" | 2475 | \n",
" 0.329800 | \n",
"
\n",
" \n",
" | 2500 | \n",
" 0.282400 | \n",
"
\n",
" \n",
" | 2525 | \n",
" 0.339900 | \n",
"
\n",
" \n",
" | 2550 | \n",
" 0.272000 | \n",
"
\n",
" \n",
" | 2575 | \n",
" 0.298900 | \n",
"
\n",
" \n",
" | 2600 | \n",
" 0.277500 | \n",
"
\n",
" \n",
" | 2625 | \n",
" 0.281700 | \n",
"
\n",
" \n",
" | 2650 | \n",
" 0.282400 | \n",
"
\n",
" \n",
" | 2675 | \n",
" 0.292500 | \n",
"
\n",
" \n",
" | 2700 | \n",
" 0.303900 | \n",
"
\n",
" \n",
" | 2725 | \n",
" 0.252600 | \n",
"
\n",
" \n",
" | 2750 | \n",
" 0.259400 | \n",
"
\n",
" \n",
" | 2775 | \n",
" 0.280600 | \n",
"
\n",
" \n",
" | 2800 | \n",
" 0.261500 | \n",
"
\n",
" \n",
" | 2825 | \n",
" 0.292000 | \n",
"
\n",
" \n",
" | 2850 | \n",
" 0.278800 | \n",
"
\n",
" \n",
" | 2875 | \n",
" 0.256500 | \n",
"
\n",
" \n",
" | 2900 | \n",
" 0.240400 | \n",
"
\n",
" \n",
" | 2925 | \n",
" 0.255000 | \n",
"
\n",
" \n",
" | 2950 | \n",
" 0.264700 | \n",
"
\n",
" \n",
" | 2975 | \n",
" 0.266600 | \n",
"
\n",
" \n",
" | 3000 | \n",
" 0.270200 | \n",
"
\n",
" \n",
" | 3025 | \n",
" 0.283700 | \n",
"
\n",
" \n",
" | 3050 | \n",
" 0.248700 | \n",
"
\n",
" \n",
" | 3075 | \n",
" 0.235900 | \n",
"
\n",
" \n",
" | 3100 | \n",
" 0.259000 | \n",
"
\n",
" \n",
" | 3125 | \n",
" 0.246400 | \n",
"
\n",
" \n",
" | 3150 | \n",
" 0.274100 | \n",
"
\n",
" \n",
" | 3175 | \n",
" 0.251800 | \n",
"
\n",
" \n",
" | 3200 | \n",
" 0.233200 | \n",
"
\n",
" \n",
" | 3225 | \n",
" 0.227200 | \n",
"
\n",
" \n",
" | 3250 | \n",
" 0.239300 | \n",
"
\n",
" \n",
" | 3275 | \n",
" 0.262100 | \n",
"
\n",
" \n",
" | 3300 | \n",
" 0.242500 | \n",
"
\n",
" \n",
" | 3325 | \n",
" 0.255300 | \n",
"
\n",
" \n",
" | 3350 | \n",
" 0.249900 | \n",
"
\n",
" \n",
" | 3375 | \n",
" 0.241000 | \n",
"
\n",
" \n",
" | 3400 | \n",
" 0.221600 | \n",
"
\n",
" \n",
" | 3425 | \n",
" 0.246400 | \n",
"
\n",
" \n",
" | 3450 | \n",
" 0.224600 | \n",
"
\n",
" \n",
" | 3475 | \n",
" 0.239200 | \n",
"
\n",
" \n",
" | 3500 | \n",
" 0.247300 | \n",
"
\n",
" \n",
" | 3525 | \n",
" 0.241100 | \n",
"
\n",
" \n",
" | 3550 | \n",
" 0.223300 | \n",
"
\n",
" \n",
" | 3575 | \n",
" 0.212000 | \n",
"
\n",
" \n",
" | 3600 | \n",
" 0.229400 | \n",
"
\n",
" \n",
" | 3625 | \n",
" 0.248400 | \n",
"
\n",
" \n",
" | 3650 | \n",
" 0.228100 | \n",
"
\n",
" \n",
" | 3675 | \n",
" 0.231600 | \n",
"
\n",
" \n",
" | 3700 | \n",
" 0.238500 | \n",
"
\n",
" \n",
" | 3725 | \n",
" 0.221000 | \n",
"
\n",
" \n",
" | 3750 | \n",
" 0.246100 | \n",
"
\n",
" \n",
" | 3775 | \n",
" 0.218400 | \n",
"
\n",
" \n",
" | 3800 | \n",
" 0.210000 | \n",
"
\n",
" \n",
" | 3825 | \n",
" 0.229600 | \n",
"
\n",
" \n",
" | 3850 | \n",
" 0.232100 | \n",
"
\n",
" \n",
" | 3875 | \n",
" 0.214800 | \n",
"
\n",
" \n",
" | 3900 | \n",
" 0.208700 | \n",
"
\n",
" \n",
" | 3925 | \n",
" 0.208200 | \n",
"
\n",
" \n",
" | 3950 | \n",
" 0.231400 | \n",
"
\n",
" \n",
" | 3975 | \n",
" 0.223900 | \n",
"
\n",
" \n",
" | 4000 | \n",
" 0.229800 | \n",
"
\n",
" \n",
" | 4025 | \n",
" 0.230300 | \n",
"
\n",
" \n",
" | 4050 | \n",
" 0.216700 | \n",
"
\n",
" \n",
" | 4075 | \n",
" 0.204100 | \n",
"
\n",
" \n",
" | 4100 | \n",
" 0.211800 | \n",
"
\n",
" \n",
" | 4125 | \n",
" 0.217500 | \n",
"
\n",
" \n",
" | 4150 | \n",
" 0.213200 | \n",
"
\n",
" \n",
" | 4175 | \n",
" 0.240600 | \n",
"
\n",
" \n",
" | 4200 | \n",
" 0.220500 | \n",
"
\n",
" \n",
" | 4225 | \n",
" 0.206400 | \n",
"
\n",
" \n",
" | 4250 | \n",
" 0.220100 | \n",
"
\n",
" \n",
" | 4275 | \n",
" 0.215000 | \n",
"
\n",
" \n",
" | 4300 | \n",
" 0.212600 | \n",
"
\n",
" \n",
" | 4325 | \n",
" 0.200000 | \n",
"
\n",
" \n",
" | 4350 | \n",
" 0.214100 | \n",
"
\n",
" \n",
" | 4375 | \n",
" 0.226500 | \n",
"
\n",
" \n",
" | 4400 | \n",
" 0.216400 | \n",
"
\n",
" \n",
" | 4425 | \n",
" 0.218200 | \n",
"
\n",
" \n",
" | 4450 | \n",
" 0.215200 | \n",
"
\n",
" \n",
" | 4475 | \n",
" 0.203100 | \n",
"
\n",
" \n",
" | 4500 | \n",
" 0.218200 | \n",
"
\n",
" \n",
" | 4525 | \n",
" 0.201800 | \n",
"
\n",
" \n",
" | 4550 | \n",
" 0.213300 | \n",
"
\n",
" \n",
" | 4575 | \n",
" 0.199000 | \n",
"
\n",
" \n",
" | 4600 | \n",
" 0.208400 | \n",
"
\n",
" \n",
" | 4625 | \n",
" 0.215700 | \n",
"
\n",
" \n",
" | 4650 | \n",
" 0.219200 | \n",
"
\n",
" \n",
" | 4675 | \n",
" 0.183800 | \n",
"
\n",
" \n",
" | 4700 | \n",
" 0.226600 | \n",
"
\n",
" \n",
" | 4725 | \n",
" 0.204100 | \n",
"
\n",
" \n",
" | 4750 | \n",
" 0.198000 | \n",
"
\n",
" \n",
" | 4775 | \n",
" 0.206400 | \n",
"
\n",
" \n",
" | 4800 | \n",
" 0.204100 | \n",
"
\n",
" \n",
" | 4825 | \n",
" 0.204400 | \n",
"
\n",
" \n",
" | 4850 | \n",
" 0.206300 | \n",
"
\n",
" \n",
" | 4875 | \n",
" 0.215900 | \n",
"
\n",
" \n",
" | 4900 | \n",
" 0.201100 | \n",
"
\n",
" \n",
" | 4925 | \n",
" 0.199000 | \n",
"
\n",
" \n",
" | 4950 | \n",
" 0.200600 | \n",
"
\n",
" \n",
" | 4975 | \n",
" 0.198200 | \n",
"
\n",
" \n",
" | 5000 | \n",
" 0.223200 | \n",
"
\n",
" \n",
" | 5025 | \n",
" 0.211700 | \n",
"
\n",
" \n",
" | 5050 | \n",
" 0.190800 | \n",
"
\n",
" \n",
" | 5075 | \n",
" 0.199700 | \n",
"
\n",
" \n",
" | 5100 | \n",
" 0.189100 | \n",
"
\n",
" \n",
" | 5125 | \n",
" 0.205500 | \n",
"
\n",
" \n",
" | 5150 | \n",
" 0.210600 | \n",
"
\n",
" \n",
" | 5175 | \n",
" 0.211000 | \n",
"
\n",
" \n",
" | 5200 | \n",
" 0.208500 | \n",
"
\n",
" \n",
" | 5225 | \n",
" 0.198300 | \n",
"
\n",
" \n",
" | 5250 | \n",
" 0.207000 | \n",
"
\n",
" \n",
" | 5275 | \n",
" 0.208300 | \n",
"
\n",
" \n",
" | 5300 | \n",
" 0.181800 | \n",
"
\n",
" \n",
" | 5325 | \n",
" 0.197800 | \n",
"
\n",
" \n",
" | 5350 | \n",
" 0.205400 | \n",
"
\n",
" \n",
" | 5375 | \n",
" 0.208400 | \n",
"
\n",
" \n",
" | 5400 | \n",
" 0.185300 | \n",
"
\n",
" \n",
" | 5425 | \n",
" 0.174800 | \n",
"
\n",
" \n",
" | 5450 | \n",
" 0.199500 | \n",
"
\n",
" \n",
" | 5475 | \n",
" 0.199200 | \n",
"
\n",
" \n",
" | 5500 | \n",
" 0.215900 | \n",
"
\n",
" \n",
" | 5525 | \n",
" 0.212600 | \n",
"
\n",
" \n",
" | 5550 | \n",
" 0.209500 | \n",
"
\n",
" \n",
" | 5575 | \n",
" 0.199400 | \n",
"
\n",
" \n",
" | 5600 | \n",
" 0.186800 | \n",
"
\n",
" \n",
"
"
]
},
"metadata": {}
},
{
"output_type": "error",
"ename": "KeyboardInterrupt",
"evalue": "ignored",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m\u001b[0m in \u001b[0;36m| \u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mtrainer\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtrain\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mresume_from_checkpoint\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;31m# Set to true if resuming\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
"\u001b[0;32m/usr/local/lib/python3.10/dist-packages/transformers/trainer.py\u001b[0m in \u001b[0;36mtrain\u001b[0;34m(self, resume_from_checkpoint, trial, ignore_keys_for_eval, **kwargs)\u001b[0m\n\u001b[1;32m 1554\u001b[0m \u001b[0mhf_hub_utils\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0menable_progress_bars\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1555\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1556\u001b[0;31m return inner_training_loop(\n\u001b[0m\u001b[1;32m 1557\u001b[0m \u001b[0margs\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1558\u001b[0m \u001b[0mresume_from_checkpoint\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mresume_from_checkpoint\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/usr/local/lib/python3.10/dist-packages/accelerate/utils/memory.py\u001b[0m in \u001b[0;36mdecorator\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 134\u001b[0m \u001b[0;32mraise\u001b[0m \u001b[0mRuntimeError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"No executable batch size found, reached zero.\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 135\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 136\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mfunction\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mbatch_size\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 137\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mException\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 138\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mshould_reduce_batch_size\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0me\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/usr/local/lib/python3.10/dist-packages/transformers/trainer.py\u001b[0m in \u001b[0;36m_inner_training_loop\u001b[0;34m(self, batch_size, args, resume_from_checkpoint, trial, ignore_keys_for_eval)\u001b[0m\n\u001b[1;32m 1836\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1837\u001b[0m \u001b[0;32mwith\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0maccelerator\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0maccumulate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmodel\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1838\u001b[0;31m \u001b[0mtr_loss_step\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtraining_step\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmodel\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0minputs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1839\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1840\u001b[0m if (\n",
"\u001b[0;32m/usr/local/lib/python3.10/dist-packages/transformers/trainer.py\u001b[0m in \u001b[0;36mtraining_step\u001b[0;34m(self, model, inputs)\u001b[0m\n\u001b[1;32m 2691\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2692\u001b[0m \u001b[0;32mwith\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcompute_loss_context_manager\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 2693\u001b[0;31m \u001b[0mloss\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcompute_loss\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmodel\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0minputs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2694\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2695\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mn_gpu\u001b[0m \u001b[0;34m>\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/usr/local/lib/python3.10/dist-packages/transformers/trainer.py\u001b[0m in \u001b[0;36mcompute_loss\u001b[0;34m(self, model, inputs, return_outputs)\u001b[0m\n\u001b[1;32m 2716\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2717\u001b[0m \u001b[0mlabels\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 2718\u001b[0;31m \u001b[0moutputs\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mmodel\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m**\u001b[0m\u001b[0minputs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2719\u001b[0m \u001b[0;31m# Save past state if it exists\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2720\u001b[0m \u001b[0;31m# TODO: this needs to be fixed and made cleaner later.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py\u001b[0m in \u001b[0;36m_call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1499\u001b[0m \u001b[0;32mor\u001b[0m \u001b[0m_global_backward_pre_hooks\u001b[0m \u001b[0;32mor\u001b[0m \u001b[0m_global_backward_hooks\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1500\u001b[0m or _global_forward_hooks or _global_forward_pre_hooks):\n\u001b[0;32m-> 1501\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mforward_call\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1502\u001b[0m \u001b[0;31m# Do not call functions when jit is used\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1503\u001b[0m \u001b[0mfull_backward_hooks\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mnon_full_backward_hooks\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/usr/local/lib/python3.10/dist-packages/accelerate/utils/operations.py\u001b[0m in \u001b[0;36mforward\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 634\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 635\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mforward\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 636\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mmodel_forward\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 637\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 638\u001b[0m \u001b[0;31m# To act like a decorator so that it can be popped when doing `extract_model_from_parallel`\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/usr/local/lib/python3.10/dist-packages/accelerate/utils/operations.py\u001b[0m in \u001b[0;36m__call__\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 622\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 623\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m__call__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 624\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mconvert_to_fp32\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmodel_forward\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 625\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 626\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m__getstate__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/usr/local/lib/python3.10/dist-packages/torch/amp/autocast_mode.py\u001b[0m in \u001b[0;36mdecorate_autocast\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 12\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mdecorate_autocast\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 13\u001b[0m \u001b[0;32mwith\u001b[0m \u001b[0mautocast_instance\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 14\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 15\u001b[0m \u001b[0mdecorate_autocast\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__script_unsupported\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m'@autocast() decorator is not supported in script mode'\u001b[0m \u001b[0;31m# type: ignore[attr-defined]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 16\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mdecorate_autocast\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/usr/local/lib/python3.10/dist-packages/accelerate/utils/operations.py\u001b[0m in \u001b[0;36mforward\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 634\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 635\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mforward\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 636\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mmodel_forward\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 637\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 638\u001b[0m \u001b[0;31m# To act like a decorator so that it can be popped when doing `extract_model_from_parallel`\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/usr/local/lib/python3.10/dist-packages/accelerate/utils/operations.py\u001b[0m in \u001b[0;36m__call__\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 622\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 623\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m__call__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 624\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mconvert_to_fp32\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmodel_forward\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 625\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 626\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m__getstate__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/usr/local/lib/python3.10/dist-packages/torch/amp/autocast_mode.py\u001b[0m in \u001b[0;36mdecorate_autocast\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 12\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mdecorate_autocast\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 13\u001b[0m \u001b[0;32mwith\u001b[0m \u001b[0mautocast_instance\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 14\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 15\u001b[0m \u001b[0mdecorate_autocast\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__script_unsupported\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m'@autocast() decorator is not supported in script mode'\u001b[0m \u001b[0;31m# type: ignore[attr-defined]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 16\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mdecorate_autocast\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/usr/local/lib/python3.10/dist-packages/accelerate/utils/operations.py\u001b[0m in \u001b[0;36mforward\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 634\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 635\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mforward\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 636\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mmodel_forward\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 637\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 638\u001b[0m \u001b[0;31m# To act like a decorator so that it can be popped when doing `extract_model_from_parallel`\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/usr/local/lib/python3.10/dist-packages/accelerate/utils/operations.py\u001b[0m in \u001b[0;36m__call__\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 622\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 623\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m__call__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 624\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mconvert_to_fp32\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmodel_forward\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 625\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 626\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m__getstate__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/usr/local/lib/python3.10/dist-packages/torch/amp/autocast_mode.py\u001b[0m in \u001b[0;36mdecorate_autocast\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 12\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mdecorate_autocast\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 13\u001b[0m \u001b[0;32mwith\u001b[0m \u001b[0mautocast_instance\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 14\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 15\u001b[0m \u001b[0mdecorate_autocast\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__script_unsupported\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m'@autocast() decorator is not supported in script mode'\u001b[0m \u001b[0;31m# type: ignore[attr-defined]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 16\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mdecorate_autocast\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/usr/local/lib/python3.10/dist-packages/accelerate/utils/operations.py\u001b[0m in \u001b[0;36mforward\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 634\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 635\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mforward\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 636\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mmodel_forward\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 637\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 638\u001b[0m \u001b[0;31m# To act like a decorator so that it can be popped when doing `extract_model_from_parallel`\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/usr/local/lib/python3.10/dist-packages/accelerate/utils/operations.py\u001b[0m in \u001b[0;36m__call__\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 622\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 623\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m__call__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 624\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mconvert_to_fp32\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmodel_forward\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 625\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 626\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m__getstate__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/usr/local/lib/python3.10/dist-packages/torch/amp/autocast_mode.py\u001b[0m in \u001b[0;36mdecorate_autocast\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 12\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mdecorate_autocast\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 13\u001b[0m \u001b[0;32mwith\u001b[0m \u001b[0mautocast_instance\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 14\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 15\u001b[0m \u001b[0mdecorate_autocast\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__script_unsupported\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m'@autocast() decorator is not supported in script mode'\u001b[0m \u001b[0;31m# type: ignore[attr-defined]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 16\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mdecorate_autocast\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/usr/local/lib/python3.10/dist-packages/peft/peft_model.py\u001b[0m in \u001b[0;36mforward\u001b[0;34m(self, input_ids, attention_mask, inputs_embeds, labels, output_attentions, output_hidden_states, return_dict, task_ids, **kwargs)\u001b[0m\n\u001b[1;32m 944\u001b[0m )\n\u001b[1;32m 945\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 946\u001b[0;31m return self.base_model(\n\u001b[0m\u001b[1;32m 947\u001b[0m \u001b[0minput_ids\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0minput_ids\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 948\u001b[0m \u001b[0mattention_mask\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mattention_mask\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py\u001b[0m in \u001b[0;36m_call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1499\u001b[0m \u001b[0;32mor\u001b[0m \u001b[0m_global_backward_pre_hooks\u001b[0m \u001b[0;32mor\u001b[0m \u001b[0m_global_backward_hooks\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1500\u001b[0m or _global_forward_hooks or _global_forward_pre_hooks):\n\u001b[0;32m-> 1501\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mforward_call\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1502\u001b[0m \u001b[0;31m# Do not call functions when jit is used\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1503\u001b[0m \u001b[0mfull_backward_hooks\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mnon_full_backward_hooks\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/usr/local/lib/python3.10/dist-packages/peft/tuners/tuners_utils.py\u001b[0m in \u001b[0;36mforward\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 101\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 102\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mforward\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mAny\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mAny\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 103\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmodel\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mforward\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 104\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 105\u001b[0m \u001b[0;34m@\u001b[0m\u001b[0mabstractmethod\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/usr/local/lib/python3.10/dist-packages/accelerate/hooks.py\u001b[0m in \u001b[0;36mnew_forward\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 163\u001b[0m \u001b[0moutput\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mold_forward\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 164\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 165\u001b[0;31m \u001b[0moutput\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mold_forward\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 166\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mmodule\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_hf_hook\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpost_forward\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmodule\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0moutput\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 167\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/.cache/huggingface/modules/transformers_modules/tiiuae/falcon-7b/f7796529e36b2d49094450fb038cc7c4c86afa44/modelling_RW.py\u001b[0m in \u001b[0;36mforward\u001b[0;34m(self, input_ids, past_key_values, attention_mask, head_mask, inputs_embeds, labels, use_cache, output_attentions, output_hidden_states, return_dict, **deprecated_arguments)\u001b[0m\n\u001b[1;32m 751\u001b[0m \u001b[0mreturn_dict\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mreturn_dict\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mreturn_dict\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m \u001b[0;32melse\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mconfig\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0muse_return_dict\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 752\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 753\u001b[0;31m transformer_outputs = self.transformer(\n\u001b[0m\u001b[1;32m 754\u001b[0m \u001b[0minput_ids\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 755\u001b[0m \u001b[0mpast_key_values\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mpast_key_values\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py\u001b[0m in \u001b[0;36m_call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1499\u001b[0m \u001b[0;32mor\u001b[0m \u001b[0m_global_backward_pre_hooks\u001b[0m \u001b[0;32mor\u001b[0m \u001b[0m_global_backward_hooks\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1500\u001b[0m or _global_forward_hooks or _global_forward_pre_hooks):\n\u001b[0;32m-> 1501\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mforward_call\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1502\u001b[0m \u001b[0;31m# Do not call functions when jit is used\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1503\u001b[0m \u001b[0mfull_backward_hooks\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mnon_full_backward_hooks\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/usr/local/lib/python3.10/dist-packages/accelerate/hooks.py\u001b[0m in \u001b[0;36mnew_forward\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 163\u001b[0m \u001b[0moutput\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mold_forward\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 164\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 165\u001b[0;31m \u001b[0moutput\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mold_forward\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 166\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mmodule\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_hf_hook\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpost_forward\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmodule\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0moutput\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 167\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/.cache/huggingface/modules/transformers_modules/tiiuae/falcon-7b/f7796529e36b2d49094450fb038cc7c4c86afa44/modelling_RW.py\u001b[0m in \u001b[0;36mforward\u001b[0;34m(self, input_ids, past_key_values, attention_mask, head_mask, inputs_embeds, use_cache, output_attentions, output_hidden_states, return_dict, **deprecated_arguments)\u001b[0m\n\u001b[1;32m 638\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mcustom_forward\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 639\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 640\u001b[0;31m outputs = torch.utils.checkpoint.checkpoint(\n\u001b[0m\u001b[1;32m 641\u001b[0m \u001b[0mcreate_custom_forward\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mblock\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 642\u001b[0m \u001b[0mhidden_states\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/usr/local/lib/python3.10/dist-packages/torch/utils/checkpoint.py\u001b[0m in \u001b[0;36mcheckpoint\u001b[0;34m(function, use_reentrant, *args, **kwargs)\u001b[0m\n\u001b[1;32m 247\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 248\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0muse_reentrant\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 249\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mCheckpointFunction\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mapply\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfunction\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mpreserve\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 250\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 251\u001b[0m return _checkpoint_without_reentrant(\n",
"\u001b[0;32m/usr/local/lib/python3.10/dist-packages/torch/autograd/function.py\u001b[0m in \u001b[0;36mapply\u001b[0;34m(cls, *args, **kwargs)\u001b[0m\n\u001b[1;32m 504\u001b[0m \u001b[0;31m# See NOTE: [functorch vjp and autograd interaction]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 505\u001b[0m \u001b[0margs\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_functorch\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mutils\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0munwrap_dead_wrappers\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 506\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0msuper\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mapply\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;31m# type: ignore[misc]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 507\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 508\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mcls\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msetup_context\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0m_SingleLevelFunction\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msetup_context\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/usr/local/lib/python3.10/dist-packages/torch/utils/checkpoint.py\u001b[0m in \u001b[0;36mforward\u001b[0;34m(ctx, run_function, preserve_rng_state, *args)\u001b[0m\n\u001b[1;32m 105\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 106\u001b[0m \u001b[0;32mwith\u001b[0m \u001b[0mtorch\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mno_grad\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 107\u001b[0;31m \u001b[0moutputs\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mrun_function\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 108\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0moutputs\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 109\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/.cache/huggingface/modules/transformers_modules/tiiuae/falcon-7b/f7796529e36b2d49094450fb038cc7c4c86afa44/modelling_RW.py\u001b[0m in \u001b[0;36mcustom_forward\u001b[0;34m(*inputs)\u001b[0m\n\u001b[1;32m 634\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mcustom_forward\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0minputs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 635\u001b[0m \u001b[0;31m# None for past_key_value\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 636\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mmodule\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0minputs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0muse_cache\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0muse_cache\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0moutput_attentions\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0moutput_attentions\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 637\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 638\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mcustom_forward\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py\u001b[0m in \u001b[0;36m_call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1499\u001b[0m \u001b[0;32mor\u001b[0m \u001b[0m_global_backward_pre_hooks\u001b[0m \u001b[0;32mor\u001b[0m \u001b[0m_global_backward_hooks\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1500\u001b[0m or _global_forward_hooks or _global_forward_pre_hooks):\n\u001b[0;32m-> 1501\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mforward_call\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1502\u001b[0m \u001b[0;31m# Do not call functions when jit is used\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1503\u001b[0m \u001b[0mfull_backward_hooks\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mnon_full_backward_hooks\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/usr/local/lib/python3.10/dist-packages/accelerate/hooks.py\u001b[0m in \u001b[0;36mnew_forward\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 163\u001b[0m \u001b[0moutput\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mold_forward\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 164\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 165\u001b[0;31m \u001b[0moutput\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mold_forward\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 166\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mmodule\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_hf_hook\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpost_forward\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmodule\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0moutput\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 167\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/.cache/huggingface/modules/transformers_modules/tiiuae/falcon-7b/f7796529e36b2d49094450fb038cc7c4c86afa44/modelling_RW.py\u001b[0m in \u001b[0;36mforward\u001b[0;34m(self, hidden_states, alibi, attention_mask, layer_past, head_mask, use_cache, output_attentions)\u001b[0m\n\u001b[1;32m 383\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 384\u001b[0m \u001b[0;31m# Self attention.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 385\u001b[0;31m attn_outputs = self.self_attention(\n\u001b[0m\u001b[1;32m 386\u001b[0m \u001b[0mlayernorm_output\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 387\u001b[0m \u001b[0mlayer_past\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mlayer_past\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py\u001b[0m in \u001b[0;36m_call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1499\u001b[0m \u001b[0;32mor\u001b[0m \u001b[0m_global_backward_pre_hooks\u001b[0m \u001b[0;32mor\u001b[0m \u001b[0m_global_backward_hooks\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1500\u001b[0m or _global_forward_hooks or _global_forward_pre_hooks):\n\u001b[0;32m-> 1501\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mforward_call\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1502\u001b[0m \u001b[0;31m# Do not call functions when jit is used\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1503\u001b[0m \u001b[0mfull_backward_hooks\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mnon_full_backward_hooks\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/usr/local/lib/python3.10/dist-packages/accelerate/hooks.py\u001b[0m in \u001b[0;36mnew_forward\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 163\u001b[0m \u001b[0moutput\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mold_forward\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 164\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 165\u001b[0;31m \u001b[0moutput\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mold_forward\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 166\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mmodule\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_hf_hook\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpost_forward\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmodule\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0moutput\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 167\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/.cache/huggingface/modules/transformers_modules/tiiuae/falcon-7b/f7796529e36b2d49094450fb038cc7c4c86afa44/modelling_RW.py\u001b[0m in \u001b[0;36mforward\u001b[0;34m(self, hidden_states, alibi, attention_mask, layer_past, head_mask, use_cache, output_attentions)\u001b[0m\n\u001b[1;32m 255\u001b[0m \u001b[0mvalue_layer\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mvalue_layer\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtranspose\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m2\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreshape\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mbatch_size\u001b[0m \u001b[0;34m*\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mnum_kv\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mq_length\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mhead_dim\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 256\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 257\u001b[0;31m \u001b[0mquery_layer\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mkey_layer\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmaybe_rotary\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mquery_layer\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mkey_layer\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 258\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 259\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mlayer_past\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py\u001b[0m in \u001b[0;36m_call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1499\u001b[0m \u001b[0;32mor\u001b[0m \u001b[0m_global_backward_pre_hooks\u001b[0m \u001b[0;32mor\u001b[0m \u001b[0m_global_backward_hooks\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1500\u001b[0m or _global_forward_hooks or _global_forward_pre_hooks):\n\u001b[0;32m-> 1501\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mforward_call\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1502\u001b[0m \u001b[0;31m# Do not call functions when jit is used\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1503\u001b[0m \u001b[0mfull_backward_hooks\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mnon_full_backward_hooks\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/.cache/huggingface/modules/transformers_modules/tiiuae/falcon-7b/f7796529e36b2d49094450fb038cc7c4c86afa44/modelling_RW.py\u001b[0m in \u001b[0;36mforward\u001b[0;34m(self, q, k)\u001b[0m\n\u001b[1;32m 91\u001b[0m \u001b[0mbatch\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mseq_len\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mhead_dim\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mq\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mshape\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 92\u001b[0m \u001b[0mcos\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msin\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcos_sin\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mseq_len\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mq\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdevice\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mq\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdtype\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 93\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mq\u001b[0m \u001b[0;34m*\u001b[0m \u001b[0mcos\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mrotate_half\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mq\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m*\u001b[0m \u001b[0msin\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mk\u001b[0m \u001b[0;34m*\u001b[0m \u001b[0mcos\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mrotate_half\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mk\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m*\u001b[0m \u001b[0msin\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 94\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 95\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/.cache/huggingface/modules/transformers_modules/tiiuae/falcon-7b/f7796529e36b2d49094450fb038cc7c4c86afa44/modelling_RW.py\u001b[0m in \u001b[0;36mrotate_half\u001b[0;34m(x)\u001b[0m\n\u001b[1;32m 42\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mrotate_half\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mx\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 43\u001b[0m \u001b[0mx1\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mx2\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mx\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m...\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m:\u001b[0m \u001b[0mx\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mshape\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m-\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m//\u001b[0m \u001b[0;36m2\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mx\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m...\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mx\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mshape\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m-\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m//\u001b[0m \u001b[0;36m2\u001b[0m \u001b[0;34m:\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 44\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mtorch\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m-\u001b[0m\u001b[0mx2\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mx1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdim\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mx1\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mndim\u001b[0m \u001b[0;34m-\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;31m# dim=-1 triggers a bug in torch < 1.8.0\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 45\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 46\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;31mKeyboardInterrupt\u001b[0m: "
]
}
]
},
{
"cell_type": "code",
"source": [
"trainer.save_model(\"final_model\")"
],
"metadata": {
"id": "ymKzlz6cMGKP"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"transformers.logging.set_verbosity_error()"
],
"metadata": {
"id": "2LoR2ryty8pi"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"source": [
"## Storage\n",
"\n",
"The following section exports the trained model to my personal GDrive for use in another inference notebook."
],
"metadata": {
"id": "zFShzv-aKGr7"
}
},
{
"cell_type": "code",
"source": [
"import tarfile\n",
"import os.path\n",
"\n",
"def make_tarfile(output_filename, source_dir):\n",
" with tarfile.open(output_filename, \"w:gz\") as tar:\n",
" tar.add(source_dir, arcname=os.path.basename(source_dir))"
],
"metadata": {
"id": "zdxmwQpHMKfr"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"make_tarfile(\"final_model.tar.gz\", \"final_model\")"
],
"metadata": {
"id": "8zWfJNJ9NxK5"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"import locale\n",
"locale.getpreferredencoding = lambda: \"UTF-8\""
],
"metadata": {
"id": "bUqXlo9FN2DJ"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"!cp final_model.tar.gz path_to_save_directory"
],
"metadata": {
"id": "W2VBFqKGN5oW"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [],
"metadata": {
"id": "_iu2wZIFHev1"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"source": [
"## Lyric Inference\n",
"\n",
"Now it's time to try our model out. Let's load the saved model weights and recreate the necessary helper functions.\n",
"\n",
"### Untar the Fine-Tuned Weights"
],
"metadata": {
"id": "hN6L5dg0Kxb2"
}
},
{
"cell_type": "code",
"source": [
"!cp ./path_to_save_directory/final_model.tar.gz .\n",
"import tarfile\n",
"import os\n",
"tar = tarfile.open(\"final_model.tar.gz\")\n",
"tar.extractall()\n",
"tar.close()"
],
"metadata": {
"id": "K8UWcfvhPiEz"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"source": [
"### Install the Prerequisites"
],
"metadata": {
"id": "9pG8OfKpLD66"
}
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "ynyh3i8APR3C",
"outputId": "f51389f2-fc6b-48aa-ba2f-bcdabf4bdc45"
},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
" Installing build dependencies ... \u001b[?25l\u001b[?25hdone\n",
" Getting requirements to build wheel ... \u001b[?25l\u001b[?25hdone\n",
" Preparing metadata (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n"
]
}
],
"source": [
"!pip install -q bitsandbytes==0.41.1 transformers==4.33.3 accelerate==0.23.0 datasets==2.14.5 einops==0.6.1\n",
"!pip install -q -U git+https://github.com/huggingface/peft.git@69665f24e98dc5f20a430637a31f196158b6e0da"
]
},
{
"cell_type": "code",
"source": [
"import os\n",
"import re\n",
"import bitsandbytes as bnb\n",
"import pandas as pd\n",
"import torch\n",
"import torch.nn as nn\n",
"import transformers\n",
"from datasets import load_dataset, Dataset, Value\n",
"from peft import (\n",
" LoraConfig,\n",
" PeftConfig,\n",
" get_peft_model,\n",
" PeftModel,\n",
" prepare_model_for_kbit_training,\n",
")\n",
"from transformers import (\n",
" AutoConfig,\n",
" AutoModelForCausalLM,\n",
" AutoTokenizer,\n",
" BitsAndBytesConfig,\n",
")\n"
],
"metadata": {
"id": "v5M81zMBPcTO"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"source": [
"### Load the Base Model"
],
"metadata": {
"id": "2zKygQrQLIaq"
}
},
{
"cell_type": "code",
"source": [
"model_id = \"tiiuae/falcon-7b\"\n",
"adapters_name = \"final_model\"\n",
"\n",
"print(f\"Starting to load the model {model_id} into memory\")\n",
"\n",
"bnb_config = BitsAndBytesConfig(\n",
" load_in_4bit=True,\n",
" load_4bit_use_double_quant=True,\n",
" bnb_4bit_quant_type=\"nf4\",\n",
" bnb_4bit_compute_dtype=torch.bfloat16,\n",
")\n",
"\n",
"model = AutoModelForCausalLM.from_pretrained(\n",
" model_id,\n",
" device_map=\"auto\",\n",
" trust_remote_code=True,\n",
" quantization_config=bnb_config,\n",
")\n",
"model = prepare_model_for_kbit_training(model)\n",
"tokenizer = AutoTokenizer.from_pretrained(model_id, add_eos_token=True)\n",
"tokenizer.pad_token = tokenizer.eos_token\n",
"\n",
"model = PeftModel.from_pretrained(model, adapters_name)\n",
"model = model.merge_and_unload()"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 173,
"referenced_widgets": [
"7a947d4533e14c6680124c659950732a",
"0b48734a358c4e298f1ae17acde4dd92",
"70e7d7a1072143489b537927cf67763f",
"aff0b650d7bb4114844e0c246906236c",
"30e4276fd6084ef29d78b8590f772081",
"a60b8b9c1b2446b3a6827015e4596b74",
"a4b68b1ef0dd4e9189dc3fdef2975c67",
"227ced1cf8d744d5aeb80e8ba0e247df",
"c1dd5835ce9445bc8dc3dee52af15534",
"a54af2e31ab347b5b906035683170788",
"e141c3013d5d43579590dd30b44aacea"
]
},
"id": "8u9Nkfz4QoAF",
"outputId": "1a530510-d1f0-4a59-896a-15267ca28e29"
},
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Starting to load the model tiiuae/falcon-7b into memory\n"
]
},
{
"output_type": "stream",
"name": "stderr",
"text": [
"WARNING:transformers_modules.tiiuae.falcon-7b.898df1396f35e447d5fe44e0a3ccaaaa69f30d36.configuration_falcon:\n",
"WARNING: You are currently loading Falcon using legacy code contained in the model repository. Falcon has now been fully ported into the Hugging Face transformers library. For the most up-to-date and high-performance version of the Falcon model code, please update to the latest version of transformers and then load the model without the trust_remote_code=True argument.\n",
"\n"
]
},
{
"output_type": "display_data",
"data": {
"text/plain": [
"Loading checkpoint shards: 0%| | 0/2 [00:00, ?it/s]"
],
"application/vnd.jupyter.widget-view+json": {
"version_major": 2,
"version_minor": 0,
"model_id": "7a947d4533e14c6680124c659950732a"
}
},
"metadata": {}
},
{
"output_type": "stream",
"name": "stderr",
"text": [
"/usr/local/lib/python3.10/dist-packages/peft/tuners/lora/bnb.py:209: UserWarning: Merge lora module to 4-bit linear may get different generations due to rounding errors.\n",
" warnings.warn(\n"
]
}
]
},
{
"cell_type": "markdown",
"source": [
"### The Same Inference Function"
],
"metadata": {
"id": "dxiYjaKALP75"
}
},
{
"cell_type": "code",
"source": [
"def generate(prompt=\"[Intro]\") -> str:\n",
" inputs = tokenizer(prompt, padding=True, truncation=True, return_tensors=\"pt\").to(\"cuda:0\")\n",
" # More info about generation options: https://huggingface.co/blog/how-to-generate\n",
" outputs = model.generate(\n",
" input_ids=inputs['input_ids'],\n",
" attention_mask=inputs['attention_mask'],\n",
" do_sample=True,\n",
" top_p=0.92,\n",
" top_k=0,\n",
" max_new_tokens=50)\n",
" return tokenizer.decode(outputs[0], skip_special_tokens=True)"
],
"metadata": {
"id": "HLTtJjHtR1rr"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"source": [
"## Lyric Generation\n",
"\n",
"Let's go! Note how I'm prompting the model using the keys in the training data. Always remember that LLMs simply \"predict\" the next word.\n",
"\n",
"Let's start with something generic and then try to engineer the prompt to produce something more relevant."
],
"metadata": {
"id": "gJhkKJX_LaZb"
}
},
{
"cell_type": "code",
"source": [
"transformers.logging.set_verbosity_error()\n",
"print(\"\\n[\" + generate(\"[Intro]\\n\").split('[')[1])\n",
"print(\"\\n[\" + generate(\"[Verse 1]\\n\").split('[')[1])\n",
"print(\"\\n[\" + generate(\"[Bridge]\\n\").split('[')[1])\n",
"print(\"\\n[\" + generate(\"[Chorus]\\n\").split('[')[1])\n",
"print(\"\\n[\" + generate(\"[Verse 2]\\n\").split('[')[1])\n",
"print(\"\\n[\" + generate(\"[Outro]\\n\").split('[')[1])"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "lMjMAtC4SvO1",
"outputId": "aa5b3f01-e25b-48d1-e798-cacfb93b7d33"
},
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"\n",
"[Intro]\n",
"One, two, three, four\n",
"One, two... (One, two, three, four)\n",
"\n",
"(Yahoo)\n",
"\n",
"(I wanna be your dog)\n",
"\n",
"(Yahoo)\n",
"\n",
"(I wanna be your dog)\n",
"\n",
"(\n",
"\n",
"[Verse 1]\n",
"And the band played on\n",
"And the people came, and they saw that it was good\n",
"And they were satisfied\n",
"They were satisfied\n",
"\n",
"(I say the word)\n",
"(She says the word)\n",
"\n",
"(And they'll understand)\n",
"\n",
"\n",
"[Bridge]\n",
"Oh how long will it take\n",
"Till she sees the mistake she has made\n",
"Till she sees the mistake she has made\n",
"\n",
"(One, two, three, four, five, six, seven, eight, nine, ten, eleven!)\n",
"\n",
"(\n",
"\n",
"[Chorus]\n",
"Come on (Come on), Come on (Come on)\n",
"Come on (Come on), Come on (Come on)\n",
"Please please me, whoa yeah, like I please you\n",
"Like I please you\n",
"\n",
"(Come\n",
"\n",
"[Verse 2]\n",
"Ring, my friend I said you'd call\n",
"Doctor Robert\n",
"Early morning raingt\n",
"Doctor Robert\n",
"Fool, you don't need him does he fool you does he?\n",
"Doctor Robert\n",
"Doctor Robert\n",
"Doctor Robert\n",
"\n",
"(Ring\n",
"\n",
"[Outro]\n",
"I don't want to leave her now\n",
"You know I believe and how\n",
"I hope she will forgive me somehow\n",
"When I see her, I start to sing\n",
"\n",
"(Sing it again)\n",
"\n",
"(Oh yeah, sing it again)\n",
"\n",
"\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"print(\"\\n[\" + generate(\"[Intro]\\nThis is a song about language models\\n\").split('[')[1])\n",
"print(\"\\n[\" + generate(\"[Verse 1]\\nIn a deep dive, you learned how they work\\n\").split('[')[1])\n",
"print(\"\\n[\" + generate(\"[Bridge]\\nBut wait, the data\\n\").split('[')[1])\n",
"print(\"\\n[\" + generate(\"[Chorus]\\nThis is a language model\\n\").split('[')[1])\n",
"print(\"\\n[\" + generate(\"[Verse 2]\\nNext you want to deploy\\n\").split('[')[1])\n",
"print(\"\\n[\" + generate(\"[Outro]\\nI hoped you enjoyed this talk\\n\").split('[')[1])"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "OgILW9yBSxZ2",
"outputId": "0a0844fe-87ca-41b1-b0a6-4a1c2a4a1d72"
},
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"\n",
"[Intro]\n",
"This is a song about language models\n",
"And the structures that they contain\n",
"And the way that they repeat themselves\n",
"And the words that they surround\n",
"\n",
"\n",
"\n",
"[Verse 1]\n",
"In a deep dive, you learned how they work\n",
"And now you're part of the corporation\n",
"They'll take you in, screwed up or torn\n",
"Solve all your problems for a price or a song\n",
"\n",
"(Oh!)\n",
"\n",
"'Cause they'll be there, awaiting your call\n",
"\n",
"\n",
"[Bridge]\n",
"But wait, the data\n",
"Presents another approach\n",
"You may observe the people passing by\n",
"And you'll soon realize\n",
"That they're all living lives that are prescribed\n",
"And you'll see that they're all the same\n",
"And you know it's\n",
"\n",
"[Chorus]\n",
"This is a language model\n",
"It's called Smokey Tongue\n",
"It can help you if you let it\n",
"It can help you if you let it\n",
"It can help you if you let it\n",
"It can help you if you let it\n",
"\n",
"(Instrumental Break\n",
"\n",
"[Verse 2]\n",
"Next you want to deploy\n",
"The same old thing again\n",
"If I've said it once I've said it a hundred times\n",
"It's no use, you know, you'll never get it in your mind\n",
"If I've said it once I'\n",
"\n",
"[Outro]\n",
"I hoped you enjoyed this talk\n",
"And trust you will come again\n",
"The next time we'll talk\n",
"We'll have another tea and scone\n",
"But for now it's time to say good-bye\n",
"Good-bye\n",
"\n",
"(She's leaving home)\n",
"\n",
"\n"
]
}
]
}
]
} |