Created
November 18, 2025 22:31
-
-
Save andrewor14/8c3909d08e0fcecd26a5fb8e155b9521 to your computer and use it in GitHub Desktop.
Unsloth GRPO datta script
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Unsloth 2025.11.3 patched 36 layers with 36 QKV layers, 36 O layers and 36 MLP layers. | |
| Performing substitution for additional_keys=set() | |
| Unsloth: Just some info: will skip parsing ['attention_norm', 'cross_attn_input_layernorm', 'cross_attn_post_attention_layernorm', 'post_layernorm', 'norm2', 'post_attention_layernorm', 'input_layernorm', 'pre_feedforward_layernorm', 'norm', 'post_feedforward_layernorm', 'k_norm', 'layer_norm1', 'q_norm', 'layer_norm2', 'ffn_norm', 'norm1'] | |
| Unsloth: Tokenizing ["text"] (num_proc=64): 100%|██████████| 1126/1126 [00:11<00:00, 95.40 examples/s] | |
| ==((====))== Unsloth - 2x faster free finetuning | Num GPUs used = 1 | |
| \\ /| Num examples = 1,126 | Num Epochs = 1 | Total steps = 100 | |
| O^O/ \_/ \ Batch size per device = 4 | Gradient accumulation steps = 1 | |
| \ / Data Parallel GPUs = 1 | Total batch size (4 x 1 x 1) = 4 | |
| "-____-" Trainable parameters = 87,293,952 of 8,278,029,312 (1.05% trained) | |
| {'loss': 0.5516, 'grad_norm': 0.5312474966049194, 'learning_rate': 0.0, 'epoch': 0.0} | |
| Unsloth: Will smartly offload gradients to save VRAM! | |
| {'loss': 0.5751, 'grad_norm': 0.4997809827327728, 'learning_rate': 4e-05, 'epoch': 0.01} | |
| {'loss': 0.6654, 'grad_norm': 0.5570040941238403, 'learning_rate': 8e-05, 'epoch': 0.01} | |
| {'loss': 0.621, 'grad_norm': 0.49220409989356995, 'learning_rate': 0.00012, 'epoch': 0.01} | |
| {'loss': 0.4791, 'grad_norm': 0.3357393741607666, 'learning_rate': 0.00016, 'epoch': 0.02} | |
| {'loss': 0.5139, 'grad_norm': 0.23720338940620422, 'learning_rate': 0.0002, 'epoch': 0.02} | |
| {'loss': 0.4249, 'grad_norm': 0.22417885065078735, 'learning_rate': 0.00019789473684210526, 'epoch': 0.02} | |
| {'loss': 0.4403, 'grad_norm': 0.22005413472652435, 'learning_rate': 0.00019578947368421054, 'epoch': 0.03} | |
| {'loss': 0.3843, 'grad_norm': 0.2089555263519287, 'learning_rate': 0.0001936842105263158, 'epoch': 0.03} | |
| {'loss': 0.3661, 'grad_norm': 0.1853521764278412, 'learning_rate': 0.00019157894736842104, 'epoch': 0.04} | |
| {'loss': 0.3368, 'grad_norm': 0.23009157180786133, 'learning_rate': 0.00018947368421052632, 'epoch': 0.04} | |
| {'loss': 0.344, 'grad_norm': 0.19669091701507568, 'learning_rate': 0.0001873684210526316, 'epoch': 0.04} | |
| {'loss': 0.3621, 'grad_norm': 0.18337242305278778, 'learning_rate': 0.00018526315789473685, 'epoch': 0.05} | |
| {'loss': 0.3082, 'grad_norm': 0.15389038622379303, 'learning_rate': 0.0001831578947368421, 'epoch': 0.05} | |
| {'loss': 0.4513, 'grad_norm': 0.19309887290000916, 'learning_rate': 0.00018105263157894739, 'epoch': 0.05} | |
| {'loss': 0.253, 'grad_norm': 0.1269475668668747, 'learning_rate': 0.00017894736842105264, 'epoch': 0.06} | |
| {'loss': 0.3247, 'grad_norm': 0.14577877521514893, 'learning_rate': 0.0001768421052631579, 'epoch': 0.06} | |
| {'loss': 0.2953, 'grad_norm': 0.15310949087142944, 'learning_rate': 0.00017473684210526317, 'epoch': 0.06} | |
| {'loss': 0.3271, 'grad_norm': 0.12097445875406265, 'learning_rate': 0.00017263157894736842, 'epoch': 0.07} | |
| {'loss': 0.3143, 'grad_norm': 0.17251430451869965, 'learning_rate': 0.0001705263157894737, 'epoch': 0.07} | |
| {'loss': 0.3824, 'grad_norm': 0.16023662686347961, 'learning_rate': 0.00016842105263157895, 'epoch': 0.07} | |
| {'loss': 0.3827, 'grad_norm': 0.12935973703861237, 'learning_rate': 0.00016631578947368423, 'epoch': 0.08} | |
| {'loss': 0.3651, 'grad_norm': 0.14019690454006195, 'learning_rate': 0.00016421052631578948, 'epoch': 0.08} | |
| {'loss': 0.314, 'grad_norm': 0.12898799777030945, 'learning_rate': 0.00016210526315789473, 'epoch': 0.09} | |
| {'loss': 0.3622, 'grad_norm': 0.141513392329216, 'learning_rate': 0.00016, 'epoch': 0.09} | |
| {'loss': 0.2899, 'grad_norm': 0.12062196433544159, 'learning_rate': 0.00015789473684210527, 'epoch': 0.09} | |
| {'loss': 0.3448, 'grad_norm': 0.12003778666257858, 'learning_rate': 0.00015578947368421052, 'epoch': 0.1} | |
| {'loss': 0.3252, 'grad_norm': 0.12177414447069168, 'learning_rate': 0.0001536842105263158, 'epoch': 0.1} | |
| {'loss': 0.2917, 'grad_norm': 0.11223629862070084, 'learning_rate': 0.00015157894736842108, 'epoch': 0.1} | |
| {'loss': 0.389, 'grad_norm': 0.12914486229419708, 'learning_rate': 0.00014947368421052633, 'epoch': 0.11} | |
| {'loss': 0.3577, 'grad_norm': 0.12752953171730042, 'learning_rate': 0.00014736842105263158, 'epoch': 0.11} | |
| {'loss': 0.3119, 'grad_norm': 0.12948615849018097, 'learning_rate': 0.00014526315789473686, 'epoch': 0.11} | |
| {'loss': 0.2777, 'grad_norm': 0.11508759111166, 'learning_rate': 0.0001431578947368421, 'epoch': 0.12} | |
| {'loss': 0.3316, 'grad_norm': 0.12426915019750595, 'learning_rate': 0.00014105263157894736, 'epoch': 0.12} | |
| {'loss': 0.3592, 'grad_norm': 0.13112099468708038, 'learning_rate': 0.00013894736842105264, 'epoch': 0.12} | |
| {'loss': 0.3531, 'grad_norm': 0.12065130472183228, 'learning_rate': 0.0001368421052631579, 'epoch': 0.13} | |
| {'loss': 0.2714, 'grad_norm': 0.11886139959096909, 'learning_rate': 0.00013473684210526317, 'epoch': 0.13} | |
| {'loss': 0.3196, 'grad_norm': 0.13051238656044006, 'learning_rate': 0.00013263157894736842, 'epoch': 0.13} | |
| {'loss': 0.2932, 'grad_norm': 0.11254164576530457, 'learning_rate': 0.0001305263157894737, 'epoch': 0.14} | |
| {'loss': 0.3292, 'grad_norm': 0.12476981431245804, 'learning_rate': 0.00012842105263157895, 'epoch': 0.14} | |
| {'loss': 0.3437, 'grad_norm': 0.14819224178791046, 'learning_rate': 0.0001263157894736842, 'epoch': 0.15} | |
| {'loss': 0.3994, 'grad_norm': 0.1342942863702774, 'learning_rate': 0.00012421052631578949, 'epoch': 0.15} | |
| {'loss': 0.3108, 'grad_norm': 0.11615261435508728, 'learning_rate': 0.00012210526315789474, 'epoch': 0.15} | |
| {'loss': 0.287, 'grad_norm': 0.10614672303199768, 'learning_rate': 0.00012, 'epoch': 0.16} | |
| {'loss': 0.3777, 'grad_norm': 0.1580209881067276, 'learning_rate': 0.00011789473684210525, 'epoch': 0.16} | |
| {'loss': 0.2382, 'grad_norm': 0.11780590564012527, 'learning_rate': 0.00011578947368421053, 'epoch': 0.16} | |
| {'loss': 0.345, 'grad_norm': 0.12982822954654694, 'learning_rate': 0.0001136842105263158, 'epoch': 0.17} | |
| {'loss': 0.2807, 'grad_norm': 0.11291559785604477, 'learning_rate': 0.00011157894736842105, 'epoch': 0.17} | |
| {'loss': 0.3361, 'grad_norm': 0.1301521211862564, 'learning_rate': 0.00010947368421052633, 'epoch': 0.17} | |
| {'loss': 0.3255, 'grad_norm': 0.12927880883216858, 'learning_rate': 0.00010736842105263158, 'epoch': 0.18} | |
| {'loss': 0.3107, 'grad_norm': 0.12102185189723969, 'learning_rate': 0.00010526315789473685, 'epoch': 0.18} | |
| {'loss': 0.2714, 'grad_norm': 0.1021929681301117, 'learning_rate': 0.00010315789473684211, 'epoch': 0.18} | |
| {'loss': 0.3161, 'grad_norm': 0.1435796022415161, 'learning_rate': 0.00010105263157894738, 'epoch': 0.19} | |
| {'loss': 0.3703, 'grad_norm': 0.14022934436798096, 'learning_rate': 9.894736842105263e-05, 'epoch': 0.19} | |
| {'loss': 0.3592, 'grad_norm': 0.12194594740867615, 'learning_rate': 9.68421052631579e-05, 'epoch': 0.2} | |
| {'loss': 0.3411, 'grad_norm': 0.11623945087194443, 'learning_rate': 9.473684210526316e-05, 'epoch': 0.2} | |
| {'loss': 0.2982, 'grad_norm': 0.11288086324930191, 'learning_rate': 9.263157894736843e-05, 'epoch': 0.2} | |
| {'loss': 0.3636, 'grad_norm': 0.19941461086273193, 'learning_rate': 9.052631578947369e-05, 'epoch': 0.21} | |
| {'loss': 0.327, 'grad_norm': 0.11614833027124405, 'learning_rate': 8.842105263157894e-05, 'epoch': 0.21} | |
| {'loss': 0.2447, 'grad_norm': 0.10045632719993591, 'learning_rate': 8.631578947368421e-05, 'epoch': 0.21} | |
| {'loss': 0.3494, 'grad_norm': 0.11216063797473907, 'learning_rate': 8.421052631578948e-05, 'epoch': 0.22} | |
| {'loss': 0.3233, 'grad_norm': 0.13398653268814087, 'learning_rate': 8.210526315789474e-05, 'epoch': 0.22} | |
| {'loss': 0.2874, 'grad_norm': 0.11401170492172241, 'learning_rate': 8e-05, 'epoch': 0.22} | |
| {'loss': 0.3704, 'grad_norm': 0.12382341921329498, 'learning_rate': 7.789473684210526e-05, 'epoch': 0.23} | |
| {'loss': 0.3196, 'grad_norm': 0.1083361878991127, 'learning_rate': 7.578947368421054e-05, 'epoch': 0.23} | |
| {'loss': 0.3459, 'grad_norm': 0.19833165407180786, 'learning_rate': 7.368421052631579e-05, 'epoch': 0.23} | |
| {'loss': 0.3015, 'grad_norm': 0.10992985963821411, 'learning_rate': 7.157894736842105e-05, 'epoch': 0.24} | |
| {'loss': 0.328, 'grad_norm': 0.11520015448331833, 'learning_rate': 6.947368421052632e-05, 'epoch': 0.24} | |
| {'loss': 0.3837, 'grad_norm': 0.13615329563617706, 'learning_rate': 6.736842105263159e-05, 'epoch': 0.24} | |
| {'loss': 0.3063, 'grad_norm': 0.12049426138401031, 'learning_rate': 6.526315789473685e-05, 'epoch': 0.25} | |
| {'loss': 0.3201, 'grad_norm': 0.1054273247718811, 'learning_rate': 6.31578947368421e-05, 'epoch': 0.25} | |
| {'loss': 0.2946, 'grad_norm': 0.11126892268657684, 'learning_rate': 6.105263157894737e-05, 'epoch': 0.26} | |
| {'loss': 0.2455, 'grad_norm': 0.10475530475378036, 'learning_rate': 5.894736842105263e-05, 'epoch': 0.26} | |
| {'loss': 0.335, 'grad_norm': 0.13587339222431183, 'learning_rate': 5.68421052631579e-05, 'epoch': 0.26} | |
| {'loss': 0.3251, 'grad_norm': 0.10262706875801086, 'learning_rate': 5.4736842105263165e-05, 'epoch': 0.27} | |
| {'loss': 0.2922, 'grad_norm': 0.1111547127366066, 'learning_rate': 5.2631578947368424e-05, 'epoch': 0.27} | |
| {'loss': 0.3437, 'grad_norm': 0.12631115317344666, 'learning_rate': 5.052631578947369e-05, 'epoch': 0.27} | |
| {'loss': 0.3183, 'grad_norm': 0.11674674600362778, 'learning_rate': 4.842105263157895e-05, 'epoch': 0.28} | |
| 100%|██████████| 100/100 [01:44<00:00, 1.04s/it] | |
| {'loss': 0.3041, 'grad_norm': 0.1114904135465622, 'learning_rate': 4.6315789473684214e-05, 'epoch': 0.28} | |
| {'loss': 0.3681, 'grad_norm': 0.11916472762823105, 'learning_rate': 4.421052631578947e-05, 'epoch': 0.28} | |
| {'loss': 0.3591, 'grad_norm': 0.15139518678188324, 'learning_rate': 4.210526315789474e-05, 'epoch': 0.29} | |
| {'loss': 0.2658, 'grad_norm': 0.11994612962007523, 'learning_rate': 4e-05, 'epoch': 0.29} | |
| {'loss': 0.3452, 'grad_norm': 0.14532621204853058, 'learning_rate': 3.789473684210527e-05, 'epoch': 0.29} | |
| {'loss': 0.3322, 'grad_norm': 0.12806951999664307, 'learning_rate': 3.578947368421053e-05, 'epoch': 0.3} | |
| {'loss': 0.2921, 'grad_norm': 0.11126401275396347, 'learning_rate': 3.368421052631579e-05, 'epoch': 0.3} | |
| {'loss': 0.2626, 'grad_norm': 0.10746820271015167, 'learning_rate': 3.157894736842105e-05, 'epoch': 0.3} | |
| {'loss': 0.3157, 'grad_norm': 0.11800961941480637, 'learning_rate': 2.9473684210526314e-05, 'epoch': 0.31} | |
| {'loss': 0.3579, 'grad_norm': 0.10837972909212112, 'learning_rate': 2.7368421052631583e-05, 'epoch': 0.31} | |
| {'loss': 0.3326, 'grad_norm': 0.11892487853765488, 'learning_rate': 2.5263157894736845e-05, 'epoch': 0.32} | |
| {'loss': 0.3138, 'grad_norm': 0.11013548076152802, 'learning_rate': 2.3157894736842107e-05, 'epoch': 0.32} | |
| {'loss': 0.3079, 'grad_norm': 0.11282574385404587, 'learning_rate': 2.105263157894737e-05, 'epoch': 0.32} | |
| {'loss': 0.3413, 'grad_norm': 0.11510809510946274, 'learning_rate': 1.8947368421052634e-05, 'epoch': 0.33} | |
| {'loss': 0.3502, 'grad_norm': 0.12397557497024536, 'learning_rate': 1.6842105263157896e-05, 'epoch': 0.33} | |
| {'loss': 0.4151, 'grad_norm': 0.12421295046806335, 'learning_rate': 1.4736842105263157e-05, 'epoch': 0.33} | |
| {'loss': 0.3161, 'grad_norm': 0.12127997726202011, 'learning_rate': 1.2631578947368422e-05, 'epoch': 0.34} | |
| {'loss': 0.2764, 'grad_norm': 0.1132940873503685, 'learning_rate': 1.0526315789473684e-05, 'epoch': 0.34} | |
| {'loss': 0.3861, 'grad_norm': 0.12642629444599152, 'learning_rate': 8.421052631578948e-06, 'epoch': 0.34} | |
| {'loss': 0.2874, 'grad_norm': 0.10199450701475143, 'learning_rate': 6.315789473684211e-06, 'epoch': 0.35} | |
| {'loss': 0.269, 'grad_norm': 0.12433204054832458, 'learning_rate': 4.210526315789474e-06, 'epoch': 0.35} | |
| {'loss': 0.3061, 'grad_norm': 0.11193855851888657, 'learning_rate': 2.105263157894737e-06, 'epoch': 0.35} | |
| {'train_runtime': 104.4302, 'train_samples_per_second': 3.83, 'train_steps_per_second': 0.958, 'train_loss': 0.3433119258284569, 'epoch': 0.35} | |
| |===========================================================================| | |
| | PyTorch CUDA memory summary, device ID 0 | | |
| |---------------------------------------------------------------------------| | |
| | CUDA OOMs: 0 | cudaMalloc retries: 0 | | |
| |===========================================================================| | |
| | Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed | | |
| |---------------------------------------------------------------------------| | |
| | Allocated memory | 19993 MiB | 27090 MiB | 21744 GiB | 21725 GiB | | |
| | from large pool | 19480 MiB | 26746 MiB | 21552 GiB | 21533 GiB | | |
| | from small pool | 513 MiB | 688 MiB | 192 GiB | 191 GiB | | |
| |---------------------------------------------------------------------------| | |
| | Active memory | 19993 MiB | 27090 MiB | 21744 GiB | 21725 GiB | | |
| | from large pool | 19480 MiB | 26746 MiB | 21552 GiB | 21533 GiB | | |
| | from small pool | 513 MiB | 688 MiB | 192 GiB | 191 GiB | | |
| |---------------------------------------------------------------------------| | |
| | Requested memory | 19930 MiB | 26912 MiB | 21730 GiB | 21711 GiB | | |
| | from large pool | 19417 MiB | 26567 MiB | 21538 GiB | 21519 GiB | | |
| | from small pool | 513 MiB | 688 MiB | 191 GiB | 191 GiB | | |
| |---------------------------------------------------------------------------| | |
| | GPU reserved memory | 25634 MiB | 27320 MiB | 35158 MiB | 9524 MiB | | |
| | from large pool | 24920 MiB | 26960 MiB | 34220 MiB | 9300 MiB | | |
| | from small pool | 714 MiB | 714 MiB | 938 MiB | 224 MiB | | |
| |---------------------------------------------------------------------------| | |
| | Non-releasable memory | 0 B | 0 B | 0 B | 0 B | | |
| | from large pool | 0 B | 0 B | 0 B | 0 B | | |
| | from small pool | 0 B | 0 B | 0 B | 0 B | | |
| |---------------------------------------------------------------------------| | |
| | Allocations | 4232 | 5249 | 1053 K | 1049 K | | |
| | from large pool | 332 | 457 | 393 K | 392 K | | |
| | from small pool | 3900 | 4809 | 660 K | 656 K | | |
| |---------------------------------------------------------------------------| | |
| | Active allocs | 4232 | 5249 | 1053 K | 1049 K | | |
| | from large pool | 332 | 457 | 393 K | 392 K | | |
| | from small pool | 3900 | 4809 | 660 K | 656 K | | |
| |---------------------------------------------------------------------------| | |
| | GPU reserved segments | 0 | 0 | 0 | 0 | | |
| | from large pool | 0 | 0 | 0 | 0 | | |
| | from small pool | 0 | 0 | 0 | 0 | | |
| |---------------------------------------------------------------------------| | |
| | Non-releasable allocs | 0 | 0 | 0 | 0 | | |
| | from large pool | 0 | 0 | 0 | 0 | | |
| | from small pool | 0 | 0 | 0 | 0 | | |
| |---------------------------------------------------------------------------| | |
| | Oversize allocations | 0 | 0 | 0 | 0 | | |
| |---------------------------------------------------------------------------| | |
| | Oversize GPU segments | 0 | 0 | 0 | 0 | | |
| |===========================================================================| | |
| You are given a problem. | |
| Think about the problem and provide your working out. | |
| Place it between <start_working_out> and <end_working_out>. | |
| Then, provide your solution between <SOLUTION></SOLUTION><|im_end|>Given $\sqrt{x^2+165}-\sqrt{x^2-52}=7$ and $x$ is positive, find all possible values of [rank0]: Traceback (most recent call last): | |
| [rank0]: File "/home/andrewor/scratch/datta_test_script.py", line 183, in <module> | |
| [rank0]: _ = model.generate( | |
| [rank0]: File "/home/andrewor/.conda/envs/pytorch-3.10/lib/python3.10/site-packages/peft/peft_model.py", line 1973, in generate | |
| [rank0]: outputs = self.base_model.generate(*args, **kwargs) | |
| [rank0]: File "/home/andrewor/local/unsloth/unsloth/models/llama.py", line 2036, in unsloth_fast_generate | |
| [rank0]: output = self._old_generate(*args, **kwargs) | |
| [rank0]: File "/home/andrewor/.conda/envs/pytorch-3.10/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 120, in decorate_context | |
| [rank0]: return func(*args, **kwargs) | |
| [rank0]: File "/home/andrewor/.conda/envs/pytorch-3.10/lib/python3.10/site-packages/transformers/generation/utils.py", line 2539, in generate | |
| [rank0]: result = self._sample( | |
| [rank0]: File "/home/andrewor/.conda/envs/pytorch-3.10/lib/python3.10/site-packages/transformers/generation/utils.py", line 2867, in _sample | |
| [rank0]: outputs = self(**model_inputs, return_dict=True) | |
| [rank0]: File "/home/andrewor/.conda/envs/pytorch-3.10/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl | |
| [rank0]: return self._call_impl(*args, **kwargs) | |
| [rank0]: File "/home/andrewor/.conda/envs/pytorch-3.10/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl | |
| [rank0]: return forward_call(*args, **kwargs) | |
| [rank0]: File "/home/andrewor/local/unsloth/unsloth/models/llama.py", line 1327, in _CausalLM_fast_forward | |
| [rank0]: outputs = self.model( | |
| [rank0]: File "/home/andrewor/.conda/envs/pytorch-3.10/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl | |
| [rank0]: return self._call_impl(*args, **kwargs) | |
| [rank0]: File "/home/andrewor/.conda/envs/pytorch-3.10/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl | |
| [rank0]: return forward_call(*args, **kwargs) | |
| [rank0]: File "/home/andrewor/local/unsloth/unsloth/models/llama.py", line 1105, in LlamaModel_fast_forward | |
| [rank0]: layer_outputs = decoder_layer( | |
| [rank0]: File "/home/andrewor/.conda/envs/pytorch-3.10/lib/python3.10/site-packages/transformers/modeling_layers.py", line 94, in __call__ | |
| [rank0]: return super().__call__(*args, **kwargs) | |
| [rank0]: File "/home/andrewor/.conda/envs/pytorch-3.10/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl | |
| [rank0]: return self._call_impl(*args, **kwargs) | |
| [rank0]: File "/home/andrewor/.conda/envs/pytorch-3.10/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl | |
| [rank0]: return forward_call(*args, **kwargs) | |
| [rank0]: File "/home/andrewor/local/unsloth/unsloth/models/llama.py", line 728, in LlamaDecoderLayer_fast_forward | |
| [rank0]: hidden_states, self_attn_weights, present_key_value = self.self_attn( | |
| [rank0]: File "/home/andrewor/.conda/envs/pytorch-3.10/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl | |
| [rank0]: return self._call_impl(*args, **kwargs) | |
| [rank0]: File "/home/andrewor/.conda/envs/pytorch-3.10/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl | |
| [rank0]: return forward_call(*args, **kwargs) | |
| [rank0]: File "/home/andrewor/local/unsloth/unsloth/models/qwen3.py", line 90, in Qwen3Attention_fast_forward | |
| [rank0]: Q, K, V = self.apply_qkv(self, hidden_states) | |
| [rank0]: File "/home/andrewor/local/unsloth/unsloth/kernels/fast_lora.py", line 527, in apply_lora_qkv | |
| [rank0]: Q, K, V = LoRA_QKV.apply( | |
| [rank0]: File "/home/andrewor/.conda/envs/pytorch-3.10/lib/python3.10/site-packages/torch/autograd/function.py", line 581, in apply | |
| [rank0]: return super().apply(*args, **kwargs) # type: ignore[misc] | |
| [rank0]: File "/home/andrewor/.conda/envs/pytorch-3.10/lib/python3.10/site-packages/torch/amp/autocast_mode.py", line 527, in decorate_fwd | |
| [rank0]: return fwd(*args, **kwargs) | |
| [rank0]: File "/home/andrewor/local/unsloth/unsloth/kernels/fast_lora.py", line 382, in forward | |
| [rank0]: Q = matmul_lora(X, QW, QW_quant, QA, QB, QS) | |
| [rank0]: File "/home/andrewor/local/unsloth/unsloth/kernels/utils.py", line 1019, in matmul_lora | |
| [rank0]: out = torch_matmul(X, W.t(), out = out) | |
| [rank0]: File "/home/andrewor/local/ao/torchao/utils.py", line 609, in _dispatch__torch_function__ | |
| [rank0]: return func(*args, **kwargs) | |
| [rank0]: RuntimeError: Cannot set version_counter for inference tensor | |
| [rank0]:[W1118 14:15:29.020192022 ProcessGroupNCCL.cpp:1524] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator()) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment