_process_group(backend,įile "/home/user/miniconda3/envs/gptenv/lib/python3.8/site-packages/torch/distributed/distributed_c10d.py", line 602, in init_process_groupįile "/home/user/miniconda3/envs/gptenv/lib/python3.8/site-packages/torch/distributed/distributed_c10d.py", line 727, in _new_process_group_helper Self.init_process_group(backend, timeout, init_method, rank, world_size)įile "/home/user/miniconda3/envs/gptenv/lib/python3.8/site-packages/deepspeed/comm/torch.py", line 40, in init_process_group Train_result = ain(resume_from_checkpoint=checkpoint)įile "/home/user/miniconda3/envs/gptenv/lib/python3.8/site-packages/transformers/trainer.py", line 1633, in trainįile "/home/user/miniconda3/envs/gptenv/lib/python3.8/site-packages/transformers/trainer.py", line 1702, in _inner_training_loopĭeepspeed_engine, optimizer, lr_scheduler = deepspeed_init(įile "/home/user/miniconda3/envs/gptenv/lib/python3.8/site-packages/transformers/deepspeed.py", line 378, in deepspeed_initĭeepspeed_engine, optimizer, _, lr_scheduler = deepspeed.initialize(**kwargs)įile "/home/user/miniconda3/envs/gptenv/lib/python3.8/site-packages/deepspeed/_init_.py", line 125, in initializeįile "/home/user/miniconda3/envs/gptenv/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 257, in _init_ĭist.init_distributed(dist_backend=self.dist_backend,įile "/home/user/miniconda3/envs/gptenv/lib/python3.8/site-packages/deepspeed/comm/comm.py", line 656, in init_distributedĬdb = TorchBackend(dist_backend, timeout, init_method, rank, world_size)įile "/home/user/miniconda3/envs/gptenv/lib/python3.8/site-packages/deepspeed/comm/torch.py", line 36, in _init_ However, when I run my script to train the model I got the following error: File "protGPT_trainer.py", line 475, in įile "protGPT_trainer.py", line 438, in main The cluster also has multiple GPUs and CUDA v 11.7. I am running my scripts in a cluster with SLURM as workload manager and Lmod as environment modul systerm, I also have created a conda environment, installed all the dependencies that I need from Transformers HuggingFace. I am trying to finetune a ProtGPT-2 model using the following libraries and packages:
0 Comments
Leave a Reply. |
AuthorWrite something about yourself. No need to be fancy, just an overview. ArchivesCategories |