diff --git "a/train.log" "b/train.log" new file mode 100644--- /dev/null +++ "b/train.log" @@ -0,0 +1,13172 @@ +[2025-06-30 06:36:55,989] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-06-30 06:36:58,463] [WARNING] [runner.py:215:fetch_hostfile] Unable to find hostfile, will proceed with training with local resources only. +Detected VISIBLE_DEVICES=0,1,2,3,4,5,6,7: setting --include=localhost:0,1,2,3,4,5,6,7 +[2025-06-30 06:36:58,464] [INFO] [runner.py:607:main] cmd = /root/miniconda3/envs/openrlhf/bin/python -u -m deepspeed.launcher.launch --world_info=eyJsb2NhbGhvc3QiOiBbMCwgMSwgMiwgMywgNCwgNSwgNiwgN119 --master_addr=127.0.0.1 --master_port=29500 --module --enable_each_rank_log=None openrlhf.cli.train_sft --max_len 4096 --dataset /apdcephfs/share_300000800/user/yulaizhao/codes/rl_grounding/0304/make_reward_sft/reward_data_180k_augmented.json --input_key query --output_key output --apply_chat_template --train_batch_size 128 --micro_train_batch_size 4 --pretrain /apdcephfs/share_300000800/user/dyu/share/model/pretrain/Qwen/Qwen2.5-7B-Instruct --save_path /apdcephfs/share_300000800/user/yulaizhao/rlhf/checkpoints/RLVR_sft_augmented_data_180k --save_steps -1 --logging_steps 1 --eval_steps -1 --zero_stage 3 --max_epochs 1 --bf16 --flash_attn --learning_rate 5e-6 --packing_samples --use_wandb true --wandb_project magic_spell --wandb_run_name RLVR_sft_augmented_data_180k_20250630-0636 +[2025-06-30 06:36:59,956] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-06-30 06:37:02,414] [INFO] [launch.py:139:main] 0 NCCL_HOME=/usr/local/tccl +[2025-06-30 06:37:02,414] [INFO] [launch.py:146:main] WORLD INFO DICT: {'localhost': [0, 1, 2, 3, 4, 5, 6, 7]} +[2025-06-30 06:37:02,414] [INFO] [launch.py:152:main] nnodes=1, num_local_procs=8, node_rank=0 +[2025-06-30 06:37:02,414] [INFO] [launch.py:163:main] global_rank_mapping=defaultdict(, {'localhost': [0, 1, 2, 3, 4, 5, 6, 7]}) +[2025-06-30 06:37:02,414] [INFO] [launch.py:164:main] dist_world_size=8 +[2025-06-30 06:37:02,414] [INFO] [launch.py:168:main] Setting CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 +[2025-06-30 06:37:02,415] [INFO] [launch.py:256:main] process 856597 spawned with command: ['/root/miniconda3/envs/openrlhf/bin/python', '-u', '-m', 'openrlhf.cli.train_sft', '--local_rank=0', '--max_len', '4096', '--dataset', '/apdcephfs/share_300000800/user/yulaizhao/codes/rl_grounding/0304/make_reward_sft/reward_data_180k_augmented.json', '--input_key', 'query', '--output_key', 'output', '--apply_chat_template', '--train_batch_size', '128', '--micro_train_batch_size', '4', '--pretrain', '/apdcephfs/share_300000800/user/dyu/share/model/pretrain/Qwen/Qwen2.5-7B-Instruct', '--save_path', '/apdcephfs/share_300000800/user/yulaizhao/rlhf/checkpoints/RLVR_sft_augmented_data_180k', '--save_steps', '-1', '--logging_steps', '1', '--eval_steps', '-1', '--zero_stage', '3', '--max_epochs', '1', '--bf16', '--flash_attn', '--learning_rate', '5e-6', '--packing_samples', '--use_wandb', 'true', '--wandb_project', 'magic_spell', '--wandb_run_name', 'RLVR_sft_augmented_data_180k_20250630-0636'] +[2025-06-30 06:37:02,416] [INFO] [launch.py:256:main] process 856598 spawned with command: ['/root/miniconda3/envs/openrlhf/bin/python', '-u', '-m', 'openrlhf.cli.train_sft', '--local_rank=1', '--max_len', '4096', '--dataset', '/apdcephfs/share_300000800/user/yulaizhao/codes/rl_grounding/0304/make_reward_sft/reward_data_180k_augmented.json', '--input_key', 'query', '--output_key', 'output', '--apply_chat_template', '--train_batch_size', '128', '--micro_train_batch_size', '4', '--pretrain', '/apdcephfs/share_300000800/user/dyu/share/model/pretrain/Qwen/Qwen2.5-7B-Instruct', '--save_path', '/apdcephfs/share_300000800/user/yulaizhao/rlhf/checkpoints/RLVR_sft_augmented_data_180k', '--save_steps', '-1', '--logging_steps', '1', '--eval_steps', '-1', '--zero_stage', '3', '--max_epochs', '1', '--bf16', '--flash_attn', '--learning_rate', '5e-6', '--packing_samples', '--use_wandb', 'true', '--wandb_project', 'magic_spell', '--wandb_run_name', 'RLVR_sft_augmented_data_180k_20250630-0636'] +[2025-06-30 06:37:02,417] [INFO] [launch.py:256:main] process 856599 spawned with command: ['/root/miniconda3/envs/openrlhf/bin/python', '-u', '-m', 'openrlhf.cli.train_sft', '--local_rank=2', '--max_len', '4096', '--dataset', '/apdcephfs/share_300000800/user/yulaizhao/codes/rl_grounding/0304/make_reward_sft/reward_data_180k_augmented.json', '--input_key', 'query', '--output_key', 'output', '--apply_chat_template', '--train_batch_size', '128', '--micro_train_batch_size', '4', '--pretrain', '/apdcephfs/share_300000800/user/dyu/share/model/pretrain/Qwen/Qwen2.5-7B-Instruct', '--save_path', '/apdcephfs/share_300000800/user/yulaizhao/rlhf/checkpoints/RLVR_sft_augmented_data_180k', '--save_steps', '-1', '--logging_steps', '1', '--eval_steps', '-1', '--zero_stage', '3', '--max_epochs', '1', '--bf16', '--flash_attn', '--learning_rate', '5e-6', '--packing_samples', '--use_wandb', 'true', '--wandb_project', 'magic_spell', '--wandb_run_name', 'RLVR_sft_augmented_data_180k_20250630-0636'] +[2025-06-30 06:37:02,418] [INFO] [launch.py:256:main] process 856600 spawned with command: ['/root/miniconda3/envs/openrlhf/bin/python', '-u', '-m', 'openrlhf.cli.train_sft', '--local_rank=3', '--max_len', '4096', '--dataset', '/apdcephfs/share_300000800/user/yulaizhao/codes/rl_grounding/0304/make_reward_sft/reward_data_180k_augmented.json', '--input_key', 'query', '--output_key', 'output', '--apply_chat_template', '--train_batch_size', '128', '--micro_train_batch_size', '4', '--pretrain', '/apdcephfs/share_300000800/user/dyu/share/model/pretrain/Qwen/Qwen2.5-7B-Instruct', '--save_path', '/apdcephfs/share_300000800/user/yulaizhao/rlhf/checkpoints/RLVR_sft_augmented_data_180k', '--save_steps', '-1', '--logging_steps', '1', '--eval_steps', '-1', '--zero_stage', '3', '--max_epochs', '1', '--bf16', '--flash_attn', '--learning_rate', '5e-6', '--packing_samples', '--use_wandb', 'true', '--wandb_project', 'magic_spell', '--wandb_run_name', 'RLVR_sft_augmented_data_180k_20250630-0636'] +[2025-06-30 06:37:02,418] [INFO] [launch.py:256:main] process 856601 spawned with command: ['/root/miniconda3/envs/openrlhf/bin/python', '-u', '-m', 'openrlhf.cli.train_sft', '--local_rank=4', '--max_len', '4096', '--dataset', '/apdcephfs/share_300000800/user/yulaizhao/codes/rl_grounding/0304/make_reward_sft/reward_data_180k_augmented.json', '--input_key', 'query', '--output_key', 'output', '--apply_chat_template', '--train_batch_size', '128', '--micro_train_batch_size', '4', '--pretrain', '/apdcephfs/share_300000800/user/dyu/share/model/pretrain/Qwen/Qwen2.5-7B-Instruct', '--save_path', '/apdcephfs/share_300000800/user/yulaizhao/rlhf/checkpoints/RLVR_sft_augmented_data_180k', '--save_steps', '-1', '--logging_steps', '1', '--eval_steps', '-1', '--zero_stage', '3', '--max_epochs', '1', '--bf16', '--flash_attn', '--learning_rate', '5e-6', '--packing_samples', '--use_wandb', 'true', '--wandb_project', 'magic_spell', '--wandb_run_name', 'RLVR_sft_augmented_data_180k_20250630-0636'] +[2025-06-30 06:37:02,419] [INFO] [launch.py:256:main] process 856602 spawned with command: ['/root/miniconda3/envs/openrlhf/bin/python', '-u', '-m', 'openrlhf.cli.train_sft', '--local_rank=5', '--max_len', '4096', '--dataset', '/apdcephfs/share_300000800/user/yulaizhao/codes/rl_grounding/0304/make_reward_sft/reward_data_180k_augmented.json', '--input_key', 'query', '--output_key', 'output', '--apply_chat_template', '--train_batch_size', '128', '--micro_train_batch_size', '4', '--pretrain', '/apdcephfs/share_300000800/user/dyu/share/model/pretrain/Qwen/Qwen2.5-7B-Instruct', '--save_path', '/apdcephfs/share_300000800/user/yulaizhao/rlhf/checkpoints/RLVR_sft_augmented_data_180k', '--save_steps', '-1', '--logging_steps', '1', '--eval_steps', '-1', '--zero_stage', '3', '--max_epochs', '1', '--bf16', '--flash_attn', '--learning_rate', '5e-6', '--packing_samples', '--use_wandb', 'true', '--wandb_project', 'magic_spell', '--wandb_run_name', 'RLVR_sft_augmented_data_180k_20250630-0636'] +[2025-06-30 06:37:02,420] [INFO] [launch.py:256:main] process 856603 spawned with command: ['/root/miniconda3/envs/openrlhf/bin/python', '-u', '-m', 'openrlhf.cli.train_sft', '--local_rank=6', '--max_len', '4096', '--dataset', '/apdcephfs/share_300000800/user/yulaizhao/codes/rl_grounding/0304/make_reward_sft/reward_data_180k_augmented.json', '--input_key', 'query', '--output_key', 'output', '--apply_chat_template', '--train_batch_size', '128', '--micro_train_batch_size', '4', '--pretrain', '/apdcephfs/share_300000800/user/dyu/share/model/pretrain/Qwen/Qwen2.5-7B-Instruct', '--save_path', '/apdcephfs/share_300000800/user/yulaizhao/rlhf/checkpoints/RLVR_sft_augmented_data_180k', '--save_steps', '-1', '--logging_steps', '1', '--eval_steps', '-1', '--zero_stage', '3', '--max_epochs', '1', '--bf16', '--flash_attn', '--learning_rate', '5e-6', '--packing_samples', '--use_wandb', 'true', '--wandb_project', 'magic_spell', '--wandb_run_name', 'RLVR_sft_augmented_data_180k_20250630-0636'] +[2025-06-30 06:37:02,420] [INFO] [launch.py:256:main] process 856604 spawned with command: ['/root/miniconda3/envs/openrlhf/bin/python', '-u', '-m', 'openrlhf.cli.train_sft', '--local_rank=7', '--max_len', '4096', '--dataset', '/apdcephfs/share_300000800/user/yulaizhao/codes/rl_grounding/0304/make_reward_sft/reward_data_180k_augmented.json', '--input_key', 'query', '--output_key', 'output', '--apply_chat_template', '--train_batch_size', '128', '--micro_train_batch_size', '4', '--pretrain', '/apdcephfs/share_300000800/user/dyu/share/model/pretrain/Qwen/Qwen2.5-7B-Instruct', '--save_path', '/apdcephfs/share_300000800/user/yulaizhao/rlhf/checkpoints/RLVR_sft_augmented_data_180k', '--save_steps', '-1', '--logging_steps', '1', '--eval_steps', '-1', '--zero_stage', '3', '--max_epochs', '1', '--bf16', '--flash_attn', '--learning_rate', '5e-6', '--packing_samples', '--use_wandb', 'true', '--wandb_project', 'magic_spell', '--wandb_run_name', 'RLVR_sft_augmented_data_180k_20250630-0636'] +[2025-06-30 06:37:06,795] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-06-30 06:37:06,848] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-06-30 06:37:06,906] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-06-30 06:37:06,925] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-06-30 06:37:06,945] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-06-30 06:37:06,946] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-06-30 06:37:06,955] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-06-30 06:37:06,955] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-06-30 06:37:08,074] [INFO] [comm.py:652:init_distributed] cdb=None +[2025-06-30 06:37:08,207] [INFO] [comm.py:652:init_distributed] cdb=None +[2025-06-30 06:37:08,207] [INFO] [comm.py:683:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl +[2025-06-30 06:37:08,452] [INFO] [comm.py:652:init_distributed] cdb=None +[2025-06-30 06:37:08,489] [INFO] [config.py:733:__init__] Config mesh_device None world_size = 8 +You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`. +[2025-06-30 06:37:08,692] [INFO] [config.py:733:__init__] Config mesh_device None world_size = 8 +You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`. +[2025-06-30 06:37:08,751] [INFO] [comm.py:652:init_distributed] cdb=None +[2025-06-30 06:37:08,753] [INFO] [comm.py:652:init_distributed] cdb=None +[2025-06-30 06:37:08,754] [INFO] [comm.py:652:init_distributed] cdb=None +[2025-06-30 06:37:08,754] [INFO] [comm.py:652:init_distributed] cdb=None +[2025-06-30 06:37:08,755] [INFO] [comm.py:652:init_distributed] cdb=None +[2025-06-30 06:37:08,791] [INFO] [config.py:733:__init__] Config mesh_device None world_size = 8 +[2025-06-30 06:37:08,794] [INFO] [config.py:733:__init__] Config mesh_device None world_size = 8 +You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`. +[2025-06-30 06:37:08,795] [INFO] [config.py:733:__init__] Config mesh_device None world_size = 8 +[2025-06-30 06:37:08,795] [INFO] [config.py:733:__init__] Config mesh_device None world_size = 8 +[2025-06-30 06:37:08,796] [INFO] [config.py:733:__init__] Config mesh_device None world_size = 8 +[2025-06-30 06:37:08,796] [INFO] [config.py:733:__init__] Config mesh_device None world_size = 8 +You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`. +You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`. +You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`. +You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`. +You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`. +TENCENT64:856597:856597 [0] NCCL INFO Bootstrap : Using bond1:30.159.162.93<0> +TENCENT64:856597:856597 [0] NCCL INFO NET/Plugin: No plugin found (libnccl-net.so) +TENCENT64:856597:856597 [0] NCCL INFO NET/Plugin: Plugin load returned 2 : libnccl-net.so: cannot open shared object file: No such file or directory : when loading libnccl-net.so +TENCENT64:856597:856597 [0] NCCL INFO NET/Plugin: Using internal network plugin. +TENCENT64:856597:856597 [0] NCCL INFO cudaDriverVersion 12020 +NCCL version 2.21.5+cuda12.4 +TENCENT64:856601:856601 [4] NCCL INFO cudaDriverVersion 12020 +TENCENT64:856598:856598 [1] NCCL INFO cudaDriverVersion 12020 +TENCENT64:856604:856604 [7] NCCL INFO cudaDriverVersion 12020 +TENCENT64:856599:856599 [2] NCCL INFO cudaDriverVersion 12020 +TENCENT64:856602:856602 [5] NCCL INFO cudaDriverVersion 12020 +TENCENT64:856603:856603 [6] NCCL INFO cudaDriverVersion 12020 +TENCENT64:856600:856600 [3] NCCL INFO cudaDriverVersion 12020 +TENCENT64:856601:856601 [4] NCCL INFO Bootstrap : Using bond1:30.159.162.93<0> +TENCENT64:856604:856604 [7] NCCL INFO Bootstrap : Using bond1:30.159.162.93<0> +TENCENT64:856599:856599 [2] NCCL INFO Bootstrap : Using bond1:30.159.162.93<0> +TENCENT64:856602:856602 [5] NCCL INFO Bootstrap : Using bond1:30.159.162.93<0> +TENCENT64:856598:856598 [1] NCCL INFO Bootstrap : Using bond1:30.159.162.93<0> +TENCENT64:856603:856603 [6] NCCL INFO Bootstrap : Using bond1:30.159.162.93<0> +TENCENT64:856600:856600 [3] NCCL INFO Bootstrap : Using bond1:30.159.162.93<0> +TENCENT64:856601:856601 [4] NCCL INFO NET/Plugin: No plugin found (libnccl-net.so) +TENCENT64:856601:856601 [4] NCCL INFO NET/Plugin: Plugin load returned 2 : libnccl-net.so: cannot open shared object file: No such file or directory : when loading libnccl-net.so +TENCENT64:856601:856601 [4] NCCL INFO NET/Plugin: Using internal network plugin. +TENCENT64:856604:856604 [7] NCCL INFO NET/Plugin: No plugin found (libnccl-net.so) +TENCENT64:856604:856604 [7] NCCL INFO NET/Plugin: Plugin load returned 2 : libnccl-net.so: cannot open shared object file: No such file or directory : when loading libnccl-net.so +TENCENT64:856604:856604 [7] NCCL INFO NET/Plugin: Using internal network plugin. +TENCENT64:856599:856599 [2] NCCL INFO NET/Plugin: No plugin found (libnccl-net.so) +TENCENT64:856599:856599 [2] NCCL INFO NET/Plugin: Plugin load returned 2 : libnccl-net.so: cannot open shared object file: No such file or directory : when loading libnccl-net.so +TENCENT64:856599:856599 [2] NCCL INFO NET/Plugin: Using internal network plugin. +TENCENT64:856600:856600 [3] NCCL INFO NET/Plugin: No plugin found (libnccl-net.so) +TENCENT64:856600:856600 [3] NCCL INFO NET/Plugin: Plugin load returned 2 : libnccl-net.so: cannot open shared object file: No such file or directory : when loading libnccl-net.so +TENCENT64:856600:856600 [3] NCCL INFO NET/Plugin: Using internal network plugin. +TENCENT64:856602:856602 [5] NCCL INFO NET/Plugin: No plugin found (libnccl-net.so) +TENCENT64:856602:856602 [5] NCCL INFO NET/Plugin: Plugin load returned 2 : libnccl-net.so: cannot open shared object file: No such file or directory : when loading libnccl-net.so +TENCENT64:856602:856602 [5] NCCL INFO NET/Plugin: Using internal network plugin. +TENCENT64:856603:856603 [6] NCCL INFO NET/Plugin: No plugin found (libnccl-net.so) +TENCENT64:856603:856603 [6] NCCL INFO NET/Plugin: Plugin load returned 2 : libnccl-net.so: cannot open shared object file: No such file or directory : when loading libnccl-net.so +TENCENT64:856603:856603 [6] NCCL INFO NET/Plugin: Using internal network plugin. +TENCENT64:856598:856598 [1] NCCL INFO NET/Plugin: No plugin found (libnccl-net.so) +TENCENT64:856598:856598 [1] NCCL INFO NET/Plugin: Plugin load returned 2 : libnccl-net.so: cannot open shared object file: No such file or directory : when loading libnccl-net.so +TENCENT64:856598:856598 [1] NCCL INFO NET/Plugin: Using internal network plugin. +TENCENT64:856597:857871 [0] NCCL INFO NET/IB : Using [0]mlx5_bond_1:1/RoCE [1]mlx5_bond_2:1/RoCE [2]mlx5_bond_3:1/RoCE [3]mlx5_bond_4:1/RoCE [4]mlx5_bond_5:1/RoCE [5]mlx5_bond_6:1/RoCE [6]mlx5_bond_7:1/RoCE [7]mlx5_bond_8:1/RoCE [8]={[8] mlx5_0:1/RoCE, [9] mlx5_1:1/RoCE} [RO]; OOB bond1:30.159.162.93<0> +TENCENT64:856597:857871 [0] NCCL INFO Using non-device net plugin version 0 +TENCENT64:856597:857871 [0] NCCL INFO Using network IB +TENCENT64:856604:857873 [7] NCCL INFO NET/IB : Using [0]mlx5_bond_1:1/RoCE [1]mlx5_bond_2:1/RoCE [2]mlx5_bond_3:1/RoCE [3]mlx5_bond_4:1/RoCE [4]mlx5_bond_5:1/RoCE [5]mlx5_bond_6:1/RoCE [6]mlx5_bond_7:1/RoCE [7]mlx5_bond_8:1/RoCE [8]={[8] mlx5_0:1/RoCE, [9] mlx5_1:1/RoCE} [RO]; OOB bond1:30.159.162.93<0> +TENCENT64:856604:857873 [7] NCCL INFO Using non-device net plugin version 0 +TENCENT64:856604:857873 [7] NCCL INFO Using network IB +TENCENT64:856603:857878 [6] NCCL INFO NET/IB : Using [0]mlx5_bond_1:1/RoCE [1]mlx5_bond_2:1/RoCE [2]mlx5_bond_3:1/RoCE [3]mlx5_bond_4:1/RoCE [4]mlx5_bond_5:1/RoCE [5]mlx5_bond_6:1/RoCE [6]mlx5_bond_7:1/RoCE [7]mlx5_bond_8:1/RoCE [8]={[8] mlx5_0:1/RoCE, [9] mlx5_1:1/RoCE} [RO]; OOB bond1:30.159.162.93<0> +TENCENT64:856603:857878 [6] NCCL INFO Using non-device net plugin version 0 +TENCENT64:856603:857878 [6] NCCL INFO Using network IB +TENCENT64:856602:857876 [5] NCCL INFO NET/IB : Using [0]mlx5_bond_1:1/RoCE [1]mlx5_bond_2:1/RoCE [2]mlx5_bond_3:1/RoCE [3]mlx5_bond_4:1/RoCE [4]mlx5_bond_5:1/RoCE [5]mlx5_bond_6:1/RoCE [6]mlx5_bond_7:1/RoCE [7]mlx5_bond_8:1/RoCE [8]={[8] mlx5_0:1/RoCE, [9] mlx5_1:1/RoCE} [RO]; OOB bond1:30.159.162.93<0> +TENCENT64:856602:857876 [5] NCCL INFO Using non-device net plugin version 0 +TENCENT64:856602:857876 [5] NCCL INFO Using network IB +TENCENT64:856599:857874 [2] NCCL INFO NET/IB : Using [0]mlx5_bond_1:1/RoCE [1]mlx5_bond_2:1/RoCE [2]mlx5_bond_3:1/RoCE [3]mlx5_bond_4:1/RoCE [4]mlx5_bond_5:1/RoCE [5]mlx5_bond_6:1/RoCE [6]mlx5_bond_7:1/RoCE [7]mlx5_bond_8:1/RoCE [8]={[8] mlx5_0:1/RoCE, [9] mlx5_1:1/RoCE} [RO]; OOB bond1:30.159.162.93<0> +TENCENT64:856601:857872 [4] NCCL INFO NET/IB : Using [0]mlx5_bond_1:1/RoCE [1]mlx5_bond_2:1/RoCE [2]mlx5_bond_3:1/RoCE [3]mlx5_bond_4:1/RoCE [4]mlx5_bond_5:1/RoCE [5]mlx5_bond_6:1/RoCE [6]mlx5_bond_7:1/RoCE [7]mlx5_bond_8:1/RoCE [8]={[8] mlx5_0:1/RoCE, [9] mlx5_1:1/RoCE} [RO]; OOB bond1:30.159.162.93<0> +TENCENT64:856599:857874 [2] NCCL INFO Using non-device net plugin version 0 +TENCENT64:856599:857874 [2] NCCL INFO Using network IB +TENCENT64:856601:857872 [4] NCCL INFO Using non-device net plugin version 0 +TENCENT64:856601:857872 [4] NCCL INFO Using network IB +TENCENT64:856598:857877 [1] NCCL INFO NET/IB : Using [0]mlx5_bond_1:1/RoCE [1]mlx5_bond_2:1/RoCE [2]mlx5_bond_3:1/RoCE [3]mlx5_bond_4:1/RoCE [4]mlx5_bond_5:1/RoCE [5]mlx5_bond_6:1/RoCE [6]mlx5_bond_7:1/RoCE [7]mlx5_bond_8:1/RoCE [8]={[8] mlx5_0:1/RoCE, [9] mlx5_1:1/RoCE} [RO]; OOB bond1:30.159.162.93<0> +TENCENT64:856600:857875 [3] NCCL INFO NET/IB : Using [0]mlx5_bond_1:1/RoCE [1]mlx5_bond_2:1/RoCE [2]mlx5_bond_3:1/RoCE [3]mlx5_bond_4:1/RoCE [4]mlx5_bond_5:1/RoCE [5]mlx5_bond_6:1/RoCE [6]mlx5_bond_7:1/RoCE [7]mlx5_bond_8:1/RoCE [8]={[8] mlx5_0:1/RoCE, [9] mlx5_1:1/RoCE} [RO]; OOB bond1:30.159.162.93<0> +TENCENT64:856600:857875 [3] NCCL INFO Using non-device net plugin version 0 +TENCENT64:856600:857875 [3] NCCL INFO Using network IB +TENCENT64:856598:857877 [1] NCCL INFO Using non-device net plugin version 0 +TENCENT64:856598:857877 [1] NCCL INFO Using network IB +TENCENT64:856601:857872 [4] NCCL INFO ncclCommInitRank comm 0xbc11380 rank 4 nranks 8 cudaDev 4 nvmlDev 4 busId a2000 commId 0x4b26f6ae90d09515 - Init START +TENCENT64:856602:857876 [5] NCCL INFO ncclCommInitRank comm 0xaf5f740 rank 5 nranks 8 cudaDev 5 nvmlDev 5 busId a7000 commId 0x4b26f6ae90d09515 - Init START +TENCENT64:856600:857875 [3] NCCL INFO ncclCommInitRank comm 0xb785090 rank 3 nranks 8 cudaDev 3 nvmlDev 3 busId 69000 commId 0x4b26f6ae90d09515 - Init START +TENCENT64:856598:857877 [1] NCCL INFO ncclCommInitRank comm 0xac3d6d0 rank 1 nranks 8 cudaDev 1 nvmlDev 1 busId 2b000 commId 0x4b26f6ae90d09515 - Init START +TENCENT64:856604:857873 [7] NCCL INFO ncclCommInitRank comm 0xa8a21a0 rank 7 nranks 8 cudaDev 7 nvmlDev 7 busId e7000 commId 0x4b26f6ae90d09515 - Init START +TENCENT64:856603:857878 [6] NCCL INFO ncclCommInitRank comm 0xada66d0 rank 6 nranks 8 cudaDev 6 nvmlDev 6 busId e1000 commId 0x4b26f6ae90d09515 - Init START +TENCENT64:856599:857874 [2] NCCL INFO ncclCommInitRank comm 0xb7cd040 rank 2 nranks 8 cudaDev 2 nvmlDev 2 busId 64000 commId 0x4b26f6ae90d09515 - Init START +TENCENT64:856597:857871 [0] NCCL INFO ncclCommInitRank comm 0xa71bd30 rank 0 nranks 8 cudaDev 0 nvmlDev 0 busId 25000 commId 0x4b26f6ae90d09515 - Init START +TENCENT64:856597:857871 [0] NCCL INFO Setting affinity for GPU 0 to ff,00000000,00000000,00000000,000000ff,00000000 +TENCENT64:856597:857871 [0] NCCL INFO NVLS multicast support is not available on dev 0 +TENCENT64:856598:857877 [1] NCCL INFO Setting affinity for GPU 1 to ff,00000000,00000000,00000000,000000ff,00000000 +TENCENT64:856598:857877 [1] NCCL INFO NVLS multicast support is not available on dev 1 +TENCENT64:856604:857873 [7] NCCL INFO Setting affinity for GPU 7 to ff,00000000,00000000,00000000,000000ff,00000000,00000000 +TENCENT64:856604:857873 [7] NCCL INFO NVLS multicast support is not available on dev 7 +TENCENT64:856599:857874 [2] NCCL INFO Setting affinity for GPU 2 to ff,00000000,00000000,00000000,000000ff +TENCENT64:856599:857874 [2] NCCL INFO NVLS multicast support is not available on dev 2 +TENCENT64:856600:857875 [3] NCCL INFO Setting affinity for GPU 3 to ff,00000000,00000000,00000000,000000ff +TENCENT64:856600:857875 [3] NCCL INFO NVLS multicast support is not available on dev 3 +TENCENT64:856601:857872 [4] NCCL INFO Setting affinity for GPU 4 to ff,00000000,00000000,00000000,000000ff,00000000,00000000,00000000 +TENCENT64:856601:857872 [4] NCCL INFO NVLS multicast support is not available on dev 4 +TENCENT64:856603:857878 [6] NCCL INFO Setting affinity for GPU 6 to ff,00000000,00000000,00000000,000000ff,00000000,00000000 +TENCENT64:856603:857878 [6] NCCL INFO NVLS multicast support is not available on dev 6 +TENCENT64:856602:857876 [5] NCCL INFO Setting affinity for GPU 5 to ff,00000000,00000000,00000000,000000ff,00000000,00000000,00000000 +TENCENT64:856602:857876 [5] NCCL INFO NVLS multicast support is not available on dev 5 +TENCENT64:856598:857877 [1] NCCL INFO comm 0xac3d6d0 rank 1 nRanks 8 nNodes 1 localRanks 8 localRank 1 MNNVL 0 +TENCENT64:856599:857874 [2] NCCL INFO comm 0xb7cd040 rank 2 nRanks 8 nNodes 1 localRanks 8 localRank 2 MNNVL 0 +TENCENT64:856597:857871 [0] NCCL INFO comm 0xa71bd30 rank 0 nRanks 8 nNodes 1 localRanks 8 localRank 0 MNNVL 0 +TENCENT64:856602:857876 [5] NCCL INFO comm 0xaf5f740 rank 5 nRanks 8 nNodes 1 localRanks 8 localRank 5 MNNVL 0 +TENCENT64:856603:857878 [6] NCCL INFO comm 0xada66d0 rank 6 nRanks 8 nNodes 1 localRanks 8 localRank 6 MNNVL 0 +TENCENT64:856601:857872 [4] NCCL INFO comm 0xbc11380 rank 4 nRanks 8 nNodes 1 localRanks 8 localRank 4 MNNVL 0 +TENCENT64:856600:857875 [3] NCCL INFO comm 0xb785090 rank 3 nRanks 8 nNodes 1 localRanks 8 localRank 3 MNNVL 0 +TENCENT64:856604:857873 [7] NCCL INFO comm 0xa8a21a0 rank 7 nRanks 8 nNodes 1 localRanks 8 localRank 7 MNNVL 0 +TENCENT64:856597:857871 [0] NCCL INFO Channel 00/16 : 0 1 2 3 4 5 6 7 +TENCENT64:856597:857871 [0] NCCL INFO Channel 01/16 : 0 1 2 3 4 5 6 7 +TENCENT64:856597:857871 [0] NCCL INFO Channel 02/16 : 0 1 2 3 4 5 6 7 +TENCENT64:856597:857871 [0] NCCL INFO Channel 03/16 : 0 1 2 3 4 5 6 7 +TENCENT64:856597:857871 [0] NCCL INFO Channel 04/16 : 0 1 2 3 4 5 6 7 +TENCENT64:856597:857871 [0] NCCL INFO Channel 05/16 : 0 1 2 3 4 5 6 7 +TENCENT64:856598:857877 [1] NCCL INFO Trees [0] 2/-1/-1->1->0 [1] 2/-1/-1->1->0 [2] 2/-1/-1->1->0 [3] 2/-1/-1->1->0 [4] 2/-1/-1->1->0 [5] 2/-1/-1->1->0 [6] 2/-1/-1->1->0 [7] 2/-1/-1->1->0 [8] 2/-1/-1->1->0 [9] 2/-1/-1->1->0 [10] 2/-1/-1->1->0 [11] 2/-1/-1->1->0 [12] 2/-1/-1->1->0 [13] 2/-1/-1->1->0 [14] 2/-1/-1->1->0 [15] 2/-1/-1->1->0 +TENCENT64:856597:857871 [0] NCCL INFO Channel 06/16 : 0 1 2 3 4 5 6 7 +TENCENT64:856602:857876 [5] NCCL INFO Trees [0] 6/-1/-1->5->4 [1] 6/-1/-1->5->4 [2] 6/-1/-1->5->4 [3] 6/-1/-1->5->4 [4] 6/-1/-1->5->4 [5] 6/-1/-1->5->4 [6] 6/-1/-1->5->4 [7] 6/-1/-1->5->4 [8] 6/-1/-1->5->4 [9] 6/-1/-1->5->4 [10] 6/-1/-1->5->4 [11] 6/-1/-1->5->4 [12] 6/-1/-1->5->4 [13] 6/-1/-1->5->4 [14] 6/-1/-1->5->4 [15] 6/-1/-1->5->4 +TENCENT64:856599:857874 [2] NCCL INFO Trees [0] 3/-1/-1->2->1 [1] 3/-1/-1->2->1 [2] 3/-1/-1->2->1 [3] 3/-1/-1->2->1 [4] 3/-1/-1->2->1 [5] 3/-1/-1->2->1 [6] 3/-1/-1->2->1 [7] 3/-1/-1->2->1 [8] 3/-1/-1->2->1 [9] 3/-1/-1->2->1 [10] 3/-1/-1->2->1 [11] 3/-1/-1->2->1 [12] 3/-1/-1->2->1 [13] 3/-1/-1->2->1 [14] 3/-1/-1->2->1 [15] 3/-1/-1->2->1 +TENCENT64:856603:857878 [6] NCCL INFO Trees [0] 7/-1/-1->6->5 [1] 7/-1/-1->6->5 [2] 7/-1/-1->6->5 [3] 7/-1/-1->6->5 [4] 7/-1/-1->6->5 [5] 7/-1/-1->6->5 [6] 7/-1/-1->6->5 [7] 7/-1/-1->6->5 [8] 7/-1/-1->6->5 [9] 7/-1/-1->6->5 [10] 7/-1/-1->6->5 [11] 7/-1/-1->6->5 [12] 7/-1/-1->6->5 [13] 7/-1/-1->6->5 [14] 7/-1/-1->6->5 [15] 7/-1/-1->6->5 +TENCENT64:856598:857877 [1] NCCL INFO P2P Chunksize set to 524288 +TENCENT64:856604:857873 [7] NCCL INFO Trees [0] -1/-1/-1->7->6 [1] -1/-1/-1->7->6 [2] -1/-1/-1->7->6 [3] -1/-1/-1->7->6 [4] -1/-1/-1->7->6 [5] -1/-1/-1->7->6 [6] -1/-1/-1->7->6 [7] -1/-1/-1->7->6 [8] -1/-1/-1->7->6 [9] -1/-1/-1->7->6 [10] -1/-1/-1->7->6 [11] -1/-1/-1->7->6 [12] -1/-1/-1->7->6 [13] -1/-1/-1->7->6 [14] -1/-1/-1->7->6 [15] -1/-1/-1->7->6 +TENCENT64:856597:857871 [0] NCCL INFO Channel 07/16 : 0 1 2 3 4 5 6 7 +TENCENT64:856601:857872 [4] NCCL INFO Trees [0] 5/-1/-1->4->3 [1] 5/-1/-1->4->3 [2] 5/-1/-1->4->3 [3] 5/-1/-1->4->3 [4] 5/-1/-1->4->3 [5] 5/-1/-1->4->3 [6] 5/-1/-1->4->3 [7] 5/-1/-1->4->3 [8] 5/-1/-1->4->3 [9] 5/-1/-1->4->3 [10] 5/-1/-1->4->3 [11] 5/-1/-1->4->3 [12] 5/-1/-1->4->3 [13] 5/-1/-1->4->3 [14] 5/-1/-1->4->3 [15] 5/-1/-1->4->3 +TENCENT64:856600:857875 [3] NCCL INFO Trees [0] 4/-1/-1->3->2 [1] 4/-1/-1->3->2 [2] 4/-1/-1->3->2 [3] 4/-1/-1->3->2 [4] 4/-1/-1->3->2 [5] 4/-1/-1->3->2 [6] 4/-1/-1->3->2 [7] 4/-1/-1->3->2 [8] 4/-1/-1->3->2 [9] 4/-1/-1->3->2 [10] 4/-1/-1->3->2 [11] 4/-1/-1->3->2 [12] 4/-1/-1->3->2 [13] 4/-1/-1->3->2 [14] 4/-1/-1->3->2 [15] 4/-1/-1->3->2 +TENCENT64:856601:857872 [4] NCCL INFO P2P Chunksize set to 524288 +TENCENT64:856602:857876 [5] NCCL INFO P2P Chunksize set to 524288 +TENCENT64:856599:857874 [2] NCCL INFO P2P Chunksize set to 524288 +TENCENT64:856603:857878 [6] NCCL INFO P2P Chunksize set to 524288 +TENCENT64:856604:857873 [7] NCCL INFO P2P Chunksize set to 524288 +TENCENT64:856597:857871 [0] NCCL INFO Channel 08/16 : 0 1 2 3 4 5 6 7 +TENCENT64:856600:857875 [3] NCCL INFO P2P Chunksize set to 524288 +TENCENT64:856597:857871 [0] NCCL INFO Channel 09/16 : 0 1 2 3 4 5 6 7 +TENCENT64:856597:857871 [0] NCCL INFO Channel 10/16 : 0 1 2 3 4 5 6 7 +TENCENT64:856597:857871 [0] NCCL INFO Channel 11/16 : 0 1 2 3 4 5 6 7 +TENCENT64:856597:857871 [0] NCCL INFO Channel 12/16 : 0 1 2 3 4 5 6 7 +TENCENT64:856597:857871 [0] NCCL INFO Channel 13/16 : 0 1 2 3 4 5 6 7 +TENCENT64:856597:857871 [0] NCCL INFO Channel 14/16 : 0 1 2 3 4 5 6 7 +TENCENT64:856597:857871 [0] NCCL INFO Channel 15/16 : 0 1 2 3 4 5 6 7 +TENCENT64:856597:857871 [0] NCCL INFO Trees [0] 1/-1/-1->0->-1 [1] 1/-1/-1->0->-1 [2] 1/-1/-1->0->-1 [3] 1/-1/-1->0->-1 [4] 1/-1/-1->0->-1 [5] 1/-1/-1->0->-1 [6] 1/-1/-1->0->-1 [7] 1/-1/-1->0->-1 [8] 1/-1/-1->0->-1 [9] 1/-1/-1->0->-1 [10] 1/-1/-1->0->-1 [11] 1/-1/-1->0->-1 [12] 1/-1/-1->0->-1 [13] 1/-1/-1->0->-1 [14] 1/-1/-1->0->-1 [15] 1/-1/-1->0->-1 +TENCENT64:856597:857871 [0] NCCL INFO P2P Chunksize set to 524288 +TENCENT64:856598:857877 [1] NCCL INFO Channel 00/0 : 1[1] -> 2[2] via P2P/CUMEM/read +TENCENT64:856598:857877 [1] NCCL INFO Channel 01/0 : 1[1] -> 2[2] via P2P/CUMEM/read +TENCENT64:856598:857877 [1] NCCL INFO Channel 02/0 : 1[1] -> 2[2] via P2P/CUMEM/read +TENCENT64:856598:857877 [1] NCCL INFO Channel 03/0 : 1[1] -> 2[2] via P2P/CUMEM/read +TENCENT64:856598:857877 [1] NCCL INFO Channel 04/0 : 1[1] -> 2[2] via P2P/CUMEM/read +TENCENT64:856597:857871 [0] NCCL INFO Channel 00/0 : 0[0] -> 1[1] via P2P/CUMEM/read +TENCENT64:856601:857872 [4] NCCL INFO Channel 00/0 : 4[4] -> 5[5] via P2P/CUMEM/read +TENCENT64:856598:857877 [1] NCCL INFO Channel 05/0 : 1[1] -> 2[2] via P2P/CUMEM/read +TENCENT64:856603:857878 [6] NCCL INFO Channel 00/0 : 6[6] -> 7[7] via P2P/CUMEM/read +TENCENT64:856597:857871 [0] NCCL INFO Channel 01/0 : 0[0] -> 1[1] via P2P/CUMEM/read +TENCENT64:856601:857872 [4] NCCL INFO Channel 01/0 : 4[4] -> 5[5] via P2P/CUMEM/read +TENCENT64:856598:857877 [1] NCCL INFO Channel 06/0 : 1[1] -> 2[2] via P2P/CUMEM/read +TENCENT64:856603:857878 [6] NCCL INFO Channel 01/0 : 6[6] -> 7[7] via P2P/CUMEM/read +TENCENT64:856597:857871 [0] NCCL INFO Channel 02/0 : 0[0] -> 1[1] via P2P/CUMEM/read +TENCENT64:856601:857872 [4] NCCL INFO Channel 02/0 : 4[4] -> 5[5] via P2P/CUMEM/read +TENCENT64:856598:857877 [1] NCCL INFO Channel 07/0 : 1[1] -> 2[2] via P2P/CUMEM/read +TENCENT64:856603:857878 [6] NCCL INFO Channel 02/0 : 6[6] -> 7[7] via P2P/CUMEM/read +TENCENT64:856597:857871 [0] NCCL INFO Channel 03/0 : 0[0] -> 1[1] via P2P/CUMEM/read +TENCENT64:856601:857872 [4] NCCL INFO Channel 03/0 : 4[4] -> 5[5] via P2P/CUMEM/read +TENCENT64:856598:857877 [1] NCCL INFO Channel 08/0 : 1[1] -> 2[2] via P2P/CUMEM/read +TENCENT64:856603:857878 [6] NCCL INFO Channel 03/0 : 6[6] -> 7[7] via P2P/CUMEM/read +TENCENT64:856600:857875 [3] NCCL INFO Channel 00/0 : 3[3] -> 4[4] via P2P/CUMEM/read +TENCENT64:856597:857871 [0] NCCL INFO Channel 04/0 : 0[0] -> 1[1] via P2P/CUMEM/read +TENCENT64:856604:857873 [7] NCCL INFO Channel 00/0 : 7[7] -> 0[0] via P2P/CUMEM/read +TENCENT64:856601:857872 [4] NCCL INFO Channel 04/0 : 4[4] -> 5[5] via P2P/CUMEM/read +TENCENT64:856602:857876 [5] NCCL INFO Channel 00/0 : 5[5] -> 6[6] via P2P/CUMEM/read +TENCENT64:856598:857877 [1] NCCL INFO Channel 09/0 : 1[1] -> 2[2] via P2P/CUMEM/read +TENCENT64:856603:857878 [6] NCCL INFO Channel 04/0 : 6[6] -> 7[7] via P2P/CUMEM/read +TENCENT64:856600:857875 [3] NCCL INFO Channel 01/0 : 3[3] -> 4[4] via P2P/CUMEM/read +TENCENT64:856597:857871 [0] NCCL INFO Channel 05/0 : 0[0] -> 1[1] via P2P/CUMEM/read +TENCENT64:856599:857874 [2] NCCL INFO Channel 00/0 : 2[2] -> 3[3] via P2P/CUMEM/read +TENCENT64:856602:857876 [5] NCCL INFO Channel 01/0 : 5[5] -> 6[6] via P2P/CUMEM/read +TENCENT64:856598:857877 [1] NCCL INFO Channel 10/0 : 1[1] -> 2[2] via P2P/CUMEM/read +TENCENT64:856603:857878 [6] NCCL INFO Channel 05/0 : 6[6] -> 7[7] via P2P/CUMEM/read +TENCENT64:856600:857875 [3] NCCL INFO Channel 02/0 : 3[3] -> 4[4] via P2P/CUMEM/read +TENCENT64:856597:857871 [0] NCCL INFO Channel 06/0 : 0[0] -> 1[1] via P2P/CUMEM/read +TENCENT64:856599:857874 [2] NCCL INFO Channel 01/0 : 2[2] -> 3[3] via P2P/CUMEM/read +TENCENT64:856602:857876 [5] NCCL INFO Channel 02/0 : 5[5] -> 6[6] via P2P/CUMEM/read +TENCENT64:856598:857877 [1] NCCL INFO Channel 11/0 : 1[1] -> 2[2] via P2P/CUMEM/read +TENCENT64:856603:857878 [6] NCCL INFO Channel 06/0 : 6[6] -> 7[7] via P2P/CUMEM/read +TENCENT64:856600:857875 [3] NCCL INFO Channel 03/0 : 3[3] -> 4[4] via P2P/CUMEM/read +TENCENT64:856597:857871 [0] NCCL INFO Channel 07/0 : 0[0] -> 1[1] via P2P/CUMEM/read +TENCENT64:856599:857874 [2] NCCL INFO Channel 02/0 : 2[2] -> 3[3] via P2P/CUMEM/read +TENCENT64:856602:857876 [5] NCCL INFO Channel 03/0 : 5[5] -> 6[6] via P2P/CUMEM/read +TENCENT64:856598:857877 [1] NCCL INFO Channel 12/0 : 1[1] -> 2[2] via P2P/CUMEM/read +TENCENT64:856603:857878 [6] NCCL INFO Channel 07/0 : 6[6] -> 7[7] via P2P/CUMEM/read +TENCENT64:856600:857875 [3] NCCL INFO Channel 04/0 : 3[3] -> 4[4] via P2P/CUMEM/read +TENCENT64:856597:857871 [0] NCCL INFO Channel 08/0 : 0[0] -> 1[1] via P2P/CUMEM/read +TENCENT64:856599:857874 [2] NCCL INFO Channel 03/0 : 2[2] -> 3[3] via P2P/CUMEM/read +TENCENT64:856602:857876 [5] NCCL INFO Channel 04/0 : 5[5] -> 6[6] via P2P/CUMEM/read +TENCENT64:856598:857877 [1] NCCL INFO Channel 13/0 : 1[1] -> 2[2] via P2P/CUMEM/read +TENCENT64:856603:857878 [6] NCCL INFO Channel 08/0 : 6[6] -> 7[7] via P2P/CUMEM/read +TENCENT64:856600:857875 [3] NCCL INFO Channel 05/0 : 3[3] -> 4[4] via P2P/CUMEM/read +TENCENT64:856597:857871 [0] NCCL INFO Channel 09/0 : 0[0] -> 1[1] via P2P/CUMEM/read +TENCENT64:856599:857874 [2] NCCL INFO Channel 04/0 : 2[2] -> 3[3] via P2P/CUMEM/read +TENCENT64:856602:857876 [5] NCCL INFO Channel 05/0 : 5[5] -> 6[6] via P2P/CUMEM/read +TENCENT64:856598:857877 [1] NCCL INFO Channel 14/0 : 1[1] -> 2[2] via P2P/CUMEM/read +TENCENT64:856603:857878 [6] NCCL INFO Channel 09/0 : 6[6] -> 7[7] via P2P/CUMEM/read +TENCENT64:856600:857875 [3] NCCL INFO Channel 06/0 : 3[3] -> 4[4] via P2P/CUMEM/read +TENCENT64:856597:857871 [0] NCCL INFO Channel 10/0 : 0[0] -> 1[1] via P2P/CUMEM/read +TENCENT64:856604:857873 [7] NCCL INFO Channel 01/0 : 7[7] -> 0[0] via P2P/CUMEM/read +TENCENT64:856599:857874 [2] NCCL INFO Channel 05/0 : 2[2] -> 3[3] via P2P/CUMEM/read +TENCENT64:856601:857872 [4] NCCL INFO Channel 05/0 : 4[4] -> 5[5] via P2P/CUMEM/read +TENCENT64:856602:857876 [5] NCCL INFO Channel 06/0 : 5[5] -> 6[6] via P2P/CUMEM/read +TENCENT64:856598:857877 [1] NCCL INFO Channel 15/0 : 1[1] -> 2[2] via P2P/CUMEM/read +TENCENT64:856603:857878 [6] NCCL INFO Channel 10/0 : 6[6] -> 7[7] via P2P/CUMEM/read +TENCENT64:856600:857875 [3] NCCL INFO Channel 07/0 : 3[3] -> 4[4] via P2P/CUMEM/read +TENCENT64:856597:857871 [0] NCCL INFO Channel 11/0 : 0[0] -> 1[1] via P2P/CUMEM/read +TENCENT64:856604:857873 [7] NCCL INFO Channel 02/0 : 7[7] -> 0[0] via P2P/CUMEM/read +TENCENT64:856599:857874 [2] NCCL INFO Channel 06/0 : 2[2] -> 3[3] via P2P/CUMEM/read +TENCENT64:856601:857872 [4] NCCL INFO Channel 06/0 : 4[4] -> 5[5] via P2P/CUMEM/read +TENCENT64:856602:857876 [5] NCCL INFO Channel 07/0 : 5[5] -> 6[6] via P2P/CUMEM/read +TENCENT64:856603:857878 [6] NCCL INFO Channel 11/0 : 6[6] -> 7[7] via P2P/CUMEM/read +TENCENT64:856600:857875 [3] NCCL INFO Channel 08/0 : 3[3] -> 4[4] via P2P/CUMEM/read +TENCENT64:856597:857871 [0] NCCL INFO Channel 12/0 : 0[0] -> 1[1] via P2P/CUMEM/read +TENCENT64:856604:857873 [7] NCCL INFO Channel 03/0 : 7[7] -> 0[0] via P2P/CUMEM/read +TENCENT64:856599:857874 [2] NCCL INFO Channel 07/0 : 2[2] -> 3[3] via P2P/CUMEM/read +TENCENT64:856601:857872 [4] NCCL INFO Channel 07/0 : 4[4] -> 5[5] via P2P/CUMEM/read +TENCENT64:856602:857876 [5] NCCL INFO Channel 08/0 : 5[5] -> 6[6] via P2P/CUMEM/read +TENCENT64:856603:857878 [6] NCCL INFO Channel 12/0 : 6[6] -> 7[7] via P2P/CUMEM/read +TENCENT64:856600:857875 [3] NCCL INFO Channel 09/0 : 3[3] -> 4[4] via P2P/CUMEM/read +TENCENT64:856597:857871 [0] NCCL INFO Channel 13/0 : 0[0] -> 1[1] via P2P/CUMEM/read +TENCENT64:856604:857873 [7] NCCL INFO Channel 04/0 : 7[7] -> 0[0] via P2P/CUMEM/read +TENCENT64:856599:857874 [2] NCCL INFO Channel 08/0 : 2[2] -> 3[3] via P2P/CUMEM/read +TENCENT64:856601:857872 [4] NCCL INFO Channel 08/0 : 4[4] -> 5[5] via P2P/CUMEM/read +TENCENT64:856602:857876 [5] NCCL INFO Channel 09/0 : 5[5] -> 6[6] via P2P/CUMEM/read +TENCENT64:856603:857878 [6] NCCL INFO Channel 13/0 : 6[6] -> 7[7] via P2P/CUMEM/read +TENCENT64:856600:857875 [3] NCCL INFO Channel 10/0 : 3[3] -> 4[4] via P2P/CUMEM/read +TENCENT64:856597:857871 [0] NCCL INFO Channel 14/0 : 0[0] -> 1[1] via P2P/CUMEM/read +TENCENT64:856604:857873 [7] NCCL INFO Channel 05/0 : 7[7] -> 0[0] via P2P/CUMEM/read +TENCENT64:856599:857874 [2] NCCL INFO Channel 09/0 : 2[2] -> 3[3] via P2P/CUMEM/read +TENCENT64:856601:857872 [4] NCCL INFO Channel 09/0 : 4[4] -> 5[5] via P2P/CUMEM/read +TENCENT64:856602:857876 [5] NCCL INFO Channel 10/0 : 5[5] -> 6[6] via P2P/CUMEM/read +TENCENT64:856603:857878 [6] NCCL INFO Channel 14/0 : 6[6] -> 7[7] via P2P/CUMEM/read +TENCENT64:856600:857875 [3] NCCL INFO Channel 11/0 : 3[3] -> 4[4] via P2P/CUMEM/read +TENCENT64:856597:857871 [0] NCCL INFO Channel 15/0 : 0[0] -> 1[1] via P2P/CUMEM/read +TENCENT64:856604:857873 [7] NCCL INFO Channel 06/0 : 7[7] -> 0[0] via P2P/CUMEM/read +TENCENT64:856599:857874 [2] NCCL INFO Channel 10/0 : 2[2] -> 3[3] via P2P/CUMEM/read +TENCENT64:856601:857872 [4] NCCL INFO Channel 10/0 : 4[4] -> 5[5] via P2P/CUMEM/read +TENCENT64:856602:857876 [5] NCCL INFO Channel 11/0 : 5[5] -> 6[6] via P2P/CUMEM/read +TENCENT64:856603:857878 [6] NCCL INFO Channel 15/0 : 6[6] -> 7[7] via P2P/CUMEM/read +TENCENT64:856600:857875 [3] NCCL INFO Channel 12/0 : 3[3] -> 4[4] via P2P/CUMEM/read +TENCENT64:856604:857873 [7] NCCL INFO Channel 07/0 : 7[7] -> 0[0] via P2P/CUMEM/read +TENCENT64:856599:857874 [2] NCCL INFO Channel 11/0 : 2[2] -> 3[3] via P2P/CUMEM/read +TENCENT64:856601:857872 [4] NCCL INFO Channel 11/0 : 4[4] -> 5[5] via P2P/CUMEM/read +TENCENT64:856602:857876 [5] NCCL INFO Channel 12/0 : 5[5] -> 6[6] via P2P/CUMEM/read +TENCENT64:856600:857875 [3] NCCL INFO Channel 13/0 : 3[3] -> 4[4] via P2P/CUMEM/read +TENCENT64:856604:857873 [7] NCCL INFO Channel 08/0 : 7[7] -> 0[0] via P2P/CUMEM/read +TENCENT64:856599:857874 [2] NCCL INFO Channel 12/0 : 2[2] -> 3[3] via P2P/CUMEM/read +TENCENT64:856601:857872 [4] NCCL INFO Channel 12/0 : 4[4] -> 5[5] via P2P/CUMEM/read +TENCENT64:856602:857876 [5] NCCL INFO Channel 13/0 : 5[5] -> 6[6] via P2P/CUMEM/read +TENCENT64:856600:857875 [3] NCCL INFO Channel 14/0 : 3[3] -> 4[4] via P2P/CUMEM/read +TENCENT64:856604:857873 [7] NCCL INFO Channel 09/0 : 7[7] -> 0[0] via P2P/CUMEM/read +TENCENT64:856599:857874 [2] NCCL INFO Channel 13/0 : 2[2] -> 3[3] via P2P/CUMEM/read +TENCENT64:856601:857872 [4] NCCL INFO Channel 13/0 : 4[4] -> 5[5] via P2P/CUMEM/read +TENCENT64:856602:857876 [5] NCCL INFO Channel 14/0 : 5[5] -> 6[6] via P2P/CUMEM/read +TENCENT64:856600:857875 [3] NCCL INFO Channel 15/0 : 3[3] -> 4[4] via P2P/CUMEM/read +TENCENT64:856604:857873 [7] NCCL INFO Channel 10/0 : 7[7] -> 0[0] via P2P/CUMEM/read +TENCENT64:856599:857874 [2] NCCL INFO Channel 14/0 : 2[2] -> 3[3] via P2P/CUMEM/read +TENCENT64:856601:857872 [4] NCCL INFO Channel 14/0 : 4[4] -> 5[5] via P2P/CUMEM/read +TENCENT64:856602:857876 [5] NCCL INFO Channel 15/0 : 5[5] -> 6[6] via P2P/CUMEM/read +TENCENT64:856604:857873 [7] NCCL INFO Channel 11/0 : 7[7] -> 0[0] via P2P/CUMEM/read +TENCENT64:856599:857874 [2] NCCL INFO Channel 15/0 : 2[2] -> 3[3] via P2P/CUMEM/read +TENCENT64:856601:857872 [4] NCCL INFO Channel 15/0 : 4[4] -> 5[5] via P2P/CUMEM/read +TENCENT64:856604:857873 [7] NCCL INFO Channel 12/0 : 7[7] -> 0[0] via P2P/CUMEM/read +TENCENT64:856604:857873 [7] NCCL INFO Channel 13/0 : 7[7] -> 0[0] via P2P/CUMEM/read +TENCENT64:856604:857873 [7] NCCL INFO Channel 14/0 : 7[7] -> 0[0] via P2P/CUMEM/read +TENCENT64:856604:857873 [7] NCCL INFO Channel 15/0 : 7[7] -> 0[0] via P2P/CUMEM/read +TENCENT64:856598:857877 [1] NCCL INFO Connected all rings +TENCENT64:856597:857871 [0] NCCL INFO Connected all rings +TENCENT64:856601:857872 [4] NCCL INFO Connected all rings +TENCENT64:856599:857874 [2] NCCL INFO Connected all rings +TENCENT64:856600:857875 [3] NCCL INFO Connected all rings +TENCENT64:856598:857877 [1] NCCL INFO Channel 00/0 : 1[1] -> 0[0] via P2P/CUMEM/read +TENCENT64:856598:857877 [1] NCCL INFO Channel 01/0 : 1[1] -> 0[0] via P2P/CUMEM/read +TENCENT64:856598:857877 [1] NCCL INFO Channel 02/0 : 1[1] -> 0[0] via P2P/CUMEM/read +TENCENT64:856598:857877 [1] NCCL INFO Channel 03/0 : 1[1] -> 0[0] via P2P/CUMEM/read +TENCENT64:856598:857877 [1] NCCL INFO Channel 04/0 : 1[1] -> 0[0] via P2P/CUMEM/read +TENCENT64:856604:857873 [7] NCCL INFO Connected all rings +TENCENT64:856604:857873 [7] NCCL INFO Channel 00/0 : 7[7] -> 6[6] via P2P/CUMEM/read +TENCENT64:856602:857876 [5] NCCL INFO Connected all rings +TENCENT64:856603:857878 [6] NCCL INFO Connected all rings +TENCENT64:856598:857877 [1] NCCL INFO Channel 05/0 : 1[1] -> 0[0] via P2P/CUMEM/read +TENCENT64:856604:857873 [7] NCCL INFO Channel 01/0 : 7[7] -> 6[6] via P2P/CUMEM/read +TENCENT64:856598:857877 [1] NCCL INFO Channel 06/0 : 1[1] -> 0[0] via P2P/CUMEM/read +TENCENT64:856604:857873 [7] NCCL INFO Channel 02/0 : 7[7] -> 6[6] via P2P/CUMEM/read +TENCENT64:856598:857877 [1] NCCL INFO Channel 07/0 : 1[1] -> 0[0] via P2P/CUMEM/read +TENCENT64:856604:857873 [7] NCCL INFO Channel 03/0 : 7[7] -> 6[6] via P2P/CUMEM/read +TENCENT64:856598:857877 [1] NCCL INFO Channel 08/0 : 1[1] -> 0[0] via P2P/CUMEM/read +TENCENT64:856604:857873 [7] NCCL INFO Channel 04/0 : 7[7] -> 6[6] via P2P/CUMEM/read +TENCENT64:856598:857877 [1] NCCL INFO Channel 09/0 : 1[1] -> 0[0] via P2P/CUMEM/read +TENCENT64:856604:857873 [7] NCCL INFO Channel 05/0 : 7[7] -> 6[6] via P2P/CUMEM/read +TENCENT64:856598:857877 [1] NCCL INFO Channel 10/0 : 1[1] -> 0[0] via P2P/CUMEM/read +TENCENT64:856604:857873 [7] NCCL INFO Channel 06/0 : 7[7] -> 6[6] via P2P/CUMEM/read +TENCENT64:856598:857877 [1] NCCL INFO Channel 11/0 : 1[1] -> 0[0] via P2P/CUMEM/read +TENCENT64:856604:857873 [7] NCCL INFO Channel 07/0 : 7[7] -> 6[6] via P2P/CUMEM/read +TENCENT64:856598:857877 [1] NCCL INFO Channel 12/0 : 1[1] -> 0[0] via P2P/CUMEM/read +TENCENT64:856604:857873 [7] NCCL INFO Channel 08/0 : 7[7] -> 6[6] via P2P/CUMEM/read +TENCENT64:856598:857877 [1] NCCL INFO Channel 13/0 : 1[1] -> 0[0] via P2P/CUMEM/read +TENCENT64:856604:857873 [7] NCCL INFO Channel 09/0 : 7[7] -> 6[6] via P2P/CUMEM/read +TENCENT64:856601:857872 [4] NCCL INFO Channel 00/0 : 4[4] -> 3[3] via P2P/CUMEM/read +TENCENT64:856599:857874 [2] NCCL INFO Channel 00/0 : 2[2] -> 1[1] via P2P/CUMEM/read +TENCENT64:856598:857877 [1] NCCL INFO Channel 14/0 : 1[1] -> 0[0] via P2P/CUMEM/read +TENCENT64:856600:857875 [3] NCCL INFO Channel 00/0 : 3[3] -> 2[2] via P2P/CUMEM/read +TENCENT64:856604:857873 [7] NCCL INFO Channel 10/0 : 7[7] -> 6[6] via P2P/CUMEM/read +TENCENT64:856601:857872 [4] NCCL INFO Channel 01/0 : 4[4] -> 3[3] via P2P/CUMEM/read +TENCENT64:856599:857874 [2] NCCL INFO Channel 01/0 : 2[2] -> 1[1] via P2P/CUMEM/read +TENCENT64:856598:857877 [1] NCCL INFO Channel 15/0 : 1[1] -> 0[0] via P2P/CUMEM/read +TENCENT64:856600:857875 [3] NCCL INFO Channel 01/0 : 3[3] -> 2[2] via P2P/CUMEM/read +TENCENT64:856604:857873 [7] NCCL INFO Channel 11/0 : 7[7] -> 6[6] via P2P/CUMEM/read +TENCENT64:856601:857872 [4] NCCL INFO Channel 02/0 : 4[4] -> 3[3] via P2P/CUMEM/read +TENCENT64:856599:857874 [2] NCCL INFO Channel 02/0 : 2[2] -> 1[1] via P2P/CUMEM/read +TENCENT64:856600:857875 [3] NCCL INFO Channel 02/0 : 3[3] -> 2[2] via P2P/CUMEM/read +TENCENT64:856604:857873 [7] NCCL INFO Channel 12/0 : 7[7] -> 6[6] via P2P/CUMEM/read +TENCENT64:856601:857872 [4] NCCL INFO Channel 03/0 : 4[4] -> 3[3] via P2P/CUMEM/read +TENCENT64:856599:857874 [2] NCCL INFO Channel 03/0 : 2[2] -> 1[1] via P2P/CUMEM/read +TENCENT64:856600:857875 [3] NCCL INFO Channel 03/0 : 3[3] -> 2[2] via P2P/CUMEM/read +TENCENT64:856604:857873 [7] NCCL INFO Channel 13/0 : 7[7] -> 6[6] via P2P/CUMEM/read +TENCENT64:856601:857872 [4] NCCL INFO Channel 04/0 : 4[4] -> 3[3] via P2P/CUMEM/read +TENCENT64:856599:857874 [2] NCCL INFO Channel 04/0 : 2[2] -> 1[1] via P2P/CUMEM/read +TENCENT64:856600:857875 [3] NCCL INFO Channel 04/0 : 3[3] -> 2[2] via P2P/CUMEM/read +TENCENT64:856604:857873 [7] NCCL INFO Channel 14/0 : 7[7] -> 6[6] via P2P/CUMEM/read +TENCENT64:856601:857872 [4] NCCL INFO Channel 05/0 : 4[4] -> 3[3] via P2P/CUMEM/read +TENCENT64:856600:857875 [3] NCCL INFO Channel 05/0 : 3[3] -> 2[2] via P2P/CUMEM/read +TENCENT64:856604:857873 [7] NCCL INFO Channel 15/0 : 7[7] -> 6[6] via P2P/CUMEM/read +TENCENT64:856601:857872 [4] NCCL INFO Channel 06/0 : 4[4] -> 3[3] via P2P/CUMEM/read +TENCENT64:856600:857875 [3] NCCL INFO Channel 06/0 : 3[3] -> 2[2] via P2P/CUMEM/read +TENCENT64:856601:857872 [4] NCCL INFO Channel 07/0 : 4[4] -> 3[3] via P2P/CUMEM/read +TENCENT64:856600:857875 [3] NCCL INFO Channel 07/0 : 3[3] -> 2[2] via P2P/CUMEM/read +TENCENT64:856601:857872 [4] NCCL INFO Channel 08/0 : 4[4] -> 3[3] via P2P/CUMEM/read +TENCENT64:856600:857875 [3] NCCL INFO Channel 08/0 : 3[3] -> 2[2] via P2P/CUMEM/read +TENCENT64:856601:857872 [4] NCCL INFO Channel 09/0 : 4[4] -> 3[3] via P2P/CUMEM/read +TENCENT64:856600:857875 [3] NCCL INFO Channel 09/0 : 3[3] -> 2[2] via P2P/CUMEM/read +TENCENT64:856599:857874 [2] NCCL INFO Channel 05/0 : 2[2] -> 1[1] via P2P/CUMEM/read +TENCENT64:856600:857875 [3] NCCL INFO Channel 10/0 : 3[3] -> 2[2] via P2P/CUMEM/read +TENCENT64:856599:857874 [2] NCCL INFO Channel 06/0 : 2[2] -> 1[1] via P2P/CUMEM/read +TENCENT64:856601:857872 [4] NCCL INFO Channel 10/0 : 4[4] -> 3[3] via P2P/CUMEM/read +TENCENT64:856603:857878 [6] NCCL INFO Channel 00/0 : 6[6] -> 5[5] via P2P/CUMEM/read +TENCENT64:856599:857874 [2] NCCL INFO Channel 07/0 : 2[2] -> 1[1] via P2P/CUMEM/read +TENCENT64:856602:857876 [5] NCCL INFO Channel 00/0 : 5[5] -> 4[4] via P2P/CUMEM/read +TENCENT64:856601:857872 [4] NCCL INFO Channel 11/0 : 4[4] -> 3[3] via P2P/CUMEM/read +TENCENT64:856603:857878 [6] NCCL INFO Channel 01/0 : 6[6] -> 5[5] via P2P/CUMEM/read +TENCENT64:856599:857874 [2] NCCL INFO Channel 08/0 : 2[2] -> 1[1] via P2P/CUMEM/read +TENCENT64:856602:857876 [5] NCCL INFO Channel 01/0 : 5[5] -> 4[4] via P2P/CUMEM/read +TENCENT64:856601:857872 [4] NCCL INFO Channel 12/0 : 4[4] -> 3[3] via P2P/CUMEM/read +TENCENT64:856603:857878 [6] NCCL INFO Channel 02/0 : 6[6] -> 5[5] via P2P/CUMEM/read +TENCENT64:856599:857874 [2] NCCL INFO Channel 09/0 : 2[2] -> 1[1] via P2P/CUMEM/read +TENCENT64:856602:857876 [5] NCCL INFO Channel 02/0 : 5[5] -> 4[4] via P2P/CUMEM/read +TENCENT64:856601:857872 [4] NCCL INFO Channel 13/0 : 4[4] -> 3[3] via P2P/CUMEM/read +TENCENT64:856603:857878 [6] NCCL INFO Channel 03/0 : 6[6] -> 5[5] via P2P/CUMEM/read +TENCENT64:856599:857874 [2] NCCL INFO Channel 10/0 : 2[2] -> 1[1] via P2P/CUMEM/read +TENCENT64:856602:857876 [5] NCCL INFO Channel 03/0 : 5[5] -> 4[4] via P2P/CUMEM/read +TENCENT64:856601:857872 [4] NCCL INFO Channel 14/0 : 4[4] -> 3[3] via P2P/CUMEM/read +TENCENT64:856603:857878 [6] NCCL INFO Channel 04/0 : 6[6] -> 5[5] via P2P/CUMEM/read +TENCENT64:856599:857874 [2] NCCL INFO Channel 11/0 : 2[2] -> 1[1] via P2P/CUMEM/read +TENCENT64:856602:857876 [5] NCCL INFO Channel 04/0 : 5[5] -> 4[4] via P2P/CUMEM/read +TENCENT64:856601:857872 [4] NCCL INFO Channel 15/0 : 4[4] -> 3[3] via P2P/CUMEM/read +TENCENT64:856603:857878 [6] NCCL INFO Channel 05/0 : 6[6] -> 5[5] via P2P/CUMEM/read +TENCENT64:856599:857874 [2] NCCL INFO Channel 12/0 : 2[2] -> 1[1] via P2P/CUMEM/read +TENCENT64:856602:857876 [5] NCCL INFO Channel 05/0 : 5[5] -> 4[4] via P2P/CUMEM/read +TENCENT64:856603:857878 [6] NCCL INFO Channel 06/0 : 6[6] -> 5[5] via P2P/CUMEM/read +TENCENT64:856599:857874 [2] NCCL INFO Channel 13/0 : 2[2] -> 1[1] via P2P/CUMEM/read +TENCENT64:856602:857876 [5] NCCL INFO Channel 06/0 : 5[5] -> 4[4] via P2P/CUMEM/read +TENCENT64:856603:857878 [6] NCCL INFO Channel 07/0 : 6[6] -> 5[5] via P2P/CUMEM/read +TENCENT64:856599:857874 [2] NCCL INFO Channel 14/0 : 2[2] -> 1[1] via P2P/CUMEM/read +TENCENT64:856602:857876 [5] NCCL INFO Channel 07/0 : 5[5] -> 4[4] via P2P/CUMEM/read +TENCENT64:856600:857875 [3] NCCL INFO Channel 11/0 : 3[3] -> 2[2] via P2P/CUMEM/read +TENCENT64:856603:857878 [6] NCCL INFO Channel 08/0 : 6[6] -> 5[5] via P2P/CUMEM/read +TENCENT64:856600:857875 [3] NCCL INFO Channel 12/0 : 3[3] -> 2[2] via P2P/CUMEM/read +TENCENT64:856599:857874 [2] NCCL INFO Channel 15/0 : 2[2] -> 1[1] via P2P/CUMEM/read +TENCENT64:856603:857878 [6] NCCL INFO Channel 09/0 : 6[6] -> 5[5] via P2P/CUMEM/read +TENCENT64:856600:857875 [3] NCCL INFO Channel 13/0 : 3[3] -> 2[2] via P2P/CUMEM/read +TENCENT64:856603:857878 [6] NCCL INFO Channel 10/0 : 6[6] -> 5[5] via P2P/CUMEM/read +TENCENT64:856600:857875 [3] NCCL INFO Channel 14/0 : 3[3] -> 2[2] via P2P/CUMEM/read +TENCENT64:856602:857876 [5] NCCL INFO Channel 08/0 : 5[5] -> 4[4] via P2P/CUMEM/read +TENCENT64:856603:857878 [6] NCCL INFO Channel 11/0 : 6[6] -> 5[5] via P2P/CUMEM/read +TENCENT64:856600:857875 [3] NCCL INFO Channel 15/0 : 3[3] -> 2[2] via P2P/CUMEM/read +TENCENT64:856602:857876 [5] NCCL INFO Channel 09/0 : 5[5] -> 4[4] via P2P/CUMEM/read +TENCENT64:856603:857878 [6] NCCL INFO Channel 12/0 : 6[6] -> 5[5] via P2P/CUMEM/read +TENCENT64:856602:857876 [5] NCCL INFO Channel 10/0 : 5[5] -> 4[4] via P2P/CUMEM/read +TENCENT64:856603:857878 [6] NCCL INFO Channel 13/0 : 6[6] -> 5[5] via P2P/CUMEM/read +TENCENT64:856602:857876 [5] NCCL INFO Channel 11/0 : 5[5] -> 4[4] via P2P/CUMEM/read +TENCENT64:856603:857878 [6] NCCL INFO Channel 14/0 : 6[6] -> 5[5] via P2P/CUMEM/read +TENCENT64:856602:857876 [5] NCCL INFO Channel 12/0 : 5[5] -> 4[4] via P2P/CUMEM/read +TENCENT64:856603:857878 [6] NCCL INFO Channel 15/0 : 6[6] -> 5[5] via P2P/CUMEM/read +TENCENT64:856602:857876 [5] NCCL INFO Channel 13/0 : 5[5] -> 4[4] via P2P/CUMEM/read +TENCENT64:856602:857876 [5] NCCL INFO Channel 14/0 : 5[5] -> 4[4] via P2P/CUMEM/read +TENCENT64:856602:857876 [5] NCCL INFO Channel 15/0 : 5[5] -> 4[4] via P2P/CUMEM/read +TENCENT64:856597:857871 [0] NCCL INFO Connected all trees +TENCENT64:856597:857871 [0] NCCL INFO threadThresholds 8/8/64 | 64/8/64 | 512 | 512 +TENCENT64:856597:857871 [0] NCCL INFO 16 coll channels, 16 collnet channels, 0 nvls channels, 16 p2p channels, 16 p2p channels per peer +TENCENT64:856598:857877 [1] NCCL INFO Connected all trees +TENCENT64:856598:857877 [1] NCCL INFO threadThresholds 8/8/64 | 64/8/64 | 512 | 512 +TENCENT64:856598:857877 [1] NCCL INFO 16 coll channels, 16 collnet channels, 0 nvls channels, 16 p2p channels, 16 p2p channels per peer +TENCENT64:856599:857874 [2] NCCL INFO Connected all trees +TENCENT64:856599:857874 [2] NCCL INFO threadThresholds 8/8/64 | 64/8/64 | 512 | 512 +TENCENT64:856599:857874 [2] NCCL INFO 16 coll channels, 16 collnet channels, 0 nvls channels, 16 p2p channels, 16 p2p channels per peer +TENCENT64:856600:857875 [3] NCCL INFO Connected all trees +TENCENT64:856600:857875 [3] NCCL INFO threadThresholds 8/8/64 | 64/8/64 | 512 | 512 +TENCENT64:856600:857875 [3] NCCL INFO 16 coll channels, 16 collnet channels, 0 nvls channels, 16 p2p channels, 16 p2p channels per peer +TENCENT64:856601:857872 [4] NCCL INFO Connected all trees +TENCENT64:856601:857872 [4] NCCL INFO threadThresholds 8/8/64 | 64/8/64 | 512 | 512 +TENCENT64:856601:857872 [4] NCCL INFO 16 coll channels, 16 collnet channels, 0 nvls channels, 16 p2p channels, 16 p2p channels per peer +TENCENT64:856602:857876 [5] NCCL INFO Connected all trees +TENCENT64:856602:857876 [5] NCCL INFO threadThresholds 8/8/64 | 64/8/64 | 512 | 512 +TENCENT64:856602:857876 [5] NCCL INFO 16 coll channels, 16 collnet channels, 0 nvls channels, 16 p2p channels, 16 p2p channels per peer +TENCENT64:856604:857873 [7] NCCL INFO Connected all trees +TENCENT64:856604:857873 [7] NCCL INFO threadThresholds 8/8/64 | 64/8/64 | 512 | 512 +TENCENT64:856604:857873 [7] NCCL INFO 16 coll channels, 16 collnet channels, 0 nvls channels, 16 p2p channels, 16 p2p channels per peer +TENCENT64:856603:857878 [6] NCCL INFO Connected all trees +TENCENT64:856603:857878 [6] NCCL INFO threadThresholds 8/8/64 | 64/8/64 | 512 | 512 +TENCENT64:856603:857878 [6] NCCL INFO 16 coll channels, 16 collnet channels, 0 nvls channels, 16 p2p channels, 16 p2p channels per peer +TENCENT64:856603:857878 [6] NCCL INFO TUNER/Plugin: Plugin load returned 11 : libnccl-net.so: cannot open shared object file: No such file or directory : when loading libnccl-tuner.so +TENCENT64:856603:857878 [6] NCCL INFO TUNER/Plugin: Using internal tuner plugin. +TENCENT64:856603:857878 [6] NCCL INFO ncclCommInitRank comm 0xada66d0 rank 6 nranks 8 cudaDev 6 nvmlDev 6 busId e1000 commId 0x4b26f6ae90d09515 - Init COMPLETE +TENCENT64:856597:857871 [0] NCCL INFO TUNER/Plugin: Plugin load returned 11 : libnccl-net.so: cannot open shared object file: No such file or directory : when loading libnccl-tuner.so +TENCENT64:856597:857871 [0] NCCL INFO TUNER/Plugin: Using internal tuner plugin. +TENCENT64:856597:857871 [0] NCCL INFO ncclCommInitRank comm 0xa71bd30 rank 0 nranks 8 cudaDev 0 nvmlDev 0 busId 25000 commId 0x4b26f6ae90d09515 - Init COMPLETE +TENCENT64:856599:857874 [2] NCCL INFO TUNER/Plugin: Plugin load returned 11 : libnccl-net.so: cannot open shared object file: No such file or directory : when loading libnccl-tuner.so +TENCENT64:856599:857874 [2] NCCL INFO TUNER/Plugin: Using internal tuner plugin. +TENCENT64:856599:857874 [2] NCCL INFO ncclCommInitRank comm 0xb7cd040 rank 2 nranks 8 cudaDev 2 nvmlDev 2 busId 64000 commId 0x4b26f6ae90d09515 - Init COMPLETE +TENCENT64:856600:857875 [3] NCCL INFO TUNER/Plugin: Plugin load returned 11 : libnccl-net.so: cannot open shared object file: No such file or directory : when loading libnccl-tuner.so +TENCENT64:856600:857875 [3] NCCL INFO TUNER/Plugin: Using internal tuner plugin. +TENCENT64:856600:857875 [3] NCCL INFO ncclCommInitRank comm 0xb785090 rank 3 nranks 8 cudaDev 3 nvmlDev 3 busId 69000 commId 0x4b26f6ae90d09515 - Init COMPLETE +TENCENT64:856602:857876 [5] NCCL INFO TUNER/Plugin: Plugin load returned 11 : libnccl-net.so: cannot open shared object file: No such file or directory : when loading libnccl-tuner.so +TENCENT64:856602:857876 [5] NCCL INFO TUNER/Plugin: Using internal tuner plugin. +TENCENT64:856602:857876 [5] NCCL INFO ncclCommInitRank comm 0xaf5f740 rank 5 nranks 8 cudaDev 5 nvmlDev 5 busId a7000 commId 0x4b26f6ae90d09515 - Init COMPLETE +TENCENT64:856601:857872 [4] NCCL INFO TUNER/Plugin: Plugin load returned 11 : libnccl-net.so: cannot open shared object file: No such file or directory : when loading libnccl-tuner.so +TENCENT64:856601:857872 [4] NCCL INFO TUNER/Plugin: Using internal tuner plugin. +TENCENT64:856601:857872 [4] NCCL INFO ncclCommInitRank comm 0xbc11380 rank 4 nranks 8 cudaDev 4 nvmlDev 4 busId a2000 commId 0x4b26f6ae90d09515 - Init COMPLETE +TENCENT64:856604:857873 [7] NCCL INFO TUNER/Plugin: Plugin load returned 11 : libnccl-net.so: cannot open shared object file: No such file or directory : when loading libnccl-tuner.so +TENCENT64:856604:857873 [7] NCCL INFO TUNER/Plugin: Using internal tuner plugin. +TENCENT64:856604:857873 [7] NCCL INFO ncclCommInitRank comm 0xa8a21a0 rank 7 nranks 8 cudaDev 7 nvmlDev 7 busId e7000 commId 0x4b26f6ae90d09515 - Init COMPLETE +TENCENT64:856598:857877 [1] NCCL INFO TUNER/Plugin: Plugin load returned 11 : libnccl-net.so: cannot open shared object file: No such file or directory : when loading libnccl-tuner.so +TENCENT64:856598:857877 [1] NCCL INFO TUNER/Plugin: Using internal tuner plugin. +TENCENT64:856598:857877 [1] NCCL INFO ncclCommInitRank comm 0xac3d6d0 rank 1 nranks 8 cudaDev 1 nvmlDev 1 busId 2b000 commId 0x4b26f6ae90d09515 - Init COMPLETE +[2025-06-30 06:37:11,243] [INFO] [partition_parameters.py:348:__exit__] finished initializing model - num_params = 339, num_elems = 7.62B + Loading checkpoint shards: 0%| | 0/4 [00:00 +[2025-06-30 06:37:57,993] [INFO] [logging.py:128:log_dist] [Rank 0] Creating fp16 ZeRO stage 3 optimizer, MiCS is enabled False, Hierarchical params gather False +[2025-06-30 06:37:57,993] [INFO] [logging.py:128:log_dist] [Rank 0] Creating torch.bfloat16 ZeRO stage 3 optimizer + Map (num_proc=8): 79%|███████▉ | 4255/5391 [00:01<00:00, 5101.35 examples/s] Map (num_proc=8): 90%|█████████ | 4878/5391 [00:01<00:00, 5197.53 examples/s] Map (num_proc=8): 0%| | 21/5391 [00:00<02:35, 34.43 examples/s] Map (num_proc=8): 0%| | 0/5391 [00:00 +[2025-06-30 06:38:04,356] [INFO] [logging.py:128:log_dist] [Rank 0] step=0, skipped=0, lr=[0.0, 0.0], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-06-30 06:38:04,357] [INFO] [config.py:999:print] DeepSpeedEngine configuration: +[2025-06-30 06:38:04,358] [INFO] [config.py:1003:print] activation_checkpointing_config { + "partition_activations": false, + "contiguous_memory_optimization": false, + "cpu_checkpointing": false, + "number_checkpoints": null, + "synchronize_checkpoint_boundary": false, + "profile": false +} +[2025-06-30 06:38:04,358] [INFO] [config.py:1003:print] aio_config ................... {'block_size': 1048576, 'queue_depth': 8, 'thread_count': 1, 'single_submit': False, 'overlap_events': True, 'use_gds': False} +[2025-06-30 06:38:04,358] [INFO] [config.py:1003:print] amp_enabled .................. False +[2025-06-30 06:38:04,358] [INFO] [config.py:1003:print] amp_params ................... False +[2025-06-30 06:38:04,358] [INFO] [config.py:1003:print] autotuning_config ............ { + "enabled": false, + "start_step": null, + "end_step": null, + "metric_path": null, + "arg_mappings": null, + "metric": "throughput", + "model_info": null, + "results_dir": "autotuning_results", + "exps_dir": "autotuning_exps", + "overwrite": true, + "fast": true, + "start_profile_step": 3, + "end_profile_step": 5, + "tuner_type": "gridsearch", + "tuner_early_stopping": 5, + "tuner_num_trials": 50, + "model_info_path": null, + "mp_size": 1, + "max_train_batch_size": null, + "min_train_batch_size": 1, + "max_train_micro_batch_size_per_gpu": 1.024000e+03, + "min_train_micro_batch_size_per_gpu": 1, + "num_tuning_micro_batch_sizes": 3 +} +[2025-06-30 06:38:04,358] [INFO] [config.py:1003:print] bfloat16_enabled ............. True +[2025-06-30 06:38:04,358] [INFO] [config.py:1003:print] bfloat16_immediate_grad_update False +[2025-06-30 06:38:04,358] [INFO] [config.py:1003:print] checkpoint_parallel_write_pipeline False +[2025-06-30 06:38:04,358] [INFO] [config.py:1003:print] checkpoint_tag_validation_enabled True +[2025-06-30 06:38:04,358] [INFO] [config.py:1003:print] checkpoint_tag_validation_fail False +[2025-06-30 06:38:04,358] [INFO] [config.py:1003:print] comms_config ................. +[2025-06-30 06:38:04,358] [INFO] [config.py:1003:print] communication_data_type ...... None +[2025-06-30 06:38:04,359] [INFO] [config.py:1003:print] compression_config ........... {'weight_quantization': {'shared_parameters': {'enabled': False, 'quantizer_kernel': False, 'schedule_offset': 0, 'quantize_groups': 1, 'quantize_verbose': False, 'quantization_type': 'symmetric', 'quantize_weight_in_forward': False, 'rounding': 'nearest', 'fp16_mixed_quantize': False, 'quantize_change_ratio': 0.001}, 'different_groups': {}}, 'activation_quantization': {'shared_parameters': {'enabled': False, 'quantization_type': 'symmetric', 'range_calibration': 'dynamic', 'schedule_offset': 1000}, 'different_groups': {}}, 'sparse_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'row_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'head_pruning': {'shared_parameters': {'enabled': False, 'method': 'topk', 'schedule_offset': 1000}, 'different_groups': {}}, 'channel_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'layer_reduction': {'enabled': False}} +[2025-06-30 06:38:04,359] [INFO] [config.py:1003:print] curriculum_enabled_legacy .... False +[2025-06-30 06:38:04,359] [INFO] [config.py:1003:print] curriculum_params_legacy ..... False +[2025-06-30 06:38:04,359] [INFO] [config.py:1003:print] data_efficiency_config ....... {'enabled': False, 'seed': 1234, 'data_sampling': {'enabled': False, 'num_epochs': 1000, 'num_workers': 0, 'curriculum_learning': {'enabled': False}}, 'data_routing': {'enabled': False, 'random_ltd': {'enabled': False, 'layer_token_lr_schedule': {'enabled': False}}}} +[2025-06-30 06:38:04,359] [INFO] [config.py:1003:print] data_efficiency_enabled ...... False +[2025-06-30 06:38:04,359] [INFO] [config.py:1003:print] dataloader_drop_last ......... False +[2025-06-30 06:38:04,359] [INFO] [config.py:1003:print] disable_allgather ............ False +[2025-06-30 06:38:04,359] [INFO] [config.py:1003:print] dump_state ................... False +[2025-06-30 06:38:04,359] [INFO] [config.py:1003:print] dynamic_loss_scale_args ...... None +[2025-06-30 06:38:04,359] [INFO] [config.py:1003:print] eigenvalue_enabled ........... False +[2025-06-30 06:38:04,359] [INFO] [config.py:1003:print] eigenvalue_gas_boundary_resolution 1 +[2025-06-30 06:38:04,359] [INFO] [config.py:1003:print] eigenvalue_layer_name ........ bert.encoder.layer +[2025-06-30 06:38:04,359] [INFO] [config.py:1003:print] eigenvalue_layer_num ......... 0 +[2025-06-30 06:38:04,359] [INFO] [config.py:1003:print] eigenvalue_max_iter .......... 100 +[2025-06-30 06:38:04,359] [INFO] [config.py:1003:print] eigenvalue_stability ......... 1e-06 +[2025-06-30 06:38:04,359] [INFO] [config.py:1003:print] eigenvalue_tol ............... 0.01 +[2025-06-30 06:38:04,359] [INFO] [config.py:1003:print] eigenvalue_verbose ........... False +[2025-06-30 06:38:04,359] [INFO] [config.py:1003:print] elasticity_enabled ........... False +[2025-06-30 06:38:04,359] [INFO] [config.py:1003:print] flops_profiler_config ........ { + "enabled": false, + "recompute_fwd_factor": 0.0, + "profile_step": 1, + "module_depth": -1, + "top_modules": 1, + "detailed": true, + "output_file": null +} +[2025-06-30 06:38:04,359] [INFO] [config.py:1003:print] fp16_auto_cast ............... None +[2025-06-30 06:38:04,359] [INFO] [config.py:1003:print] fp16_enabled ................. False +[2025-06-30 06:38:04,360] [INFO] [config.py:1003:print] fp16_master_weights_and_gradients False +[2025-06-30 06:38:04,360] [INFO] [config.py:1003:print] global_rank .................. 0 +[2025-06-30 06:38:04,360] [INFO] [config.py:1003:print] grad_accum_dtype ............. None +[2025-06-30 06:38:04,360] [INFO] [config.py:1003:print] gradient_accumulation_steps .. 4 +[2025-06-30 06:38:04,360] [INFO] [config.py:1003:print] gradient_clipping ............ 1.0 +[2025-06-30 06:38:04,360] [INFO] [config.py:1003:print] gradient_predivide_factor .... 1.0 +[2025-06-30 06:38:04,360] [INFO] [config.py:1003:print] graph_harvesting ............. False +[2025-06-30 06:38:04,360] [INFO] [config.py:1003:print] hybrid_engine ................ enabled=False max_out_tokens=512 inference_tp_size=1 release_inference_cache=False pin_parameters=True tp_gather_partition_size=8 +[2025-06-30 06:38:04,360] [INFO] [config.py:1003:print] initial_dynamic_scale ........ 1 +[2025-06-30 06:38:04,360] [INFO] [config.py:1003:print] load_universal_checkpoint .... False +[2025-06-30 06:38:04,360] [INFO] [config.py:1003:print] loss_scale ................... 1.0 +[2025-06-30 06:38:04,360] [INFO] [config.py:1003:print] memory_breakdown ............. False +[2025-06-30 06:38:04,360] [INFO] [config.py:1003:print] mics_hierarchial_params_gather False +[2025-06-30 06:38:04,360] [INFO] [config.py:1003:print] mics_shard_size .............. -1 +[2025-06-30 06:38:04,360] [INFO] [config.py:1003:print] monitor_config ............... tensorboard=TensorBoardConfig(enabled=False, output_path='', job_name='DeepSpeedJobName') comet=CometConfig(enabled=False, samples_log_interval=100, project=None, workspace=None, api_key=None, experiment_name=None, experiment_key=None, online=None, mode=None) wandb=WandbConfig(enabled=False, group=None, team=None, project='deepspeed') csv_monitor=CSVConfig(enabled=False, output_path='', job_name='DeepSpeedJobName') +[2025-06-30 06:38:04,360] [INFO] [config.py:1003:print] nebula_config ................ { + "enabled": false, + "persistent_storage_path": null, + "persistent_time_interval": 100, + "num_of_version_in_retention": 2, + "enable_nebula_load": true, + "load_path": null +} +[2025-06-30 06:38:04,360] [INFO] [config.py:1003:print] optimizer_legacy_fusion ...... False +[2025-06-30 06:38:04,361] [INFO] [config.py:1003:print] optimizer_name ............... None +[2025-06-30 06:38:04,361] [INFO] [config.py:1003:print] optimizer_params ............. None +[2025-06-30 06:38:04,361] [INFO] [config.py:1003:print] pipeline ..................... {'stages': 'auto', 'partition': 'best', 'seed_layers': False, 'activation_checkpoint_interval': 0, 'pipe_partitioned': True, 'grad_partitioned': True} +[2025-06-30 06:38:04,361] [INFO] [config.py:1003:print] pld_enabled .................. False +[2025-06-30 06:38:04,361] [INFO] [config.py:1003:print] pld_params ................... False +[2025-06-30 06:38:04,361] [INFO] [config.py:1003:print] prescale_gradients ........... False +[2025-06-30 06:38:04,361] [INFO] [config.py:1003:print] scheduler_name ............... None +[2025-06-30 06:38:04,361] [INFO] [config.py:1003:print] scheduler_params ............. None +[2025-06-30 06:38:04,361] [INFO] [config.py:1003:print] seq_parallel_communication_data_type torch.float32 +[2025-06-30 06:38:04,361] [INFO] [config.py:1003:print] sparse_attention ............. None +[2025-06-30 06:38:04,361] [INFO] [config.py:1003:print] sparse_gradients_enabled ..... False +[2025-06-30 06:38:04,361] [INFO] [config.py:1003:print] steps_per_print .............. 100 +[2025-06-30 06:38:04,361] [INFO] [config.py:1003:print] timers_config ................ enabled=True synchronized=True +[2025-06-30 06:38:04,361] [INFO] [config.py:1003:print] train_batch_size ............. 128 +[2025-06-30 06:38:04,361] [INFO] [config.py:1003:print] train_micro_batch_size_per_gpu 4 +[2025-06-30 06:38:04,361] [INFO] [config.py:1003:print] use_data_before_expert_parallel_ False +[2025-06-30 06:38:04,361] [INFO] [config.py:1003:print] use_node_local_storage ....... False +[2025-06-30 06:38:04,361] [INFO] [config.py:1003:print] wall_clock_breakdown ......... False +[2025-06-30 06:38:04,361] [INFO] [config.py:1003:print] weight_quantization_config ... None +[2025-06-30 06:38:04,361] [INFO] [config.py:1003:print] world_size ................... 8 +[2025-06-30 06:38:04,361] [INFO] [config.py:1003:print] zero_allow_untested_optimizer False +[2025-06-30 06:38:04,362] [INFO] [config.py:1003:print] zero_config .................. stage=3 contiguous_gradients=True reduce_scatter=True reduce_bucket_size=500000000 use_multi_rank_bucket_allreduce=True allgather_partitions=True allgather_bucket_size=500000000 overlap_comm=True load_from_fp32_weights=True elastic_checkpoint=False offload_param=DeepSpeedZeroOffloadParamConfig(device='none', nvme_path=None, buffer_count=5, buffer_size=100000000, max_in_cpu=1000000000, pin_memory=False) offload_optimizer=DeepSpeedZeroOffloadOptimizerConfig(device='none', nvme_path=None, buffer_count=4, pin_memory=True, pipeline_read=False, pipeline_write=False, fast_init=False, ratio=1.0) sub_group_size=1000000000 cpu_offload_param=None cpu_offload_use_pin_memory=None cpu_offload=None prefetch_bucket_size=50000000 param_persistence_threshold=100000 model_persistence_threshold=9223372036854775807 max_live_parameters=1000000000 max_reuse_distance=1000000000 gather_16bit_weights_on_model_save=False module_granularity_threshold=0 use_all_reduce_for_fetch_params=False stage3_gather_fp16_weights_on_model_save=False ignore_unused_parameters=True legacy_stage1=False round_robin_gradients=False zero_hpz_partition_size=1 zero_quantized_weights=False zero_quantized_nontrainable_weights=False zero_quantized_gradients=False zeropp_loco_param=None mics_shard_size=-1 mics_hierarchical_params_gather=False memory_efficient_linear=True pipeline_loading_checkpoint=False override_module_apply=True +[2025-06-30 06:38:04,362] [INFO] [config.py:1003:print] zero_enabled ................. True +[2025-06-30 06:38:04,362] [INFO] [config.py:1003:print] zero_force_ds_cpu_optimizer .. True +[2025-06-30 06:38:04,362] [INFO] [config.py:1003:print] zero_optimization_stage ...... 3 +[2025-06-30 06:38:04,362] [INFO] [config.py:989:print_user_config] json = { + "steps_per_print": 100, + "zero_optimization": { + "stage": 3, + "offload_param": { + "device": "none" + }, + "offload_optimizer": { + "device": "none", + "pin_memory": true + }, + "sub_group_size": "auto", + "stage3_max_live_parameters": "auto", + "stage3_max_reuse_distance": "auto", + "stage3_param_persistence_threshold": "auto", + "stage3_prefetch_bucket_size": "auto", + "reduce_bucket_size": "auto", + "zero_hpz_partition_size": 1, + "zero_quantized_weights": false, + "zero_quantized_gradients": false + }, + "bf16": { + "enabled": true + }, + "gradient_clipping": 1.0, + "prescale_gradients": false, + "wall_clock_breakdown": false, + "data_types": { + "grad_accum_dtype": null + }, + "train_micro_batch_size_per_gpu": 4, + "train_batch_size": 128 +} +wandb: Currently logged in as: sarosavo to https://api.wandb.ai. Use `wandb login --relogin` to force relogin +wandb: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information. +wandb: - Waiting for wandb.init()... wandb: \ Waiting for wandb.init()... wandb: | Waiting for wandb.init()... wandb: Tracking run with wandb version 0.19.6 +wandb: Run data is saved locally in /apdcephfs/share_300000800/user/yulaizhao/codes/rl_grounding/0304/make_reward_sft/wandb/run-20250630_063805-afb04fb4-d0ce-4a7e-9a3c-547fe30cc0d8 +wandb: Run `wandb offline` to turn off syncing. +wandb: Syncing run RLVR_sft_augmented_data_180k_20250630-0636 +wandb: ⭐️ View project at https://wandb.ai/sarosavo/magic_spell +wandb: 🚀 View run at https://wandb.ai/sarosavo/magic_spell/runs/afb04fb4-d0ce-4a7e-9a3c-547fe30cc0d8 + Train epoch: 0%| | 0/1 [00:005->4 [1] 6/-1/-1->5->4 [2] 6/-1/-1->5->4 [3] 6/-1/-1->5->4 [4] 6/-1/-1->5->4 [5] 6/-1/-1->5->4 [6] 6/-1/-1->5->4 [7] 6/-1/-1->5->4 [8] 6/-1/-1->5->4 [9] 6/-1/-1->5->4 [10] 6/-1/-1->5->4 [11] 6/-1/-1->5->4 [12] 6/-1/-1->5->4 [13] 6/-1/-1->5->4 [14] 6/-1/-1->5->4 [15] 6/-1/-1->5->4 +TENCENT64:856604:861125 [7] NCCL INFO Trees [0] -1/-1/-1->7->6 [1] -1/-1/-1->7->6 [2] -1/-1/-1->7->6 [3] -1/-1/-1->7->6 [4] -1/-1/-1->7->6 [5] -1/-1/-1->7->6 [6] -1/-1/-1->7->6 [7] -1/-1/-1->7->6 [8] -1/-1/-1->7->6 [9] -1/-1/-1->7->6 [10] -1/-1/-1->7->6 [11] -1/-1/-1->7->6 [12] -1/-1/-1->7->6 [13] -1/-1/-1->7->6 [14] -1/-1/-1->7->6 [15] -1/-1/-1->7->6 +TENCENT64:856599:861126 [2] NCCL INFO comm 0x7fcd04067390 rank 2 nRanks 8 nNodes 1 localRanks 8 localRank 2 MNNVL 0 +TENCENT64:856602:861129 [5] NCCL INFO P2P Chunksize set to 524288 +TENCENT64:856601:861124 [4] NCCL INFO comm 0x7fae74067050 rank 4 nRanks 8 nNodes 1 localRanks 8 localRank 4 MNNVL 0 +TENCENT64:856600:861127 [3] NCCL INFO comm 0x7f18e8068360 rank 3 nRanks 8 nNodes 1 localRanks 8 localRank 3 MNNVL 0 +TENCENT64:856597:861128 [0] NCCL INFO comm 0x7f0bec068160 rank 0 nRanks 8 nNodes 1 localRanks 8 localRank 0 MNNVL 0 +TENCENT64:856604:861125 [7] NCCL INFO P2P Chunksize set to 524288 +TENCENT64:856603:861123 [6] NCCL INFO Trees [0] 7/-1/-1->6->5 [1] 7/-1/-1->6->5 [2] 7/-1/-1->6->5 [3] 7/-1/-1->6->5 [4] 7/-1/-1->6->5 [5] 7/-1/-1->6->5 [6] 7/-1/-1->6->5 [7] 7/-1/-1->6->5 [8] 7/-1/-1->6->5 [9] 7/-1/-1->6->5 [10] 7/-1/-1->6->5 [11] 7/-1/-1->6->5 [12] 7/-1/-1->6->5 [13] 7/-1/-1->6->5 [14] 7/-1/-1->6->5 [15] 7/-1/-1->6->5 +TENCENT64:856603:861123 [6] NCCL INFO P2P Chunksize set to 524288 +TENCENT64:856597:861128 [0] NCCL INFO Channel 00/16 : 0 1 2 3 4 5 6 7 +TENCENT64:856601:861124 [4] NCCL INFO Trees [0] 5/-1/-1->4->3 [1] 5/-1/-1->4->3 [2] 5/-1/-1->4->3 [3] 5/-1/-1->4->3 [4] 5/-1/-1->4->3 [5] 5/-1/-1->4->3 [6] 5/-1/-1->4->3 [7] 5/-1/-1->4->3 [8] 5/-1/-1->4->3 [9] 5/-1/-1->4->3 [10] 5/-1/-1->4->3 [11] 5/-1/-1->4->3 [12] 5/-1/-1->4->3 [13] 5/-1/-1->4->3 [14] 5/-1/-1->4->3 [15] 5/-1/-1->4->3 +TENCENT64:856599:861126 [2] NCCL INFO Trees [0] 3/-1/-1->2->1 [1] 3/-1/-1->2->1 [2] 3/-1/-1->2->1 [3] 3/-1/-1->2->1 [4] 3/-1/-1->2->1 [5] 3/-1/-1->2->1 [6] 3/-1/-1->2->1 [7] 3/-1/-1->2->1 [8] 3/-1/-1->2->1 [9] 3/-1/-1->2->1 [10] 3/-1/-1->2->1 [11] 3/-1/-1->2->1 [12] 3/-1/-1->2->1 [13] 3/-1/-1->2->1 [14] 3/-1/-1->2->1 [15] 3/-1/-1->2->1 +TENCENT64:856597:861128 [0] NCCL INFO Channel 01/16 : 0 1 2 3 4 5 6 7 +TENCENT64:856601:861124 [4] NCCL INFO P2P Chunksize set to 524288 +TENCENT64:856598:861130 [1] NCCL INFO Trees [0] 2/-1/-1->1->0 [1] 2/-1/-1->1->0 [2] 2/-1/-1->1->0 [3] 2/-1/-1->1->0 [4] 2/-1/-1->1->0 [5] 2/-1/-1->1->0 [6] 2/-1/-1->1->0 [7] 2/-1/-1->1->0 [8] 2/-1/-1->1->0 [9] 2/-1/-1->1->0 [10] 2/-1/-1->1->0 [11] 2/-1/-1->1->0 [12] 2/-1/-1->1->0 [13] 2/-1/-1->1->0 [14] 2/-1/-1->1->0 [15] 2/-1/-1->1->0 +TENCENT64:856599:861126 [2] NCCL INFO P2P Chunksize set to 524288 +TENCENT64:856600:861127 [3] NCCL INFO Trees [0] 4/-1/-1->3->2 [1] 4/-1/-1->3->2 [2] 4/-1/-1->3->2 [3] 4/-1/-1->3->2 [4] 4/-1/-1->3->2 [5] 4/-1/-1->3->2 [6] 4/-1/-1->3->2 [7] 4/-1/-1->3->2 [8] 4/-1/-1->3->2 [9] 4/-1/-1->3->2 [10] 4/-1/-1->3->2 [11] 4/-1/-1->3->2 [12] 4/-1/-1->3->2 [13] 4/-1/-1->3->2 [14] 4/-1/-1->3->2 [15] 4/-1/-1->3->2 +TENCENT64:856597:861128 [0] NCCL INFO Channel 02/16 : 0 1 2 3 4 5 6 7 +TENCENT64:856598:861130 [1] NCCL INFO P2P Chunksize set to 524288 +TENCENT64:856600:861127 [3] NCCL INFO P2P Chunksize set to 524288 +TENCENT64:856597:861128 [0] NCCL INFO Channel 03/16 : 0 1 2 3 4 5 6 7 +TENCENT64:856597:861128 [0] NCCL INFO Channel 04/16 : 0 1 2 3 4 5 6 7 +TENCENT64:856597:861128 [0] NCCL INFO Channel 05/16 : 0 1 2 3 4 5 6 7 +TENCENT64:856597:861128 [0] NCCL INFO Channel 06/16 : 0 1 2 3 4 5 6 7 +TENCENT64:856597:861128 [0] NCCL INFO Channel 07/16 : 0 1 2 3 4 5 6 7 +TENCENT64:856597:861128 [0] NCCL INFO Channel 08/16 : 0 1 2 3 4 5 6 7 +TENCENT64:856597:861128 [0] NCCL INFO Channel 09/16 : 0 1 2 3 4 5 6 7 +TENCENT64:856597:861128 [0] NCCL INFO Channel 10/16 : 0 1 2 3 4 5 6 7 +TENCENT64:856597:861128 [0] NCCL INFO Channel 11/16 : 0 1 2 3 4 5 6 7 +TENCENT64:856597:861128 [0] NCCL INFO Channel 12/16 : 0 1 2 3 4 5 6 7 +TENCENT64:856597:861128 [0] NCCL INFO Channel 13/16 : 0 1 2 3 4 5 6 7 +TENCENT64:856597:861128 [0] NCCL INFO Channel 14/16 : 0 1 2 3 4 5 6 7 +TENCENT64:856597:861128 [0] NCCL INFO Channel 15/16 : 0 1 2 3 4 5 6 7 +TENCENT64:856597:861128 [0] NCCL INFO Trees [0] 1/-1/-1->0->-1 [1] 1/-1/-1->0->-1 [2] 1/-1/-1->0->-1 [3] 1/-1/-1->0->-1 [4] 1/-1/-1->0->-1 [5] 1/-1/-1->0->-1 [6] 1/-1/-1->0->-1 [7] 1/-1/-1->0->-1 [8] 1/-1/-1->0->-1 [9] 1/-1/-1->0->-1 [10] 1/-1/-1->0->-1 [11] 1/-1/-1->0->-1 [12] 1/-1/-1->0->-1 [13] 1/-1/-1->0->-1 [14] 1/-1/-1->0->-1 [15] 1/-1/-1->0->-1 +TENCENT64:856597:861128 [0] NCCL INFO P2P Chunksize set to 524288 +TENCENT64:856602:861129 [5] NCCL INFO Channel 00/0 : 5[5] -> 6[6] via P2P/CUMEM/read +TENCENT64:856604:861125 [7] NCCL INFO Channel 00/0 : 7[7] -> 0[0] via P2P/CUMEM/read +TENCENT64:856600:861127 [3] NCCL INFO Channel 00/0 : 3[3] -> 4[4] via P2P/CUMEM/read +TENCENT64:856603:861123 [6] NCCL INFO Channel 00/0 : 6[6] -> 7[7] via P2P/CUMEM/read +TENCENT64:856597:861128 [0] NCCL INFO Channel 00/0 : 0[0] -> 1[1] via P2P/CUMEM/read +TENCENT64:856602:861129 [5] NCCL INFO Channel 01/0 : 5[5] -> 6[6] via P2P/CUMEM/read +TENCENT64:856604:861125 [7] NCCL INFO Channel 01/0 : 7[7] -> 0[0] via P2P/CUMEM/read +TENCENT64:856600:861127 [3] NCCL INFO Channel 01/0 : 3[3] -> 4[4] via P2P/CUMEM/read +TENCENT64:856603:861123 [6] NCCL INFO Channel 01/0 : 6[6] -> 7[7] via P2P/CUMEM/read +TENCENT64:856597:861128 [0] NCCL INFO Channel 01/0 : 0[0] -> 1[1] via P2P/CUMEM/read +TENCENT64:856602:861129 [5] NCCL INFO Channel 02/0 : 5[5] -> 6[6] via P2P/CUMEM/read +TENCENT64:856604:861125 [7] NCCL INFO Channel 02/0 : 7[7] -> 0[0] via P2P/CUMEM/read +TENCENT64:856600:861127 [3] NCCL INFO Channel 02/0 : 3[3] -> 4[4] via P2P/CUMEM/read +TENCENT64:856603:861123 [6] NCCL INFO Channel 02/0 : 6[6] -> 7[7] via P2P/CUMEM/read +TENCENT64:856602:861129 [5] NCCL INFO Channel 03/0 : 5[5] -> 6[6] via P2P/CUMEM/read +TENCENT64:856597:861128 [0] NCCL INFO Channel 02/0 : 0[0] -> 1[1] via P2P/CUMEM/read +TENCENT64:856599:861126 [2] NCCL INFO Channel 00/0 : 2[2] -> 3[3] via P2P/CUMEM/read +TENCENT64:856604:861125 [7] NCCL INFO Channel 03/0 : 7[7] -> 0[0] via P2P/CUMEM/read +TENCENT64:856600:861127 [3] NCCL INFO Channel 03/0 : 3[3] -> 4[4] via P2P/CUMEM/read +TENCENT64:856597:861128 [0] NCCL INFO Channel 03/0 : 0[0] -> 1[1] via P2P/CUMEM/read +TENCENT64:856599:861126 [2] NCCL INFO Channel 01/0 : 2[2] -> 3[3] via P2P/CUMEM/read +TENCENT64:856603:861123 [6] NCCL INFO Channel 03/0 : 6[6] -> 7[7] via P2P/CUMEM/read +TENCENT64:856600:861127 [3] NCCL INFO Channel 04/0 : 3[3] -> 4[4] via P2P/CUMEM/read +TENCENT64:856597:861128 [0] NCCL INFO Channel 04/0 : 0[0] -> 1[1] via P2P/CUMEM/read +TENCENT64:856599:861126 [2] NCCL INFO Channel 02/0 : 2[2] -> 3[3] via P2P/CUMEM/read +TENCENT64:856603:861123 [6] NCCL INFO Channel 04/0 : 6[6] -> 7[7] via P2P/CUMEM/read +TENCENT64:856600:861127 [3] NCCL INFO Channel 05/0 : 3[3] -> 4[4] via P2P/CUMEM/read +TENCENT64:856597:861128 [0] NCCL INFO Channel 05/0 : 0[0] -> 1[1] via P2P/CUMEM/read +TENCENT64:856599:861126 [2] NCCL INFO Channel 03/0 : 2[2] -> 3[3] via P2P/CUMEM/read +TENCENT64:856603:861123 [6] NCCL INFO Channel 05/0 : 6[6] -> 7[7] via P2P/CUMEM/read +TENCENT64:856600:861127 [3] NCCL INFO Channel 06/0 : 3[3] -> 4[4] via P2P/CUMEM/read +TENCENT64:856597:861128 [0] NCCL INFO Channel 06/0 : 0[0] -> 1[1] via P2P/CUMEM/read +TENCENT64:856599:861126 [2] NCCL INFO Channel 04/0 : 2[2] -> 3[3] via P2P/CUMEM/read +TENCENT64:856603:861123 [6] NCCL INFO Channel 06/0 : 6[6] -> 7[7] via P2P/CUMEM/read +TENCENT64:856600:861127 [3] NCCL INFO Channel 07/0 : 3[3] -> 4[4] via P2P/CUMEM/read +TENCENT64:856597:861128 [0] NCCL INFO Channel 07/0 : 0[0] -> 1[1] via P2P/CUMEM/read +TENCENT64:856599:861126 [2] NCCL INFO Channel 05/0 : 2[2] -> 3[3] via P2P/CUMEM/read +TENCENT64:856603:861123 [6] NCCL INFO Channel 07/0 : 6[6] -> 7[7] via P2P/CUMEM/read +TENCENT64:856600:861127 [3] NCCL INFO Channel 08/0 : 3[3] -> 4[4] via P2P/CUMEM/read +TENCENT64:856597:861128 [0] NCCL INFO Channel 08/0 : 0[0] -> 1[1] via P2P/CUMEM/read +TENCENT64:856599:861126 [2] NCCL INFO Channel 06/0 : 2[2] -> 3[3] via P2P/CUMEM/read +TENCENT64:856603:861123 [6] NCCL INFO Channel 08/0 : 6[6] -> 7[7] via P2P/CUMEM/read +TENCENT64:856600:861127 [3] NCCL INFO Channel 09/0 : 3[3] -> 4[4] via P2P/CUMEM/read +TENCENT64:856597:861128 [0] NCCL INFO Channel 09/0 : 0[0] -> 1[1] via P2P/CUMEM/read +TENCENT64:856601:861124 [4] NCCL INFO Channel 00/0 : 4[4] -> 5[5] via P2P/CUMEM/read +TENCENT64:856599:861126 [2] NCCL INFO Channel 07/0 : 2[2] -> 3[3] via P2P/CUMEM/read +TENCENT64:856602:861129 [5] NCCL INFO Channel 04/0 : 5[5] -> 6[6] via P2P/CUMEM/read +TENCENT64:856603:861123 [6] NCCL INFO Channel 09/0 : 6[6] -> 7[7] via P2P/CUMEM/read +TENCENT64:856600:861127 [3] NCCL INFO Channel 10/0 : 3[3] -> 4[4] via P2P/CUMEM/read +TENCENT64:856604:861125 [7] NCCL INFO Channel 04/0 : 7[7] -> 0[0] via P2P/CUMEM/read +TENCENT64:856597:861128 [0] NCCL INFO Channel 10/0 : 0[0] -> 1[1] via P2P/CUMEM/read +TENCENT64:856601:861124 [4] NCCL INFO Channel 01/0 : 4[4] -> 5[5] via P2P/CUMEM/read +TENCENT64:856599:861126 [2] NCCL INFO Channel 08/0 : 2[2] -> 3[3] via P2P/CUMEM/read +TENCENT64:856602:861129 [5] NCCL INFO Channel 05/0 : 5[5] -> 6[6] via P2P/CUMEM/read +TENCENT64:856603:861123 [6] NCCL INFO Channel 10/0 : 6[6] -> 7[7] via P2P/CUMEM/read +TENCENT64:856598:861130 [1] NCCL INFO Channel 00/0 : 1[1] -> 2[2] via P2P/CUMEM/read +TENCENT64:856600:861127 [3] NCCL INFO Channel 11/0 : 3[3] -> 4[4] via P2P/CUMEM/read +TENCENT64:856604:861125 [7] NCCL INFO Channel 05/0 : 7[7] -> 0[0] via P2P/CUMEM/read +TENCENT64:856597:861128 [0] NCCL INFO Channel 11/0 : 0[0] -> 1[1] via P2P/CUMEM/read +TENCENT64:856601:861124 [4] NCCL INFO Channel 02/0 : 4[4] -> 5[5] via P2P/CUMEM/read +TENCENT64:856599:861126 [2] NCCL INFO Channel 09/0 : 2[2] -> 3[3] via P2P/CUMEM/read +TENCENT64:856602:861129 [5] NCCL INFO Channel 06/0 : 5[5] -> 6[6] via P2P/CUMEM/read +TENCENT64:856603:861123 [6] NCCL INFO Channel 11/0 : 6[6] -> 7[7] via P2P/CUMEM/read +TENCENT64:856598:861130 [1] NCCL INFO Channel 01/0 : 1[1] -> 2[2] via P2P/CUMEM/read +TENCENT64:856600:861127 [3] NCCL INFO Channel 12/0 : 3[3] -> 4[4] via P2P/CUMEM/read +TENCENT64:856604:861125 [7] NCCL INFO Channel 06/0 : 7[7] -> 0[0] via P2P/CUMEM/read +TENCENT64:856597:861128 [0] NCCL INFO Channel 12/0 : 0[0] -> 1[1] via P2P/CUMEM/read +TENCENT64:856601:861124 [4] NCCL INFO Channel 03/0 : 4[4] -> 5[5] via P2P/CUMEM/read +TENCENT64:856599:861126 [2] NCCL INFO Channel 10/0 : 2[2] -> 3[3] via P2P/CUMEM/read +TENCENT64:856602:861129 [5] NCCL INFO Channel 07/0 : 5[5] -> 6[6] via P2P/CUMEM/read +TENCENT64:856603:861123 [6] NCCL INFO Channel 12/0 : 6[6] -> 7[7] via P2P/CUMEM/read +TENCENT64:856598:861130 [1] NCCL INFO Channel 02/0 : 1[1] -> 2[2] via P2P/CUMEM/read +TENCENT64:856600:861127 [3] NCCL INFO Channel 13/0 : 3[3] -> 4[4] via P2P/CUMEM/read +TENCENT64:856604:861125 [7] NCCL INFO Channel 07/0 : 7[7] -> 0[0] via P2P/CUMEM/read +TENCENT64:856597:861128 [0] NCCL INFO Channel 13/0 : 0[0] -> 1[1] via P2P/CUMEM/read +TENCENT64:856601:861124 [4] NCCL INFO Channel 04/0 : 4[4] -> 5[5] via P2P/CUMEM/read +TENCENT64:856599:861126 [2] NCCL INFO Channel 11/0 : 2[2] -> 3[3] via P2P/CUMEM/read +TENCENT64:856602:861129 [5] NCCL INFO Channel 08/0 : 5[5] -> 6[6] via P2P/CUMEM/read +TENCENT64:856603:861123 [6] NCCL INFO Channel 13/0 : 6[6] -> 7[7] via P2P/CUMEM/read +TENCENT64:856598:861130 [1] NCCL INFO Channel 03/0 : 1[1] -> 2[2] via P2P/CUMEM/read +TENCENT64:856600:861127 [3] NCCL INFO Channel 14/0 : 3[3] -> 4[4] via P2P/CUMEM/read +TENCENT64:856604:861125 [7] NCCL INFO Channel 08/0 : 7[7] -> 0[0] via P2P/CUMEM/read +TENCENT64:856597:861128 [0] NCCL INFO Channel 14/0 : 0[0] -> 1[1] via P2P/CUMEM/read +TENCENT64:856601:861124 [4] NCCL INFO Channel 05/0 : 4[4] -> 5[5] via P2P/CUMEM/read +TENCENT64:856599:861126 [2] NCCL INFO Channel 12/0 : 2[2] -> 3[3] via P2P/CUMEM/read +TENCENT64:856602:861129 [5] NCCL INFO Channel 09/0 : 5[5] -> 6[6] via P2P/CUMEM/read +TENCENT64:856603:861123 [6] NCCL INFO Channel 14/0 : 6[6] -> 7[7] via P2P/CUMEM/read +TENCENT64:856598:861130 [1] NCCL INFO Channel 04/0 : 1[1] -> 2[2] via P2P/CUMEM/read +TENCENT64:856600:861127 [3] NCCL INFO Channel 15/0 : 3[3] -> 4[4] via P2P/CUMEM/read +TENCENT64:856604:861125 [7] NCCL INFO Channel 09/0 : 7[7] -> 0[0] via P2P/CUMEM/read +TENCENT64:856597:861128 [0] NCCL INFO Channel 15/0 : 0[0] -> 1[1] via P2P/CUMEM/read +TENCENT64:856601:861124 [4] NCCL INFO Channel 06/0 : 4[4] -> 5[5] via P2P/CUMEM/read +TENCENT64:856599:861126 [2] NCCL INFO Channel 13/0 : 2[2] -> 3[3] via P2P/CUMEM/read +TENCENT64:856602:861129 [5] NCCL INFO Channel 10/0 : 5[5] -> 6[6] via P2P/CUMEM/read +TENCENT64:856603:861123 [6] NCCL INFO Channel 15/0 : 6[6] -> 7[7] via P2P/CUMEM/read +TENCENT64:856598:861130 [1] NCCL INFO Channel 05/0 : 1[1] -> 2[2] via P2P/CUMEM/read +TENCENT64:856604:861125 [7] NCCL INFO Channel 10/0 : 7[7] -> 0[0] via P2P/CUMEM/read +TENCENT64:856601:861124 [4] NCCL INFO Channel 07/0 : 4[4] -> 5[5] via P2P/CUMEM/read +TENCENT64:856599:861126 [2] NCCL INFO Channel 14/0 : 2[2] -> 3[3] via P2P/CUMEM/read +TENCENT64:856602:861129 [5] NCCL INFO Channel 11/0 : 5[5] -> 6[6] via P2P/CUMEM/read +TENCENT64:856598:861130 [1] NCCL INFO Channel 06/0 : 1[1] -> 2[2] via P2P/CUMEM/read +TENCENT64:856604:861125 [7] NCCL INFO Channel 11/0 : 7[7] -> 0[0] via P2P/CUMEM/read +TENCENT64:856601:861124 [4] NCCL INFO Channel 08/0 : 4[4] -> 5[5] via P2P/CUMEM/read +TENCENT64:856599:861126 [2] NCCL INFO Channel 15/0 : 2[2] -> 3[3] via P2P/CUMEM/read +TENCENT64:856602:861129 [5] NCCL INFO Channel 12/0 : 5[5] -> 6[6] via P2P/CUMEM/read +TENCENT64:856598:861130 [1] NCCL INFO Channel 07/0 : 1[1] -> 2[2] via P2P/CUMEM/read +TENCENT64:856604:861125 [7] NCCL INFO Channel 12/0 : 7[7] -> 0[0] via P2P/CUMEM/read +TENCENT64:856601:861124 [4] NCCL INFO Channel 09/0 : 4[4] -> 5[5] via P2P/CUMEM/read +TENCENT64:856602:861129 [5] NCCL INFO Channel 13/0 : 5[5] -> 6[6] via P2P/CUMEM/read +TENCENT64:856598:861130 [1] NCCL INFO Channel 08/0 : 1[1] -> 2[2] via P2P/CUMEM/read +TENCENT64:856604:861125 [7] NCCL INFO Channel 13/0 : 7[7] -> 0[0] via P2P/CUMEM/read +TENCENT64:856601:861124 [4] NCCL INFO Channel 10/0 : 4[4] -> 5[5] via P2P/CUMEM/read +TENCENT64:856602:861129 [5] NCCL INFO Channel 14/0 : 5[5] -> 6[6] via P2P/CUMEM/read +TENCENT64:856598:861130 [1] NCCL INFO Channel 09/0 : 1[1] -> 2[2] via P2P/CUMEM/read +TENCENT64:856604:861125 [7] NCCL INFO Channel 14/0 : 7[7] -> 0[0] via P2P/CUMEM/read +TENCENT64:856601:861124 [4] NCCL INFO Channel 11/0 : 4[4] -> 5[5] via P2P/CUMEM/read +TENCENT64:856602:861129 [5] NCCL INFO Channel 15/0 : 5[5] -> 6[6] via P2P/CUMEM/read +TENCENT64:856598:861130 [1] NCCL INFO Channel 10/0 : 1[1] -> 2[2] via P2P/CUMEM/read +TENCENT64:856604:861125 [7] NCCL INFO Channel 15/0 : 7[7] -> 0[0] via P2P/CUMEM/read +TENCENT64:856601:861124 [4] NCCL INFO Channel 12/0 : 4[4] -> 5[5] via P2P/CUMEM/read +TENCENT64:856598:861130 [1] NCCL INFO Channel 11/0 : 1[1] -> 2[2] via P2P/CUMEM/read +TENCENT64:856601:861124 [4] NCCL INFO Channel 13/0 : 4[4] -> 5[5] via P2P/CUMEM/read +TENCENT64:856598:861130 [1] NCCL INFO Channel 12/0 : 1[1] -> 2[2] via P2P/CUMEM/read +TENCENT64:856601:861124 [4] NCCL INFO Channel 14/0 : 4[4] -> 5[5] via P2P/CUMEM/read +TENCENT64:856598:861130 [1] NCCL INFO Channel 13/0 : 1[1] -> 2[2] via P2P/CUMEM/read +TENCENT64:856601:861124 [4] NCCL INFO Channel 15/0 : 4[4] -> 5[5] via P2P/CUMEM/read +TENCENT64:856598:861130 [1] NCCL INFO Channel 14/0 : 1[1] -> 2[2] via P2P/CUMEM/read +TENCENT64:856598:861130 [1] NCCL INFO Channel 15/0 : 1[1] -> 2[2] via P2P/CUMEM/read +TENCENT64:856599:861126 [2] NCCL INFO Connected all rings +TENCENT64:856598:861130 [1] NCCL INFO Connected all rings +TENCENT64:856597:861128 [0] NCCL INFO Connected all rings +TENCENT64:856599:861126 [2] NCCL INFO Channel 00/0 : 2[2] -> 1[1] via P2P/CUMEM/read +TENCENT64:856600:861127 [3] NCCL INFO Connected all rings +TENCENT64:856598:861130 [1] NCCL INFO Channel 00/0 : 1[1] -> 0[0] via P2P/CUMEM/read +TENCENT64:856599:861126 [2] NCCL INFO Channel 01/0 : 2[2] -> 1[1] via P2P/CUMEM/read +TENCENT64:856601:861124 [4] NCCL INFO Connected all rings +TENCENT64:856598:861130 [1] NCCL INFO Channel 01/0 : 1[1] -> 0[0] via P2P/CUMEM/read +TENCENT64:856599:861126 [2] NCCL INFO Channel 02/0 : 2[2] -> 1[1] via P2P/CUMEM/read +TENCENT64:856599:861126 [2] NCCL INFO Channel 03/0 : 2[2] -> 1[1] via P2P/CUMEM/read +TENCENT64:856598:861130 [1] NCCL INFO Channel 02/0 : 1[1] -> 0[0] via P2P/CUMEM/read +TENCENT64:856599:861126 [2] NCCL INFO Channel 04/0 : 2[2] -> 1[1] via P2P/CUMEM/read +TENCENT64:856598:861130 [1] NCCL INFO Channel 03/0 : 1[1] -> 0[0] via P2P/CUMEM/read +TENCENT64:856599:861126 [2] NCCL INFO Channel 05/0 : 2[2] -> 1[1] via P2P/CUMEM/read +TENCENT64:856598:861130 [1] NCCL INFO Channel 04/0 : 1[1] -> 0[0] via P2P/CUMEM/read +TENCENT64:856599:861126 [2] NCCL INFO Channel 06/0 : 2[2] -> 1[1] via P2P/CUMEM/read +TENCENT64:856598:861130 [1] NCCL INFO Channel 05/0 : 1[1] -> 0[0] via P2P/CUMEM/read +TENCENT64:856599:861126 [2] NCCL INFO Channel 07/0 : 2[2] -> 1[1] via P2P/CUMEM/read +TENCENT64:856598:861130 [1] NCCL INFO Channel 06/0 : 1[1] -> 0[0] via P2P/CUMEM/read +TENCENT64:856599:861126 [2] NCCL INFO Channel 08/0 : 2[2] -> 1[1] via P2P/CUMEM/read +TENCENT64:856598:861130 [1] NCCL INFO Channel 07/0 : 1[1] -> 0[0] via P2P/CUMEM/read +TENCENT64:856598:861130 [1] NCCL INFO Channel 08/0 : 1[1] -> 0[0] via P2P/CUMEM/read +TENCENT64:856599:861126 [2] NCCL INFO Channel 09/0 : 2[2] -> 1[1] via P2P/CUMEM/read +TENCENT64:856599:861126 [2] NCCL INFO Channel 10/0 : 2[2] -> 1[1] via P2P/CUMEM/read +TENCENT64:856604:861125 [7] NCCL INFO Connected all rings +TENCENT64:856604:861125 [7] NCCL INFO Channel 00/0 : 7[7] -> 6[6] via P2P/CUMEM/read +TENCENT64:856598:861130 [1] NCCL INFO Channel 09/0 : 1[1] -> 0[0] via P2P/CUMEM/read +TENCENT64:856602:861129 [5] NCCL INFO Connected all rings +TENCENT64:856603:861123 [6] NCCL INFO Connected all rings +TENCENT64:856599:861126 [2] NCCL INFO Channel 11/0 : 2[2] -> 1[1] via P2P/CUMEM/read +TENCENT64:856604:861125 [7] NCCL INFO Channel 01/0 : 7[7] -> 6[6] via P2P/CUMEM/read +TENCENT64:856598:861130 [1] NCCL INFO Channel 10/0 : 1[1] -> 0[0] via P2P/CUMEM/read +TENCENT64:856599:861126 [2] NCCL INFO Channel 12/0 : 2[2] -> 1[1] via P2P/CUMEM/read +TENCENT64:856604:861125 [7] NCCL INFO Channel 02/0 : 7[7] -> 6[6] via P2P/CUMEM/read +TENCENT64:856598:861130 [1] NCCL INFO Channel 11/0 : 1[1] -> 0[0] via P2P/CUMEM/read +TENCENT64:856599:861126 [2] NCCL INFO Channel 13/0 : 2[2] -> 1[1] via P2P/CUMEM/read +TENCENT64:856604:861125 [7] NCCL INFO Channel 03/0 : 7[7] -> 6[6] via P2P/CUMEM/read +TENCENT64:856598:861130 [1] NCCL INFO Channel 12/0 : 1[1] -> 0[0] via P2P/CUMEM/read +TENCENT64:856604:861125 [7] NCCL INFO Channel 04/0 : 7[7] -> 6[6] via P2P/CUMEM/read +TENCENT64:856598:861130 [1] NCCL INFO Channel 13/0 : 1[1] -> 0[0] via P2P/CUMEM/read +TENCENT64:856604:861125 [7] NCCL INFO Channel 05/0 : 7[7] -> 6[6] via P2P/CUMEM/read +TENCENT64:856599:861126 [2] NCCL INFO Channel 14/0 : 2[2] -> 1[1] via P2P/CUMEM/read +TENCENT64:856598:861130 [1] NCCL INFO Channel 14/0 : 1[1] -> 0[0] via P2P/CUMEM/read +TENCENT64:856604:861125 [7] NCCL INFO Channel 06/0 : 7[7] -> 6[6] via P2P/CUMEM/read +TENCENT64:856599:861126 [2] NCCL INFO Channel 15/0 : 2[2] -> 1[1] via P2P/CUMEM/read +TENCENT64:856598:861130 [1] NCCL INFO Channel 15/0 : 1[1] -> 0[0] via P2P/CUMEM/read +TENCENT64:856604:861125 [7] NCCL INFO Channel 07/0 : 7[7] -> 6[6] via P2P/CUMEM/read +TENCENT64:856604:861125 [7] NCCL INFO Channel 08/0 : 7[7] -> 6[6] via P2P/CUMEM/read +TENCENT64:856600:861127 [3] NCCL INFO Channel 00/0 : 3[3] -> 2[2] via P2P/CUMEM/read +TENCENT64:856604:861125 [7] NCCL INFO Channel 09/0 : 7[7] -> 6[6] via P2P/CUMEM/read +TENCENT64:856600:861127 [3] NCCL INFO Channel 01/0 : 3[3] -> 2[2] via P2P/CUMEM/read +TENCENT64:856604:861125 [7] NCCL INFO Channel 10/0 : 7[7] -> 6[6] via P2P/CUMEM/read +TENCENT64:856601:861124 [4] NCCL INFO Channel 00/0 : 4[4] -> 3[3] via P2P/CUMEM/read +TENCENT64:856600:861127 [3] NCCL INFO Channel 02/0 : 3[3] -> 2[2] via P2P/CUMEM/read +TENCENT64:856601:861124 [4] NCCL INFO Channel 01/0 : 4[4] -> 3[3] via P2P/CUMEM/read +TENCENT64:856600:861127 [3] NCCL INFO Channel 03/0 : 3[3] -> 2[2] via P2P/CUMEM/read +TENCENT64:856601:861124 [4] NCCL INFO Channel 02/0 : 4[4] -> 3[3] via P2P/CUMEM/read +TENCENT64:856600:861127 [3] NCCL INFO Channel 04/0 : 3[3] -> 2[2] via P2P/CUMEM/read +TENCENT64:856601:861124 [4] NCCL INFO Channel 03/0 : 4[4] -> 3[3] via P2P/CUMEM/read +TENCENT64:856600:861127 [3] NCCL INFO Channel 05/0 : 3[3] -> 2[2] via P2P/CUMEM/read +TENCENT64:856601:861124 [4] NCCL INFO Channel 04/0 : 4[4] -> 3[3] via P2P/CUMEM/read +TENCENT64:856604:861125 [7] NCCL INFO Channel 11/0 : 7[7] -> 6[6] via P2P/CUMEM/read +TENCENT64:856600:861127 [3] NCCL INFO Channel 06/0 : 3[3] -> 2[2] via P2P/CUMEM/read +TENCENT64:856602:861129 [5] NCCL INFO Channel 00/0 : 5[5] -> 4[4] via P2P/CUMEM/read +TENCENT64:856601:861124 [4] NCCL INFO Channel 05/0 : 4[4] -> 3[3] via P2P/CUMEM/read +TENCENT64:856604:861125 [7] NCCL INFO Channel 12/0 : 7[7] -> 6[6] via P2P/CUMEM/read +TENCENT64:856602:861129 [5] NCCL INFO Channel 01/0 : 5[5] -> 4[4] via P2P/CUMEM/read +TENCENT64:856601:861124 [4] NCCL INFO Channel 06/0 : 4[4] -> 3[3] via P2P/CUMEM/read +TENCENT64:856604:861125 [7] NCCL INFO Channel 13/0 : 7[7] -> 6[6] via P2P/CUMEM/read +TENCENT64:856602:861129 [5] NCCL INFO Channel 02/0 : 5[5] -> 4[4] via P2P/CUMEM/read +TENCENT64:856601:861124 [4] NCCL INFO Channel 07/0 : 4[4] -> 3[3] via P2P/CUMEM/read +TENCENT64:856604:861125 [7] NCCL INFO Channel 14/0 : 7[7] -> 6[6] via P2P/CUMEM/read +TENCENT64:856603:861123 [6] NCCL INFO Channel 00/0 : 6[6] -> 5[5] via P2P/CUMEM/read +TENCENT64:856602:861129 [5] NCCL INFO Channel 03/0 : 5[5] -> 4[4] via P2P/CUMEM/read +TENCENT64:856601:861124 [4] NCCL INFO Channel 08/0 : 4[4] -> 3[3] via P2P/CUMEM/read +TENCENT64:856604:861125 [7] NCCL INFO Channel 15/0 : 7[7] -> 6[6] via P2P/CUMEM/read +TENCENT64:856603:861123 [6] NCCL INFO Channel 01/0 : 6[6] -> 5[5] via P2P/CUMEM/read +TENCENT64:856602:861129 [5] NCCL INFO Channel 04/0 : 5[5] -> 4[4] via P2P/CUMEM/read +TENCENT64:856601:861124 [4] NCCL INFO Channel 09/0 : 4[4] -> 3[3] via P2P/CUMEM/read +TENCENT64:856603:861123 [6] NCCL INFO Channel 02/0 : 6[6] -> 5[5] via P2P/CUMEM/read +TENCENT64:856602:861129 [5] NCCL INFO Channel 05/0 : 5[5] -> 4[4] via P2P/CUMEM/read +TENCENT64:856601:861124 [4] NCCL INFO Channel 10/0 : 4[4] -> 3[3] via P2P/CUMEM/read +TENCENT64:856600:861127 [3] NCCL INFO Channel 07/0 : 3[3] -> 2[2] via P2P/CUMEM/read +TENCENT64:856603:861123 [6] NCCL INFO Channel 03/0 : 6[6] -> 5[5] via P2P/CUMEM/read +TENCENT64:856602:861129 [5] NCCL INFO Channel 06/0 : 5[5] -> 4[4] via P2P/CUMEM/read +TENCENT64:856601:861124 [4] NCCL INFO Channel 11/0 : 4[4] -> 3[3] via P2P/CUMEM/read +TENCENT64:856600:861127 [3] NCCL INFO Channel 08/0 : 3[3] -> 2[2] via P2P/CUMEM/read +TENCENT64:856603:861123 [6] NCCL INFO Channel 04/0 : 6[6] -> 5[5] via P2P/CUMEM/read +TENCENT64:856600:861127 [3] NCCL INFO Channel 09/0 : 3[3] -> 2[2] via P2P/CUMEM/read +TENCENT64:856602:861129 [5] NCCL INFO Channel 07/0 : 5[5] -> 4[4] via P2P/CUMEM/read +TENCENT64:856601:861124 [4] NCCL INFO Channel 12/0 : 4[4] -> 3[3] via P2P/CUMEM/read +TENCENT64:856603:861123 [6] NCCL INFO Channel 05/0 : 6[6] -> 5[5] via P2P/CUMEM/read +TENCENT64:856600:861127 [3] NCCL INFO Channel 10/0 : 3[3] -> 2[2] via P2P/CUMEM/read +TENCENT64:856602:861129 [5] NCCL INFO Channel 08/0 : 5[5] -> 4[4] via P2P/CUMEM/read +TENCENT64:856601:861124 [4] NCCL INFO Channel 13/0 : 4[4] -> 3[3] via P2P/CUMEM/read +TENCENT64:856603:861123 [6] NCCL INFO Channel 06/0 : 6[6] -> 5[5] via P2P/CUMEM/read +TENCENT64:856600:861127 [3] NCCL INFO Channel 11/0 : 3[3] -> 2[2] via P2P/CUMEM/read +TENCENT64:856602:861129 [5] NCCL INFO Channel 09/0 : 5[5] -> 4[4] via P2P/CUMEM/read +TENCENT64:856601:861124 [4] NCCL INFO Channel 14/0 : 4[4] -> 3[3] via P2P/CUMEM/read +TENCENT64:856603:861123 [6] NCCL INFO Channel 07/0 : 6[6] -> 5[5] via P2P/CUMEM/read +TENCENT64:856600:861127 [3] NCCL INFO Channel 12/0 : 3[3] -> 2[2] via P2P/CUMEM/read +TENCENT64:856601:861124 [4] NCCL INFO Channel 15/0 : 4[4] -> 3[3] via P2P/CUMEM/read +TENCENT64:856603:861123 [6] NCCL INFO Channel 08/0 : 6[6] -> 5[5] via P2P/CUMEM/read +TENCENT64:856600:861127 [3] NCCL INFO Channel 13/0 : 3[3] -> 2[2] via P2P/CUMEM/read +TENCENT64:856603:861123 [6] NCCL INFO Channel 09/0 : 6[6] -> 5[5] via P2P/CUMEM/read +TENCENT64:856600:861127 [3] NCCL INFO Channel 14/0 : 3[3] -> 2[2] via P2P/CUMEM/read +TENCENT64:856600:861127 [3] NCCL INFO Channel 15/0 : 3[3] -> 2[2] via P2P/CUMEM/read +TENCENT64:856602:861129 [5] NCCL INFO Channel 10/0 : 5[5] -> 4[4] via P2P/CUMEM/read +TENCENT64:856603:861123 [6] NCCL INFO Channel 10/0 : 6[6] -> 5[5] via P2P/CUMEM/read +TENCENT64:856603:861123 [6] NCCL INFO Channel 11/0 : 6[6] -> 5[5] via P2P/CUMEM/read +TENCENT64:856602:861129 [5] NCCL INFO Channel 11/0 : 5[5] -> 4[4] via P2P/CUMEM/read +TENCENT64:856603:861123 [6] NCCL INFO Channel 12/0 : 6[6] -> 5[5] via P2P/CUMEM/read +TENCENT64:856602:861129 [5] NCCL INFO Channel 12/0 : 5[5] -> 4[4] via P2P/CUMEM/read +TENCENT64:856603:861123 [6] NCCL INFO Channel 13/0 : 6[6] -> 5[5] via P2P/CUMEM/read +TENCENT64:856602:861129 [5] NCCL INFO Channel 13/0 : 5[5] -> 4[4] via P2P/CUMEM/read +TENCENT64:856602:861129 [5] NCCL INFO Channel 14/0 : 5[5] -> 4[4] via P2P/CUMEM/read +TENCENT64:856602:861129 [5] NCCL INFO Channel 15/0 : 5[5] -> 4[4] via P2P/CUMEM/read +TENCENT64:856603:861123 [6] NCCL INFO Channel 14/0 : 6[6] -> 5[5] via P2P/CUMEM/read +TENCENT64:856603:861123 [6] NCCL INFO Channel 15/0 : 6[6] -> 5[5] via P2P/CUMEM/read +TENCENT64:856597:861128 [0] NCCL INFO Connected all trees +TENCENT64:856597:861128 [0] NCCL INFO threadThresholds 8/8/64 | 64/8/64 | 512 | 512 +TENCENT64:856597:861128 [0] NCCL INFO 16 coll channels, 16 collnet channels, 0 nvls channels, 16 p2p channels, 16 p2p channels per peer +TENCENT64:856598:861130 [1] NCCL INFO Connected all trees +TENCENT64:856598:861130 [1] NCCL INFO threadThresholds 8/8/64 | 64/8/64 | 512 | 512 +TENCENT64:856598:861130 [1] NCCL INFO 16 coll channels, 16 collnet channels, 0 nvls channels, 16 p2p channels, 16 p2p channels per peer +TENCENT64:856599:861126 [2] NCCL INFO Connected all trees +TENCENT64:856599:861126 [2] NCCL INFO threadThresholds 8/8/64 | 64/8/64 | 512 | 512 +TENCENT64:856599:861126 [2] NCCL INFO 16 coll channels, 16 collnet channels, 0 nvls channels, 16 p2p channels, 16 p2p channels per peer +TENCENT64:856600:861127 [3] NCCL INFO Connected all trees +TENCENT64:856600:861127 [3] NCCL INFO threadThresholds 8/8/64 | 64/8/64 | 512 | 512 +TENCENT64:856600:861127 [3] NCCL INFO 16 coll channels, 16 collnet channels, 0 nvls channels, 16 p2p channels, 16 p2p channels per peer +TENCENT64:856604:861125 [7] NCCL INFO Connected all trees +TENCENT64:856604:861125 [7] NCCL INFO threadThresholds 8/8/64 | 64/8/64 | 512 | 512 +TENCENT64:856604:861125 [7] NCCL INFO 16 coll channels, 16 collnet channels, 0 nvls channels, 16 p2p channels, 16 p2p channels per peer +TENCENT64:856601:861124 [4] NCCL INFO Connected all trees +TENCENT64:856601:861124 [4] NCCL INFO threadThresholds 8/8/64 | 64/8/64 | 512 | 512 +TENCENT64:856601:861124 [4] NCCL INFO 16 coll channels, 16 collnet channels, 0 nvls channels, 16 p2p channels, 16 p2p channels per peer +TENCENT64:856603:861123 [6] NCCL INFO Connected all trees +TENCENT64:856602:861129 [5] NCCL INFO Connected all trees +TENCENT64:856603:861123 [6] NCCL INFO threadThresholds 8/8/64 | 64/8/64 | 512 | 512 +TENCENT64:856603:861123 [6] NCCL INFO 16 coll channels, 16 collnet channels, 0 nvls channels, 16 p2p channels, 16 p2p channels per peer +TENCENT64:856602:861129 [5] NCCL INFO threadThresholds 8/8/64 | 64/8/64 | 512 | 512 +TENCENT64:856602:861129 [5] NCCL INFO 16 coll channels, 16 collnet channels, 0 nvls channels, 16 p2p channels, 16 p2p channels per peer +TENCENT64:856603:861123 [6] NCCL INFO ncclCommSplit comm 0x7f66fc066f50 rank 6 nranks 8 cudaDev 6 nvmlDev 6 busId e1000 parent 0xada66d0 color -934961569 key 6 commId 0xde142fdc93a738ae - Init COMPLETE +TENCENT64:856597:861128 [0] NCCL INFO ncclCommSplit comm 0x7f0bec068160 rank 0 nranks 8 cudaDev 0 nvmlDev 0 busId 25000 parent 0xa71bd30 color -934961569 key 0 commId 0xde142fdc93a738ae - Init COMPLETE +TENCENT64:856601:861124 [4] NCCL INFO ncclCommSplit comm 0x7fae74067050 rank 4 nranks 8 cudaDev 4 nvmlDev 4 busId a2000 parent 0xbc11380 color -934961569 key 4 commId 0xde142fdc93a738ae - Init COMPLETE +TENCENT64:856602:861129 [5] NCCL INFO ncclCommSplit comm 0x7f99a0067130 rank 5 nranks 8 cudaDev 5 nvmlDev 5 busId a7000 parent 0xaf5f740 color -934961569 key 5 commId 0xde142fdc93a738ae - Init COMPLETE +TENCENT64:856598:861130 [1] NCCL INFO ncclCommSplit comm 0x7f625c066fc0 rank 1 nranks 8 cudaDev 1 nvmlDev 1 busId 2b000 parent 0xac3d6d0 color -934961569 key 1 commId 0xde142fdc93a738ae - Init COMPLETE +TENCENT64:856600:861127 [3] NCCL INFO ncclCommSplit comm 0x7f18e8068360 rank 3 nranks 8 cudaDev 3 nvmlDev 3 busId 69000 parent 0xb785090 color -934961569 key 3 commId 0xde142fdc93a738ae - Init COMPLETE +TENCENT64:856599:861126 [2] NCCL INFO ncclCommSplit comm 0x7fcd04067390 rank 2 nranks 8 cudaDev 2 nvmlDev 2 busId 64000 parent 0xb7cd040 color -934961569 key 2 commId 0xde142fdc93a738ae - Init COMPLETE +TENCENT64:856604:861125 [7] NCCL INFO ncclCommSplit comm 0x7f7a18066fa0 rank 7 nranks 8 cudaDev 7 nvmlDev 7 busId e7000 parent 0xa8a21a0 color -934961569 key 7 commId 0xde142fdc93a738ae - Init COMPLETE + + Train step of epoch 0: 0%| | 0/5616 [00:03