diff --git a/model-00001-of-00004.safetensors b/model-00001-of-00004.safetensors index 0c72a2885d475b19313210f447b817a4e1e227c9..106d31e3de550072b23008b0cc439fb032935924 100644 --- a/model-00001-of-00004.safetensors +++ b/model-00001-of-00004.safetensors @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5248e85868cfadf673f21aff3553bda95fc725be514fcb518f8605431cda1370 +oid sha256:44f5977251fb7a78e7eba90933c06e4d9fdafb7c482f49341ad839dcb135d7fe size 4874843752 diff --git a/model-00002-of-00004.safetensors b/model-00002-of-00004.safetensors index 734851392eb19af41a5948ff22900a083d320070..31f2c2b0031e4359f33715aa246fa8b94a18f368 100644 --- a/model-00002-of-00004.safetensors +++ b/model-00002-of-00004.safetensors @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:adaaf154d498e47b508bd058351d88fae666bd6923ef97752791cecf35f1a5ad +oid sha256:f69ded7665946b680ec2e95710390f0e81ccf697dfdce3fab0d3510896110f46 size 4932751008 diff --git a/model-00003-of-00004.safetensors b/model-00003-of-00004.safetensors index 695c1cae4e6e5323ccb2e6e736d1d07c32f82c1d..93a1538e75dc32d9c1e3c169d2321908d742aa13 100644 --- a/model-00003-of-00004.safetensors +++ b/model-00003-of-00004.safetensors @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:deaded19ac2798595b85aa83a5b121de1ccb598159e52f793d41caeb5ee6bee0 +oid sha256:db3bec760b17695fd96cb40d1488940e3ad73f9a01e5510ed76aeb4ef6b1078a size 4330865200 diff --git a/model-00004-of-00004.safetensors b/model-00004-of-00004.safetensors index cf22bf82b1aceb022f52251115c6eee5353ead32..0a34d4255c31a8f80a759f64f2bd78cebbad1b66 100644 --- a/model-00004-of-00004.safetensors +++ b/model-00004-of-00004.safetensors @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:813e96187ef9e2dec23fdf90b62ac1ae750a434b9a0e73cd0847ce4b79035f8a +oid sha256:99f67e9a3557ea5d22a0edb4f723768e5002f064a8a279081df9c3a2faa80834 size 1087177856 diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log index bad7b0e2c783bdea166c52adc1c5c5a2b4b0845e..2b6fdd216525649f3628c81512f79937ffac83a0 100644 --- a/wandb/debug-internal.log +++ b/wandb/debug-internal.log @@ -371,3 +371,263 @@ 2024-11-13 17:19:10,426 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: stop_status 2024-11-13 17:19:10,427 DEBUG SenderThread:1939 [sender.py:send_request():409] send_request: stop_status 2024-11-13 17:19:11,531 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 17:19:16,532 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 17:19:21,533 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 17:19:23,908 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: internal_messages +2024-11-13 17:19:24,981 INFO Thread-12 :1939 [dir_watcher.py:_on_file_modified():288] file/dir modified: /opt/ml/model//sft-codecontests-qwen_ds-code-contests_model-Qwen2.5-Coder-7B-Instruct_sch-cosine_lr-1e-5_bs-64_acc-8_len-2048/wandb/run-20241113_171007-kushalarora-rvv-main-2024-11-13-16-43-09-915-0exw0n-algo-1/files/output.log +2024-11-13 17:19:25,426 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: stop_status +2024-11-13 17:19:25,426 DEBUG SenderThread:1939 [sender.py:send_request():409] send_request: stop_status +2024-11-13 17:19:27,525 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 17:19:32,526 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 17:19:37,526 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 17:19:37,986 DEBUG SenderThread:1939 [sender.py:send():382] send: stats +2024-11-13 17:19:38,908 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: internal_messages +2024-11-13 17:19:40,426 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: stop_status +2024-11-13 17:19:40,427 DEBUG SenderThread:1939 [sender.py:send_request():409] send_request: stop_status +2024-11-13 17:19:43,503 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 17:19:46,989 INFO Thread-12 :1939 [dir_watcher.py:_on_file_modified():288] file/dir modified: /opt/ml/model//sft-codecontests-qwen_ds-code-contests_model-Qwen2.5-Coder-7B-Instruct_sch-cosine_lr-1e-5_bs-64_acc-8_len-2048/wandb/run-20241113_171007-kushalarora-rvv-main-2024-11-13-16-43-09-915-0exw0n-algo-1/files/output.log +2024-11-13 17:19:49,064 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 17:19:53,908 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: internal_messages +2024-11-13 17:19:54,065 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 17:19:55,426 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: stop_status +2024-11-13 17:19:55,427 DEBUG SenderThread:1939 [sender.py:send_request():409] send_request: stop_status +2024-11-13 17:19:59,572 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 17:20:04,573 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 17:20:07,011 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: partial_history +2024-11-13 17:20:07,012 DEBUG SenderThread:1939 [sender.py:send():382] send: history +2024-11-13 17:20:07,013 DEBUG SenderThread:1939 [sender.py:send_request():409] send_request: summary_record +2024-11-13 17:20:07,013 INFO SenderThread:1939 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end +2024-11-13 17:20:07,988 DEBUG SenderThread:1939 [sender.py:send():382] send: stats +2024-11-13 17:20:07,998 INFO Thread-12 :1939 [dir_watcher.py:_on_file_modified():288] file/dir modified: /opt/ml/model//sft-codecontests-qwen_ds-code-contests_model-Qwen2.5-Coder-7B-Instruct_sch-cosine_lr-1e-5_bs-64_acc-8_len-2048/wandb/run-20241113_171007-kushalarora-rvv-main-2024-11-13-16-43-09-915-0exw0n-algo-1/files/wandb-summary.json +2024-11-13 17:20:07,999 INFO Thread-12 :1939 [dir_watcher.py:_on_file_modified():288] file/dir modified: /opt/ml/model//sft-codecontests-qwen_ds-code-contests_model-Qwen2.5-Coder-7B-Instruct_sch-cosine_lr-1e-5_bs-64_acc-8_len-2048/wandb/run-20241113_171007-kushalarora-rvv-main-2024-11-13-16-43-09-915-0exw0n-algo-1/files/output.log +2024-11-13 17:20:08,654 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: partial_history +2024-11-13 17:20:08,655 DEBUG SenderThread:1939 [sender.py:send():382] send: history +2024-11-13 17:20:08,656 DEBUG SenderThread:1939 [sender.py:send_request():409] send_request: summary_record +2024-11-13 17:20:08,657 INFO SenderThread:1939 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end +2024-11-13 17:20:08,908 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: internal_messages +2024-11-13 17:20:08,999 INFO Thread-12 :1939 [dir_watcher.py:_on_file_modified():288] file/dir modified: /opt/ml/model//sft-codecontests-qwen_ds-code-contests_model-Qwen2.5-Coder-7B-Instruct_sch-cosine_lr-1e-5_bs-64_acc-8_len-2048/wandb/run-20241113_171007-kushalarora-rvv-main-2024-11-13-16-43-09-915-0exw0n-algo-1/files/wandb-summary.json +2024-11-13 17:20:08,999 INFO Thread-12 :1939 [dir_watcher.py:_on_file_modified():288] file/dir modified: /opt/ml/model//sft-codecontests-qwen_ds-code-contests_model-Qwen2.5-Coder-7B-Instruct_sch-cosine_lr-1e-5_bs-64_acc-8_len-2048/wandb/run-20241113_171007-kushalarora-rvv-main-2024-11-13-16-43-09-915-0exw0n-algo-1/files/output.log +2024-11-13 17:20:09,658 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 17:20:09,999 INFO Thread-12 :1939 [dir_watcher.py:_on_file_modified():288] file/dir modified: /opt/ml/model//sft-codecontests-qwen_ds-code-contests_model-Qwen2.5-Coder-7B-Instruct_sch-cosine_lr-1e-5_bs-64_acc-8_len-2048/wandb/run-20241113_171007-kushalarora-rvv-main-2024-11-13-16-43-09-915-0exw0n-algo-1/files/output.log +2024-11-13 17:20:10,427 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: stop_status +2024-11-13 17:20:10,427 DEBUG SenderThread:1939 [sender.py:send_request():409] send_request: stop_status +2024-11-13 17:20:10,999 INFO Thread-12 :1939 [dir_watcher.py:_on_file_modified():288] file/dir modified: /opt/ml/model//sft-codecontests-qwen_ds-code-contests_model-Qwen2.5-Coder-7B-Instruct_sch-cosine_lr-1e-5_bs-64_acc-8_len-2048/wandb/run-20241113_171007-kushalarora-rvv-main-2024-11-13-16-43-09-915-0exw0n-algo-1/files/output.log +2024-11-13 17:20:15,532 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 17:20:20,533 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 17:20:23,908 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: internal_messages +2024-11-13 17:20:25,427 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: stop_status +2024-11-13 17:20:25,427 DEBUG SenderThread:1939 [sender.py:send_request():409] send_request: stop_status +2024-11-13 17:20:25,533 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 17:20:30,534 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 17:20:33,008 INFO Thread-12 :1939 [dir_watcher.py:_on_file_modified():288] file/dir modified: /opt/ml/model//sft-codecontests-qwen_ds-code-contests_model-Qwen2.5-Coder-7B-Instruct_sch-cosine_lr-1e-5_bs-64_acc-8_len-2048/wandb/run-20241113_171007-kushalarora-rvv-main-2024-11-13-16-43-09-915-0exw0n-algo-1/files/output.log +2024-11-13 17:20:35,587 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 17:20:37,991 DEBUG SenderThread:1939 [sender.py:send():382] send: stats +2024-11-13 17:20:38,908 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: internal_messages +2024-11-13 17:20:40,427 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: stop_status +2024-11-13 17:20:40,427 DEBUG SenderThread:1939 [sender.py:send_request():409] send_request: stop_status +2024-11-13 17:20:41,574 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 17:20:46,575 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 17:20:51,575 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 17:20:53,908 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: internal_messages +2024-11-13 17:20:55,021 INFO Thread-12 :1939 [dir_watcher.py:_on_file_modified():288] file/dir modified: /opt/ml/model//sft-codecontests-qwen_ds-code-contests_model-Qwen2.5-Coder-7B-Instruct_sch-cosine_lr-1e-5_bs-64_acc-8_len-2048/wandb/run-20241113_171007-kushalarora-rvv-main-2024-11-13-16-43-09-915-0exw0n-algo-1/files/output.log +2024-11-13 17:20:55,427 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: stop_status +2024-11-13 17:20:55,427 DEBUG SenderThread:1939 [sender.py:send_request():409] send_request: stop_status +2024-11-13 17:20:56,595 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 17:21:01,596 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 17:21:06,597 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 17:21:07,994 DEBUG SenderThread:1939 [sender.py:send():382] send: stats +2024-11-13 17:21:08,909 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: internal_messages +2024-11-13 17:21:10,428 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: stop_status +2024-11-13 17:21:10,428 DEBUG SenderThread:1939 [sender.py:send_request():409] send_request: stop_status +2024-11-13 17:21:12,570 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 17:21:17,036 INFO Thread-12 :1939 [dir_watcher.py:_on_file_modified():288] file/dir modified: /opt/ml/model//sft-codecontests-qwen_ds-code-contests_model-Qwen2.5-Coder-7B-Instruct_sch-cosine_lr-1e-5_bs-64_acc-8_len-2048/wandb/run-20241113_171007-kushalarora-rvv-main-2024-11-13-16-43-09-915-0exw0n-algo-1/files/output.log +2024-11-13 17:21:17,796 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 17:21:22,796 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 17:21:23,909 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: internal_messages +2024-11-13 17:21:25,427 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: stop_status +2024-11-13 17:21:25,427 DEBUG SenderThread:1939 [sender.py:send_request():409] send_request: stop_status +2024-11-13 17:21:28,520 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 17:21:33,520 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 17:21:37,996 DEBUG SenderThread:1939 [sender.py:send():382] send: stats +2024-11-13 17:21:38,553 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 17:21:38,909 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: internal_messages +2024-11-13 17:21:40,427 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: stop_status +2024-11-13 17:21:40,427 DEBUG SenderThread:1939 [sender.py:send_request():409] send_request: stop_status +2024-11-13 17:21:41,048 INFO Thread-12 :1939 [dir_watcher.py:_on_file_modified():288] file/dir modified: /opt/ml/model//sft-codecontests-qwen_ds-code-contests_model-Qwen2.5-Coder-7B-Instruct_sch-cosine_lr-1e-5_bs-64_acc-8_len-2048/wandb/run-20241113_171007-kushalarora-rvv-main-2024-11-13-16-43-09-915-0exw0n-algo-1/files/output.log +2024-11-13 17:21:44,512 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 17:21:49,513 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 17:21:54,514 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 17:21:58,921 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: stop_status +2024-11-13 17:21:58,921 DEBUG SenderThread:1939 [sender.py:send_request():409] send_request: stop_status +2024-11-13 17:21:59,233 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: internal_messages +2024-11-13 17:22:00,115 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 17:22:05,116 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 17:22:07,999 DEBUG SenderThread:1939 [sender.py:send():382] send: stats +2024-11-13 17:22:11,001 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 17:22:13,345 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: internal_messages +2024-11-13 17:22:13,345 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: stop_status +2024-11-13 17:22:13,345 DEBUG SenderThread:1939 [sender.py:send_request():409] send_request: stop_status +2024-11-13 17:22:16,483 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 17:22:21,484 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 17:22:26,485 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 17:22:29,071 INFO Thread-12 :1939 [dir_watcher.py:_on_file_modified():288] file/dir modified: /opt/ml/model//sft-codecontests-qwen_ds-code-contests_model-Qwen2.5-Coder-7B-Instruct_sch-cosine_lr-1e-5_bs-64_acc-8_len-2048/wandb/run-20241113_171007-kushalarora-rvv-main-2024-11-13-16-43-09-915-0exw0n-algo-1/files/output.log +2024-11-13 17:22:29,261 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: stop_status +2024-11-13 17:22:29,261 DEBUG SenderThread:1939 [sender.py:send_request():409] send_request: stop_status +2024-11-13 17:22:29,593 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: internal_messages +2024-11-13 17:22:32,181 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 17:22:37,181 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 17:22:38,002 DEBUG SenderThread:1939 [sender.py:send():382] send: stats +2024-11-13 17:22:43,004 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 17:22:43,839 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: internal_messages +2024-11-13 17:22:43,839 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: stop_status +2024-11-13 17:22:43,840 DEBUG SenderThread:1939 [sender.py:send_request():409] send_request: stop_status +2024-11-13 17:22:45,079 INFO Thread-12 :1939 [dir_watcher.py:_on_file_modified():288] file/dir modified: /opt/ml/model//sft-codecontests-qwen_ds-code-contests_model-Qwen2.5-Coder-7B-Instruct_sch-cosine_lr-1e-5_bs-64_acc-8_len-2048/wandb/run-20241113_171007-kushalarora-rvv-main-2024-11-13-16-43-09-915-0exw0n-algo-1/files/output.log +2024-11-13 17:22:47,080 INFO Thread-12 :1939 [dir_watcher.py:_on_file_modified():288] file/dir modified: /opt/ml/model//sft-codecontests-qwen_ds-code-contests_model-Qwen2.5-Coder-7B-Instruct_sch-cosine_lr-1e-5_bs-64_acc-8_len-2048/wandb/run-20241113_171007-kushalarora-rvv-main-2024-11-13-16-43-09-915-0exw0n-algo-1/files/output.log +2024-11-13 17:22:48,290 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 17:22:49,081 INFO Thread-12 :1939 [dir_watcher.py:_on_file_modified():288] file/dir modified: /opt/ml/model//sft-codecontests-qwen_ds-code-contests_model-Qwen2.5-Coder-7B-Instruct_sch-cosine_lr-1e-5_bs-64_acc-8_len-2048/wandb/run-20241113_171007-kushalarora-rvv-main-2024-11-13-16-43-09-915-0exw0n-algo-1/files/output.log +2024-11-13 17:22:51,082 INFO Thread-12 :1939 [dir_watcher.py:_on_file_modified():288] file/dir modified: /opt/ml/model//sft-codecontests-qwen_ds-code-contests_model-Qwen2.5-Coder-7B-Instruct_sch-cosine_lr-1e-5_bs-64_acc-8_len-2048/wandb/run-20241113_171007-kushalarora-rvv-main-2024-11-13-16-43-09-915-0exw0n-algo-1/files/output.log +2024-11-13 17:22:53,083 INFO Thread-12 :1939 [dir_watcher.py:_on_file_modified():288] file/dir modified: /opt/ml/model//sft-codecontests-qwen_ds-code-contests_model-Qwen2.5-Coder-7B-Instruct_sch-cosine_lr-1e-5_bs-64_acc-8_len-2048/wandb/run-20241113_171007-kushalarora-rvv-main-2024-11-13-16-43-09-915-0exw0n-algo-1/files/output.log +2024-11-13 17:22:53,501 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 17:22:55,084 INFO Thread-12 :1939 [dir_watcher.py:_on_file_modified():288] file/dir modified: /opt/ml/model//sft-codecontests-qwen_ds-code-contests_model-Qwen2.5-Coder-7B-Instruct_sch-cosine_lr-1e-5_bs-64_acc-8_len-2048/wandb/run-20241113_171007-kushalarora-rvv-main-2024-11-13-16-43-09-915-0exw0n-algo-1/files/output.log +2024-11-13 17:22:57,085 INFO Thread-12 :1939 [dir_watcher.py:_on_file_modified():288] file/dir modified: /opt/ml/model//sft-codecontests-qwen_ds-code-contests_model-Qwen2.5-Coder-7B-Instruct_sch-cosine_lr-1e-5_bs-64_acc-8_len-2048/wandb/run-20241113_171007-kushalarora-rvv-main-2024-11-13-16-43-09-915-0exw0n-algo-1/files/output.log +2024-11-13 17:22:59,086 INFO Thread-12 :1939 [dir_watcher.py:_on_file_modified():288] file/dir modified: /opt/ml/model//sft-codecontests-qwen_ds-code-contests_model-Qwen2.5-Coder-7B-Instruct_sch-cosine_lr-1e-5_bs-64_acc-8_len-2048/wandb/run-20241113_171007-kushalarora-rvv-main-2024-11-13-16-43-09-915-0exw0n-algo-1/files/output.log +2024-11-13 17:22:59,339 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: stop_status +2024-11-13 17:22:59,339 DEBUG SenderThread:1939 [sender.py:send_request():409] send_request: stop_status +2024-11-13 17:22:59,486 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 17:22:59,961 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: internal_messages +2024-11-13 17:23:04,808 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 17:23:08,005 DEBUG SenderThread:1939 [sender.py:send():382] send: stats +2024-11-13 17:23:10,007 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 17:23:15,008 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 17:23:16,870 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: internal_messages +2024-11-13 17:23:17,878 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: stop_status +2024-11-13 17:23:17,878 DEBUG SenderThread:1939 [sender.py:send_request():409] send_request: stop_status +2024-11-13 17:23:20,977 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 17:23:25,978 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 17:23:30,117 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: internal_messages +2024-11-13 17:23:31,167 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: stop_status +2024-11-13 17:23:31,167 DEBUG SenderThread:1939 [sender.py:send_request():409] send_request: stop_status +2024-11-13 17:23:31,334 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 17:23:33,101 INFO Thread-12 :1939 [dir_watcher.py:_on_file_modified():288] file/dir modified: /opt/ml/model//sft-codecontests-qwen_ds-code-contests_model-Qwen2.5-Coder-7B-Instruct_sch-cosine_lr-1e-5_bs-64_acc-8_len-2048/wandb/run-20241113_171007-kushalarora-rvv-main-2024-11-13-16-43-09-915-0exw0n-algo-1/files/output.log +2024-11-13 17:23:36,842 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 17:23:38,008 DEBUG SenderThread:1939 [sender.py:send():382] send: stats +2024-11-13 17:23:42,009 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 17:23:43,918 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: internal_messages +2024-11-13 17:23:44,229 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: stop_status +2024-11-13 17:23:44,229 DEBUG SenderThread:1939 [sender.py:send_request():409] send_request: stop_status +2024-11-13 17:23:47,330 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 17:23:52,331 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 17:23:57,331 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 17:23:58,927 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: internal_messages +2024-11-13 17:23:59,229 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: stop_status +2024-11-13 17:23:59,229 DEBUG SenderThread:1939 [sender.py:send_request():409] send_request: stop_status +2024-11-13 17:24:02,381 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 17:24:07,382 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 17:24:08,010 DEBUG SenderThread:1939 [sender.py:send():382] send: stats +2024-11-13 17:24:13,012 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 17:24:18,013 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 17:24:20,080 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: stop_status +2024-11-13 17:24:20,081 DEBUG SenderThread:1939 [sender.py:send_request():409] send_request: stop_status +2024-11-13 17:24:20,425 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: internal_messages +2024-11-13 17:24:23,197 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 17:24:28,198 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 17:24:33,199 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 17:24:34,975 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: internal_messages +2024-11-13 17:24:34,975 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: stop_status +2024-11-13 17:24:34,976 DEBUG SenderThread:1939 [sender.py:send_request():409] send_request: stop_status +2024-11-13 17:24:38,013 DEBUG SenderThread:1939 [sender.py:send():382] send: stats +2024-11-13 17:24:39,015 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 17:24:43,133 INFO Thread-12 :1939 [dir_watcher.py:_on_file_modified():288] file/dir modified: /opt/ml/model//sft-codecontests-qwen_ds-code-contests_model-Qwen2.5-Coder-7B-Instruct_sch-cosine_lr-1e-5_bs-64_acc-8_len-2048/wandb/run-20241113_171007-kushalarora-rvv-main-2024-11-13-16-43-09-915-0exw0n-algo-1/files/output.log +2024-11-13 17:24:44,952 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 17:24:49,952 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 17:24:50,909 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: stop_status +2024-11-13 17:24:50,909 DEBUG SenderThread:1939 [sender.py:send_request():409] send_request: stop_status +2024-11-13 17:24:51,584 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: internal_messages +2024-11-13 17:24:55,782 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 17:25:00,783 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 17:25:05,028 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: internal_messages +2024-11-13 17:25:05,056 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: stop_status +2024-11-13 17:25:05,057 DEBUG SenderThread:1939 [sender.py:send_request():409] send_request: stop_status +2024-11-13 17:25:05,144 INFO Thread-12 :1939 [dir_watcher.py:_on_file_modified():288] file/dir modified: /opt/ml/model//sft-codecontests-qwen_ds-code-contests_model-Qwen2.5-Coder-7B-Instruct_sch-cosine_lr-1e-5_bs-64_acc-8_len-2048/wandb/run-20241113_171007-kushalarora-rvv-main-2024-11-13-16-43-09-915-0exw0n-algo-1/files/output.log +2024-11-13 17:25:06,319 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 17:25:07,145 INFO Thread-12 :1939 [dir_watcher.py:_on_file_modified():288] file/dir modified: /opt/ml/model//sft-codecontests-qwen_ds-code-contests_model-Qwen2.5-Coder-7B-Instruct_sch-cosine_lr-1e-5_bs-64_acc-8_len-2048/wandb/run-20241113_171007-kushalarora-rvv-main-2024-11-13-16-43-09-915-0exw0n-algo-1/files/output.log +2024-11-13 17:25:08,016 DEBUG SenderThread:1939 [sender.py:send():382] send: stats +2024-11-13 17:25:09,146 INFO Thread-12 :1939 [dir_watcher.py:_on_file_modified():288] file/dir modified: /opt/ml/model//sft-codecontests-qwen_ds-code-contests_model-Qwen2.5-Coder-7B-Instruct_sch-cosine_lr-1e-5_bs-64_acc-8_len-2048/wandb/run-20241113_171007-kushalarora-rvv-main-2024-11-13-16-43-09-915-0exw0n-algo-1/files/output.log +2024-11-13 17:25:11,173 INFO Thread-12 :1939 [dir_watcher.py:_on_file_modified():288] file/dir modified: /opt/ml/model//sft-codecontests-qwen_ds-code-contests_model-Qwen2.5-Coder-7B-Instruct_sch-cosine_lr-1e-5_bs-64_acc-8_len-2048/wandb/run-20241113_171007-kushalarora-rvv-main-2024-11-13-16-43-09-915-0exw0n-algo-1/files/output.log +2024-11-13 17:25:11,566 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 17:25:13,229 INFO Thread-12 :1939 [dir_watcher.py:_on_file_modified():288] file/dir modified: /opt/ml/model//sft-codecontests-qwen_ds-code-contests_model-Qwen2.5-Coder-7B-Instruct_sch-cosine_lr-1e-5_bs-64_acc-8_len-2048/wandb/run-20241113_171007-kushalarora-rvv-main-2024-11-13-16-43-09-915-0exw0n-algo-1/files/output.log +2024-11-13 17:25:16,231 INFO Thread-12 :1939 [dir_watcher.py:_on_file_modified():288] file/dir modified: /opt/ml/model//sft-codecontests-qwen_ds-code-contests_model-Qwen2.5-Coder-7B-Instruct_sch-cosine_lr-1e-5_bs-64_acc-8_len-2048/wandb/run-20241113_171007-kushalarora-rvv-main-2024-11-13-16-43-09-915-0exw0n-algo-1/files/output.log +2024-11-13 17:25:16,767 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 17:25:18,232 INFO Thread-12 :1939 [dir_watcher.py:_on_file_modified():288] file/dir modified: /opt/ml/model//sft-codecontests-qwen_ds-code-contests_model-Qwen2.5-Coder-7B-Instruct_sch-cosine_lr-1e-5_bs-64_acc-8_len-2048/wandb/run-20241113_171007-kushalarora-rvv-main-2024-11-13-16-43-09-915-0exw0n-algo-1/files/output.log +2024-11-13 17:25:20,233 INFO Thread-12 :1939 [dir_watcher.py:_on_file_modified():288] file/dir modified: /opt/ml/model//sft-codecontests-qwen_ds-code-contests_model-Qwen2.5-Coder-7B-Instruct_sch-cosine_lr-1e-5_bs-64_acc-8_len-2048/wandb/run-20241113_171007-kushalarora-rvv-main-2024-11-13-16-43-09-915-0exw0n-algo-1/files/output.log +2024-11-13 17:25:22,557 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 17:25:24,219 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: stop_status +2024-11-13 17:25:24,220 DEBUG SenderThread:1939 [sender.py:send_request():409] send_request: stop_status +2024-11-13 17:25:24,289 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: internal_messages +2024-11-13 17:25:28,289 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 17:25:33,289 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 17:25:38,019 DEBUG SenderThread:1939 [sender.py:send():382] send: stats +2024-11-13 17:25:38,279 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: stop_status +2024-11-13 17:25:38,279 DEBUG SenderThread:1939 [sender.py:send_request():409] send_request: stop_status +2024-11-13 17:25:38,373 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 17:25:39,360 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: internal_messages +2024-11-13 17:25:44,045 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 17:25:49,045 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 17:25:54,046 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 17:25:55,068 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: stop_status +2024-11-13 17:25:55,069 DEBUG SenderThread:1939 [sender.py:send_request():409] send_request: stop_status +2024-11-13 17:25:55,314 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: internal_messages +2024-11-13 17:25:59,148 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 17:26:01,619 DEBUG SenderThread:1939 [sender.py:send_request():409] send_request: summary_record +2024-11-13 17:26:01,620 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: partial_history +2024-11-13 17:26:01,622 INFO SenderThread:1939 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end +2024-11-13 17:26:01,622 DEBUG SenderThread:1939 [sender.py:send_request():409] send_request: summary_record +2024-11-13 17:26:01,622 INFO SenderThread:1939 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end +2024-11-13 17:26:01,622 DEBUG SenderThread:1939 [sender.py:send_request():409] send_request: summary_record +2024-11-13 17:26:01,623 INFO SenderThread:1939 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end +2024-11-13 17:26:01,623 DEBUG SenderThread:1939 [sender.py:send_request():409] send_request: summary_record +2024-11-13 17:26:01,623 INFO SenderThread:1939 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end +2024-11-13 17:26:01,623 DEBUG SenderThread:1939 [sender.py:send_request():409] send_request: summary_record +2024-11-13 17:26:01,624 INFO SenderThread:1939 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end +2024-11-13 17:26:01,624 DEBUG SenderThread:1939 [sender.py:send():382] send: history +2024-11-13 17:26:01,624 DEBUG SenderThread:1939 [sender.py:send_request():409] send_request: summary_record +2024-11-13 17:26:01,624 INFO SenderThread:1939 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end +2024-11-13 17:26:02,253 INFO Thread-12 :1939 [dir_watcher.py:_on_file_modified():288] file/dir modified: /opt/ml/model//sft-codecontests-qwen_ds-code-contests_model-Qwen2.5-Coder-7B-Instruct_sch-cosine_lr-1e-5_bs-64_acc-8_len-2048/wandb/run-20241113_171007-kushalarora-rvv-main-2024-11-13-16-43-09-915-0exw0n-algo-1/files/wandb-summary.json +2024-11-13 17:26:02,254 INFO Thread-12 :1939 [dir_watcher.py:_on_file_modified():288] file/dir modified: /opt/ml/model//sft-codecontests-qwen_ds-code-contests_model-Qwen2.5-Coder-7B-Instruct_sch-cosine_lr-1e-5_bs-64_acc-8_len-2048/wandb/run-20241113_171007-kushalarora-rvv-main-2024-11-13-16-43-09-915-0exw0n-algo-1/files/output.log +2024-11-13 17:26:04,255 INFO Thread-12 :1939 [dir_watcher.py:_on_file_modified():288] file/dir modified: /opt/ml/model//sft-codecontests-qwen_ds-code-contests_model-Qwen2.5-Coder-7B-Instruct_sch-cosine_lr-1e-5_bs-64_acc-8_len-2048/wandb/run-20241113_171007-kushalarora-rvv-main-2024-11-13-16-43-09-915-0exw0n-algo-1/files/output.log +2024-11-13 17:26:04,625 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 17:26:08,014 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: stop_status +2024-11-13 17:26:08,014 DEBUG SenderThread:1939 [sender.py:send_request():409] send_request: stop_status +2024-11-13 17:26:08,130 DEBUG SenderThread:1939 [sender.py:send():382] send: stats +2024-11-13 17:26:08,299 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: internal_messages +2024-11-13 17:26:10,300 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 17:26:15,301 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 17:26:20,302 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 17:26:23,014 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: stop_status +2024-11-13 17:26:23,014 DEBUG SenderThread:1939 [sender.py:send_request():409] send_request: stop_status +2024-11-13 17:26:23,299 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: internal_messages +2024-11-13 17:26:26,301 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 17:26:31,302 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 17:26:36,302 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 17:26:38,014 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: stop_status +2024-11-13 17:26:38,014 DEBUG SenderThread:1939 [sender.py:send_request():409] send_request: stop_status +2024-11-13 17:26:38,164 DEBUG SenderThread:1939 [sender.py:send():382] send: stats +2024-11-13 17:26:38,299 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: internal_messages +2024-11-13 17:26:42,301 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 17:26:47,302 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 17:26:52,302 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 17:26:53,014 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: stop_status +2024-11-13 17:26:53,014 DEBUG SenderThread:1939 [sender.py:send_request():409] send_request: stop_status +2024-11-13 17:26:53,300 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: internal_messages +2024-11-13 17:26:58,302 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 17:27:03,303 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 17:27:08,014 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: stop_status +2024-11-13 17:27:08,015 DEBUG SenderThread:1939 [sender.py:send_request():409] send_request: stop_status +2024-11-13 17:27:08,168 DEBUG SenderThread:1939 [sender.py:send():382] send: stats +2024-11-13 17:27:08,299 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: internal_messages +2024-11-13 17:27:09,300 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 17:27:14,301 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 17:27:19,302 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 17:27:23,014 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: stop_status +2024-11-13 17:27:23,015 DEBUG SenderThread:1939 [sender.py:send_request():409] send_request: stop_status +2024-11-13 17:27:23,300 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: internal_messages diff --git a/wandb/run-20241113_171007-kushalarora-rvv-main-2024-11-13-16-43-09-915-0exw0n-algo-1/files/output.log b/wandb/run-20241113_171007-kushalarora-rvv-main-2024-11-13-16-43-09-915-0exw0n-algo-1/files/output.log index d8d48e13f68a73c59210a1f10182f8ba9d7e3c9f..7ab5b3c0623500f2bf89eb03cac1e6bd72257748 100644 --- a/wandb/run-20241113_171007-kushalarora-rvv-main-2024-11-13-16-43-09-915-0exw0n-algo-1/files/output.log +++ b/wandb/run-20241113_171007-kushalarora-rvv-main-2024-11-13-16-43-09-915-0exw0n-algo-1/files/output.log @@ -1423,3 +1423,2516 @@ I1113 17:16:56.904300 140013302830912 fsdp_utils.py:195] Optimizer state saved i + + + 79%|███████▉ | 19/24 [09:22<02:16, 27.37s/it] + 83%|████████▎ | 20/24 [09:44<01:42, 25.74s/it] + 0%| | 0/2 [00:00, {'preprocessing': 0.034162452999225934, 'preprocessing_with_comm': 0.0033938859996851534, : 0.1119009219983127, : 0.08641748499576352, : 12.482803792002414, : 12.728741018000619, 'state_converting': 12.730824801999916, : 12.77104181300001}) +I1113 17:22:57.499208 140013302830912 fsdp_utils.py:193] Saving Optimizer state to /opt/ml/model//sft-codecontests-qwen_ds-code-contests_model-Qwen2.5-Coder-7B-Instruct_sch-cosine_lr-1e-5_bs-64_acc-8_len-2048/checkpoint-24/optimizer.bin +I1113 17:23:31.839883 140013302830912 fsdp_utils.py:195] Optimizer state saved in /opt/ml/model//sft-codecontests-qwen_ds-code-contests_model-Qwen2.5-Coder-7B-Instruct_sch-cosine_lr-1e-5_bs-64_acc-8_len-2048/checkpoint-24/optimizer.bin +I1113 17:24:41.950384 140013302830912 fsdp_utils.py:89] Saving model to /opt/ml/model//sft-codecontests-qwen_ds-code-contests_model-Qwen2.5-Coder-7B-Instruct_sch-cosine_lr-1e-5_bs-64_acc-8_len-2048/checkpoint-24/pytorch_model_fsdp.bin +I1113 17:25:04.388763 140013302830912 fsdp_utils.py:91] Model saved to /opt/ml/model//sft-codecontests-qwen_ds-code-contests_model-Qwen2.5-Coder-7B-Instruct_sch-cosine_lr-1e-5_bs-64_acc-8_len-2048/checkpoint-24/pytorch_model_fsdp.bin +[rank0]:[2024-11-13 17:25:05,318] torch.distributed.fsdp._optim_utils: [WARNING] CUDA Memory Summary before calling to _allgather_orig_param_states |===========================================================================| +[rank0]:[2024-11-13 17:25:05,318] torch.distributed.fsdp._optim_utils: [WARNING] | PyTorch CUDA memory summary, device ID 0 | +[rank0]:[2024-11-13 17:25:05,318] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:05,318] torch.distributed.fsdp._optim_utils: [WARNING] | CUDA OOMs: 0 | cudaMalloc retries: 2 | +[rank0]:[2024-11-13 17:25:05,318] torch.distributed.fsdp._optim_utils: [WARNING] |===========================================================================| +[rank0]:[2024-11-13 17:25:05,318] torch.distributed.fsdp._optim_utils: [WARNING] | Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed | +[rank0]:[2024-11-13 17:25:05,318] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:05,318] torch.distributed.fsdp._optim_utils: [WARNING] | Allocated memory | 5523 MiB | 58937 MiB | 103609 GiB | 103604 GiB | +[rank0]:[2024-11-13 17:25:05,318] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5521 MiB | 58933 MiB | 103569 GiB | 103563 GiB | +[rank0]:[2024-11-13 17:25:05,318] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 17:25:05,318] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:05,318] torch.distributed.fsdp._optim_utils: [WARNING] | Active memory | 5523 MiB | 59826 MiB | 103609 GiB | 103604 GiB | +[rank0]:[2024-11-13 17:25:05,318] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5521 MiB | 59822 MiB | 103569 GiB | 103563 GiB | +[rank0]:[2024-11-13 17:25:05,318] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 17:25:05,318] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:05,318] torch.distributed.fsdp._optim_utils: [WARNING] | Requested memory | 5511 MiB | 59812 MiB | 103603 GiB | 103597 GiB | +[rank0]:[2024-11-13 17:25:05,318] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5508 MiB | 59808 MiB | 103562 GiB | 103557 GiB | +[rank0]:[2024-11-13 17:25:05,318] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 17:25:05,318] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:05,318] torch.distributed.fsdp._optim_utils: [WARNING] | GPU reserved memory | 71846 MiB | 73286 MiB | 140138 MiB | 68292 MiB | +[rank0]:[2024-11-13 17:25:05,318] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 71836 MiB | 73278 MiB | 140124 MiB | 68288 MiB | +[rank0]:[2024-11-13 17:25:05,318] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 10 MiB | 10 MiB | 14 MiB | 4 MiB | +[rank0]:[2024-11-13 17:25:05,318] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:05,318] torch.distributed.fsdp._optim_utils: [WARNING] | Non-releasable memory | 456849 KiB | 18699 MiB | 121853 GiB | 121852 GiB | +[rank0]:[2024-11-13 17:25:05,318] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 453549 KiB | 18696 MiB | 121812 GiB | 121811 GiB | +[rank0]:[2024-11-13 17:25:05,318] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 3300 KiB | 6 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 17:25:05,318] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:05,318] torch.distributed.fsdp._optim_utils: [WARNING] | Allocations | 482 | 710 | 1052 K | 1051 K | +[rank0]:[2024-11-13 17:25:05,318] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 257 | 399 | 824 K | 824 K | +[rank0]:[2024-11-13 17:25:05,318] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 225 | 424 | 227 K | 227 K | +[rank0]:[2024-11-13 17:25:05,318] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:05,318] torch.distributed.fsdp._optim_utils: [WARNING] | Active allocs | 487 | 710 | 1052 K | 1051 K | +[rank0]:[2024-11-13 17:25:05,318] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 257 | 399 | 824 K | 824 K | +[rank0]:[2024-11-13 17:25:05,318] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 230 | 424 | 227 K | 227 K | +[rank0]:[2024-11-13 17:25:05,318] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:05,318] torch.distributed.fsdp._optim_utils: [WARNING] | GPU reserved segments | 110 | 110 | 161 | 51 | +[rank0]:[2024-11-13 17:25:05,318] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 105 | 105 | 154 | 49 | +[rank0]:[2024-11-13 17:25:05,318] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 5 | 5 | 7 | 2 | +[rank0]:[2024-11-13 17:25:05,318] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:05,318] torch.distributed.fsdp._optim_utils: [WARNING] | Non-releasable allocs | 20 | 68 | 462688 | 462668 | +[rank0]:[2024-11-13 17:25:05,318] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 9 | 55 | 373920 | 373911 | +[rank0]:[2024-11-13 17:25:05,318] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 11 | 17 | 88768 | 88757 | +[rank0]:[2024-11-13 17:25:05,318] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:05,318] torch.distributed.fsdp._optim_utils: [WARNING] | Oversize allocations | 0 | 0 | 0 | 0 | +[rank0]:[2024-11-13 17:25:05,318] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:05,318] torch.distributed.fsdp._optim_utils: [WARNING] | Oversize GPU segments | 0 | 0 | 0 | 0 | +[rank0]:[2024-11-13 17:25:05,318] torch.distributed.fsdp._optim_utils: [WARNING] |===========================================================================| +[rank0]:[2024-11-13 17:25:05,318] torch.distributed.fsdp._optim_utils: [WARNING] +[rank0]:[2024-11-13 17:25:06,824] torch.distributed.fsdp._optim_utils: [WARNING] CUDA Memory Summary before calling to _allgather_orig_param_states |===========================================================================| +[rank0]:[2024-11-13 17:25:06,824] torch.distributed.fsdp._optim_utils: [WARNING] | PyTorch CUDA memory summary, device ID 0 | +[rank0]:[2024-11-13 17:25:06,824] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:06,824] torch.distributed.fsdp._optim_utils: [WARNING] | CUDA OOMs: 0 | cudaMalloc retries: 2 | +[rank0]:[2024-11-13 17:25:06,824] torch.distributed.fsdp._optim_utils: [WARNING] |===========================================================================| +[rank0]:[2024-11-13 17:25:06,824] torch.distributed.fsdp._optim_utils: [WARNING] | Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed | +[rank0]:[2024-11-13 17:25:06,824] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:06,824] torch.distributed.fsdp._optim_utils: [WARNING] | Allocated memory | 5523 MiB | 58937 MiB | 103612 GiB | 103606 GiB | +[rank0]:[2024-11-13 17:25:06,824] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5521 MiB | 58933 MiB | 103571 GiB | 103566 GiB | +[rank0]:[2024-11-13 17:25:06,824] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 17:25:06,824] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:06,824] torch.distributed.fsdp._optim_utils: [WARNING] | Active memory | 5523 MiB | 59826 MiB | 103612 GiB | 103606 GiB | +[rank0]:[2024-11-13 17:25:06,824] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5521 MiB | 59822 MiB | 103571 GiB | 103566 GiB | +[rank0]:[2024-11-13 17:25:06,824] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 17:25:06,824] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:06,824] torch.distributed.fsdp._optim_utils: [WARNING] | Requested memory | 5511 MiB | 59812 MiB | 103605 GiB | 103600 GiB | +[rank0]:[2024-11-13 17:25:06,824] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5508 MiB | 59808 MiB | 103565 GiB | 103559 GiB | +[rank0]:[2024-11-13 17:25:06,824] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 17:25:06,824] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:06,824] torch.distributed.fsdp._optim_utils: [WARNING] | GPU reserved memory | 71846 MiB | 73286 MiB | 140138 MiB | 68292 MiB | +[rank0]:[2024-11-13 17:25:06,824] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 71836 MiB | 73278 MiB | 140124 MiB | 68288 MiB | +[rank0]:[2024-11-13 17:25:06,824] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 10 MiB | 10 MiB | 14 MiB | 4 MiB | +[rank0]:[2024-11-13 17:25:06,824] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:06,824] torch.distributed.fsdp._optim_utils: [WARNING] | Non-releasable memory | 456811 KiB | 18699 MiB | 121857 GiB | 121857 GiB | +[rank0]:[2024-11-13 17:25:06,824] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 453549 KiB | 18696 MiB | 121816 GiB | 121816 GiB | +[rank0]:[2024-11-13 17:25:06,824] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 3262 KiB | 6 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 17:25:06,824] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:06,824] torch.distributed.fsdp._optim_utils: [WARNING] | Allocations | 482 | 710 | 1052 K | 1051 K | +[rank0]:[2024-11-13 17:25:06,824] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 257 | 399 | 824 K | 824 K | +[rank0]:[2024-11-13 17:25:06,824] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 225 | 424 | 227 K | 227 K | +[rank0]:[2024-11-13 17:25:06,824] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:06,824] torch.distributed.fsdp._optim_utils: [WARNING] | Active allocs | 487 | 710 | 1052 K | 1051 K | +[rank0]:[2024-11-13 17:25:06,824] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 257 | 399 | 824 K | 824 K | +[rank0]:[2024-11-13 17:25:06,824] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 230 | 424 | 227 K | 227 K | +[rank0]:[2024-11-13 17:25:06,824] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:06,824] torch.distributed.fsdp._optim_utils: [WARNING] | GPU reserved segments | 110 | 110 | 161 | 51 | +[rank0]:[2024-11-13 17:25:06,824] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 105 | 105 | 154 | 49 | +[rank0]:[2024-11-13 17:25:06,824] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 5 | 5 | 7 | 2 | +[rank0]:[2024-11-13 17:25:06,824] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:06,824] torch.distributed.fsdp._optim_utils: [WARNING] | Non-releasable allocs | 20 | 68 | 462699 | 462679 | +[rank0]:[2024-11-13 17:25:06,824] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 9 | 55 | 373921 | 373912 | +[rank0]:[2024-11-13 17:25:06,824] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 11 | 17 | 88778 | 88767 | +[rank0]:[2024-11-13 17:25:06,824] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:06,824] torch.distributed.fsdp._optim_utils: [WARNING] | Oversize allocations | 0 | 0 | 0 | 0 | +[rank0]:[2024-11-13 17:25:06,824] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:06,824] torch.distributed.fsdp._optim_utils: [WARNING] | Oversize GPU segments | 0 | 0 | 0 | 0 | +[rank0]:[2024-11-13 17:25:06,824] torch.distributed.fsdp._optim_utils: [WARNING] |===========================================================================| +[rank0]:[2024-11-13 17:25:06,824] torch.distributed.fsdp._optim_utils: [WARNING] +[rank0]:[2024-11-13 17:25:07,074] torch.distributed.fsdp._optim_utils: [WARNING] CUDA Memory Summary before calling to _allgather_orig_param_states |===========================================================================| +[rank0]:[2024-11-13 17:25:07,074] torch.distributed.fsdp._optim_utils: [WARNING] | PyTorch CUDA memory summary, device ID 0 | +[rank0]:[2024-11-13 17:25:07,074] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:07,074] torch.distributed.fsdp._optim_utils: [WARNING] | CUDA OOMs: 0 | cudaMalloc retries: 2 | +[rank0]:[2024-11-13 17:25:07,074] torch.distributed.fsdp._optim_utils: [WARNING] |===========================================================================| +[rank0]:[2024-11-13 17:25:07,074] torch.distributed.fsdp._optim_utils: [WARNING] | Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed | +[rank0]:[2024-11-13 17:25:07,074] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:07,074] torch.distributed.fsdp._optim_utils: [WARNING] | Allocated memory | 5523 MiB | 58937 MiB | 103612 GiB | 103607 GiB | +[rank0]:[2024-11-13 17:25:07,074] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5521 MiB | 58933 MiB | 103572 GiB | 103566 GiB | +[rank0]:[2024-11-13 17:25:07,074] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 17:25:07,074] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:07,074] torch.distributed.fsdp._optim_utils: [WARNING] | Active memory | 5523 MiB | 59826 MiB | 103612 GiB | 103607 GiB | +[rank0]:[2024-11-13 17:25:07,074] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5521 MiB | 59822 MiB | 103572 GiB | 103566 GiB | +[rank0]:[2024-11-13 17:25:07,074] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 17:25:07,074] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:07,074] torch.distributed.fsdp._optim_utils: [WARNING] | Requested memory | 5511 MiB | 59812 MiB | 103606 GiB | 103600 GiB | +[rank0]:[2024-11-13 17:25:07,074] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5508 MiB | 59808 MiB | 103565 GiB | 103560 GiB | +[rank0]:[2024-11-13 17:25:07,074] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 17:25:07,074] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:07,074] torch.distributed.fsdp._optim_utils: [WARNING] | GPU reserved memory | 71846 MiB | 73286 MiB | 140138 MiB | 68292 MiB | +[rank0]:[2024-11-13 17:25:07,074] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 71836 MiB | 73278 MiB | 140124 MiB | 68288 MiB | +[rank0]:[2024-11-13 17:25:07,074] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 10 MiB | 10 MiB | 14 MiB | 4 MiB | +[rank0]:[2024-11-13 17:25:07,074] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:07,074] torch.distributed.fsdp._optim_utils: [WARNING] | Non-releasable memory | 456811 KiB | 18699 MiB | 121863 GiB | 121863 GiB | +[rank0]:[2024-11-13 17:25:07,074] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 453549 KiB | 18696 MiB | 121822 GiB | 121822 GiB | +[rank0]:[2024-11-13 17:25:07,074] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 3262 KiB | 6 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 17:25:07,074] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:07,074] torch.distributed.fsdp._optim_utils: [WARNING] | Allocations | 482 | 710 | 1052 K | 1052 K | +[rank0]:[2024-11-13 17:25:07,074] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 257 | 399 | 824 K | 824 K | +[rank0]:[2024-11-13 17:25:07,074] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 225 | 424 | 227 K | 227 K | +[rank0]:[2024-11-13 17:25:07,074] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:07,074] torch.distributed.fsdp._optim_utils: [WARNING] | Active allocs | 487 | 710 | 1052 K | 1052 K | +[rank0]:[2024-11-13 17:25:07,074] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 257 | 399 | 824 K | 824 K | +[rank0]:[2024-11-13 17:25:07,074] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 230 | 424 | 227 K | 227 K | +[rank0]:[2024-11-13 17:25:07,074] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:07,074] torch.distributed.fsdp._optim_utils: [WARNING] | GPU reserved segments | 110 | 110 | 161 | 51 | +[rank0]:[2024-11-13 17:25:07,074] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 105 | 105 | 154 | 49 | +[rank0]:[2024-11-13 17:25:07,074] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 5 | 5 | 7 | 2 | +[rank0]:[2024-11-13 17:25:07,074] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:07,074] torch.distributed.fsdp._optim_utils: [WARNING] | Non-releasable allocs | 20 | 68 | 462710 | 462690 | +[rank0]:[2024-11-13 17:25:07,074] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 9 | 55 | 373923 | 373914 | +[rank0]:[2024-11-13 17:25:07,074] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 11 | 17 | 88787 | 88776 | +[rank0]:[2024-11-13 17:25:07,074] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:07,074] torch.distributed.fsdp._optim_utils: [WARNING] | Oversize allocations | 0 | 0 | 0 | 0 | +[rank0]:[2024-11-13 17:25:07,074] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:07,074] torch.distributed.fsdp._optim_utils: [WARNING] | Oversize GPU segments | 0 | 0 | 0 | 0 | +[rank0]:[2024-11-13 17:25:07,074] torch.distributed.fsdp._optim_utils: [WARNING] |===========================================================================| +[rank0]:[2024-11-13 17:25:07,074] torch.distributed.fsdp._optim_utils: [WARNING] +[rank0]:[2024-11-13 17:25:07,326] torch.distributed.fsdp._optim_utils: [WARNING] CUDA Memory Summary before calling to _allgather_orig_param_states |===========================================================================| +[rank0]:[2024-11-13 17:25:07,326] torch.distributed.fsdp._optim_utils: [WARNING] | PyTorch CUDA memory summary, device ID 0 | +[rank0]:[2024-11-13 17:25:07,326] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:07,326] torch.distributed.fsdp._optim_utils: [WARNING] | CUDA OOMs: 0 | cudaMalloc retries: 2 | +[rank0]:[2024-11-13 17:25:07,326] torch.distributed.fsdp._optim_utils: [WARNING] |===========================================================================| +[rank0]:[2024-11-13 17:25:07,326] torch.distributed.fsdp._optim_utils: [WARNING] | Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed | +[rank0]:[2024-11-13 17:25:07,326] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:07,326] torch.distributed.fsdp._optim_utils: [WARNING] | Allocated memory | 5523 MiB | 58937 MiB | 103613 GiB | 103607 GiB | +[rank0]:[2024-11-13 17:25:07,326] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5521 MiB | 58933 MiB | 103572 GiB | 103567 GiB | +[rank0]:[2024-11-13 17:25:07,326] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 17:25:07,326] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:07,326] torch.distributed.fsdp._optim_utils: [WARNING] | Active memory | 5523 MiB | 59826 MiB | 103613 GiB | 103607 GiB | +[rank0]:[2024-11-13 17:25:07,326] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5521 MiB | 59822 MiB | 103572 GiB | 103567 GiB | +[rank0]:[2024-11-13 17:25:07,326] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 17:25:07,326] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:07,326] torch.distributed.fsdp._optim_utils: [WARNING] | Requested memory | 5511 MiB | 59812 MiB | 103606 GiB | 103601 GiB | +[rank0]:[2024-11-13 17:25:07,326] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5508 MiB | 59808 MiB | 103566 GiB | 103560 GiB | +[rank0]:[2024-11-13 17:25:07,326] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 17:25:07,326] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:07,326] torch.distributed.fsdp._optim_utils: [WARNING] | GPU reserved memory | 71846 MiB | 73286 MiB | 140138 MiB | 68292 MiB | +[rank0]:[2024-11-13 17:25:07,326] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 71836 MiB | 73278 MiB | 140124 MiB | 68288 MiB | +[rank0]:[2024-11-13 17:25:07,326] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 10 MiB | 10 MiB | 14 MiB | 4 MiB | +[rank0]:[2024-11-13 17:25:07,326] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:07,326] torch.distributed.fsdp._optim_utils: [WARNING] | Non-releasable memory | 456811 KiB | 18699 MiB | 121869 GiB | 121868 GiB | +[rank0]:[2024-11-13 17:25:07,326] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 453549 KiB | 18696 MiB | 121828 GiB | 121827 GiB | +[rank0]:[2024-11-13 17:25:07,326] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 3262 KiB | 6 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 17:25:07,326] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:07,326] torch.distributed.fsdp._optim_utils: [WARNING] | Allocations | 482 | 710 | 1052 K | 1052 K | +[rank0]:[2024-11-13 17:25:07,326] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 257 | 399 | 824 K | 824 K | +[rank0]:[2024-11-13 17:25:07,326] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 225 | 424 | 227 K | 227 K | +[rank0]:[2024-11-13 17:25:07,326] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:07,326] torch.distributed.fsdp._optim_utils: [WARNING] | Active allocs | 487 | 710 | 1052 K | 1052 K | +[rank0]:[2024-11-13 17:25:07,326] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 257 | 399 | 824 K | 824 K | +[rank0]:[2024-11-13 17:25:07,326] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 230 | 424 | 227 K | 227 K | +[rank0]:[2024-11-13 17:25:07,326] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:07,326] torch.distributed.fsdp._optim_utils: [WARNING] | GPU reserved segments | 110 | 110 | 161 | 51 | +[rank0]:[2024-11-13 17:25:07,326] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 105 | 105 | 154 | 49 | +[rank0]:[2024-11-13 17:25:07,326] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 5 | 5 | 7 | 2 | +[rank0]:[2024-11-13 17:25:07,326] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:07,326] torch.distributed.fsdp._optim_utils: [WARNING] | Non-releasable allocs | 20 | 68 | 462721 | 462701 | +[rank0]:[2024-11-13 17:25:07,326] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 9 | 55 | 373925 | 373916 | +[rank0]:[2024-11-13 17:25:07,326] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 11 | 17 | 88796 | 88785 | +[rank0]:[2024-11-13 17:25:07,326] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:07,326] torch.distributed.fsdp._optim_utils: [WARNING] | Oversize allocations | 0 | 0 | 0 | 0 | +[rank0]:[2024-11-13 17:25:07,326] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:07,326] torch.distributed.fsdp._optim_utils: [WARNING] | Oversize GPU segments | 0 | 0 | 0 | 0 | +[rank0]:[2024-11-13 17:25:07,326] torch.distributed.fsdp._optim_utils: [WARNING] |===========================================================================| +[rank0]:[2024-11-13 17:25:07,326] torch.distributed.fsdp._optim_utils: [WARNING] +[rank0]:[2024-11-13 17:25:07,580] torch.distributed.fsdp._optim_utils: [WARNING] CUDA Memory Summary before calling to _allgather_orig_param_states |===========================================================================| +[rank0]:[2024-11-13 17:25:07,580] torch.distributed.fsdp._optim_utils: [WARNING] | PyTorch CUDA memory summary, device ID 0 | +[rank0]:[2024-11-13 17:25:07,580] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:07,580] torch.distributed.fsdp._optim_utils: [WARNING] | CUDA OOMs: 0 | cudaMalloc retries: 2 | +[rank0]:[2024-11-13 17:25:07,580] torch.distributed.fsdp._optim_utils: [WARNING] |===========================================================================| +[rank0]:[2024-11-13 17:25:07,580] torch.distributed.fsdp._optim_utils: [WARNING] | Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed | +[rank0]:[2024-11-13 17:25:07,580] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:07,580] torch.distributed.fsdp._optim_utils: [WARNING] | Allocated memory | 5523 MiB | 58937 MiB | 103613 GiB | 103608 GiB | +[rank0]:[2024-11-13 17:25:07,580] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5521 MiB | 58933 MiB | 103573 GiB | 103567 GiB | +[rank0]:[2024-11-13 17:25:07,580] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 17:25:07,580] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:07,580] torch.distributed.fsdp._optim_utils: [WARNING] | Active memory | 5523 MiB | 59826 MiB | 103613 GiB | 103608 GiB | +[rank0]:[2024-11-13 17:25:07,580] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5521 MiB | 59822 MiB | 103573 GiB | 103567 GiB | +[rank0]:[2024-11-13 17:25:07,580] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 17:25:07,580] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:07,580] torch.distributed.fsdp._optim_utils: [WARNING] | Requested memory | 5511 MiB | 59812 MiB | 103607 GiB | 103601 GiB | +[rank0]:[2024-11-13 17:25:07,580] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5508 MiB | 59808 MiB | 103566 GiB | 103561 GiB | +[rank0]:[2024-11-13 17:25:07,580] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 17:25:07,580] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:07,580] torch.distributed.fsdp._optim_utils: [WARNING] | GPU reserved memory | 71846 MiB | 73286 MiB | 140138 MiB | 68292 MiB | +[rank0]:[2024-11-13 17:25:07,580] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 71836 MiB | 73278 MiB | 140124 MiB | 68288 MiB | +[rank0]:[2024-11-13 17:25:07,580] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 10 MiB | 10 MiB | 14 MiB | 4 MiB | +[rank0]:[2024-11-13 17:25:07,580] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:07,580] torch.distributed.fsdp._optim_utils: [WARNING] | Non-releasable memory | 456811 KiB | 18699 MiB | 121875 GiB | 121874 GiB | +[rank0]:[2024-11-13 17:25:07,580] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 453549 KiB | 18696 MiB | 121834 GiB | 121833 GiB | +[rank0]:[2024-11-13 17:25:07,580] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 3262 KiB | 6 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 17:25:07,580] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:07,580] torch.distributed.fsdp._optim_utils: [WARNING] | Allocations | 482 | 710 | 1052 K | 1052 K | +[rank0]:[2024-11-13 17:25:07,580] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 257 | 399 | 824 K | 824 K | +[rank0]:[2024-11-13 17:25:07,580] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 225 | 424 | 227 K | 227 K | +[rank0]:[2024-11-13 17:25:07,580] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:07,580] torch.distributed.fsdp._optim_utils: [WARNING] | Active allocs | 487 | 710 | 1052 K | 1052 K | +[rank0]:[2024-11-13 17:25:07,580] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 257 | 399 | 824 K | 824 K | +[rank0]:[2024-11-13 17:25:07,580] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 230 | 424 | 227 K | 227 K | +[rank0]:[2024-11-13 17:25:07,580] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:07,580] torch.distributed.fsdp._optim_utils: [WARNING] | GPU reserved segments | 110 | 110 | 161 | 51 | +[rank0]:[2024-11-13 17:25:07,580] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 105 | 105 | 154 | 49 | +[rank0]:[2024-11-13 17:25:07,580] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 5 | 5 | 7 | 2 | +[rank0]:[2024-11-13 17:25:07,580] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:07,580] torch.distributed.fsdp._optim_utils: [WARNING] | Non-releasable allocs | 20 | 68 | 462732 | 462712 | +[rank0]:[2024-11-13 17:25:07,580] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 9 | 55 | 373927 | 373918 | +[rank0]:[2024-11-13 17:25:07,580] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 11 | 17 | 88805 | 88794 | +[rank0]:[2024-11-13 17:25:07,580] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:07,580] torch.distributed.fsdp._optim_utils: [WARNING] | Oversize allocations | 0 | 0 | 0 | 0 | +[rank0]:[2024-11-13 17:25:07,580] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:07,580] torch.distributed.fsdp._optim_utils: [WARNING] | Oversize GPU segments | 0 | 0 | 0 | 0 | +[rank0]:[2024-11-13 17:25:07,580] torch.distributed.fsdp._optim_utils: [WARNING] |===========================================================================| +[rank0]:[2024-11-13 17:25:07,580] torch.distributed.fsdp._optim_utils: [WARNING] +[rank0]:[2024-11-13 17:25:07,964] torch.distributed.fsdp._optim_utils: [WARNING] CUDA Memory Summary before calling to _allgather_orig_param_states |===========================================================================| +[rank0]:[2024-11-13 17:25:07,964] torch.distributed.fsdp._optim_utils: [WARNING] | PyTorch CUDA memory summary, device ID 0 | +[rank0]:[2024-11-13 17:25:07,964] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:07,964] torch.distributed.fsdp._optim_utils: [WARNING] | CUDA OOMs: 0 | cudaMalloc retries: 2 | +[rank0]:[2024-11-13 17:25:07,964] torch.distributed.fsdp._optim_utils: [WARNING] |===========================================================================| +[rank0]:[2024-11-13 17:25:07,964] torch.distributed.fsdp._optim_utils: [WARNING] | Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed | +[rank0]:[2024-11-13 17:25:07,964] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:07,964] torch.distributed.fsdp._optim_utils: [WARNING] | Allocated memory | 5523 MiB | 58937 MiB | 103614 GiB | 103608 GiB | +[rank0]:[2024-11-13 17:25:07,964] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5521 MiB | 58933 MiB | 103573 GiB | 103568 GiB | +[rank0]:[2024-11-13 17:25:07,964] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 17:25:07,964] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:07,964] torch.distributed.fsdp._optim_utils: [WARNING] | Active memory | 5523 MiB | 59826 MiB | 103614 GiB | 103608 GiB | +[rank0]:[2024-11-13 17:25:07,964] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5521 MiB | 59822 MiB | 103573 GiB | 103568 GiB | +[rank0]:[2024-11-13 17:25:07,964] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 17:25:07,964] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:07,964] torch.distributed.fsdp._optim_utils: [WARNING] | Requested memory | 5511 MiB | 59812 MiB | 103607 GiB | 103602 GiB | +[rank0]:[2024-11-13 17:25:07,964] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5508 MiB | 59808 MiB | 103567 GiB | 103561 GiB | +[rank0]:[2024-11-13 17:25:07,964] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 17:25:07,964] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:07,964] torch.distributed.fsdp._optim_utils: [WARNING] | GPU reserved memory | 71846 MiB | 73286 MiB | 140138 MiB | 68292 MiB | +[rank0]:[2024-11-13 17:25:07,964] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 71836 MiB | 73278 MiB | 140124 MiB | 68288 MiB | +[rank0]:[2024-11-13 17:25:07,964] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 10 MiB | 10 MiB | 14 MiB | 4 MiB | +[rank0]:[2024-11-13 17:25:07,964] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:07,964] torch.distributed.fsdp._optim_utils: [WARNING] | Non-releasable memory | 456811 KiB | 18699 MiB | 121880 GiB | 121880 GiB | +[rank0]:[2024-11-13 17:25:07,964] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 453549 KiB | 18696 MiB | 121839 GiB | 121839 GiB | +[rank0]:[2024-11-13 17:25:07,964] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 3262 KiB | 6 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 17:25:07,964] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:07,964] torch.distributed.fsdp._optim_utils: [WARNING] | Allocations | 482 | 710 | 1052 K | 1052 K | +[rank0]:[2024-11-13 17:25:07,964] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 257 | 399 | 824 K | 824 K | +[rank0]:[2024-11-13 17:25:07,964] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 225 | 424 | 228 K | 227 K | +[rank0]:[2024-11-13 17:25:07,964] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:07,964] torch.distributed.fsdp._optim_utils: [WARNING] | Active allocs | 487 | 710 | 1052 K | 1052 K | +[rank0]:[2024-11-13 17:25:07,964] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 257 | 399 | 824 K | 824 K | +[rank0]:[2024-11-13 17:25:07,964] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 230 | 424 | 228 K | 227 K | +[rank0]:[2024-11-13 17:25:07,964] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:07,964] torch.distributed.fsdp._optim_utils: [WARNING] | GPU reserved segments | 110 | 110 | 161 | 51 | +[rank0]:[2024-11-13 17:25:07,964] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 105 | 105 | 154 | 49 | +[rank0]:[2024-11-13 17:25:07,964] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 5 | 5 | 7 | 2 | +[rank0]:[2024-11-13 17:25:07,964] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:07,964] torch.distributed.fsdp._optim_utils: [WARNING] | Non-releasable allocs | 20 | 68 | 462743 | 462723 | +[rank0]:[2024-11-13 17:25:07,964] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 9 | 55 | 373929 | 373920 | +[rank0]:[2024-11-13 17:25:07,964] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 11 | 17 | 88814 | 88803 | +[rank0]:[2024-11-13 17:25:07,964] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:07,964] torch.distributed.fsdp._optim_utils: [WARNING] | Oversize allocations | 0 | 0 | 0 | 0 | +[rank0]:[2024-11-13 17:25:07,964] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:07,964] torch.distributed.fsdp._optim_utils: [WARNING] | Oversize GPU segments | 0 | 0 | 0 | 0 | +[rank0]:[2024-11-13 17:25:07,964] torch.distributed.fsdp._optim_utils: [WARNING] |===========================================================================| +[rank0]:[2024-11-13 17:25:07,964] torch.distributed.fsdp._optim_utils: [WARNING] +[rank0]:[2024-11-13 17:25:08,347] torch.distributed.fsdp._optim_utils: [WARNING] CUDA Memory Summary before calling to _allgather_orig_param_states |===========================================================================| +[rank0]:[2024-11-13 17:25:08,347] torch.distributed.fsdp._optim_utils: [WARNING] | PyTorch CUDA memory summary, device ID 0 | +[rank0]:[2024-11-13 17:25:08,347] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:08,347] torch.distributed.fsdp._optim_utils: [WARNING] | CUDA OOMs: 0 | cudaMalloc retries: 2 | +[rank0]:[2024-11-13 17:25:08,347] torch.distributed.fsdp._optim_utils: [WARNING] |===========================================================================| +[rank0]:[2024-11-13 17:25:08,347] torch.distributed.fsdp._optim_utils: [WARNING] | Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed | +[rank0]:[2024-11-13 17:25:08,347] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:08,347] torch.distributed.fsdp._optim_utils: [WARNING] | Allocated memory | 5523 MiB | 58937 MiB | 103614 GiB | 103609 GiB | +[rank0]:[2024-11-13 17:25:08,347] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5521 MiB | 58933 MiB | 103574 GiB | 103568 GiB | +[rank0]:[2024-11-13 17:25:08,347] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 17:25:08,347] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:08,347] torch.distributed.fsdp._optim_utils: [WARNING] | Active memory | 5523 MiB | 59826 MiB | 103614 GiB | 103609 GiB | +[rank0]:[2024-11-13 17:25:08,347] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5521 MiB | 59822 MiB | 103574 GiB | 103568 GiB | +[rank0]:[2024-11-13 17:25:08,347] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 17:25:08,347] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:08,347] torch.distributed.fsdp._optim_utils: [WARNING] | Requested memory | 5511 MiB | 59812 MiB | 103608 GiB | 103602 GiB | +[rank0]:[2024-11-13 17:25:08,347] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5508 MiB | 59808 MiB | 103567 GiB | 103562 GiB | +[rank0]:[2024-11-13 17:25:08,347] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 17:25:08,347] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:08,347] torch.distributed.fsdp._optim_utils: [WARNING] | GPU reserved memory | 71846 MiB | 73286 MiB | 140138 MiB | 68292 MiB | +[rank0]:[2024-11-13 17:25:08,347] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 71836 MiB | 73278 MiB | 140124 MiB | 68288 MiB | +[rank0]:[2024-11-13 17:25:08,347] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 10 MiB | 10 MiB | 14 MiB | 4 MiB | +[rank0]:[2024-11-13 17:25:08,347] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:08,347] torch.distributed.fsdp._optim_utils: [WARNING] | Non-releasable memory | 456811 KiB | 18699 MiB | 121886 GiB | 121886 GiB | +[rank0]:[2024-11-13 17:25:08,347] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 453549 KiB | 18696 MiB | 121845 GiB | 121845 GiB | +[rank0]:[2024-11-13 17:25:08,347] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 3262 KiB | 6 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 17:25:08,347] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:08,347] torch.distributed.fsdp._optim_utils: [WARNING] | Allocations | 482 | 710 | 1052 K | 1052 K | +[rank0]:[2024-11-13 17:25:08,347] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 257 | 399 | 824 K | 824 K | +[rank0]:[2024-11-13 17:25:08,347] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 225 | 424 | 228 K | 227 K | +[rank0]:[2024-11-13 17:25:08,347] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:08,347] torch.distributed.fsdp._optim_utils: [WARNING] | Active allocs | 487 | 710 | 1052 K | 1052 K | +[rank0]:[2024-11-13 17:25:08,347] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 257 | 399 | 824 K | 824 K | +[rank0]:[2024-11-13 17:25:08,347] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 230 | 424 | 228 K | 227 K | +[rank0]:[2024-11-13 17:25:08,347] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:08,347] torch.distributed.fsdp._optim_utils: [WARNING] | GPU reserved segments | 110 | 110 | 161 | 51 | +[rank0]:[2024-11-13 17:25:08,347] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 105 | 105 | 154 | 49 | +[rank0]:[2024-11-13 17:25:08,347] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 5 | 5 | 7 | 2 | +[rank0]:[2024-11-13 17:25:08,347] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:08,347] torch.distributed.fsdp._optim_utils: [WARNING] | Non-releasable allocs | 20 | 68 | 462754 | 462734 | +[rank0]:[2024-11-13 17:25:08,347] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 9 | 55 | 373931 | 373922 | +[rank0]:[2024-11-13 17:25:08,347] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 11 | 17 | 88823 | 88812 | +[rank0]:[2024-11-13 17:25:08,347] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:08,347] torch.distributed.fsdp._optim_utils: [WARNING] | Oversize allocations | 0 | 0 | 0 | 0 | +[rank0]:[2024-11-13 17:25:08,347] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:08,347] torch.distributed.fsdp._optim_utils: [WARNING] | Oversize GPU segments | 0 | 0 | 0 | 0 | +[rank0]:[2024-11-13 17:25:08,347] torch.distributed.fsdp._optim_utils: [WARNING] |===========================================================================| +[rank0]:[2024-11-13 17:25:08,347] torch.distributed.fsdp._optim_utils: [WARNING] +[rank0]:[2024-11-13 17:25:08,728] torch.distributed.fsdp._optim_utils: [WARNING] CUDA Memory Summary before calling to _allgather_orig_param_states |===========================================================================| +[rank0]:[2024-11-13 17:25:08,728] torch.distributed.fsdp._optim_utils: [WARNING] | PyTorch CUDA memory summary, device ID 0 | +[rank0]:[2024-11-13 17:25:08,728] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:08,728] torch.distributed.fsdp._optim_utils: [WARNING] | CUDA OOMs: 0 | cudaMalloc retries: 2 | +[rank0]:[2024-11-13 17:25:08,728] torch.distributed.fsdp._optim_utils: [WARNING] |===========================================================================| +[rank0]:[2024-11-13 17:25:08,728] torch.distributed.fsdp._optim_utils: [WARNING] | Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed | +[rank0]:[2024-11-13 17:25:08,728] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:08,728] torch.distributed.fsdp._optim_utils: [WARNING] | Allocated memory | 5523 MiB | 58937 MiB | 103615 GiB | 103610 GiB | +[rank0]:[2024-11-13 17:25:08,728] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5521 MiB | 58933 MiB | 103574 GiB | 103569 GiB | +[rank0]:[2024-11-13 17:25:08,728] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 17:25:08,728] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:08,728] torch.distributed.fsdp._optim_utils: [WARNING] | Active memory | 5523 MiB | 59826 MiB | 103615 GiB | 103610 GiB | +[rank0]:[2024-11-13 17:25:08,728] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5521 MiB | 59822 MiB | 103574 GiB | 103569 GiB | +[rank0]:[2024-11-13 17:25:08,728] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 17:25:08,728] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 257 | 399 | 824 K | 824 K | +[rank0]:[2024-11-13 17:25:08,728] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 230 | 424 | 228 K | 227 K | +[rank0]:[2024-11-13 17:25:08,728] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:08,728] torch.distributed.fsdp._optim_utils: [WARNING] | GPU reserved segments | 110 | 110 | 161 | 51 | +[rank0]:[2024-11-13 17:25:08,728] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 105 | 105 | 154 | 49 | +[rank0]:[2024-11-13 17:25:08,728] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 5 | 5 | 7 | 2 | +[rank0]:[2024-11-13 17:25:08,728] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:08,728] torch.distributed.fsdp._optim_utils: [WARNING] | Non-releasable allocs | 20 | 68 | 462765 | 462745 | +[rank0]:[2024-11-13 17:25:08,728] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 9 | 55 | 373933 | 373924 | +[rank0]:[2024-11-13 17:25:08,728] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 11 | 17 | 88832 | 88821 | +[rank0]:[2024-11-13 17:25:08,728] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:08,728] torch.distributed.fsdp._optim_utils: [WARNING] | Oversize allocations | 0 | 0 | 0 | 0 | +[rank0]:[2024-11-13 17:25:08,728] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:08,728] torch.distributed.fsdp._optim_utils: [WARNING] | Oversize GPU segments | 0 | 0 | 0 | 0 | +[rank0]:[2024-11-13 17:25:08,728] torch.distributed.fsdp._optim_utils: [WARNING] |===========================================================================| +[rank0]:[2024-11-13 17:25:08,728] torch.distributed.fsdp._optim_utils: [WARNING] +[rank0]:[2024-11-13 17:25:09,133] torch.distributed.fsdp._optim_utils: [WARNING] CUDA Memory Summary before calling to _allgather_orig_param_states |===========================================================================| +[rank0]:[2024-11-13 17:25:09,133] torch.distributed.fsdp._optim_utils: [WARNING] | PyTorch CUDA memory summary, device ID 0 | +[rank0]:[2024-11-13 17:25:09,133] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:09,133] torch.distributed.fsdp._optim_utils: [WARNING] | CUDA OOMs: 0 | cudaMalloc retries: 2 | +[rank0]:[2024-11-13 17:25:09,133] torch.distributed.fsdp._optim_utils: [WARNING] |===========================================================================| +[rank0]:[2024-11-13 17:25:09,133] torch.distributed.fsdp._optim_utils: [WARNING] | Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed | +[rank0]:[2024-11-13 17:25:09,133] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:09,133] torch.distributed.fsdp._optim_utils: [WARNING] | Allocated memory | 5523 MiB | 58937 MiB | 103615 GiB | 103610 GiB | +[rank0]:[2024-11-13 17:25:09,133] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5521 MiB | 58933 MiB | 103575 GiB | 103570 GiB | +[rank0]:[2024-11-13 17:25:09,133] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 17:25:09,133] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:09,133] torch.distributed.fsdp._optim_utils: [WARNING] | Active memory | 5523 MiB | 59826 MiB | 103615 GiB | 103610 GiB | +[rank0]:[2024-11-13 17:25:09,133] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5521 MiB | 59822 MiB | 103575 GiB | 103570 GiB | +[rank0]:[2024-11-13 17:25:09,133] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 17:25:09,133] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:09,133] torch.distributed.fsdp._optim_utils: [WARNING] | Requested memory | 5511 MiB | 59812 MiB | 103609 GiB | 103604 GiB | +[rank0]:[2024-11-13 17:25:09,133] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5508 MiB | 59808 MiB | 103568 GiB | 103563 GiB | +[rank0]:[2024-11-13 17:25:09,133] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 17:25:09,133] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:09,133] torch.distributed.fsdp._optim_utils: [WARNING] | GPU reserved memory | 71846 MiB | 73286 MiB | 140138 MiB | 68292 MiB | +[rank0]:[2024-11-13 17:25:09,133] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 71836 MiB | 73278 MiB | 140124 MiB | 68288 MiB | +[rank0]:[2024-11-13 17:25:09,133] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 10 MiB | 10 MiB | 14 MiB | 4 MiB | +[rank0]:[2024-11-13 17:25:09,133] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:09,133] torch.distributed.fsdp._optim_utils: [WARNING] | Non-releasable memory | 456811 KiB | 18699 MiB | 121898 GiB | 121897 GiB | +[rank0]:[2024-11-13 17:25:09,133] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 453549 KiB | 18696 MiB | 121857 GiB | 121856 GiB | +[rank0]:[2024-11-13 17:25:09,133] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 3262 KiB | 6 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 17:25:09,133] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:09,133] torch.distributed.fsdp._optim_utils: [WARNING] | Allocations | 482 | 710 | 1052 K | 1052 K | +[rank0]:[2024-11-13 17:25:09,133] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 257 | 399 | 824 K | 824 K | +[rank0]:[2024-11-13 17:25:09,133] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 225 | 424 | 228 K | 227 K | +[rank0]:[2024-11-13 17:25:09,133] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:09,133] torch.distributed.fsdp._optim_utils: [WARNING] | Active allocs | 487 | 710 | 1052 K | 1052 K | +[rank0]:[2024-11-13 17:25:09,133] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 257 | 399 | 824 K | 824 K | +[rank0]:[2024-11-13 17:25:09,133] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 230 | 424 | 228 K | 227 K | +[rank0]:[2024-11-13 17:25:09,133] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:09,133] torch.distributed.fsdp._optim_utils: [WARNING] | GPU reserved segments | 110 | 110 | 161 | 51 | +[rank0]:[2024-11-13 17:25:09,133] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 105 | 105 | 154 | 49 | +[rank0]:[2024-11-13 17:25:09,133] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 5 | 5 | 7 | 2 | +[rank0]:[2024-11-13 17:25:09,133] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:09,133] torch.distributed.fsdp._optim_utils: [WARNING] | Non-releasable allocs | 20 | 68 | 462776 | 462756 | +[rank0]:[2024-11-13 17:25:09,133] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 9 | 55 | 373935 | 373926 | +[rank0]:[2024-11-13 17:25:09,133] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 11 | 17 | 88841 | 88830 | +[rank0]:[2024-11-13 17:25:09,133] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:09,133] torch.distributed.fsdp._optim_utils: [WARNING] | Oversize allocations | 0 | 0 | 0 | 0 | +[rank0]:[2024-11-13 17:25:09,133] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:09,133] torch.distributed.fsdp._optim_utils: [WARNING] | Oversize GPU segments | 0 | 0 | 0 | 0 | +[rank0]:[2024-11-13 17:25:09,133] torch.distributed.fsdp._optim_utils: [WARNING] |===========================================================================| +[rank0]:[2024-11-13 17:25:09,133] torch.distributed.fsdp._optim_utils: [WARNING] +[rank0]:[2024-11-13 17:25:09,537] torch.distributed.fsdp._optim_utils: [WARNING] CUDA Memory Summary before calling to _allgather_orig_param_states |===========================================================================| +[rank0]:[2024-11-13 17:25:09,537] torch.distributed.fsdp._optim_utils: [WARNING] | PyTorch CUDA memory summary, device ID 0 | +[rank0]:[2024-11-13 17:25:09,537] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:09,537] torch.distributed.fsdp._optim_utils: [WARNING] | CUDA OOMs: 0 | cudaMalloc retries: 2 | +[rank0]:[2024-11-13 17:25:09,537] torch.distributed.fsdp._optim_utils: [WARNING] |===========================================================================| +[rank0]:[2024-11-13 17:25:09,537] torch.distributed.fsdp._optim_utils: [WARNING] | Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed | +[rank0]:[2024-11-13 17:25:09,537] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:09,537] torch.distributed.fsdp._optim_utils: [WARNING] | Allocated memory | 5523 MiB | 58937 MiB | 103616 GiB | 103611 GiB | +[rank0]:[2024-11-13 17:25:09,537] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5521 MiB | 58933 MiB | 103575 GiB | 103570 GiB | +[rank0]:[2024-11-13 17:25:09,537] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 17:25:09,537] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:09,537] torch.distributed.fsdp._optim_utils: [WARNING] | Active memory | 5523 MiB | 59826 MiB | 103616 GiB | 103611 GiB | +[rank0]:[2024-11-13 17:25:09,537] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5521 MiB | 59822 MiB | 103575 GiB | 103570 GiB | +[rank0]:[2024-11-13 17:25:09,537] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 17:25:09,537] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:09,537] torch.distributed.fsdp._optim_utils: [WARNING] | Requested memory | 5511 MiB | 59812 MiB | 103609 GiB | 103604 GiB | +[rank0]:[2024-11-13 17:25:09,537] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5508 MiB | 59808 MiB | 103569 GiB | 103564 GiB | +[rank0]:[2024-11-13 17:25:09,537] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 17:25:09,537] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:09,537] torch.distributed.fsdp._optim_utils: [WARNING] | GPU reserved memory | 71846 MiB | 73286 MiB | 140138 MiB | 68292 MiB | +[rank0]:[2024-11-13 17:25:09,537] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 71836 MiB | 73278 MiB | 140124 MiB | 68288 MiB | +[rank0]:[2024-11-13 17:25:09,537] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 10 MiB | 10 MiB | 14 MiB | 4 MiB | +[rank0]:[2024-11-13 17:25:09,537] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:09,537] torch.distributed.fsdp._optim_utils: [WARNING] | Non-releasable memory | 456811 KiB | 18699 MiB | 121904 GiB | 121903 GiB | +[rank0]:[2024-11-13 17:25:09,537] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 453549 KiB | 18696 MiB | 121863 GiB | 121862 GiB | +[rank0]:[2024-11-13 17:25:09,537] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 3262 KiB | 6 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 17:25:09,537] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:09,537] torch.distributed.fsdp._optim_utils: [WARNING] | Allocations | 482 | 710 | 1052 K | 1052 K | +[rank0]:[2024-11-13 17:25:09,537] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 257 | 399 | 824 K | 824 K | +[rank0]:[2024-11-13 17:25:09,537] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 225 | 424 | 228 K | 227 K | +[rank0]:[2024-11-13 17:25:09,537] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:09,537] torch.distributed.fsdp._optim_utils: [WARNING] | Active allocs | 487 | 710 | 1052 K | 1052 K | +[rank0]:[2024-11-13 17:25:09,537] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 257 | 399 | 824 K | 824 K | +[rank0]:[2024-11-13 17:25:09,537] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 230 | 424 | 228 K | 227 K | +[rank0]:[2024-11-13 17:25:09,537] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:09,537] torch.distributed.fsdp._optim_utils: [WARNING] | GPU reserved segments | 110 | 110 | 161 | 51 | +[rank0]:[2024-11-13 17:25:09,537] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 105 | 105 | 154 | 49 | +[rank0]:[2024-11-13 17:25:09,537] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 5 | 5 | 7 | 2 | +[rank0]:[2024-11-13 17:25:09,537] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:09,537] torch.distributed.fsdp._optim_utils: [WARNING] | Non-releasable allocs | 20 | 68 | 462787 | 462767 | +[rank0]:[2024-11-13 17:25:09,537] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 9 | 55 | 373937 | 373928 | +[rank0]:[2024-11-13 17:25:09,537] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 11 | 17 | 88850 | 88839 | +[rank0]:[2024-11-13 17:25:09,537] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:09,537] torch.distributed.fsdp._optim_utils: [WARNING] | Oversize allocations | 0 | 0 | 0 | 0 | +[rank0]:[2024-11-13 17:25:09,537] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:09,537] torch.distributed.fsdp._optim_utils: [WARNING] | Oversize GPU segments | 0 | 0 | 0 | 0 | +[rank0]:[2024-11-13 17:25:09,537] torch.distributed.fsdp._optim_utils: [WARNING] |===========================================================================| +[rank0]:[2024-11-13 17:25:09,537] torch.distributed.fsdp._optim_utils: [WARNING] +[rank0]:[2024-11-13 17:25:09,937] torch.distributed.fsdp._optim_utils: [WARNING] CUDA Memory Summary before calling to _allgather_orig_param_states |===========================================================================| +[rank0]:[2024-11-13 17:25:09,937] torch.distributed.fsdp._optim_utils: [WARNING] | PyTorch CUDA memory summary, device ID 0 | +[rank0]:[2024-11-13 17:25:09,937] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:09,937] torch.distributed.fsdp._optim_utils: [WARNING] | CUDA OOMs: 0 | cudaMalloc retries: 2 | +[rank0]:[2024-11-13 17:25:09,937] torch.distributed.fsdp._optim_utils: [WARNING] |===========================================================================| +[rank0]:[2024-11-13 17:25:09,937] torch.distributed.fsdp._optim_utils: [WARNING] | Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed | +[rank0]:[2024-11-13 17:25:09,937] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:09,937] torch.distributed.fsdp._optim_utils: [WARNING] | Allocated memory | 5523 MiB | 58937 MiB | 103617 GiB | 103611 GiB | +[rank0]:[2024-11-13 17:25:09,937] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5521 MiB | 58933 MiB | 103576 GiB | 103571 GiB | +[rank0]:[2024-11-13 17:25:09,937] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 17:25:09,937] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:09,937] torch.distributed.fsdp._optim_utils: [WARNING] | Active memory | 5523 MiB | 59826 MiB | 103617 GiB | 103611 GiB | +[rank0]:[2024-11-13 17:25:09,937] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5521 MiB | 59822 MiB | 103576 GiB | 103571 GiB | +[rank0]:[2024-11-13 17:25:09,937] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 17:25:09,937] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:09,937] torch.distributed.fsdp._optim_utils: [WARNING] | Requested memory | 5511 MiB | 59812 MiB | 103610 GiB | 103605 GiB | +[rank0]:[2024-11-13 17:25:09,937] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5508 MiB | 59808 MiB | 103570 GiB | 103564 GiB | +[rank0]:[2024-11-13 17:25:09,937] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 17:25:09,937] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:09,937] torch.distributed.fsdp._optim_utils: [WARNING] | GPU reserved memory | 71846 MiB | 73286 MiB | 140138 MiB | 68292 MiB | +[rank0]:[2024-11-13 17:25:09,937] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 71836 MiB | 73278 MiB | 140124 MiB | 68288 MiB | +[rank0]:[2024-11-13 17:25:09,937] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 10 MiB | 10 MiB | 14 MiB | 4 MiB | +[rank0]:[2024-11-13 17:25:09,937] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:09,937] torch.distributed.fsdp._optim_utils: [WARNING] | Non-releasable memory | 456811 KiB | 18699 MiB | 121909 GiB | 121909 GiB | +[rank0]:[2024-11-13 17:25:09,937] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 453549 KiB | 18696 MiB | 121868 GiB | 121868 GiB | +[rank0]:[2024-11-13 17:25:09,937] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 3262 KiB | 6 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 17:25:09,937] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:09,937] torch.distributed.fsdp._optim_utils: [WARNING] | Allocations | 482 | 710 | 1052 K | 1052 K | +[rank0]:[2024-11-13 17:25:09,937] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 257 | 399 | 824 K | 824 K | +[rank0]:[2024-11-13 17:25:09,937] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 225 | 424 | 228 K | 227 K | +[rank0]:[2024-11-13 17:25:09,937] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:09,937] torch.distributed.fsdp._optim_utils: [WARNING] | Active allocs | 487 | 710 | 1052 K | 1052 K | +[rank0]:[2024-11-13 17:25:09,937] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 257 | 399 | 824 K | 824 K | +[rank0]:[2024-11-13 17:25:09,937] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 230 | 424 | 228 K | 227 K | +[rank0]:[2024-11-13 17:25:09,937] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:09,937] torch.distributed.fsdp._optim_utils: [WARNING] | GPU reserved segments | 110 | 110 | 161 | 51 | +[rank0]:[2024-11-13 17:25:09,937] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 105 | 105 | 154 | 49 | +[rank0]:[2024-11-13 17:25:09,937] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 5 | 5 | 7 | 2 | +[rank0]:[2024-11-13 17:25:09,937] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:09,937] torch.distributed.fsdp._optim_utils: [WARNING] | Non-releasable allocs | 20 | 68 | 462798 | 462778 | +[rank0]:[2024-11-13 17:25:09,937] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 9 | 55 | 373939 | 373930 | +[rank0]:[2024-11-13 17:25:09,937] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 11 | 17 | 88859 | 88848 | +[rank0]:[2024-11-13 17:25:09,937] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:09,937] torch.distributed.fsdp._optim_utils: [WARNING] | Oversize allocations | 0 | 0 | 0 | 0 | +[rank0]:[2024-11-13 17:25:09,937] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:09,937] torch.distributed.fsdp._optim_utils: [WARNING] | Oversize GPU segments | 0 | 0 | 0 | 0 | +[rank0]:[2024-11-13 17:25:09,937] torch.distributed.fsdp._optim_utils: [WARNING] |===========================================================================| +[rank0]:[2024-11-13 17:25:09,937] torch.distributed.fsdp._optim_utils: [WARNING] +[rank0]:[2024-11-13 17:25:10,346] torch.distributed.fsdp._optim_utils: [WARNING] CUDA Memory Summary before calling to _allgather_orig_param_states |===========================================================================| +[rank0]:[2024-11-13 17:25:10,346] torch.distributed.fsdp._optim_utils: [WARNING] | PyTorch CUDA memory summary, device ID 0 | +[rank0]:[2024-11-13 17:25:10,346] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:10,346] torch.distributed.fsdp._optim_utils: [WARNING] | CUDA OOMs: 0 | cudaMalloc retries: 2 | +[rank0]:[2024-11-13 17:25:10,346] torch.distributed.fsdp._optim_utils: [WARNING] |===========================================================================| +[rank0]:[2024-11-13 17:25:10,346] torch.distributed.fsdp._optim_utils: [WARNING] | Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed | +[rank0]:[2024-11-13 17:25:10,346] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:10,346] torch.distributed.fsdp._optim_utils: [WARNING] | Allocated memory | 5523 MiB | 58937 MiB | 103617 GiB | 103612 GiB | +[rank0]:[2024-11-13 17:25:10,346] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5521 MiB | 58933 MiB | 103577 GiB | 103571 GiB | +[rank0]:[2024-11-13 17:25:10,346] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 17:25:10,346] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:10,346] torch.distributed.fsdp._optim_utils: [WARNING] | Active memory | 5523 MiB | 59826 MiB | 103617 GiB | 103612 GiB | +[rank0]:[2024-11-13 17:25:10,346] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5521 MiB | 59822 MiB | 103577 GiB | 103571 GiB | +[rank0]:[2024-11-13 17:25:10,346] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 17:25:10,346] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:10,346] torch.distributed.fsdp._optim_utils: [WARNING] | Requested memory | 5511 MiB | 59812 MiB | 103611 GiB | 103605 GiB | +[rank0]:[2024-11-13 17:25:10,346] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5508 MiB | 59808 MiB | 103570 GiB | 103565 GiB | +[rank0]:[2024-11-13 17:25:10,346] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 17:25:10,346] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:10,346] torch.distributed.fsdp._optim_utils: [WARNING] | GPU reserved memory | 71846 MiB | 73286 MiB | 140138 MiB | 68292 MiB | +[rank0]:[2024-11-13 17:25:10,346] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 71836 MiB | 73278 MiB | 140124 MiB | 68288 MiB | +[rank0]:[2024-11-13 17:25:10,346] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 10 MiB | 10 MiB | 14 MiB | 4 MiB | +[rank0]:[2024-11-13 17:25:10,346] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:10,346] torch.distributed.fsdp._optim_utils: [WARNING] | Non-releasable memory | 456811 KiB | 18699 MiB | 121915 GiB | 121915 GiB | +[rank0]:[2024-11-13 17:25:10,346] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 453549 KiB | 18696 MiB | 121874 GiB | 121874 GiB | +[rank0]:[2024-11-13 17:25:10,346] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 3262 KiB | 6 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 17:25:10,346] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:10,346] torch.distributed.fsdp._optim_utils: [WARNING] | Allocations | 482 | 710 | 1052 K | 1052 K | +[rank0]:[2024-11-13 17:25:10,346] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 257 | 399 | 824 K | 824 K | +[rank0]:[2024-11-13 17:25:10,346] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 225 | 424 | 228 K | 227 K | +[rank0]:[2024-11-13 17:25:10,346] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:10,346] torch.distributed.fsdp._optim_utils: [WARNING] | Active allocs | 487 | 710 | 1052 K | 1052 K | +[rank0]:[2024-11-13 17:25:10,346] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 257 | 399 | 824 K | 824 K | +[rank0]:[2024-11-13 17:25:10,346] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 230 | 424 | 228 K | 227 K | +[rank0]:[2024-11-13 17:25:10,346] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:10,346] torch.distributed.fsdp._optim_utils: [WARNING] | GPU reserved segments | 110 | 110 | 161 | 51 | +[rank0]:[2024-11-13 17:25:10,346] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 105 | 105 | 154 | 49 | +[rank0]:[2024-11-13 17:25:10,346] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 5 | 5 | 7 | 2 | +[rank0]:[2024-11-13 17:25:10,346] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:10,346] torch.distributed.fsdp._optim_utils: [WARNING] | Non-releasable allocs | 20 | 68 | 462809 | 462789 | +[rank0]:[2024-11-13 17:25:10,346] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 9 | 55 | 373941 | 373932 | +[rank0]:[2024-11-13 17:25:10,346] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 11 | 17 | 88868 | 88857 | +[rank0]:[2024-11-13 17:25:10,346] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:10,346] torch.distributed.fsdp._optim_utils: [WARNING] | Oversize allocations | 0 | 0 | 0 | 0 | +[rank0]:[2024-11-13 17:25:10,346] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:10,346] torch.distributed.fsdp._optim_utils: [WARNING] | Oversize GPU segments | 0 | 0 | 0 | 0 | +[rank0]:[2024-11-13 17:25:10,346] torch.distributed.fsdp._optim_utils: [WARNING] |===========================================================================| +[rank0]:[2024-11-13 17:25:10,346] torch.distributed.fsdp._optim_utils: [WARNING] +[rank0]:[2024-11-13 17:25:08,728] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:08,728] torch.distributed.fsdp._optim_utils: [WARNING] | Requested memory | 5511 MiB | 59812 MiB | 103608 GiB | 103603 GiB | +[rank0]:[2024-11-13 17:25:08,728] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5508 MiB | 59808 MiB | 103568 GiB | 103563 GiB | +[rank0]:[2024-11-13 17:25:08,728] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 17:25:08,728] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:08,728] torch.distributed.fsdp._optim_utils: [WARNING] | GPU reserved memory | 71846 MiB | 73286 MiB | 140138 MiB | 68292 MiB | +[rank0]:[2024-11-13 17:25:08,728] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 71836 MiB | 73278 MiB | 140124 MiB | 68288 MiB | +[rank0]:[2024-11-13 17:25:08,728] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 10 MiB | 10 MiB | 14 MiB | 4 MiB | +[rank0]:[2024-11-13 17:25:08,728] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:08,728] torch.distributed.fsdp._optim_utils: [WARNING] | Non-releasable memory | 456811 KiB | 18699 MiB | 121892 GiB | 121892 GiB | +[rank0]:[2024-11-13 17:25:08,728] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 453549 KiB | 18696 MiB | 121851 GiB | 121851 GiB | +[rank0]:[2024-11-13 17:25:08,728] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 3262 KiB | 6 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 17:25:08,728] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:08,728] torch.distributed.fsdp._optim_utils: [WARNING] | Allocations | 482 | 710 | 1052 K | 1052 K | +[rank0]:[2024-11-13 17:25:08,728] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 257 | 399 | 824 K | 824 K | +[rank0]:[2024-11-13 17:25:08,728] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 225 | 424 | 228 K | 227 K | +[rank0]:[2024-11-13 17:25:08,728] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:11,159] torch.distributed.fsdp._optim_utils: [WARNING] | Oversize GPU segments | 0 | 0 | 0 | 0 | +[rank0]:[2024-11-13 17:25:11,159] torch.distributed.fsdp._optim_utils: [WARNING] |===========================================================================| +[rank0]:[2024-11-13 17:25:11,159] torch.distributed.fsdp._optim_utils: [WARNING] +[rank0]:[2024-11-13 17:25:11,565] torch.distributed.fsdp._optim_utils: [WARNING] CUDA Memory Summary before calling to _allgather_orig_param_states |===========================================================================| +[rank0]:[2024-11-13 17:25:11,565] torch.distributed.fsdp._optim_utils: [WARNING] | PyTorch CUDA memory summary, device ID 0 | +[rank0]:[2024-11-13 17:25:11,565] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:11,565] torch.distributed.fsdp._optim_utils: [WARNING] | CUDA OOMs: 0 | cudaMalloc retries: 2 | +[rank0]:[2024-11-13 17:25:11,565] torch.distributed.fsdp._optim_utils: [WARNING] |===========================================================================| +[rank0]:[2024-11-13 17:25:11,565] torch.distributed.fsdp._optim_utils: [WARNING] | Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed | +[rank0]:[2024-11-13 17:25:11,565] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:11,565] torch.distributed.fsdp._optim_utils: [WARNING] | Allocated memory | 5523 MiB | 58937 MiB | 103619 GiB | 103613 GiB | +[rank0]:[2024-11-13 17:25:11,565] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5521 MiB | 58933 MiB | 103578 GiB | 103573 GiB | +[rank0]:[2024-11-13 17:25:11,565] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 17:25:11,565] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:11,565] torch.distributed.fsdp._optim_utils: [WARNING] | Active memory | 5523 MiB | 59826 MiB | 103619 GiB | 103613 GiB | +[rank0]:[2024-11-13 17:25:11,565] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5521 MiB | 59822 MiB | 103578 GiB | 103573 GiB | +[rank0]:[2024-11-13 17:25:11,565] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 17:25:11,565] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:11,565] torch.distributed.fsdp._optim_utils: [WARNING] | Requested memory | 5511 MiB | 59812 MiB | 103612 GiB | 103607 GiB | +[rank0]:[2024-11-13 17:25:11,565] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5508 MiB | 59808 MiB | 103572 GiB | 103566 GiB | +[rank0]:[2024-11-13 17:25:11,565] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 17:25:11,565] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:11,565] torch.distributed.fsdp._optim_utils: [WARNING] | GPU reserved memory | 71846 MiB | 73286 MiB | 140138 MiB | 68292 MiB | +[rank0]:[2024-11-13 17:25:11,565] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 71836 MiB | 73278 MiB | 140124 MiB | 68288 MiB | +[rank0]:[2024-11-13 17:25:11,565] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 10 MiB | 10 MiB | 14 MiB | 4 MiB | +[rank0]:[2024-11-13 17:25:11,565] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:11,565] torch.distributed.fsdp._optim_utils: [WARNING] | Non-releasable memory | 456811 KiB | 18699 MiB | 121932 GiB | 121932 GiB | +[rank0]:[2024-11-13 17:25:11,565] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 453549 KiB | 18696 MiB | 121891 GiB | 121891 GiB | +[rank0]:[2024-11-13 17:25:11,565] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 3262 KiB | 6 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 17:25:11,565] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:11,565] torch.distributed.fsdp._optim_utils: [WARNING] | Allocations | 482 | 710 | 1052 K | 1052 K | +[rank0]:[2024-11-13 17:25:11,565] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 257 | 399 | 824 K | 824 K | +[rank0]:[2024-11-13 17:25:11,565] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 225 | 424 | 228 K | 227 K | +[rank0]:[2024-11-13 17:25:11,565] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:11,565] torch.distributed.fsdp._optim_utils: [WARNING] | Active allocs | 487 | 710 | 1052 K | 1052 K | +[rank0]:[2024-11-13 17:25:11,565] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 257 | 399 | 824 K | 824 K | +[rank0]:[2024-11-13 17:25:11,565] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 230 | 424 | 228 K | 227 K | +[rank0]:[2024-11-13 17:25:11,565] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:11,565] torch.distributed.fsdp._optim_utils: [WARNING] | GPU reserved segments | 110 | 110 | 161 | 51 | +[rank0]:[2024-11-13 17:25:11,565] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 105 | 105 | 154 | 49 | +[rank0]:[2024-11-13 17:25:11,565] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 5 | 5 | 7 | 2 | +[rank0]:[2024-11-13 17:25:11,565] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:11,565] torch.distributed.fsdp._optim_utils: [WARNING] | Non-releasable allocs | 20 | 68 | 462843 | 462823 | +[rank0]:[2024-11-13 17:25:11,565] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 9 | 55 | 373947 | 373938 | +[rank0]:[2024-11-13 17:25:11,565] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 11 | 17 | 88896 | 88885 | +[rank0]:[2024-11-13 17:25:11,565] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:11,565] torch.distributed.fsdp._optim_utils: [WARNING] | Oversize allocations | 0 | 0 | 0 | 0 | +[rank0]:[2024-11-13 17:25:11,565] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:11,565] torch.distributed.fsdp._optim_utils: [WARNING] | Oversize GPU segments | 0 | 0 | 0 | 0 | +[rank0]:[2024-11-13 17:25:11,565] torch.distributed.fsdp._optim_utils: [WARNING] |===========================================================================| +[rank0]:[2024-11-13 17:25:11,565] torch.distributed.fsdp._optim_utils: [WARNING] +[rank0]:[2024-11-13 17:25:11,970] torch.distributed.fsdp._optim_utils: [WARNING] CUDA Memory Summary before calling to _allgather_orig_param_states |===========================================================================| +[rank0]:[2024-11-13 17:25:11,970] torch.distributed.fsdp._optim_utils: [WARNING] | PyTorch CUDA memory summary, device ID 0 | +[rank0]:[2024-11-13 17:25:11,970] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:11,970] torch.distributed.fsdp._optim_utils: [WARNING] | CUDA OOMs: 0 | cudaMalloc retries: 2 | +[rank0]:[2024-11-13 17:25:11,970] torch.distributed.fsdp._optim_utils: [WARNING] |===========================================================================| +[rank0]:[2024-11-13 17:25:11,970] torch.distributed.fsdp._optim_utils: [WARNING] | Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed | +[rank0]:[2024-11-13 17:25:11,970] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:11,970] torch.distributed.fsdp._optim_utils: [WARNING] | Allocated memory | 5523 MiB | 58937 MiB | 103619 GiB | 103614 GiB | +[rank0]:[2024-11-13 17:25:11,970] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5521 MiB | 58933 MiB | 103579 GiB | 103573 GiB | +[rank0]:[2024-11-13 17:25:11,970] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 17:25:11,970] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:11,970] torch.distributed.fsdp._optim_utils: [WARNING] | Active memory | 5523 MiB | 59826 MiB | 103619 GiB | 103614 GiB | +[rank0]:[2024-11-13 17:25:11,970] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5521 MiB | 59822 MiB | 103579 GiB | 103573 GiB | +[rank0]:[2024-11-13 17:25:11,970] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 17:25:11,970] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:11,970] torch.distributed.fsdp._optim_utils: [WARNING] | Requested memory | 5511 MiB | 59812 MiB | 103613 GiB | 103607 GiB | +[rank0]:[2024-11-13 17:25:11,970] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5508 MiB | 59808 MiB | 103572 GiB | 103567 GiB | +[rank0]:[2024-11-13 17:25:11,970] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 17:25:11,970] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:11,970] torch.distributed.fsdp._optim_utils: [WARNING] | GPU reserved memory | 71846 MiB | 73286 MiB | 140138 MiB | 68292 MiB | +[rank0]:[2024-11-13 17:25:11,970] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 71836 MiB | 73278 MiB | 140124 MiB | 68288 MiB | +[rank0]:[2024-11-13 17:25:11,970] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 10 MiB | 10 MiB | 14 MiB | 4 MiB | +[rank0]:[2024-11-13 17:25:11,970] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:11,970] torch.distributed.fsdp._optim_utils: [WARNING] | Non-releasable memory | 456811 KiB | 18699 MiB | 121938 GiB | 121938 GiB | +[rank0]:[2024-11-13 17:25:11,970] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 453549 KiB | 18696 MiB | 121897 GiB | 121897 GiB | +[rank0]:[2024-11-13 17:25:11,970] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 3262 KiB | 6 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 17:25:11,970] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:11,970] torch.distributed.fsdp._optim_utils: [WARNING] | Allocations | 482 | 710 | 1052 K | 1052 K | +[rank0]:[2024-11-13 17:25:11,970] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 257 | 399 | 824 K | 824 K | +[rank0]:[2024-11-13 17:25:11,970] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 225 | 424 | 228 K | 227 K | +[rank0]:[2024-11-13 17:25:11,970] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:11,970] torch.distributed.fsdp._optim_utils: [WARNING] | Active allocs | 487 | 710 | 1052 K | 1052 K | +[rank0]:[2024-11-13 17:25:11,970] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 257 | 399 | 824 K | 824 K | +[rank0]:[2024-11-13 17:25:11,970] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 230 | 424 | 228 K | 227 K | +[rank0]:[2024-11-13 17:25:11,970] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:11,970] torch.distributed.fsdp._optim_utils: [WARNING] | GPU reserved segments | 110 | 110 | 161 | 51 | +[rank0]:[2024-11-13 17:25:11,970] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 105 | 105 | 154 | 49 | +[rank0]:[2024-11-13 17:25:11,970] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 5 | 5 | 7 | 2 | +[rank0]:[2024-11-13 17:25:11,970] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:11,970] torch.distributed.fsdp._optim_utils: [WARNING] | Non-releasable allocs | 20 | 68 | 462854 | 462834 | +[rank0]:[2024-11-13 17:25:11,970] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 9 | 55 | 373949 | 373940 | +[rank0]:[2024-11-13 17:25:11,970] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 11 | 17 | 88905 | 88894 | +[rank0]:[2024-11-13 17:25:11,970] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:11,970] torch.distributed.fsdp._optim_utils: [WARNING] | Oversize allocations | 0 | 0 | 0 | 0 | +[rank0]:[2024-11-13 17:25:11,970] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:11,970] torch.distributed.fsdp._optim_utils: [WARNING] | Oversize GPU segments | 0 | 0 | 0 | 0 | +[rank0]:[2024-11-13 17:25:11,970] torch.distributed.fsdp._optim_utils: [WARNING] |===========================================================================| +[rank0]:[2024-11-13 17:25:11,970] torch.distributed.fsdp._optim_utils: [WARNING] +[rank0]:[2024-11-13 17:25:12,366] torch.distributed.fsdp._optim_utils: [WARNING] CUDA Memory Summary before calling to _allgather_orig_param_states |===========================================================================| +[rank0]:[2024-11-13 17:25:12,366] torch.distributed.fsdp._optim_utils: [WARNING] | PyTorch CUDA memory summary, device ID 0 | +[rank0]:[2024-11-13 17:25:12,366] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:12,366] torch.distributed.fsdp._optim_utils: [WARNING] | CUDA OOMs: 0 | cudaMalloc retries: 2 | +[rank0]:[2024-11-13 17:25:12,366] torch.distributed.fsdp._optim_utils: [WARNING] |===========================================================================| +[rank0]:[2024-11-13 17:25:12,366] torch.distributed.fsdp._optim_utils: [WARNING] | Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed | +[rank0]:[2024-11-13 17:25:12,366] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:12,366] torch.distributed.fsdp._optim_utils: [WARNING] | Allocated memory | 5523 MiB | 58937 MiB | 103620 GiB | 103614 GiB | +[rank0]:[2024-11-13 17:25:12,366] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5521 MiB | 58933 MiB | 103579 GiB | 103574 GiB | +[rank0]:[2024-11-13 17:25:12,366] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 17:25:12,366] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:12,366] torch.distributed.fsdp._optim_utils: [WARNING] | Active memory | 5523 MiB | 59826 MiB | 103620 GiB | 103614 GiB | +[rank0]:[2024-11-13 17:25:12,366] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5521 MiB | 59822 MiB | 103579 GiB | 103574 GiB | +[rank0]:[2024-11-13 17:25:12,366] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 17:25:12,366] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:12,366] torch.distributed.fsdp._optim_utils: [WARNING] | Requested memory | 5511 MiB | 59812 MiB | 103613 GiB | 103608 GiB | +[rank0]:[2024-11-13 17:25:12,366] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5508 MiB | 59808 MiB | 103573 GiB | 103567 GiB | +[rank0]:[2024-11-13 17:25:12,366] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 17:25:12,366] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:12,366] torch.distributed.fsdp._optim_utils: [WARNING] | GPU reserved memory | 71846 MiB | 73286 MiB | 140138 MiB | 68292 MiB | +[rank0]:[2024-11-13 17:25:12,366] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 71836 MiB | 73278 MiB | 140124 MiB | 68288 MiB | +[rank0]:[2024-11-13 17:25:12,366] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 10 MiB | 10 MiB | 14 MiB | 4 MiB | +[rank0]:[2024-11-13 17:25:12,366] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:12,366] torch.distributed.fsdp._optim_utils: [WARNING] | Non-releasable memory | 456811 KiB | 18699 MiB | 121944 GiB | 121944 GiB | +[rank0]:[2024-11-13 17:25:12,366] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 453549 KiB | 18696 MiB | 121903 GiB | 121903 GiB | +[rank0]:[2024-11-13 17:25:12,366] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 3262 KiB | 6 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 17:25:12,366] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:12,366] torch.distributed.fsdp._optim_utils: [WARNING] | Allocations | 482 | 710 | 1052 K | 1052 K | +[rank0]:[2024-11-13 17:25:12,366] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 257 | 399 | 824 K | 824 K | +[rank0]:[2024-11-13 17:25:12,366] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 225 | 424 | 228 K | 227 K | +[rank0]:[2024-11-13 17:25:12,366] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:12,366] torch.distributed.fsdp._optim_utils: [WARNING] | Active allocs | 487 | 710 | 1052 K | 1052 K | +[rank0]:[2024-11-13 17:25:12,366] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 257 | 399 | 824 K | 824 K | +[rank0]:[2024-11-13 17:25:12,366] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 230 | 424 | 228 K | 227 K | +[rank0]:[2024-11-13 17:25:12,366] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:12,366] torch.distributed.fsdp._optim_utils: [WARNING] | GPU reserved segments | 110 | 110 | 161 | 51 | +[rank0]:[2024-11-13 17:25:12,366] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 105 | 105 | 154 | 49 | +[rank0]:[2024-11-13 17:25:12,366] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 5 | 5 | 7 | 2 | +[rank0]:[2024-11-13 17:25:12,366] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:12,366] torch.distributed.fsdp._optim_utils: [WARNING] | Non-releasable allocs | 20 | 68 | 462865 | 462845 | +[rank0]:[2024-11-13 17:25:12,366] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 9 | 55 | 373951 | 373942 | +[rank0]:[2024-11-13 17:25:12,366] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 11 | 17 | 88914 | 88903 | +[rank0]:[2024-11-13 17:25:12,366] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:12,366] torch.distributed.fsdp._optim_utils: [WARNING] | Oversize allocations | 0 | 0 | 0 | 0 | +[rank0]:[2024-11-13 17:25:12,366] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:12,366] torch.distributed.fsdp._optim_utils: [WARNING] | Oversize GPU segments | 0 | 0 | 0 | 0 | +[rank0]:[2024-11-13 17:25:12,366] torch.distributed.fsdp._optim_utils: [WARNING] |===========================================================================| +[rank0]:[2024-11-13 17:25:12,366] torch.distributed.fsdp._optim_utils: [WARNING] +[rank0]:[2024-11-13 17:25:12,765] torch.distributed.fsdp._optim_utils: [WARNING] CUDA Memory Summary before calling to _allgather_orig_param_states |===========================================================================| +[rank0]:[2024-11-13 17:25:12,765] torch.distributed.fsdp._optim_utils: [WARNING] | PyTorch CUDA memory summary, device ID 0 | +[rank0]:[2024-11-13 17:25:12,765] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:12,765] torch.distributed.fsdp._optim_utils: [WARNING] | CUDA OOMs: 0 | cudaMalloc retries: 2 | +[rank0]:[2024-11-13 17:25:12,765] torch.distributed.fsdp._optim_utils: [WARNING] |===========================================================================| +[rank0]:[2024-11-13 17:25:12,765] torch.distributed.fsdp._optim_utils: [WARNING] | Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed | +[rank0]:[2024-11-13 17:25:12,765] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:12,765] torch.distributed.fsdp._optim_utils: [WARNING] | Allocated memory | 5523 MiB | 58937 MiB | 103620 GiB | 103615 GiB | +[rank0]:[2024-11-13 17:25:12,765] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5521 MiB | 58933 MiB | 103580 GiB | 103574 GiB | +[rank0]:[2024-11-13 17:25:12,765] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 17:25:12,765] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:12,765] torch.distributed.fsdp._optim_utils: [WARNING] | Active memory | 5523 MiB | 59826 MiB | 103620 GiB | 103615 GiB | +[rank0]:[2024-11-13 17:25:12,765] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5521 MiB | 59822 MiB | 103580 GiB | 103574 GiB | +[rank0]:[2024-11-13 17:25:12,765] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 17:25:12,765] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:12,765] torch.distributed.fsdp._optim_utils: [WARNING] | Requested memory | 5511 MiB | 59812 MiB | 103614 GiB | 103608 GiB | +[rank0]:[2024-11-13 17:25:12,765] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5508 MiB | 59808 MiB | 103573 GiB | 103568 GiB | +[rank0]:[2024-11-13 17:25:12,765] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 17:25:12,765] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:12,765] torch.distributed.fsdp._optim_utils: [WARNING] | GPU reserved memory | 71846 MiB | 73286 MiB | 140138 MiB | 68292 MiB | +[rank0]:[2024-11-13 17:25:12,765] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 71836 MiB | 73278 MiB | 140124 MiB | 68288 MiB | +[rank0]:[2024-11-13 17:25:12,765] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 10 MiB | 10 MiB | 14 MiB | 4 MiB | +[rank0]:[2024-11-13 17:25:12,765] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:12,765] torch.distributed.fsdp._optim_utils: [WARNING] | Non-releasable memory | 456811 KiB | 18699 MiB | 121950 GiB | 121949 GiB | +[rank0]:[2024-11-13 17:25:12,765] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 453549 KiB | 18696 MiB | 121909 GiB | 121908 GiB | +[rank0]:[2024-11-13 17:25:12,765] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 3262 KiB | 6 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 17:25:12,765] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:12,765] torch.distributed.fsdp._optim_utils: [WARNING] | Allocations | 482 | 710 | 1052 K | 1052 K | +[rank0]:[2024-11-13 17:25:12,765] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 257 | 399 | 824 K | 824 K | +[rank0]:[2024-11-13 17:25:12,765] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 225 | 424 | 228 K | 227 K | +[rank0]:[2024-11-13 17:25:12,765] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:12,765] torch.distributed.fsdp._optim_utils: [WARNING] | Active allocs | 487 | 710 | 1052 K | 1052 K | +[rank0]:[2024-11-13 17:25:12,765] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 257 | 399 | 824 K | 824 K | +[rank0]:[2024-11-13 17:25:12,765] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 230 | 424 | 228 K | 227 K | +[rank0]:[2024-11-13 17:25:12,765] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:12,765] torch.distributed.fsdp._optim_utils: [WARNING] | GPU reserved segments | 110 | 110 | 161 | 51 | +[rank0]:[2024-11-13 17:25:12,765] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 105 | 105 | 154 | 49 | +[rank0]:[2024-11-13 17:25:12,765] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 5 | 5 | 7 | 2 | +[rank0]:[2024-11-13 17:25:12,765] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:12,765] torch.distributed.fsdp._optim_utils: [WARNING] | Non-releasable allocs | 20 | 68 | 462876 | 462856 | +[rank0]:[2024-11-13 17:25:12,765] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 9 | 55 | 373953 | 373944 | +[rank0]:[2024-11-13 17:25:12,765] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 11 | 17 | 88923 | 88912 | +[rank0]:[2024-11-13 17:25:12,765] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:12,765] torch.distributed.fsdp._optim_utils: [WARNING] | Oversize allocations | 0 | 0 | 0 | 0 | +[rank0]:[2024-11-13 17:25:12,765] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:12,765] torch.distributed.fsdp._optim_utils: [WARNING] | Oversize GPU segments | 0 | 0 | 0 | 0 | +[rank0]:[2024-11-13 17:25:12,765] torch.distributed.fsdp._optim_utils: [WARNING] |===========================================================================| +[rank0]:[2024-11-13 17:25:12,765] torch.distributed.fsdp._optim_utils: [WARNING] +[rank0]:[2024-11-13 17:25:13,961] torch.distributed.fsdp._optim_utils: [WARNING] | CUDA OOMs: 0 | cudaMalloc retries: 2 | +[rank0]:[2024-11-13 17:25:13,961] torch.distributed.fsdp._optim_utils: [WARNING] |===========================================================================| +[rank0]:[2024-11-13 17:25:13,961] torch.distributed.fsdp._optim_utils: [WARNING] | Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed | +[rank0]:[2024-11-13 17:25:13,961] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:13,961] torch.distributed.fsdp._optim_utils: [WARNING] | Allocated memory | 5523 MiB | 58937 MiB | 103622 GiB | 103617 GiB | +[rank0]:[2024-11-13 17:25:13,961] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5521 MiB | 58933 MiB | 103581 GiB | 103576 GiB | +[rank0]:[2024-11-13 17:25:13,961] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 17:25:13,961] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:13,961] torch.distributed.fsdp._optim_utils: [WARNING] | Active memory | 5523 MiB | 59826 MiB | 103622 GiB | 103617 GiB | +[rank0]:[2024-11-13 17:25:13,961] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5521 MiB | 59822 MiB | 103581 GiB | 103576 GiB | +[rank0]:[2024-11-13 17:25:13,961] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 17:25:13,961] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:13,961] torch.distributed.fsdp._optim_utils: [WARNING] | Requested memory | 5511 MiB | 59812 MiB | 103615 GiB | 103610 GiB | +[rank0]:[2024-11-13 17:25:13,961] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5508 MiB | 59808 MiB | 103575 GiB | 103570 GiB | +[rank0]:[2024-11-13 17:25:13,961] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 17:25:13,961] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:13,961] torch.distributed.fsdp._optim_utils: [WARNING] | GPU reserved memory | 71846 MiB | 73286 MiB | 140138 MiB | 68292 MiB | +[rank0]:[2024-11-13 17:25:13,961] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 71836 MiB | 73278 MiB | 140124 MiB | 68288 MiB | +[rank0]:[2024-11-13 17:25:13,961] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 10 MiB | 10 MiB | 14 MiB | 4 MiB | +[rank0]:[2024-11-13 17:25:13,961] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:13,961] torch.distributed.fsdp._optim_utils: [WARNING] | Non-releasable memory | 456811 KiB | 18699 MiB | 121967 GiB | 121967 GiB | +[rank0]:[2024-11-13 17:25:13,961] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 453549 KiB | 18696 MiB | 121926 GiB | 121926 GiB | +[rank0]:[2024-11-13 17:25:13,961] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 3262 KiB | 6 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 17:25:13,961] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:13,961] torch.distributed.fsdp._optim_utils: [WARNING] | Allocations | 482 | 710 | 1052 K | 1052 K | +[rank0]:[2024-11-13 17:25:13,961] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 257 | 399 | 824 K | 824 K | +[rank0]:[2024-11-13 17:25:13,961] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 225 | 424 | 228 K | 227 K | +[rank0]:[2024-11-13 17:25:13,961] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:13,961] torch.distributed.fsdp._optim_utils: [WARNING] | Active allocs | 487 | 710 | 1052 K | 1052 K | +[rank0]:[2024-11-13 17:25:13,961] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 257 | 399 | 824 K | 824 K | +[rank0]:[2024-11-13 17:25:13,961] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 230 | 424 | 228 K | 227 K | +[rank0]:[2024-11-13 17:25:13,961] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:13,961] torch.distributed.fsdp._optim_utils: [WARNING] | GPU reserved segments | 110 | 110 | 161 | 51 | +[rank0]:[2024-11-13 17:25:13,961] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 105 | 105 | 154 | 49 | +[rank0]:[2024-11-13 17:25:13,961] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 5 | 5 | 7 | 2 | +[rank0]:[2024-11-13 17:25:13,961] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:13,961] torch.distributed.fsdp._optim_utils: [WARNING] | Non-releasable allocs | 20 | 68 | 462909 | 462889 | +[rank0]:[2024-11-13 17:25:13,961] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 9 | 55 | 373959 | 373950 | +[rank0]:[2024-11-13 17:25:13,961] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 11 | 17 | 88950 | 88939 | +[rank0]:[2024-11-13 17:25:13,961] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:13,961] torch.distributed.fsdp._optim_utils: [WARNING] | Oversize allocations | 0 | 0 | 0 | 0 | +[rank0]:[2024-11-13 17:25:13,961] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:13,961] torch.distributed.fsdp._optim_utils: [WARNING] | Oversize GPU segments | 0 | 0 | 0 | 0 | +[rank0]:[2024-11-13 17:25:13,961] torch.distributed.fsdp._optim_utils: [WARNING] |===========================================================================| +[rank0]:[2024-11-13 17:25:13,961] torch.distributed.fsdp._optim_utils: [WARNING] +[rank0]:[2024-11-13 17:25:14,355] torch.distributed.fsdp._optim_utils: [WARNING] CUDA Memory Summary before calling to _allgather_orig_param_states |===========================================================================| +[rank0]:[2024-11-13 17:25:14,355] torch.distributed.fsdp._optim_utils: [WARNING] | PyTorch CUDA memory summary, device ID 0 | +[rank0]:[2024-11-13 17:25:14,355] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:14,355] torch.distributed.fsdp._optim_utils: [WARNING] | CUDA OOMs: 0 | cudaMalloc retries: 2 | +[rank0]:[2024-11-13 17:25:14,355] torch.distributed.fsdp._optim_utils: [WARNING] |===========================================================================| +[rank0]:[2024-11-13 17:25:14,355] torch.distributed.fsdp._optim_utils: [WARNING] | Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed | +[rank0]:[2024-11-13 17:25:14,355] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:14,355] torch.distributed.fsdp._optim_utils: [WARNING] | Allocated memory | 5523 MiB | 58937 MiB | 103623 GiB | 103617 GiB | +[rank0]:[2024-11-13 17:25:14,355] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5521 MiB | 58933 MiB | 103582 GiB | 103577 GiB | +[rank0]:[2024-11-13 17:25:14,355] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 17:25:14,355] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:14,355] torch.distributed.fsdp._optim_utils: [WARNING] | Active memory | 5523 MiB | 59826 MiB | 103623 GiB | 103617 GiB | +[rank0]:[2024-11-13 17:25:14,355] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5521 MiB | 59822 MiB | 103582 GiB | 103577 GiB | +[rank0]:[2024-11-13 17:25:14,355] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 17:25:14,355] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:14,355] torch.distributed.fsdp._optim_utils: [WARNING] | Requested memory | 5511 MiB | 59812 MiB | 103616 GiB | 103611 GiB | +[rank0]:[2024-11-13 17:25:14,355] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5508 MiB | 59808 MiB | 103575 GiB | 103570 GiB | +[rank0]:[2024-11-13 17:25:14,355] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 17:25:14,355] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:14,355] torch.distributed.fsdp._optim_utils: [WARNING] | GPU reserved memory | 71846 MiB | 73286 MiB | 140138 MiB | 68292 MiB | +[rank0]:[2024-11-13 17:25:14,355] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 71836 MiB | 73278 MiB | 140124 MiB | 68288 MiB | +[rank0]:[2024-11-13 17:25:14,355] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 10 MiB | 10 MiB | 14 MiB | 4 MiB | +[rank0]:[2024-11-13 17:25:14,355] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:14,355] torch.distributed.fsdp._optim_utils: [WARNING] | Non-releasable memory | 456811 KiB | 18699 MiB | 121973 GiB | 121972 GiB | +[rank0]:[2024-11-13 17:25:14,355] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 453549 KiB | 18696 MiB | 121932 GiB | 121931 GiB | +[rank0]:[2024-11-13 17:25:14,355] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 3262 KiB | 6 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 17:25:14,355] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:14,355] torch.distributed.fsdp._optim_utils: [WARNING] | Allocations | 482 | 710 | 1052 K | 1052 K | +[rank0]:[2024-11-13 17:25:14,355] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 257 | 399 | 824 K | 824 K | +[rank0]:[2024-11-13 17:25:14,355] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 225 | 424 | 228 K | 227 K | +[rank0]:[2024-11-13 17:25:14,355] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:14,355] torch.distributed.fsdp._optim_utils: [WARNING] | Active allocs | 487 | 710 | 1052 K | 1052 K | +[rank0]:[2024-11-13 17:25:14,355] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 257 | 399 | 824 K | 824 K | +[rank0]:[2024-11-13 17:25:14,355] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 230 | 424 | 228 K | 227 K | +[rank0]:[2024-11-13 17:25:14,355] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:14,355] torch.distributed.fsdp._optim_utils: [WARNING] | GPU reserved segments | 110 | 110 | 161 | 51 | +[rank0]:[2024-11-13 17:25:14,355] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 105 | 105 | 154 | 49 | +[rank0]:[2024-11-13 17:25:14,355] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 5 | 5 | 7 | 2 | +[rank0]:[2024-11-13 17:25:14,355] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:14,355] torch.distributed.fsdp._optim_utils: [WARNING] | Non-releasable allocs | 20 | 68 | 462920 | 462900 | +[rank0]:[2024-11-13 17:25:14,355] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 9 | 55 | 373961 | 373952 | +[rank0]:[2024-11-13 17:25:14,355] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 11 | 17 | 88959 | 88948 | +[rank0]:[2024-11-13 17:25:14,355] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:14,355] torch.distributed.fsdp._optim_utils: [WARNING] | Oversize allocations | 0 | 0 | 0 | 0 | +[rank0]:[2024-11-13 17:25:14,355] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:14,355] torch.distributed.fsdp._optim_utils: [WARNING] | Oversize GPU segments | 0 | 0 | 0 | 0 | +[rank0]:[2024-11-13 17:25:14,355] torch.distributed.fsdp._optim_utils: [WARNING] |===========================================================================| +[rank0]:[2024-11-13 17:25:14,355] torch.distributed.fsdp._optim_utils: [WARNING] +[rank0]:[2024-11-13 17:25:14,761] torch.distributed.fsdp._optim_utils: [WARNING] CUDA Memory Summary before calling to _allgather_orig_param_states |===========================================================================| +[rank0]:[2024-11-13 17:25:14,761] torch.distributed.fsdp._optim_utils: [WARNING] | PyTorch CUDA memory summary, device ID 0 | +[rank0]:[2024-11-13 17:25:14,761] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:14,761] torch.distributed.fsdp._optim_utils: [WARNING] | CUDA OOMs: 0 | cudaMalloc retries: 2 | +[rank0]:[2024-11-13 17:25:14,761] torch.distributed.fsdp._optim_utils: [WARNING] |===========================================================================| +[rank0]:[2024-11-13 17:25:14,761] torch.distributed.fsdp._optim_utils: [WARNING] | Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed | +[rank0]:[2024-11-13 17:25:14,761] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:14,761] torch.distributed.fsdp._optim_utils: [WARNING] | Allocated memory | 5523 MiB | 58937 MiB | 103623 GiB | 103618 GiB | +[rank0]:[2024-11-13 17:25:14,761] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5521 MiB | 58933 MiB | 103583 GiB | 103577 GiB | +[rank0]:[2024-11-13 17:25:14,761] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 17:25:14,761] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:14,761] torch.distributed.fsdp._optim_utils: [WARNING] | Active memory | 5523 MiB | 59826 MiB | 103623 GiB | 103618 GiB | +[rank0]:[2024-11-13 17:25:14,761] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5521 MiB | 59822 MiB | 103583 GiB | 103577 GiB | +[rank0]:[2024-11-13 17:25:14,761] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 17:25:14,761] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:14,761] torch.distributed.fsdp._optim_utils: [WARNING] | Requested memory | 5511 MiB | 59812 MiB | 103617 GiB | 103611 GiB | +[rank0]:[2024-11-13 17:25:14,761] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5508 MiB | 59808 MiB | 103576 GiB | 103571 GiB | +[rank0]:[2024-11-13 17:25:14,761] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 17:25:14,761] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:14,761] torch.distributed.fsdp._optim_utils: [WARNING] | GPU reserved memory | 71846 MiB | 73286 MiB | 140138 MiB | 68292 MiB | +[rank0]:[2024-11-13 17:25:14,761] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 71836 MiB | 73278 MiB | 140124 MiB | 68288 MiB | +[rank0]:[2024-11-13 17:25:14,761] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 10 MiB | 10 MiB | 14 MiB | 4 MiB | +[rank0]:[2024-11-13 17:25:14,761] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:14,761] torch.distributed.fsdp._optim_utils: [WARNING] | Non-releasable memory | 456811 KiB | 18699 MiB | 121979 GiB | 121978 GiB | +[rank0]:[2024-11-13 17:25:14,761] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 453549 KiB | 18696 MiB | 121938 GiB | 121937 GiB | +[rank0]:[2024-11-13 17:25:14,761] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 3262 KiB | 6 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 17:25:14,761] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:14,761] torch.distributed.fsdp._optim_utils: [WARNING] | Allocations | 482 | 710 | 1052 K | 1052 K | +[rank0]:[2024-11-13 17:25:14,761] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 257 | 399 | 824 K | 824 K | +[rank0]:[2024-11-13 17:25:14,761] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 225 | 424 | 228 K | 228 K | +[rank0]:[2024-11-13 17:25:14,761] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:14,761] torch.distributed.fsdp._optim_utils: [WARNING] | Active allocs | 487 | 710 | 1052 K | 1052 K | +[rank0]:[2024-11-13 17:25:14,761] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 257 | 399 | 824 K | 824 K | +[rank0]:[2024-11-13 17:25:14,761] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 230 | 424 | 228 K | 227 K | +[rank0]:[2024-11-13 17:25:14,761] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:14,761] torch.distributed.fsdp._optim_utils: [WARNING] | GPU reserved segments | 110 | 110 | 161 | 51 | +[rank0]:[2024-11-13 17:25:14,761] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 105 | 105 | 154 | 49 | +[rank0]:[2024-11-13 17:25:14,761] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 5 | 5 | 7 | 2 | +[rank0]:[2024-11-13 17:25:14,761] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:14,761] torch.distributed.fsdp._optim_utils: [WARNING] | Non-releasable allocs | 20 | 68 | 462931 | 462911 | +[rank0]:[2024-11-13 17:25:14,761] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 9 | 55 | 373963 | 373954 | +[rank0]:[2024-11-13 17:25:14,761] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 11 | 17 | 88968 | 88957 | +[rank0]:[2024-11-13 17:25:14,761] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:14,761] torch.distributed.fsdp._optim_utils: [WARNING] | Oversize allocations | 0 | 0 | 0 | 0 | +[rank0]:[2024-11-13 17:25:14,761] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:14,761] torch.distributed.fsdp._optim_utils: [WARNING] | Oversize GPU segments | 0 | 0 | 0 | 0 | +[rank0]:[2024-11-13 17:25:14,761] torch.distributed.fsdp._optim_utils: [WARNING] |===========================================================================| +[rank0]:[2024-11-13 17:25:14,761] torch.distributed.fsdp._optim_utils: [WARNING] +[rank0]:[2024-11-13 17:25:15,963] torch.distributed.fsdp._optim_utils: [WARNING] | CUDA OOMs: 0 | cudaMalloc retries: 2 | +[rank0]:[2024-11-13 17:25:15,963] torch.distributed.fsdp._optim_utils: [WARNING] |===========================================================================| +[rank0]:[2024-11-13 17:25:15,963] torch.distributed.fsdp._optim_utils: [WARNING] | Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed | +[rank0]:[2024-11-13 17:25:15,963] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:15,963] torch.distributed.fsdp._optim_utils: [WARNING] | Allocated memory | 5523 MiB | 58937 MiB | 103625 GiB | 103619 GiB | +[rank0]:[2024-11-13 17:25:15,963] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5521 MiB | 58933 MiB | 103584 GiB | 103579 GiB | +[rank0]:[2024-11-13 17:25:15,963] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 17:25:15,963] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:15,963] torch.distributed.fsdp._optim_utils: [WARNING] | Active memory | 5523 MiB | 59826 MiB | 103625 GiB | 103619 GiB | +[rank0]:[2024-11-13 17:25:15,963] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5521 MiB | 59822 MiB | 103584 GiB | 103579 GiB | +[rank0]:[2024-11-13 17:25:15,963] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 17:25:15,963] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:15,963] torch.distributed.fsdp._optim_utils: [WARNING] | Requested memory | 5511 MiB | 59812 MiB | 103618 GiB | 103613 GiB | +[rank0]:[2024-11-13 17:25:15,963] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5508 MiB | 59808 MiB | 103578 GiB | 103572 GiB | +[rank0]:[2024-11-13 17:25:15,963] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 17:25:15,963] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:15,963] torch.distributed.fsdp._optim_utils: [WARNING] | GPU reserved memory | 71846 MiB | 73286 MiB | 140138 MiB | 68292 MiB | +[rank0]:[2024-11-13 17:25:15,963] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 71836 MiB | 73278 MiB | 140124 MiB | 68288 MiB | +[rank0]:[2024-11-13 17:25:15,963] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 10 MiB | 10 MiB | 14 MiB | 4 MiB | +[rank0]:[2024-11-13 17:25:15,963] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:15,963] torch.distributed.fsdp._optim_utils: [WARNING] | Non-releasable memory | 456811 KiB | 18699 MiB | 121996 GiB | 121996 GiB | +[rank0]:[2024-11-13 17:25:15,963] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 453549 KiB | 18696 MiB | 121955 GiB | 121955 GiB | +[rank0]:[2024-11-13 17:25:15,963] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 3262 KiB | 6 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 17:25:15,963] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:15,963] torch.distributed.fsdp._optim_utils: [WARNING] | Allocations | 482 | 710 | 1052 K | 1052 K | +[rank0]:[2024-11-13 17:25:15,963] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 257 | 399 | 824 K | 824 K | +[rank0]:[2024-11-13 17:25:15,963] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 225 | 424 | 228 K | 228 K | +[rank0]:[2024-11-13 17:25:15,963] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:15,963] torch.distributed.fsdp._optim_utils: [WARNING] | Active allocs | 487 | 710 | 1052 K | 1052 K | +[rank0]:[2024-11-13 17:25:15,963] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 257 | 399 | 824 K | 824 K | +[rank0]:[2024-11-13 17:25:15,963] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 230 | 424 | 228 K | 228 K | +[rank0]:[2024-11-13 17:25:15,963] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:15,963] torch.distributed.fsdp._optim_utils: [WARNING] | GPU reserved segments | 110 | 110 | 161 | 51 | +[rank0]:[2024-11-13 17:25:15,963] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 105 | 105 | 154 | 49 | +[rank0]:[2024-11-13 17:25:15,963] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 5 | 5 | 7 | 2 | +[rank0]:[2024-11-13 17:25:15,963] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:15,963] torch.distributed.fsdp._optim_utils: [WARNING] | Non-releasable allocs | 20 | 68 | 462964 | 462944 | +[rank0]:[2024-11-13 17:25:15,963] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 9 | 55 | 373969 | 373960 | +[rank0]:[2024-11-13 17:25:15,963] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 11 | 17 | 88995 | 88984 | +[rank0]:[2024-11-13 17:25:15,963] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:15,963] torch.distributed.fsdp._optim_utils: [WARNING] | Oversize allocations | 0 | 0 | 0 | 0 | +[rank0]:[2024-11-13 17:25:15,963] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:15,963] torch.distributed.fsdp._optim_utils: [WARNING] | Oversize GPU segments | 0 | 0 | 0 | 0 | +[rank0]:[2024-11-13 17:25:15,963] torch.distributed.fsdp._optim_utils: [WARNING] |===========================================================================| +[rank0]:[2024-11-13 17:25:15,963] torch.distributed.fsdp._optim_utils: [WARNING] +[rank0]:[2024-11-13 17:25:16,361] torch.distributed.fsdp._optim_utils: [WARNING] CUDA Memory Summary before calling to _allgather_orig_param_states |===========================================================================| +[rank0]:[2024-11-13 17:25:16,361] torch.distributed.fsdp._optim_utils: [WARNING] | PyTorch CUDA memory summary, device ID 0 | +[rank0]:[2024-11-13 17:25:16,361] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:16,361] torch.distributed.fsdp._optim_utils: [WARNING] | CUDA OOMs: 0 | cudaMalloc retries: 2 | +[rank0]:[2024-11-13 17:25:16,361] torch.distributed.fsdp._optim_utils: [WARNING] |===========================================================================| +[rank0]:[2024-11-13 17:25:16,361] torch.distributed.fsdp._optim_utils: [WARNING] | Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed | +[rank0]:[2024-11-13 17:25:16,361] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:16,361] torch.distributed.fsdp._optim_utils: [WARNING] | Allocated memory | 5523 MiB | 58937 MiB | 103625 GiB | 103620 GiB | +[rank0]:[2024-11-13 17:25:16,361] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5521 MiB | 58933 MiB | 103585 GiB | 103579 GiB | +[rank0]:[2024-11-13 17:25:16,361] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 17:25:16,361] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:16,361] torch.distributed.fsdp._optim_utils: [WARNING] | Active memory | 5523 MiB | 59826 MiB | 103625 GiB | 103620 GiB | +[rank0]:[2024-11-13 17:25:16,361] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5521 MiB | 59822 MiB | 103585 GiB | 103579 GiB | +[rank0]:[2024-11-13 17:25:16,361] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 17:25:16,361] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:16,361] torch.distributed.fsdp._optim_utils: [WARNING] | Requested memory | 5511 MiB | 59812 MiB | 103619 GiB | 103613 GiB | +[rank0]:[2024-11-13 17:25:16,361] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5508 MiB | 59808 MiB | 103578 GiB | 103573 GiB | +[rank0]:[2024-11-13 17:25:16,361] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 17:25:16,361] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:16,361] torch.distributed.fsdp._optim_utils: [WARNING] | GPU reserved memory | 71846 MiB | 73286 MiB | 140138 MiB | 68292 MiB | +[rank0]:[2024-11-13 17:25:16,361] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 71836 MiB | 73278 MiB | 140124 MiB | 68288 MiB | +[rank0]:[2024-11-13 17:25:16,361] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 10 MiB | 10 MiB | 14 MiB | 4 MiB | +[rank0]:[2024-11-13 17:25:16,361] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:16,361] torch.distributed.fsdp._optim_utils: [WARNING] | Non-releasable memory | 456811 KiB | 18699 MiB | 122002 GiB | 122001 GiB | +[rank0]:[2024-11-13 17:25:16,361] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 453549 KiB | 18696 MiB | 121961 GiB | 121960 GiB | +[rank0]:[2024-11-13 17:25:16,361] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 3262 KiB | 6 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 17:25:16,361] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:16,361] torch.distributed.fsdp._optim_utils: [WARNING] | Allocations | 482 | 710 | 1052 K | 1052 K | +[rank0]:[2024-11-13 17:25:16,361] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 257 | 399 | 824 K | 824 K | +[rank0]:[2024-11-13 17:25:16,361] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 225 | 424 | 228 K | 228 K | +[rank0]:[2024-11-13 17:25:16,361] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:16,361] torch.distributed.fsdp._optim_utils: [WARNING] | Active allocs | 487 | 710 | 1052 K | 1052 K | +[rank0]:[2024-11-13 17:25:16,361] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 257 | 399 | 824 K | 824 K | +[rank0]:[2024-11-13 17:25:16,361] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 230 | 424 | 228 K | 228 K | +[rank0]:[2024-11-13 17:25:16,361] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:16,361] torch.distributed.fsdp._optim_utils: [WARNING] | GPU reserved segments | 110 | 110 | 161 | 51 | +[rank0]:[2024-11-13 17:25:16,361] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 105 | 105 | 154 | 49 | +[rank0]:[2024-11-13 17:25:16,361] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 5 | 5 | 7 | 2 | +[rank0]:[2024-11-13 17:25:16,361] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:16,361] torch.distributed.fsdp._optim_utils: [WARNING] | Non-releasable allocs | 20 | 68 | 462975 | 462955 | +[rank0]:[2024-11-13 17:25:16,361] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 9 | 55 | 373971 | 373962 | +[rank0]:[2024-11-13 17:25:16,361] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 11 | 17 | 89004 | 88993 | +[rank0]:[2024-11-13 17:25:16,361] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:16,361] torch.distributed.fsdp._optim_utils: [WARNING] | Oversize allocations | 0 | 0 | 0 | 0 | +[rank0]:[2024-11-13 17:25:16,361] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:16,361] torch.distributed.fsdp._optim_utils: [WARNING] | Oversize GPU segments | 0 | 0 | 0 | 0 | +[rank0]:[2024-11-13 17:25:16,361] torch.distributed.fsdp._optim_utils: [WARNING] |===========================================================================| +[rank0]:[2024-11-13 17:25:16,361] torch.distributed.fsdp._optim_utils: [WARNING] +[rank0]:[2024-11-13 17:25:16,767] torch.distributed.fsdp._optim_utils: [WARNING] CUDA Memory Summary before calling to _allgather_orig_param_states |===========================================================================| +[rank0]:[2024-11-13 17:25:16,767] torch.distributed.fsdp._optim_utils: [WARNING] | PyTorch CUDA memory summary, device ID 0 | +[rank0]:[2024-11-13 17:25:16,767] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:16,767] torch.distributed.fsdp._optim_utils: [WARNING] | CUDA OOMs: 0 | cudaMalloc retries: 2 | +[rank0]:[2024-11-13 17:25:16,767] torch.distributed.fsdp._optim_utils: [WARNING] |===========================================================================| +[rank0]:[2024-11-13 17:25:16,767] torch.distributed.fsdp._optim_utils: [WARNING] | Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed | +[rank0]:[2024-11-13 17:25:16,767] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:16,767] torch.distributed.fsdp._optim_utils: [WARNING] | Allocated memory | 5523 MiB | 58937 MiB | 103626 GiB | 103620 GiB | +[rank0]:[2024-11-13 17:25:16,767] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5521 MiB | 58933 MiB | 103585 GiB | 103580 GiB | +[rank0]:[2024-11-13 17:25:16,767] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 17:25:16,767] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:16,767] torch.distributed.fsdp._optim_utils: [WARNING] | Active memory | 5523 MiB | 59826 MiB | 103626 GiB | 103620 GiB | +[rank0]:[2024-11-13 17:25:16,767] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5521 MiB | 59822 MiB | 103585 GiB | 103580 GiB | +[rank0]:[2024-11-13 17:25:16,767] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 17:25:16,767] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:16,767] torch.distributed.fsdp._optim_utils: [WARNING] | Requested memory | 5511 MiB | 59812 MiB | 103619 GiB | 103614 GiB | +[rank0]:[2024-11-13 17:25:16,767] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5508 MiB | 59808 MiB | 103579 GiB | 103573 GiB | +[rank0]:[2024-11-13 17:25:16,767] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 17:25:16,767] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:16,767] torch.distributed.fsdp._optim_utils: [WARNING] | GPU reserved memory | 71846 MiB | 73286 MiB | 140138 MiB | 68292 MiB | +[rank0]:[2024-11-13 17:25:16,767] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 71836 MiB | 73278 MiB | 140124 MiB | 68288 MiB | +[rank0]:[2024-11-13 17:25:16,767] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 10 MiB | 10 MiB | 14 MiB | 4 MiB | +[rank0]:[2024-11-13 17:25:16,767] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:16,767] torch.distributed.fsdp._optim_utils: [WARNING] | Non-releasable memory | 456811 KiB | 18699 MiB | 122008 GiB | 122007 GiB | +[rank0]:[2024-11-13 17:25:16,767] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 453549 KiB | 18696 MiB | 121967 GiB | 121966 GiB | +[rank0]:[2024-11-13 17:25:16,767] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 3262 KiB | 6 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 17:25:16,767] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:16,767] torch.distributed.fsdp._optim_utils: [WARNING] | Allocations | 482 | 710 | 1052 K | 1052 K | +[rank0]:[2024-11-13 17:25:16,767] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 257 | 399 | 824 K | 824 K | +[rank0]:[2024-11-13 17:25:16,767] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 225 | 424 | 228 K | 228 K | +[rank0]:[2024-11-13 17:25:16,767] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:16,767] torch.distributed.fsdp._optim_utils: [WARNING] | Active allocs | 487 | 710 | 1052 K | 1052 K | +[rank0]:[2024-11-13 17:25:16,767] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 257 | 399 | 824 K | 824 K | +[rank0]:[2024-11-13 17:25:16,767] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 230 | 424 | 228 K | 228 K | +[rank0]:[2024-11-13 17:25:16,767] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:16,767] torch.distributed.fsdp._optim_utils: [WARNING] | GPU reserved segments | 110 | 110 | 161 | 51 | +[rank0]:[2024-11-13 17:25:16,767] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 105 | 105 | 154 | 49 | +[rank0]:[2024-11-13 17:25:16,767] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 5 | 5 | 7 | 2 | +[rank0]:[2024-11-13 17:25:16,767] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:16,767] torch.distributed.fsdp._optim_utils: [WARNING] | Non-releasable allocs | 20 | 68 | 462986 | 462966 | +[rank0]:[2024-11-13 17:25:16,767] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 9 | 55 | 373973 | 373964 | +[rank0]:[2024-11-13 17:25:16,767] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 11 | 17 | 89013 | 89002 | +[rank0]:[2024-11-13 17:25:16,767] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:16,767] torch.distributed.fsdp._optim_utils: [WARNING] | Oversize allocations | 0 | 0 | 0 | 0 | +[rank0]:[2024-11-13 17:25:16,767] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:16,767] torch.distributed.fsdp._optim_utils: [WARNING] | Oversize GPU segments | 0 | 0 | 0 | 0 | +[rank0]:[2024-11-13 17:25:16,767] torch.distributed.fsdp._optim_utils: [WARNING] |===========================================================================| +[rank0]:[2024-11-13 17:25:16,767] torch.distributed.fsdp._optim_utils: [WARNING] +[rank0]:[2024-11-13 17:25:17,164] torch.distributed.fsdp._optim_utils: [WARNING] CUDA Memory Summary before calling to _allgather_orig_param_states |===========================================================================| +[rank0]:[2024-11-13 17:25:17,164] torch.distributed.fsdp._optim_utils: [WARNING] | PyTorch CUDA memory summary, device ID 0 | +[rank0]:[2024-11-13 17:25:17,164] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:17,164] torch.distributed.fsdp._optim_utils: [WARNING] | CUDA OOMs: 0 | cudaMalloc retries: 2 | +[rank0]:[2024-11-13 17:25:17,164] torch.distributed.fsdp._optim_utils: [WARNING] |===========================================================================| +[rank0]:[2024-11-13 17:25:17,164] torch.distributed.fsdp._optim_utils: [WARNING] | Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed | +[rank0]:[2024-11-13 17:25:17,164] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:17,164] torch.distributed.fsdp._optim_utils: [WARNING] | Allocated memory | 5523 MiB | 58937 MiB | 103626 GiB | 103621 GiB | +[rank0]:[2024-11-13 17:25:17,164] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5521 MiB | 58933 MiB | 103586 GiB | 103580 GiB | +[rank0]:[2024-11-13 17:25:17,164] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 17:25:17,164] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:17,164] torch.distributed.fsdp._optim_utils: [WARNING] | Active memory | 5523 MiB | 59826 MiB | 103626 GiB | 103621 GiB | +[rank0]:[2024-11-13 17:25:17,164] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5521 MiB | 59822 MiB | 103586 GiB | 103580 GiB | +[rank0]:[2024-11-13 17:25:17,164] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 17:25:17,164] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:17,164] torch.distributed.fsdp._optim_utils: [WARNING] | Requested memory | 5511 MiB | 59812 MiB | 103620 GiB | 103614 GiB | +[rank0]:[2024-11-13 17:25:17,164] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5508 MiB | 59808 MiB | 103579 GiB | 103574 GiB | +[rank0]:[2024-11-13 17:25:17,164] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 17:25:17,164] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:17,164] torch.distributed.fsdp._optim_utils: [WARNING] | GPU reserved memory | 71846 MiB | 73286 MiB | 140138 MiB | 68292 MiB | +[rank0]:[2024-11-13 17:25:17,164] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 71836 MiB | 73278 MiB | 140124 MiB | 68288 MiB | +[rank0]:[2024-11-13 17:25:17,164] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 10 MiB | 10 MiB | 14 MiB | 4 MiB | +[rank0]:[2024-11-13 17:25:17,164] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:17,164] torch.distributed.fsdp._optim_utils: [WARNING] | Non-releasable memory | 456811 KiB | 18699 MiB | 122013 GiB | 122013 GiB | +[rank0]:[2024-11-13 17:25:17,164] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 453549 KiB | 18696 MiB | 121972 GiB | 121972 GiB | +[rank0]:[2024-11-13 17:25:17,164] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 3262 KiB | 6 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 17:25:17,164] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:17,164] torch.distributed.fsdp._optim_utils: [WARNING] | Allocations | 482 | 710 | 1052 K | 1052 K | +[rank0]:[2024-11-13 17:25:17,164] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 257 | 399 | 824 K | 824 K | +[rank0]:[2024-11-13 17:25:17,164] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 225 | 424 | 228 K | 228 K | +[rank0]:[2024-11-13 17:25:17,164] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:17,164] torch.distributed.fsdp._optim_utils: [WARNING] | Active allocs | 487 | 710 | 1052 K | 1052 K | +[rank0]:[2024-11-13 17:25:17,164] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 257 | 399 | 824 K | 824 K | +[rank0]:[2024-11-13 17:25:17,164] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 230 | 424 | 228 K | 228 K | +[rank0]:[2024-11-13 17:25:17,164] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:17,164] torch.distributed.fsdp._optim_utils: [WARNING] | GPU reserved segments | 110 | 110 | 161 | 51 | +[rank0]:[2024-11-13 17:25:17,164] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 105 | 105 | 154 | 49 | +[rank0]:[2024-11-13 17:25:17,164] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 5 | 5 | 7 | 2 | +[rank0]:[2024-11-13 17:25:17,164] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:17,164] torch.distributed.fsdp._optim_utils: [WARNING] | Non-releasable allocs | 20 | 68 | 462997 | 462977 | +[rank0]:[2024-11-13 17:25:17,164] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 9 | 55 | 373975 | 373966 | +[rank0]:[2024-11-13 17:25:17,164] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 11 | 17 | 89022 | 89011 | +[rank0]:[2024-11-13 17:25:17,164] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:17,164] torch.distributed.fsdp._optim_utils: [WARNING] | Oversize allocations | 0 | 0 | 0 | 0 | +[rank0]:[2024-11-13 17:25:17,164] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 17:25:17,164] torch.distributed.fsdp._optim_utils: [WARNING] | Oversize GPU segments | 0 | 0 | 0 | 0 | +[rank0]:[2024-11-13 17:25:17,164] torch.distributed.fsdp._optim_utils: [WARNING] |===========================================================================| +[rank0]:[2024-11-13 17:25:17,164] torch.distributed.fsdp._optim_utils: [WARNING] +[rank0]:[2024-11-13 17:25:15,963] torch.distributed.fsdp._optim_utils: [WARNING] | CUDA OOMs: 0 | cudaMalloc retries: 2 | +[rank0]:[2024-11-13 17:25:15,963] torch.distributed.fsdp._optim_utils: [WARNING] | CUDA OOMs: 0 | cudaMalloc retries: 2 | +[rank0]:[2024-11-13 17:25:15,963] torch.distributed.fsdp._optim_utils: [WARNING] | CUDA OOMs: 0 | cudaMalloc retries: 2 | +{'train_runtime': 955.0891, 'train_samples_per_second': 13.306, 'train_steps_per_second': 0.025, 'train_loss': 0.6483509813745817, 'epoch': 1.94} diff --git a/wandb/run-20241113_171007-kushalarora-rvv-main-2024-11-13-16-43-09-915-0exw0n-algo-1/files/wandb-summary.json b/wandb/run-20241113_171007-kushalarora-rvv-main-2024-11-13-16-43-09-915-0exw0n-algo-1/files/wandb-summary.json index 52b9bd38684591fed2fb554f68f5ade6030a145e..18124dbbdc56ab980c42de23781522f68167dbb6 100644 --- a/wandb/run-20241113_171007-kushalarora-rvv-main-2024-11-13-16-43-09-915-0exw0n-algo-1/files/wandb-summary.json +++ b/wandb/run-20241113_171007-kushalarora-rvv-main-2024-11-13-16-43-09-915-0exw0n-algo-1/files/wandb-summary.json @@ -1 +1 @@ -{"train/loss": 0.7172, "train/grad_norm": 0.341796875, "train/learning_rate": 3.0865828381745515e-06, "train/epoch": 1.22, "train/global_step": 15, "_timestamp": 1731518293.2375202, "_runtime": 485.8045001029968, "_step": 6, "eval/loss": 0.6777312159538269, "eval/runtime": 1.6459, "eval/samples_per_second": 61.97, "eval/steps_per_second": 1.215} \ No newline at end of file +{"train/loss": 0.6062, "train/grad_norm": 0.318359375, "train/learning_rate": 6.698729810778065e-07, "train/epoch": 1.94, "train/global_step": 24, "_timestamp": 1731518761.6185403, "_runtime": 954.1855201721191, "_step": 9, "eval/loss": 0.677400529384613, "eval/runtime": 1.6389, "eval/samples_per_second": 62.238, "eval/steps_per_second": 1.22, "train_runtime": 955.0891, "train_samples_per_second": 13.306, "train_steps_per_second": 0.025, "total_flos": 9.400248301309133e+16, "train_loss": 0.6483509813745817} \ No newline at end of file diff --git a/wandb/run-20241113_171007-kushalarora-rvv-main-2024-11-13-16-43-09-915-0exw0n-algo-1/logs/debug-internal.log b/wandb/run-20241113_171007-kushalarora-rvv-main-2024-11-13-16-43-09-915-0exw0n-algo-1/logs/debug-internal.log index bad7b0e2c783bdea166c52adc1c5c5a2b4b0845e..2b6fdd216525649f3628c81512f79937ffac83a0 100644 --- a/wandb/run-20241113_171007-kushalarora-rvv-main-2024-11-13-16-43-09-915-0exw0n-algo-1/logs/debug-internal.log +++ b/wandb/run-20241113_171007-kushalarora-rvv-main-2024-11-13-16-43-09-915-0exw0n-algo-1/logs/debug-internal.log @@ -371,3 +371,263 @@ 2024-11-13 17:19:10,426 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: stop_status 2024-11-13 17:19:10,427 DEBUG SenderThread:1939 [sender.py:send_request():409] send_request: stop_status 2024-11-13 17:19:11,531 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 17:19:16,532 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 17:19:21,533 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 17:19:23,908 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: internal_messages +2024-11-13 17:19:24,981 INFO Thread-12 :1939 [dir_watcher.py:_on_file_modified():288] file/dir modified: /opt/ml/model//sft-codecontests-qwen_ds-code-contests_model-Qwen2.5-Coder-7B-Instruct_sch-cosine_lr-1e-5_bs-64_acc-8_len-2048/wandb/run-20241113_171007-kushalarora-rvv-main-2024-11-13-16-43-09-915-0exw0n-algo-1/files/output.log +2024-11-13 17:19:25,426 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: stop_status +2024-11-13 17:19:25,426 DEBUG SenderThread:1939 [sender.py:send_request():409] send_request: stop_status +2024-11-13 17:19:27,525 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 17:19:32,526 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 17:19:37,526 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 17:19:37,986 DEBUG SenderThread:1939 [sender.py:send():382] send: stats +2024-11-13 17:19:38,908 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: internal_messages +2024-11-13 17:19:40,426 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: stop_status +2024-11-13 17:19:40,427 DEBUG SenderThread:1939 [sender.py:send_request():409] send_request: stop_status +2024-11-13 17:19:43,503 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 17:19:46,989 INFO Thread-12 :1939 [dir_watcher.py:_on_file_modified():288] file/dir modified: /opt/ml/model//sft-codecontests-qwen_ds-code-contests_model-Qwen2.5-Coder-7B-Instruct_sch-cosine_lr-1e-5_bs-64_acc-8_len-2048/wandb/run-20241113_171007-kushalarora-rvv-main-2024-11-13-16-43-09-915-0exw0n-algo-1/files/output.log +2024-11-13 17:19:49,064 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 17:19:53,908 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: internal_messages +2024-11-13 17:19:54,065 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 17:19:55,426 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: stop_status +2024-11-13 17:19:55,427 DEBUG SenderThread:1939 [sender.py:send_request():409] send_request: stop_status +2024-11-13 17:19:59,572 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 17:20:04,573 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 17:20:07,011 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: partial_history +2024-11-13 17:20:07,012 DEBUG SenderThread:1939 [sender.py:send():382] send: history +2024-11-13 17:20:07,013 DEBUG SenderThread:1939 [sender.py:send_request():409] send_request: summary_record +2024-11-13 17:20:07,013 INFO SenderThread:1939 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end +2024-11-13 17:20:07,988 DEBUG SenderThread:1939 [sender.py:send():382] send: stats +2024-11-13 17:20:07,998 INFO Thread-12 :1939 [dir_watcher.py:_on_file_modified():288] file/dir modified: /opt/ml/model//sft-codecontests-qwen_ds-code-contests_model-Qwen2.5-Coder-7B-Instruct_sch-cosine_lr-1e-5_bs-64_acc-8_len-2048/wandb/run-20241113_171007-kushalarora-rvv-main-2024-11-13-16-43-09-915-0exw0n-algo-1/files/wandb-summary.json +2024-11-13 17:20:07,999 INFO Thread-12 :1939 [dir_watcher.py:_on_file_modified():288] file/dir modified: /opt/ml/model//sft-codecontests-qwen_ds-code-contests_model-Qwen2.5-Coder-7B-Instruct_sch-cosine_lr-1e-5_bs-64_acc-8_len-2048/wandb/run-20241113_171007-kushalarora-rvv-main-2024-11-13-16-43-09-915-0exw0n-algo-1/files/output.log +2024-11-13 17:20:08,654 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: partial_history +2024-11-13 17:20:08,655 DEBUG SenderThread:1939 [sender.py:send():382] send: history +2024-11-13 17:20:08,656 DEBUG SenderThread:1939 [sender.py:send_request():409] send_request: summary_record +2024-11-13 17:20:08,657 INFO SenderThread:1939 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end +2024-11-13 17:20:08,908 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: internal_messages +2024-11-13 17:20:08,999 INFO Thread-12 :1939 [dir_watcher.py:_on_file_modified():288] file/dir modified: /opt/ml/model//sft-codecontests-qwen_ds-code-contests_model-Qwen2.5-Coder-7B-Instruct_sch-cosine_lr-1e-5_bs-64_acc-8_len-2048/wandb/run-20241113_171007-kushalarora-rvv-main-2024-11-13-16-43-09-915-0exw0n-algo-1/files/wandb-summary.json +2024-11-13 17:20:08,999 INFO Thread-12 :1939 [dir_watcher.py:_on_file_modified():288] file/dir modified: /opt/ml/model//sft-codecontests-qwen_ds-code-contests_model-Qwen2.5-Coder-7B-Instruct_sch-cosine_lr-1e-5_bs-64_acc-8_len-2048/wandb/run-20241113_171007-kushalarora-rvv-main-2024-11-13-16-43-09-915-0exw0n-algo-1/files/output.log +2024-11-13 17:20:09,658 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 17:20:09,999 INFO Thread-12 :1939 [dir_watcher.py:_on_file_modified():288] file/dir modified: /opt/ml/model//sft-codecontests-qwen_ds-code-contests_model-Qwen2.5-Coder-7B-Instruct_sch-cosine_lr-1e-5_bs-64_acc-8_len-2048/wandb/run-20241113_171007-kushalarora-rvv-main-2024-11-13-16-43-09-915-0exw0n-algo-1/files/output.log +2024-11-13 17:20:10,427 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: stop_status +2024-11-13 17:20:10,427 DEBUG SenderThread:1939 [sender.py:send_request():409] send_request: stop_status +2024-11-13 17:20:10,999 INFO Thread-12 :1939 [dir_watcher.py:_on_file_modified():288] file/dir modified: /opt/ml/model//sft-codecontests-qwen_ds-code-contests_model-Qwen2.5-Coder-7B-Instruct_sch-cosine_lr-1e-5_bs-64_acc-8_len-2048/wandb/run-20241113_171007-kushalarora-rvv-main-2024-11-13-16-43-09-915-0exw0n-algo-1/files/output.log +2024-11-13 17:20:15,532 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 17:20:20,533 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 17:20:23,908 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: internal_messages +2024-11-13 17:20:25,427 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: stop_status +2024-11-13 17:20:25,427 DEBUG SenderThread:1939 [sender.py:send_request():409] send_request: stop_status +2024-11-13 17:20:25,533 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 17:20:30,534 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 17:20:33,008 INFO Thread-12 :1939 [dir_watcher.py:_on_file_modified():288] file/dir modified: /opt/ml/model//sft-codecontests-qwen_ds-code-contests_model-Qwen2.5-Coder-7B-Instruct_sch-cosine_lr-1e-5_bs-64_acc-8_len-2048/wandb/run-20241113_171007-kushalarora-rvv-main-2024-11-13-16-43-09-915-0exw0n-algo-1/files/output.log +2024-11-13 17:20:35,587 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 17:20:37,991 DEBUG SenderThread:1939 [sender.py:send():382] send: stats +2024-11-13 17:20:38,908 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: internal_messages +2024-11-13 17:20:40,427 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: stop_status +2024-11-13 17:20:40,427 DEBUG SenderThread:1939 [sender.py:send_request():409] send_request: stop_status +2024-11-13 17:20:41,574 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 17:20:46,575 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 17:20:51,575 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 17:20:53,908 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: internal_messages +2024-11-13 17:20:55,021 INFO Thread-12 :1939 [dir_watcher.py:_on_file_modified():288] file/dir modified: /opt/ml/model//sft-codecontests-qwen_ds-code-contests_model-Qwen2.5-Coder-7B-Instruct_sch-cosine_lr-1e-5_bs-64_acc-8_len-2048/wandb/run-20241113_171007-kushalarora-rvv-main-2024-11-13-16-43-09-915-0exw0n-algo-1/files/output.log +2024-11-13 17:20:55,427 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: stop_status +2024-11-13 17:20:55,427 DEBUG SenderThread:1939 [sender.py:send_request():409] send_request: stop_status +2024-11-13 17:20:56,595 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 17:21:01,596 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 17:21:06,597 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 17:21:07,994 DEBUG SenderThread:1939 [sender.py:send():382] send: stats +2024-11-13 17:21:08,909 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: internal_messages +2024-11-13 17:21:10,428 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: stop_status +2024-11-13 17:21:10,428 DEBUG SenderThread:1939 [sender.py:send_request():409] send_request: stop_status +2024-11-13 17:21:12,570 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 17:21:17,036 INFO Thread-12 :1939 [dir_watcher.py:_on_file_modified():288] file/dir modified: /opt/ml/model//sft-codecontests-qwen_ds-code-contests_model-Qwen2.5-Coder-7B-Instruct_sch-cosine_lr-1e-5_bs-64_acc-8_len-2048/wandb/run-20241113_171007-kushalarora-rvv-main-2024-11-13-16-43-09-915-0exw0n-algo-1/files/output.log +2024-11-13 17:21:17,796 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 17:21:22,796 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 17:21:23,909 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: internal_messages +2024-11-13 17:21:25,427 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: stop_status +2024-11-13 17:21:25,427 DEBUG SenderThread:1939 [sender.py:send_request():409] send_request: stop_status +2024-11-13 17:21:28,520 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 17:21:33,520 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 17:21:37,996 DEBUG SenderThread:1939 [sender.py:send():382] send: stats +2024-11-13 17:21:38,553 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 17:21:38,909 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: internal_messages +2024-11-13 17:21:40,427 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: stop_status +2024-11-13 17:21:40,427 DEBUG SenderThread:1939 [sender.py:send_request():409] send_request: stop_status +2024-11-13 17:21:41,048 INFO Thread-12 :1939 [dir_watcher.py:_on_file_modified():288] file/dir modified: /opt/ml/model//sft-codecontests-qwen_ds-code-contests_model-Qwen2.5-Coder-7B-Instruct_sch-cosine_lr-1e-5_bs-64_acc-8_len-2048/wandb/run-20241113_171007-kushalarora-rvv-main-2024-11-13-16-43-09-915-0exw0n-algo-1/files/output.log +2024-11-13 17:21:44,512 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 17:21:49,513 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 17:21:54,514 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 17:21:58,921 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: stop_status +2024-11-13 17:21:58,921 DEBUG SenderThread:1939 [sender.py:send_request():409] send_request: stop_status +2024-11-13 17:21:59,233 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: internal_messages +2024-11-13 17:22:00,115 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 17:22:05,116 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 17:22:07,999 DEBUG SenderThread:1939 [sender.py:send():382] send: stats +2024-11-13 17:22:11,001 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 17:22:13,345 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: internal_messages +2024-11-13 17:22:13,345 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: stop_status +2024-11-13 17:22:13,345 DEBUG SenderThread:1939 [sender.py:send_request():409] send_request: stop_status +2024-11-13 17:22:16,483 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 17:22:21,484 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 17:22:26,485 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 17:22:29,071 INFO Thread-12 :1939 [dir_watcher.py:_on_file_modified():288] file/dir modified: /opt/ml/model//sft-codecontests-qwen_ds-code-contests_model-Qwen2.5-Coder-7B-Instruct_sch-cosine_lr-1e-5_bs-64_acc-8_len-2048/wandb/run-20241113_171007-kushalarora-rvv-main-2024-11-13-16-43-09-915-0exw0n-algo-1/files/output.log +2024-11-13 17:22:29,261 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: stop_status +2024-11-13 17:22:29,261 DEBUG SenderThread:1939 [sender.py:send_request():409] send_request: stop_status +2024-11-13 17:22:29,593 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: internal_messages +2024-11-13 17:22:32,181 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 17:22:37,181 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 17:22:38,002 DEBUG SenderThread:1939 [sender.py:send():382] send: stats +2024-11-13 17:22:43,004 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 17:22:43,839 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: internal_messages +2024-11-13 17:22:43,839 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: stop_status +2024-11-13 17:22:43,840 DEBUG SenderThread:1939 [sender.py:send_request():409] send_request: stop_status +2024-11-13 17:22:45,079 INFO Thread-12 :1939 [dir_watcher.py:_on_file_modified():288] file/dir modified: /opt/ml/model//sft-codecontests-qwen_ds-code-contests_model-Qwen2.5-Coder-7B-Instruct_sch-cosine_lr-1e-5_bs-64_acc-8_len-2048/wandb/run-20241113_171007-kushalarora-rvv-main-2024-11-13-16-43-09-915-0exw0n-algo-1/files/output.log +2024-11-13 17:22:47,080 INFO Thread-12 :1939 [dir_watcher.py:_on_file_modified():288] file/dir modified: /opt/ml/model//sft-codecontests-qwen_ds-code-contests_model-Qwen2.5-Coder-7B-Instruct_sch-cosine_lr-1e-5_bs-64_acc-8_len-2048/wandb/run-20241113_171007-kushalarora-rvv-main-2024-11-13-16-43-09-915-0exw0n-algo-1/files/output.log +2024-11-13 17:22:48,290 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 17:22:49,081 INFO Thread-12 :1939 [dir_watcher.py:_on_file_modified():288] file/dir modified: /opt/ml/model//sft-codecontests-qwen_ds-code-contests_model-Qwen2.5-Coder-7B-Instruct_sch-cosine_lr-1e-5_bs-64_acc-8_len-2048/wandb/run-20241113_171007-kushalarora-rvv-main-2024-11-13-16-43-09-915-0exw0n-algo-1/files/output.log +2024-11-13 17:22:51,082 INFO Thread-12 :1939 [dir_watcher.py:_on_file_modified():288] file/dir modified: /opt/ml/model//sft-codecontests-qwen_ds-code-contests_model-Qwen2.5-Coder-7B-Instruct_sch-cosine_lr-1e-5_bs-64_acc-8_len-2048/wandb/run-20241113_171007-kushalarora-rvv-main-2024-11-13-16-43-09-915-0exw0n-algo-1/files/output.log +2024-11-13 17:22:53,083 INFO Thread-12 :1939 [dir_watcher.py:_on_file_modified():288] file/dir modified: /opt/ml/model//sft-codecontests-qwen_ds-code-contests_model-Qwen2.5-Coder-7B-Instruct_sch-cosine_lr-1e-5_bs-64_acc-8_len-2048/wandb/run-20241113_171007-kushalarora-rvv-main-2024-11-13-16-43-09-915-0exw0n-algo-1/files/output.log +2024-11-13 17:22:53,501 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 17:22:55,084 INFO Thread-12 :1939 [dir_watcher.py:_on_file_modified():288] file/dir modified: /opt/ml/model//sft-codecontests-qwen_ds-code-contests_model-Qwen2.5-Coder-7B-Instruct_sch-cosine_lr-1e-5_bs-64_acc-8_len-2048/wandb/run-20241113_171007-kushalarora-rvv-main-2024-11-13-16-43-09-915-0exw0n-algo-1/files/output.log +2024-11-13 17:22:57,085 INFO Thread-12 :1939 [dir_watcher.py:_on_file_modified():288] file/dir modified: /opt/ml/model//sft-codecontests-qwen_ds-code-contests_model-Qwen2.5-Coder-7B-Instruct_sch-cosine_lr-1e-5_bs-64_acc-8_len-2048/wandb/run-20241113_171007-kushalarora-rvv-main-2024-11-13-16-43-09-915-0exw0n-algo-1/files/output.log +2024-11-13 17:22:59,086 INFO Thread-12 :1939 [dir_watcher.py:_on_file_modified():288] file/dir modified: /opt/ml/model//sft-codecontests-qwen_ds-code-contests_model-Qwen2.5-Coder-7B-Instruct_sch-cosine_lr-1e-5_bs-64_acc-8_len-2048/wandb/run-20241113_171007-kushalarora-rvv-main-2024-11-13-16-43-09-915-0exw0n-algo-1/files/output.log +2024-11-13 17:22:59,339 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: stop_status +2024-11-13 17:22:59,339 DEBUG SenderThread:1939 [sender.py:send_request():409] send_request: stop_status +2024-11-13 17:22:59,486 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 17:22:59,961 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: internal_messages +2024-11-13 17:23:04,808 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 17:23:08,005 DEBUG SenderThread:1939 [sender.py:send():382] send: stats +2024-11-13 17:23:10,007 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 17:23:15,008 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 17:23:16,870 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: internal_messages +2024-11-13 17:23:17,878 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: stop_status +2024-11-13 17:23:17,878 DEBUG SenderThread:1939 [sender.py:send_request():409] send_request: stop_status +2024-11-13 17:23:20,977 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 17:23:25,978 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 17:23:30,117 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: internal_messages +2024-11-13 17:23:31,167 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: stop_status +2024-11-13 17:23:31,167 DEBUG SenderThread:1939 [sender.py:send_request():409] send_request: stop_status +2024-11-13 17:23:31,334 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 17:23:33,101 INFO Thread-12 :1939 [dir_watcher.py:_on_file_modified():288] file/dir modified: /opt/ml/model//sft-codecontests-qwen_ds-code-contests_model-Qwen2.5-Coder-7B-Instruct_sch-cosine_lr-1e-5_bs-64_acc-8_len-2048/wandb/run-20241113_171007-kushalarora-rvv-main-2024-11-13-16-43-09-915-0exw0n-algo-1/files/output.log +2024-11-13 17:23:36,842 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 17:23:38,008 DEBUG SenderThread:1939 [sender.py:send():382] send: stats +2024-11-13 17:23:42,009 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 17:23:43,918 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: internal_messages +2024-11-13 17:23:44,229 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: stop_status +2024-11-13 17:23:44,229 DEBUG SenderThread:1939 [sender.py:send_request():409] send_request: stop_status +2024-11-13 17:23:47,330 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 17:23:52,331 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 17:23:57,331 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 17:23:58,927 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: internal_messages +2024-11-13 17:23:59,229 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: stop_status +2024-11-13 17:23:59,229 DEBUG SenderThread:1939 [sender.py:send_request():409] send_request: stop_status +2024-11-13 17:24:02,381 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 17:24:07,382 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 17:24:08,010 DEBUG SenderThread:1939 [sender.py:send():382] send: stats +2024-11-13 17:24:13,012 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 17:24:18,013 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 17:24:20,080 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: stop_status +2024-11-13 17:24:20,081 DEBUG SenderThread:1939 [sender.py:send_request():409] send_request: stop_status +2024-11-13 17:24:20,425 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: internal_messages +2024-11-13 17:24:23,197 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 17:24:28,198 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 17:24:33,199 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 17:24:34,975 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: internal_messages +2024-11-13 17:24:34,975 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: stop_status +2024-11-13 17:24:34,976 DEBUG SenderThread:1939 [sender.py:send_request():409] send_request: stop_status +2024-11-13 17:24:38,013 DEBUG SenderThread:1939 [sender.py:send():382] send: stats +2024-11-13 17:24:39,015 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 17:24:43,133 INFO Thread-12 :1939 [dir_watcher.py:_on_file_modified():288] file/dir modified: /opt/ml/model//sft-codecontests-qwen_ds-code-contests_model-Qwen2.5-Coder-7B-Instruct_sch-cosine_lr-1e-5_bs-64_acc-8_len-2048/wandb/run-20241113_171007-kushalarora-rvv-main-2024-11-13-16-43-09-915-0exw0n-algo-1/files/output.log +2024-11-13 17:24:44,952 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 17:24:49,952 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 17:24:50,909 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: stop_status +2024-11-13 17:24:50,909 DEBUG SenderThread:1939 [sender.py:send_request():409] send_request: stop_status +2024-11-13 17:24:51,584 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: internal_messages +2024-11-13 17:24:55,782 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 17:25:00,783 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 17:25:05,028 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: internal_messages +2024-11-13 17:25:05,056 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: stop_status +2024-11-13 17:25:05,057 DEBUG SenderThread:1939 [sender.py:send_request():409] send_request: stop_status +2024-11-13 17:25:05,144 INFO Thread-12 :1939 [dir_watcher.py:_on_file_modified():288] file/dir modified: /opt/ml/model//sft-codecontests-qwen_ds-code-contests_model-Qwen2.5-Coder-7B-Instruct_sch-cosine_lr-1e-5_bs-64_acc-8_len-2048/wandb/run-20241113_171007-kushalarora-rvv-main-2024-11-13-16-43-09-915-0exw0n-algo-1/files/output.log +2024-11-13 17:25:06,319 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 17:25:07,145 INFO Thread-12 :1939 [dir_watcher.py:_on_file_modified():288] file/dir modified: /opt/ml/model//sft-codecontests-qwen_ds-code-contests_model-Qwen2.5-Coder-7B-Instruct_sch-cosine_lr-1e-5_bs-64_acc-8_len-2048/wandb/run-20241113_171007-kushalarora-rvv-main-2024-11-13-16-43-09-915-0exw0n-algo-1/files/output.log +2024-11-13 17:25:08,016 DEBUG SenderThread:1939 [sender.py:send():382] send: stats +2024-11-13 17:25:09,146 INFO Thread-12 :1939 [dir_watcher.py:_on_file_modified():288] file/dir modified: /opt/ml/model//sft-codecontests-qwen_ds-code-contests_model-Qwen2.5-Coder-7B-Instruct_sch-cosine_lr-1e-5_bs-64_acc-8_len-2048/wandb/run-20241113_171007-kushalarora-rvv-main-2024-11-13-16-43-09-915-0exw0n-algo-1/files/output.log +2024-11-13 17:25:11,173 INFO Thread-12 :1939 [dir_watcher.py:_on_file_modified():288] file/dir modified: /opt/ml/model//sft-codecontests-qwen_ds-code-contests_model-Qwen2.5-Coder-7B-Instruct_sch-cosine_lr-1e-5_bs-64_acc-8_len-2048/wandb/run-20241113_171007-kushalarora-rvv-main-2024-11-13-16-43-09-915-0exw0n-algo-1/files/output.log +2024-11-13 17:25:11,566 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 17:25:13,229 INFO Thread-12 :1939 [dir_watcher.py:_on_file_modified():288] file/dir modified: /opt/ml/model//sft-codecontests-qwen_ds-code-contests_model-Qwen2.5-Coder-7B-Instruct_sch-cosine_lr-1e-5_bs-64_acc-8_len-2048/wandb/run-20241113_171007-kushalarora-rvv-main-2024-11-13-16-43-09-915-0exw0n-algo-1/files/output.log +2024-11-13 17:25:16,231 INFO Thread-12 :1939 [dir_watcher.py:_on_file_modified():288] file/dir modified: /opt/ml/model//sft-codecontests-qwen_ds-code-contests_model-Qwen2.5-Coder-7B-Instruct_sch-cosine_lr-1e-5_bs-64_acc-8_len-2048/wandb/run-20241113_171007-kushalarora-rvv-main-2024-11-13-16-43-09-915-0exw0n-algo-1/files/output.log +2024-11-13 17:25:16,767 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 17:25:18,232 INFO Thread-12 :1939 [dir_watcher.py:_on_file_modified():288] file/dir modified: /opt/ml/model//sft-codecontests-qwen_ds-code-contests_model-Qwen2.5-Coder-7B-Instruct_sch-cosine_lr-1e-5_bs-64_acc-8_len-2048/wandb/run-20241113_171007-kushalarora-rvv-main-2024-11-13-16-43-09-915-0exw0n-algo-1/files/output.log +2024-11-13 17:25:20,233 INFO Thread-12 :1939 [dir_watcher.py:_on_file_modified():288] file/dir modified: /opt/ml/model//sft-codecontests-qwen_ds-code-contests_model-Qwen2.5-Coder-7B-Instruct_sch-cosine_lr-1e-5_bs-64_acc-8_len-2048/wandb/run-20241113_171007-kushalarora-rvv-main-2024-11-13-16-43-09-915-0exw0n-algo-1/files/output.log +2024-11-13 17:25:22,557 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 17:25:24,219 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: stop_status +2024-11-13 17:25:24,220 DEBUG SenderThread:1939 [sender.py:send_request():409] send_request: stop_status +2024-11-13 17:25:24,289 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: internal_messages +2024-11-13 17:25:28,289 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 17:25:33,289 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 17:25:38,019 DEBUG SenderThread:1939 [sender.py:send():382] send: stats +2024-11-13 17:25:38,279 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: stop_status +2024-11-13 17:25:38,279 DEBUG SenderThread:1939 [sender.py:send_request():409] send_request: stop_status +2024-11-13 17:25:38,373 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 17:25:39,360 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: internal_messages +2024-11-13 17:25:44,045 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 17:25:49,045 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 17:25:54,046 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 17:25:55,068 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: stop_status +2024-11-13 17:25:55,069 DEBUG SenderThread:1939 [sender.py:send_request():409] send_request: stop_status +2024-11-13 17:25:55,314 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: internal_messages +2024-11-13 17:25:59,148 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 17:26:01,619 DEBUG SenderThread:1939 [sender.py:send_request():409] send_request: summary_record +2024-11-13 17:26:01,620 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: partial_history +2024-11-13 17:26:01,622 INFO SenderThread:1939 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end +2024-11-13 17:26:01,622 DEBUG SenderThread:1939 [sender.py:send_request():409] send_request: summary_record +2024-11-13 17:26:01,622 INFO SenderThread:1939 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end +2024-11-13 17:26:01,622 DEBUG SenderThread:1939 [sender.py:send_request():409] send_request: summary_record +2024-11-13 17:26:01,623 INFO SenderThread:1939 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end +2024-11-13 17:26:01,623 DEBUG SenderThread:1939 [sender.py:send_request():409] send_request: summary_record +2024-11-13 17:26:01,623 INFO SenderThread:1939 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end +2024-11-13 17:26:01,623 DEBUG SenderThread:1939 [sender.py:send_request():409] send_request: summary_record +2024-11-13 17:26:01,624 INFO SenderThread:1939 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end +2024-11-13 17:26:01,624 DEBUG SenderThread:1939 [sender.py:send():382] send: history +2024-11-13 17:26:01,624 DEBUG SenderThread:1939 [sender.py:send_request():409] send_request: summary_record +2024-11-13 17:26:01,624 INFO SenderThread:1939 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end +2024-11-13 17:26:02,253 INFO Thread-12 :1939 [dir_watcher.py:_on_file_modified():288] file/dir modified: /opt/ml/model//sft-codecontests-qwen_ds-code-contests_model-Qwen2.5-Coder-7B-Instruct_sch-cosine_lr-1e-5_bs-64_acc-8_len-2048/wandb/run-20241113_171007-kushalarora-rvv-main-2024-11-13-16-43-09-915-0exw0n-algo-1/files/wandb-summary.json +2024-11-13 17:26:02,254 INFO Thread-12 :1939 [dir_watcher.py:_on_file_modified():288] file/dir modified: /opt/ml/model//sft-codecontests-qwen_ds-code-contests_model-Qwen2.5-Coder-7B-Instruct_sch-cosine_lr-1e-5_bs-64_acc-8_len-2048/wandb/run-20241113_171007-kushalarora-rvv-main-2024-11-13-16-43-09-915-0exw0n-algo-1/files/output.log +2024-11-13 17:26:04,255 INFO Thread-12 :1939 [dir_watcher.py:_on_file_modified():288] file/dir modified: /opt/ml/model//sft-codecontests-qwen_ds-code-contests_model-Qwen2.5-Coder-7B-Instruct_sch-cosine_lr-1e-5_bs-64_acc-8_len-2048/wandb/run-20241113_171007-kushalarora-rvv-main-2024-11-13-16-43-09-915-0exw0n-algo-1/files/output.log +2024-11-13 17:26:04,625 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 17:26:08,014 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: stop_status +2024-11-13 17:26:08,014 DEBUG SenderThread:1939 [sender.py:send_request():409] send_request: stop_status +2024-11-13 17:26:08,130 DEBUG SenderThread:1939 [sender.py:send():382] send: stats +2024-11-13 17:26:08,299 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: internal_messages +2024-11-13 17:26:10,300 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 17:26:15,301 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 17:26:20,302 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 17:26:23,014 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: stop_status +2024-11-13 17:26:23,014 DEBUG SenderThread:1939 [sender.py:send_request():409] send_request: stop_status +2024-11-13 17:26:23,299 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: internal_messages +2024-11-13 17:26:26,301 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 17:26:31,302 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 17:26:36,302 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 17:26:38,014 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: stop_status +2024-11-13 17:26:38,014 DEBUG SenderThread:1939 [sender.py:send_request():409] send_request: stop_status +2024-11-13 17:26:38,164 DEBUG SenderThread:1939 [sender.py:send():382] send: stats +2024-11-13 17:26:38,299 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: internal_messages +2024-11-13 17:26:42,301 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 17:26:47,302 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 17:26:52,302 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 17:26:53,014 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: stop_status +2024-11-13 17:26:53,014 DEBUG SenderThread:1939 [sender.py:send_request():409] send_request: stop_status +2024-11-13 17:26:53,300 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: internal_messages +2024-11-13 17:26:58,302 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 17:27:03,303 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 17:27:08,014 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: stop_status +2024-11-13 17:27:08,015 DEBUG SenderThread:1939 [sender.py:send_request():409] send_request: stop_status +2024-11-13 17:27:08,168 DEBUG SenderThread:1939 [sender.py:send():382] send: stats +2024-11-13 17:27:08,299 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: internal_messages +2024-11-13 17:27:09,300 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 17:27:14,301 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 17:27:19,302 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 17:27:23,014 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: stop_status +2024-11-13 17:27:23,015 DEBUG SenderThread:1939 [sender.py:send_request():409] send_request: stop_status +2024-11-13 17:27:23,300 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: internal_messages diff --git a/wandb/run-20241113_171007-kushalarora-rvv-main-2024-11-13-16-43-09-915-0exw0n-algo-1/run-kushalarora-rvv-main-2024-11-13-16-43-09-915-0exw0n-algo-1.wandb b/wandb/run-20241113_171007-kushalarora-rvv-main-2024-11-13-16-43-09-915-0exw0n-algo-1/run-kushalarora-rvv-main-2024-11-13-16-43-09-915-0exw0n-algo-1.wandb index 3ff9ee9d54e86414b19a49bbc3881634376f47fb..70956212c3effa57c85a47208192bffc8c4c5800 100644 Binary files a/wandb/run-20241113_171007-kushalarora-rvv-main-2024-11-13-16-43-09-915-0exw0n-algo-1/run-kushalarora-rvv-main-2024-11-13-16-43-09-915-0exw0n-algo-1.wandb and b/wandb/run-20241113_171007-kushalarora-rvv-main-2024-11-13-16-43-09-915-0exw0n-algo-1/run-kushalarora-rvv-main-2024-11-13-16-43-09-915-0exw0n-algo-1.wandb differ