[feat] support megatron-LM training by mcore_adapter (#9237)

Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
Co-authored-by: Yaowei Zheng <hiyouga@buaa.edu.cn>
This commit is contained in:
Kingsley
2025-10-26 16:21:30 +08:00
committed by GitHub
parent 129e918106
commit 13170577b2
14 changed files with 671 additions and 8 deletions

View File

@@ -54,6 +54,10 @@ def launch():
)
command = sys.argv.pop(1) if len(sys.argv) > 1 else "help"
if is_env_enabled("USE_MCA"):
# force use torchrun
os.environ["FORCE_TORCHRUN"] = "1"
if command == "train" and (is_env_enabled("FORCE_TORCHRUN") or (get_device_count() > 1 and not use_ray())):
# launch distributed training
nnodes = os.getenv("NNODES", "1")