[v1] add init on rank0 for fsdp2 (#10264)

2026-03-30 15:27:44 +08:00 · 2026-03-27 14:54:03 +08:00
parent d02fcd3588
commit df2e6edb7e
9 changed files with 84 additions and 12 deletions
--- a/examples/v1/train_freeze/train_freeze_sft.yaml
+++ b/examples/v1/train_freeze/train_freeze_sft.yaml
@@ -1,5 +1,4 @@
 model: Qwen/Qwen3-4B
 trust_remote_code: true
 model_class: llm
 template: qwen3_nothink
--- a/examples/v1/train_full/train_full_deepspeed.yaml
+++ b/examples/v1/train_full/train_full_deepspeed.yaml
@@ -1,5 +1,4 @@
 model: Qwen/Qwen3-0.6B
 model_class: llm
 template: qwen3_nothink
--- a/examples/v1/train_full/train_full_fsdp2.yaml
+++ b/examples/v1/train_full/train_full_fsdp2.yaml
@@ -1,5 +1,4 @@
 model: Qwen/Qwen3-0.6B
 trust_remote_code: true
 model_class: llm
 template: qwen3_nothink
--- a/examples/v1/train_lora/train_lora_sft.yaml
+++ b/examples/v1/train_lora/train_lora_sft.yaml
@@ -1,5 +1,4 @@
 model: Qwen/Qwen3-4B
 trust_remote_code: true
 model_class: llm
 template: qwen3_nothink
@@ -28,7 +27,6 @@ train_dataset: data/v1_sft_demo.yaml
 ### training
 output_dir: ./outputs/test_lora
 micro_batch_size: 1
 global_batch_size: 4
 cutoff_len: 2048
 learning_rate: 1.0e-4
 bf16: true
--- a/examples/v1/train_lora/train_lora_sft_rank0.yaml
+++ b/examples/v1/train_lora/train_lora_sft_rank0.yaml
@@ -0,0 +1,40 @@
 model: Qwen/Qwen3-4B
 model_class: llm
 template: qwen3_nothink
 # PEFT Configuration
 peft_config:
  name: lora
  r: 16
  lora_alpha: 32
  lora_dropout: 0.05
  target_modules: all
 # Kernel Config
 kernel_config:
  name: auto
  include_kernels: auto
 # FSDP Config
 dist_config:
  name: fsdp2
  dcp_path: null
 init_config:
  name: init_on_rank0
 ### data
 train_dataset: data/v1_sft_demo.yaml
 ### training
 output_dir: ./outputs/test_lora
 micro_batch_size: 1
 cutoff_len: 2048
 learning_rate: 1.0e-4
 bf16: true
 max_steps: 10
 ### sample
 sample_backend: hf
 max_new_tokens: 128
--- a/examples/v1/train_qlora/quantization.yaml
+++ b/examples/v1/train_qlora/quantization.yaml
@@ -1,5 +1,4 @@
 model: Qwen/Qwen3-0.6B
 trust_remote_code: true
 model_class: llm
 template: qwen3_nothink
--- a/src/llamafactory/v1/core/model_engine.py
+++ b/src/llamafactory/v1/core/model_engine.py
@@ -140,6 +140,9 @@ class ModelEngine:
                **init_kwargs,
            )
        init_mode = self.args.init_config.name if self.args.init_config is not None else "init_on_default"
        model._init_mode = init_mode
        if self.args.peft_config is None:
            if self.is_train:
                logger.info_rank0("Fine-tuning mode: full tuning")
@@ -147,6 +150,9 @@ class ModelEngine:
            else:
                logger.info_rank0("Inference the original model")
        else:
            if self.args.peft_config.name == "lora" and init_mode == "init_on_meta":
                raise ValueError("Currently lora stage does not support loading model by meta.")
            from ..plugins.model_plugins.peft import PeftPlugin
            model = PeftPlugin(self.args.peft_config.name)(model, self.args.peft_config, self.is_train)
--- a/src/llamafactory/v1/plugins/model_plugins/peft.py
+++ b/src/llamafactory/v1/plugins/model_plugins/peft.py
@@ -150,9 +150,6 @@ def load_adapter(model: HFModel, adapter_name_or_path: Union[list[str], str], is
@PeftPlugin("lora").register()
 def get_lora_model(model: HFModel, config: LoraConfigDict, is_train: bool = False) -> HFModel:
    if model.device.type == "meta":
        raise ValueError("Currently lora stage does not support loading model by meta.")
    adapter_name_or_path = config.get("adapter_name_or_path")
    if adapter_name_or_path:
--- a/src/llamafactory/v1/plugins/trainer_plugins/distributed/fsdp2.py
+++ b/src/llamafactory/v1/plugins/trainer_plugins/distributed/fsdp2.py
@@ -17,6 +17,7 @@ import gc
 import os
 import torch
 import torch.distributed as dist
 import torch.nn as nn
 from peft.tuners.lora import LoraLayer
 from torch.distributed.checkpoint.state_dict import StateDictOptions, get_model_state_dict, set_model_state_dict
@@ -244,23 +245,57 @@ class FSDP2Engine:
            logger.info(f"Restored {len(saved_buffers)} non-persistent buffers")
    def shard_model(self, model: HFModel) -> HFModel:
-        if model.device.type == "meta":
+        init_mode = getattr(model, "_init_mode", "init_on_default")
        if init_mode == "init_on_rank0":
            if getattr(model.config, "tie_word_embeddings", False):
                model.tie_weights()
            if self.rank == 0:
                logger.info("init_on_rank0 detected: sharding then scattering Rank 0 CPU weights.")
                full_sd = {k: v.clone() for k, v in model.state_dict().items()}
            else:
                full_sd = {}
            # Reuse existing helper to save persistent=False buffers (e.g. inv_freq) before shard
            saved_buffers = self._save_non_persistent_buffers(model) if self.rank == 0 else {}
            model = self.prepare_model(model)
            device = get_current_accelerator()
            model.to_empty(device=device)
            # Scatter params from Rank 0 into all DTensor shards
            # Broadcast the full state dict from the global rank-0 process to all ranks in this group.
            options = StateDictOptions(full_state_dict=True, cpu_offload=True, broadcast_from_rank0=True)
            set_model_state_dict(model, full_sd, options=options)
            # Broadcast and restore non-persistent buffers
            buffers_to_sync = [saved_buffers]
            dist.broadcast_object_list(buffers_to_sync, src=0, group=self.fsdp_mesh.get_group())
            self._restore_non_persistent_buffers(model, buffers_to_sync[0])
            if self.rank == 0:
                logger.info("init_on_rank0 sync complete.")
        elif init_mode == "init_on_meta":
            non_persistent_buffers = self._save_non_persistent_buffers(model)
-            if getattr(model.config, "tie_word_embeddings", None):
+            if getattr(model.config, "tie_word_embeddings", False):
                model.tie_weights()
            model = self.prepare_model(model)
            model = self.materialize_and_load(model, hf_model_path=model.config.name_or_path, dcp_path=self.dcp_path)
            # fix tied broken for no-fsdp-wrap case
-            if getattr(model.config, "tie_word_embeddings", None):
+            if getattr(model.config, "tie_word_embeddings", False):
                model.tie_weights()
            self._restore_non_persistent_buffers(model, non_persistent_buffers)
        else:
            model = self.prepare_model(model)
        return model
    def _load_from_dcp(self, model: HFModel, dcp_path: str):