From 6ff3e8695be704b6dc92b30aae19abc7647997d2 Mon Sep 17 00:00:00 2001 From: Yan Bai Date: Wed, 11 Jun 2025 02:27:10 -0700 Subject: [PATCH] qwen2.5vl 7b report and guide --- docs/algo/baseline.md | 1 + .../run_qwen2_5_vl-7b-megatron.sh | 45 ++++++++++++++++--- 2 files changed, 39 insertions(+), 7 deletions(-) diff --git a/docs/algo/baseline.md b/docs/algo/baseline.md index 4d23a9c15a8..c85e8602551 100644 --- a/docs/algo/baseline.md +++ b/docs/algo/baseline.md @@ -28,6 +28,7 @@ Refer to the table below to reproduce RL training from different pre-trained che | NVIDIA GPU | Mixtral-8x22B-Instruct-v0.1 | Instruct model | 83.7 | [Qwen Blog](https://qwenlm.github.io/blog/qwen2.5-llm/) | | NVIDIA GPU | Mixtral-8x22B-Instruct-v0.1 | RLOO (Megatron) | 92.3 | [wandb](https://api.wandb.ai/links/ppo_dev/sbuiuf2d) | | NVIDIA GPU | Qwen/Qwen2.5-7B-Instruct | SPIN | 92 | [script](https://github.com/volcengine/verl/tree/main/recipe/spin/README.md) | +| NVIDIA GPU | Qwen/Qwen2.5-VL-7B-Instruct | GRPO (Megatron) | 65.4 (GEO3k) | [script](https://github.com/volcengine/verl/blob/main/examples/grpo_trainer/run_qwen2_5_vl-7b-megatron.sh), [wandb](https://api.wandb.ai/links/megatron-core-moe-dev/1yngvkek) | | AMD MI300 | deepseek-ai/deepseek-llm-7b-chat | PPO | 70.5 [1] | [log](https://github.com/yushengsu-thu/verl_training_log/blob/main/gsm8k/ppo_run_deepseek7b_llm.log) | | AMD MI300 | deepseek-ai/deepseek-llm-7b-chat | GRPO | 71.4 [1] | [log](https://github.com/yushengsu-thu/verl_training_log/blob/main/gsm8k/grpo_run_deepseek7b_llm.log) | diff --git a/examples/grpo_trainer/run_qwen2_5_vl-7b-megatron.sh b/examples/grpo_trainer/run_qwen2_5_vl-7b-megatron.sh index 63791481657..1ad5141f943 100644 --- a/examples/grpo_trainer/run_qwen2_5_vl-7b-megatron.sh +++ b/examples/grpo_trainer/run_qwen2_5_vl-7b-megatron.sh @@ -5,11 +5,33 @@ export CUDA_DEVICE_MAX_CONNECTIONS=1 # For megatron communication/computation ov HF_MODEL_PATH=Qwen/Qwen2.5-VL-7B-Instruct DIST_CKPT_PATH=${DIST_CKPT_PATH} -# convert HF model to verl format -# python scripts/converter_hf_to_verl.py --hf_model_path $HF_MODEL_PATH --output_dir $DIST_CKPT_PATH +# convert HF model to meagatron format offlinely +# python scripts/converter_hf_to_mcore.py --hf_model_path $HF_MODEL_PATH --output_path $DIST_CKPT_PATH -train_path=/data/geo3k/train.parquet -test_path=/data/geo3k/test.parquet + +# megatron tuning guide: +# 1. recommend to offload all states by setting ALL_OFFLOAD=True +# 2. enable dynamic batch size by setting actor_rollout_ref.actor.use_dynamic_bsz=True ref.log_prob_use_dynamic_bsz=True rollout.log_prob_use_dynamic_bsz=True +# 3. set ppo_max_token_len_per_gpu and log_prob_max_token_len_per_gpu as large as possible for better MFU (limited by GPU memory). assure ppo_max_token_len_per_gpu > max_prompt_length+max_response_length, if sequence length is too long, you can increase the TP/PP size +# 4. if memory is very limited, enable full recompute, but the mfu will be 30% lower +# full recompute settings: +# +actor_rollout_ref.actor.megatron.override_transformer_config.recompute_method=uniform \ +# +actor_rollout_ref.actor.megatron.override_transformer_config.recompute_granularity=full \ +# +actor_rollout_ref.actor.megatron.override_transformer_config.recompute_num_layers=1 \ + +ALL_OFFLOAD=${ALL_OFFLOAD:-True} +COMMON_PARAM_OFFLOAD=${COMMON_PARAM_OFFLOAD:-$ALL_OFFLOAD} +COMMON_GRAD_OFFLOAD=${COMMON_GRAD_OFFLOAD:-$ALL_OFFLOAD} +COMMON_OPTIMIZER_OFFLOAD=${COMMON_OPTIMIZER_OFFLOAD:-$ALL_OFFLOAD} + +ACTOR_PARAM_OFFLOAD=${ACTOR_PARAM_OFFLOAD:-$COMMON_PARAM_OFFLOAD} +ACTOR_GRAD_OFFLOAD=${ACTOR_GRAD_OFFLOAD:-$COMMON_GRAD_OFFLOAD} +ACTOR_OPTIMIZER_OFFLOAD=${ACTOR_OPTIMIZER_OFFLOAD:-$COMMON_OPTIMIZER_OFFLOAD} +REF_PARAM_OFFLOAD=${REF_PARAM_OFFLOAD:-$COMMON_PARAM_OFFLOAD} + + +train_path=$HOME/data/geo3k/train.parquet +test_path=$HOME/data/geo3k/test.parquet python3 -m verl.trainer.main_ppo --config-path=config \ --config-name='ppo_megatron_trainer.yaml'\ @@ -31,11 +53,16 @@ python3 -m verl.trainer.main_ppo --config-path=config \ actor_rollout_ref.actor.kl_loss_coef=0.01 \ actor_rollout_ref.actor.kl_loss_type=low_var_kl \ actor_rollout_ref.actor.entropy_coeff=0 \ - actor_rollout_ref.model.enable_gradient_checkpointing=True \ actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=20 \ - actor_rollout_ref.rollout.tensor_model_parallel_size=2 \ + actor_rollout_ref.rollout.tensor_model_parallel_size=1 \ + actor_rollout_ref.actor.use_dynamic_bsz=True \ + actor_rollout_ref.actor.ppo_max_token_len_per_gpu=5120 \ + actor_rollout_ref.ref.log_prob_use_dynamic_bsz=True \ + actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=20480 \ + actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=True \ + actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=20480 \ actor_rollout_ref.rollout.name=$ENGINE \ - actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \ + actor_rollout_ref.rollout.gpu_memory_utilization=0.8 \ actor_rollout_ref.rollout.n=5 \ actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=20 \ actor_rollout_ref.ref.megatron.pipeline_model_parallel_size=1 \ @@ -44,6 +71,10 @@ python3 -m verl.trainer.main_ppo --config-path=config \ actor_rollout_ref.ref.megatron.use_dist_checkpointing=True \ actor_rollout_ref.actor.megatron.dist_checkpointing_path=$DIST_CKPT_PATH \ actor_rollout_ref.ref.megatron.dist_checkpointing_path=$DIST_CKPT_PATH \ + actor_rollout_ref.actor.megatron.param_offload=${ACTOR_PARAM_OFFLOAD} \ + actor_rollout_ref.actor.megatron.optimizer_offload=${ACTOR_OPTIMIZER_OFFLOAD} \ + actor_rollout_ref.actor.megatron.grad_offload=${ACTOR_GRAD_OFFLOAD} \ + actor_rollout_ref.ref.megatron.param_offload=${REF_PARAM_OFFLOAD} \ algorithm.use_kl_in_reward=False \ trainer.critic_warmup=0 \ trainer.logger=['console','wandb'] \