Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Appearance settings

Commit fe76101

Browse filesBrowse files
Add a thin DAPO adapter on top of DeepGym rewards
Introduce a first-class DAPO integration module that exposes sync/async reward callbacks, shaped reward helpers, and small config/module generators for verl-style DAPO recipes without embedding a separate DAPO trainer into DeepGym. This keeps DeepGym at the reward/execution boundary while making DAPO usage explicit and documented alongside the existing TRL, verl, OpenRLHF, and Axolotl integrations. Constraint: DeepGym should remain a verifiable reward layer, not a second RL framework implementation Rejected: Implement a standalone DAPO trainer in DeepGym | duplicates upstream training infrastructure and materially increases maintenance scope Confidence: high Scope-risk: narrow Reversibility: clean Directive: Keep future DAPO work thin and recipe-oriented unless DeepGym intentionally expands beyond the reward layer boundary Tested: ruff check src/ Tested: ruff format --check src/ Tested: python3 -m pytest -q Not-tested: Live end-to-end DAPO/verl training run against a real cluster config
1 parent 88de371 commit fe76101
Copy full SHA for fe76101

4 files changed

+258-1Lines changed: 258 additions & 1 deletion

File tree

Expand file treeCollapse file tree
Open diff view settings
Filter options
Expand file treeCollapse file tree
Open diff view settings
Collapse file

‎deepgym/README.md‎

Copy file name to clipboardExpand all lines: deepgym/README.md
+25Lines changed: 25 additions & 0 deletions
  • Display the source diff
  • Display the rich diff
Original file line numberDiff line numberDiff line change
@@ -98,6 +98,31 @@ trainer = GRPOTrainer(model=model, reward_funcs=[reward_fn])
9898
trainer.train()
9999
```
100100

101+
### Drop into DAPO
102+
103+
```python
104+
from deepgym.integrations.dapo import make_dapo_reward_fn
105+
106+
reward_fn = make_dapo_reward_fn(env)
107+
scores = reward_fn(completions=['def solve(x): return x'])
108+
```
109+
110+
For verl-style DAPO recipes, DeepGym also exposes thin helpers to generate a
111+
reward module and a minimal config snippet:
112+
113+
```python
114+
from deepgym.integrations.dapo import (
115+
generate_dapo_reward_module,
116+
generate_dapo_verl_config,
117+
)
118+
119+
reward_module = generate_dapo_reward_module('coin_change')
120+
config_yaml = generate_dapo_verl_config(
121+
train_files='data/train.parquet',
122+
reward_module_path='reward_module.py',
123+
)
124+
```
125+
101126
### Train on repo patches with SWE-bench Pro
102127

103128
```python
Collapse file

‎deepgym/src/deepgym/integrations/__init__.py‎

Copy file name to clipboardExpand all lines: deepgym/src/deepgym/integrations/__init__.py
+1Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
33
Available integrations:
44
- axolotl: Axolotl GRPO reward functions, PRM dataset generation, config helpers
5+
- dapo: Thin DAPO reward/config helpers for verl-style training recipes
56
- trl: HuggingFace TRL GRPOTrainer reward functions
67
- verl: ByteDance verl compute_score and batch reward functions
78
- openrlhf: OpenRLHF reward server FastAPI router
Collapse file
+177Lines changed: 177 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,177 @@
1+
"""DAPO integration built on top of DeepGym's reward engine.
2+
3+
DAPO is commonly run through ``verl`` recipes rather than a standalone trainer
4+
API. DeepGym keeps the integration thin: expose reward callbacks and helper
5+
config/module generators that fit naturally into an external DAPO training run.
6+
"""
7+
8+
from __future__ import annotations
9+
10+
import asyncio
11+
from collections.abc import Awaitable, Callable
12+
13+
from deepgym.core import DeepGym
14+
from deepgym.models import Environment
15+
16+
17+
def make_dapo_reward_fn(
18+
env: Environment,
19+
dg: DeepGym | None = None,
20+
max_parallel: int = 32,
21+
) -> Callable[..., list[float]]:
22+
"""Create a sync reward function for DAPO-style training loops.
23+
24+
The callable accepts decoded completions plus arbitrary per-sample metadata
25+
and returns one scalar reward per completion.
26+
"""
27+
_dg = dg or DeepGym(mode='auto')
28+
29+
def reward_fn(completions: list[str], **kwargs: object) -> list[float]:
30+
if not completions:
31+
return []
32+
batch = _dg.run_batch(
33+
env,
34+
completions,
35+
max_parallel=min(len(completions), max_parallel),
36+
**kwargs,
37+
)
38+
return [result.score for result in batch.results]
39+
40+
return reward_fn
41+
42+
43+
def make_dapo_async_reward_fn(
44+
env: Environment,
45+
dg: DeepGym | None = None,
46+
max_parallel: int = 32,
47+
) -> Callable[..., Awaitable[list[float]]]:
48+
"""Create an async reward function for DAPO-style training loops."""
49+
_dg = dg or DeepGym(mode='auto')
50+
51+
async def reward_fn(completions: list[str], **kwargs: object) -> list[float]:
52+
if not completions:
53+
return []
54+
loop = asyncio.get_running_loop()
55+
batch = await loop.run_in_executor(
56+
None,
57+
lambda: _dg.run_batch(
58+
env,
59+
completions,
60+
max_parallel=min(len(completions), max_parallel),
61+
**kwargs,
62+
),
63+
)
64+
return [result.score for result in batch.results]
65+
66+
return reward_fn
67+
68+
69+
def make_dapo_shaped_reward_fn(
70+
env: Environment,
71+
dg: DeepGym | None = None,
72+
*,
73+
component: str | None = None,
74+
max_parallel: int = 32,
75+
) -> Callable[..., list[float]]:
76+
"""Return a specific shaped reward component for each completion.
77+
78+
If ``component`` is ``None``, the aggregate DeepGym score is returned.
79+
"""
80+
_dg = dg or DeepGym(mode='auto')
81+
82+
def reward_fn(completions: list[str], **kwargs: object) -> list[float]:
83+
if not completions:
84+
return []
85+
batch = _dg.run_batch(
86+
env,
87+
completions,
88+
max_parallel=min(len(completions), max_parallel),
89+
**kwargs,
90+
)
91+
if component is None:
92+
return [result.score for result in batch.results]
93+
return [
94+
(result.reward_components or {}).get(component, result.score)
95+
for result in batch.results
96+
]
97+
98+
return reward_fn
99+
100+
101+
_DAPO_VERL_CONFIG_TEMPLATE = """\
102+
# verl DAPO config with DeepGym reward scoring
103+
# Generated by deepgym.integrations.dapo
104+
#
105+
# Usage:
106+
# 1. Write the generated reward module next to your training entrypoint
107+
# 2. Point verl's custom reward hook at that module
108+
109+
algorithm:
110+
adv_estimator: dapo
111+
112+
data:
113+
train_files: {train_files}
114+
val_files: {val_files}
115+
prompt_key: {prompt_key}
116+
response_key: {response_key}
117+
118+
custom_reward_function:
119+
path: {reward_module_path}
120+
name: {reward_fn_name}
121+
122+
trainer:
123+
default_local_dir: {output_dir}
124+
total_epochs: {epochs}
125+
126+
rollout:
127+
n: {samples_per_prompt}
128+
129+
logging:
130+
project: {project_name}
131+
"""
132+
133+
134+
def generate_dapo_verl_config(
135+
*,
136+
train_files: str,
137+
reward_module_path: str,
138+
reward_fn_name: str = 'reward_fn',
139+
output_dir: str = 'outputs/dapo',
140+
val_files: str = '',
141+
prompt_key: str = 'prompt',
142+
response_key: str = 'response',
143+
epochs: int = 1,
144+
samples_per_prompt: int = 8,
145+
project_name: str = 'deepgym-dapo',
146+
) -> str:
147+
"""Generate a minimal verl-flavored DAPO config snippet.
148+
149+
This is intentionally thin and focuses on the reward bridge fields DeepGym
150+
can own directly.
151+
"""
152+
return _DAPO_VERL_CONFIG_TEMPLATE.format(
153+
train_files=train_files,
154+
val_files=val_files or train_files,
155+
prompt_key=prompt_key,
156+
response_key=response_key,
157+
reward_module_path=reward_module_path,
158+
reward_fn_name=reward_fn_name,
159+
output_dir=output_dir,
160+
epochs=epochs,
161+
samples_per_prompt=samples_per_prompt,
162+
project_name=project_name,
163+
)
164+
165+
166+
_DAPO_REWARD_MODULE_TEMPLATE = """\
167+
from deepgym import load_environment
168+
from deepgym.integrations.dapo import make_dapo_reward_fn
169+
170+
env = load_environment({env_name!r})
171+
reward_fn = make_dapo_reward_fn(env)
172+
"""
173+
174+
175+
def generate_dapo_reward_module(env_name: str) -> str:
176+
"""Generate a tiny reward module for external DAPO/verl jobs."""
177+
return _DAPO_REWARD_MODULE_TEMPLATE.format(env_name=env_name)
Collapse file

‎deepgym/tests/test_integrations.py‎

Copy file name to clipboardExpand all lines: deepgym/tests/test_integrations.py
+55-1Lines changed: 55 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,12 +7,19 @@
77
from fastapi.testclient import TestClient
88

99
from deepgym.core import DeepGym
10+
from deepgym.integrations.dapo import (
11+
generate_dapo_reward_module,
12+
generate_dapo_verl_config,
13+
make_dapo_async_reward_fn,
14+
make_dapo_reward_fn,
15+
make_dapo_shaped_reward_fn,
16+
)
1017
from deepgym.integrations.openrlhf import (
1118
OpenRLHFRewardRequest,
1219
OpenRLHFRewardResponse,
1320
create_openrlhf_router,
1421
)
15-
from deepgym.integrations.reward import AsyncRewardFunction, RewardFunction
22+
from deepgym.integrations.reward import RewardFunction
1623
from deepgym.integrations.trl import make_trl_async_reward_fn, make_trl_reward_fn
1724
from deepgym.integrations.verl import make_verl_compute_score, make_verl_reward_fn
1825
from deepgym.models import Environment
@@ -95,6 +102,53 @@ def test_verl_reward_fn_with_responses_key(self, env: Environment, local_dg: Dee
95102
assert scores[0] >= 0.5
96103

97104

105+
class TestDAPOIntegration:
106+
"""Verify thin DAPO integration helpers."""
107+
108+
def test_make_dapo_reward_fn_returns_callable(self, env: Environment) -> None:
109+
fn = make_dapo_reward_fn(env=env)
110+
assert callable(fn)
111+
112+
def test_dapo_reward_fn_scores_good_solution(self, env: Environment, local_dg: DeepGym) -> None:
113+
fn = make_dapo_reward_fn(env=env, dg=local_dg)
114+
scores = fn(completions=[GOOD_SOLUTION])
115+
assert len(scores) == 1
116+
assert scores[0] >= 0.9
117+
118+
def test_make_dapo_async_reward_fn_returns_callable(self, env: Environment) -> None:
119+
fn = make_dapo_async_reward_fn(env=env)
120+
assert callable(fn)
121+
122+
def test_dapo_shaped_reward_fn_returns_component(
123+
self,
124+
local_dg: DeepGym,
125+
) -> None:
126+
shaped_env = Environment(
127+
task='Return anything',
128+
verifier_code=(
129+
'return {"score": 0.4, "passed": False, '
130+
'"reward_components": {"correctness": 0.8, "style": 0.2}}\n'
131+
),
132+
)
133+
fn = make_dapo_shaped_reward_fn(env=shaped_env, dg=local_dg, component='correctness')
134+
scores = fn(completions=['print("hi")\n'])
135+
assert scores == [0.8]
136+
137+
def test_generate_dapo_verl_config_contains_expected_fields(self) -> None:
138+
config = generate_dapo_verl_config(
139+
train_files='data/train.parquet',
140+
reward_module_path='reward_module.py',
141+
)
142+
assert 'adv_estimator: dapo' in config
143+
assert 'custom_reward_function:' in config
144+
assert 'reward_module.py' in config
145+
146+
def test_generate_dapo_reward_module_uses_dapo_reward_fn(self) -> None:
147+
module_text = generate_dapo_reward_module('coin_change')
148+
assert "load_environment('coin_change')" in module_text
149+
assert 'make_dapo_reward_fn' in module_text
150+
151+
98152
class TestRewardFunction:
99153
"""Verify the universal RewardFunction class."""
100154

0 commit comments

Comments
0 (0)
Morty Proxy This is a proxified and sanitized view of the page, visit original site.