NVIDIA-NeMo · bxyu-nvidia · Mar 3, 2026 · Jan 28, 2026 · Jan 28, 2026 · Jan 28, 2026
diff --git a/README.md b/README.md
@@ -180,7 +180,7 @@ Purpose: Training-ready environments with curated datasets.
 | Math With Judge            | <a href='resources_servers/math_with_judge/configs/math_stack_overflow.yaml'>math_stack_overflow.yaml</a>                                   | math                  | <a href='https://huggingface.co/datasets/nvidia/Nemotron-RL-math-stack_overflow'>Nemotron-RL-math-stack_overflow</a>                                           | -                                                                                                          | -                                                                        | ✓     | ✓          | Creative Commons Attribution-ShareAlike 4.0 International |
 | Math With Judge            | <a href='resources_servers/math_with_judge/configs/math_with_judge.yaml'>math_with_judge.yaml</a>                                           | math                  | <a href='https://huggingface.co/datasets/nvidia/Nemotron-RL-math-OpenMathReasoning'>Nemotron-RL-math-OpenMathReasoning</a>                                     | Math dataset with math-verify and LLM-as-a-judge                                                           | Improve math capabilities including AIME 24 / 25                         | ✓     | ✓          | Creative Commons Attribution 4.0 International            |
 | Math With Judge            | <a href='resources_servers/math_with_judge/configs/math_with_local_judge.yaml'>math_with_local_judge.yaml</a>                               | math                  | -                                                                                                                                                              | -                                                                                                          | -                                                                        | -     | -          | -                                                         |
-| Mcqa                       | <a href='resources_servers/mcqa/configs/mcqa.yaml'>mcqa.yaml</a>                                                                            | knowledge             | <a href='https://huggingface.co/datasets/nvidia/Nemotron-RL-knowledge-mcqa'>Nemotron-RL-knowledge-mcqa</a>                                                     | Multi-choice question answering problems                                                                   | Improve benchmarks like MMLU / GPQA / HLE                                | ✓     | -          | Apache 2.0                                                |
+| Mcqa                       | <a href='resources_servers/mcqa/configs/mcqa.yaml'>mcqa.yaml</a>                                                                            | knowledge             | <a href='https://huggingface.co/datasets/nvidia/Nemotron-RL-knowledge-mcqa'>Nemotron-RL-knowledge-mcqa</a>                                                     | Multi-choice question answering problems                                                                   | Improve benchmarks like MMLU / GPQA / HLE                                | ✓     | ✓          | Apache 2.0                                                |
 | Mini Swe Agent             | <a href='resources_servers/mini_swe_agent/configs/mini_swe_agent.yaml'>mini_swe_agent.yaml</a>                                              | coding                | <a href='https://huggingface.co/datasets/SWE-Gym/SWE-Gym'>SWE-Gym</a>                                                                                          | A software development with mini-swe-agent orchestration                                                   | Improve software development capabilities, like SWE-bench                | ✓     | ✓          | MIT                                                       |
 | Multichallenge             | <a href='resources_servers/multichallenge/configs/multichallenge.yaml'>multichallenge.yaml</a>                                              | knowledge             | -                                                                                                                                                              | MultiChallenge benchmark evaluation with LLM judge                                                         | -                                                                        | ✓     | -          | TBD                                                       |
 | Multichallenge             | <a href='resources_servers/multichallenge/configs/multichallenge_nrl.yaml'>multichallenge_nrl.yaml</a>                                      | knowledge             | -                                                                                                                                                              | MultiChallenge benchmark evaluation with LLM judge                                                         | -                                                                        | ✓     | -          | TBD                                                       |

diff --git a/resources_servers/equivalence_llm_judge/configs/equivalence_llm_judge.yaml b/resources_servers/equivalence_llm_judge/configs/equivalence_llm_judge.yaml
@@ -27,7 +27,7 @@ equivalence_llm_judge:
       response_extract_regex: null
 
       # Swap check: Run second judge pass with swapped expected/generated to detect positional bias
-      check_twice_swap: true
+      check_twice_swap: false
       # Reward when the second (swap) pass fails; default 0.0, can be -1.0
       reward_if_swap_fails: 0.0
 
@@ -47,15 +47,15 @@ equivalence_llm_judge:
       # [NEW] Skip regex extraction when expected_answer length exceeds this threshold.
       # When skipped, the full generation is shown to judge instead of extracting.
       # Only applies when per-record regex is present. Set to null to disable.
-      extraction_length_threshold: 120
+      extraction_length_threshold: null
 
       # [NEW] If true, when first pass fails, retry with full generation (no regex) for partial credit.
       # Helps recover from regex extraction failures. Only activates when per-record regex exists.
-      check_full_generation_on_fail: true
+      check_full_generation_on_fail: false
 
       # [NEW] Reward when full generation check succeeds after first pass fails.
       # Default 0.5 (partial credit). Set to 1.0 for full credit or 0.0 to ignore.
-      reward_if_full_generation_succeeds: 0.5
+      reward_if_full_generation_succeeds: 0.0
       domain: knowledge
       verified: false
       description: Short answer questions with LLM-as-a-judge

diff --git a/resources_servers/math_with_code/app.py b/resources_servers/math_with_code/app.py
@@ -14,8 +14,8 @@
 # limitations under the License.
 import asyncio
 import io
+import json
 import multiprocessing
-import re
 import signal
 import time
 from contextlib import redirect_stderr, redirect_stdout
@@ -224,13 +224,25 @@ async def verify(self, body: PythonMathVerifyRequest) -> PythonMathVerifyRespons
                     if content.type == "output_text":
                         text_content += content.text
 
-                # Extract boxed answer
-                match = re.search(r"\\boxed\{([^}]+)\}", text_content)
-                if match:
-                    actual = match.group(1).strip()
+                actual = _extract_boxed_answer(text_content)
+                if actual is not None:
                     break
 
-        accuracy = str(actual) == str(expected)
+        # Fallback: the model may print the boxed answer inside executed code,
+        # so search tool output (stdout) as well.
+        if actual is None:
+            for output in reversed(body.response.output):
+                if output.type == "function_call_output":
+                    try:
+                        tool_resp = json.loads(output.output)
+                        stdout_text = tool_resp.get("stdout", "")
+                    except (json.JSONDecodeError, AttributeError):
+                        stdout_text = output.output
+                    actual = _extract_boxed_answer(stdout_text)
+                    if actual is not None:
+                        break
+
+        accuracy = _answers_match(actual, expected)
         reward = 1.0 if accuracy else 0.0
 
         return PythonMathVerifyResponse(
@@ -241,6 +253,74 @@ async def verify(self, body: PythonMathVerifyRequest) -> PythonMathVerifyRespons
         )
 
 
+def _normalize_answer(s: str) -> str:
+    """Strip common math delimiters and whitespace for fair comparison.
+
+    Many expected_result values are wrapped in \\(...\\) or $...$,
+    but model answers inside \\boxed{} never include those wrappers.
+    Without this normalization, ~67% of answers can never match.
+    """
+    s = s.strip()
+    if s.startswith("\\(") and s.endswith("\\)"):
+        s = s[2:-2].strip()
+    if s.startswith("$") and s.endswith("$") and len(s) > 1:
+        s = s[1:-1].strip()
+    if s.startswith("\\text{") and s.endswith("}"):
+        s = s[6:-1].strip()
+    s = " ".join(s.split())
+    return s
+
+
+def _answers_match(actual: Optional[str], expected: str) -> bool:
+    """Compare answers after normalization, with numeric fallback."""
+    if actual is None:
+        return False
+
+    norm_actual = _normalize_answer(actual)
+    norm_expected = _normalize_answer(expected)
+
+    # Exact match after normalization
+    if norm_actual == norm_expected:
+        return True
+
+    # Numeric fallback (handles "42" vs "42.0", "0.5" vs ".5", etc.)
+    try:
+        fa = float(norm_actual)
+        fe = float(norm_expected)
+        if abs(fa - fe) < 1e-6 * max(1.0, abs(fe)):
+            return True
+    except (ValueError, OverflowError):
+        pass
+
+    return False
+
+
+def _extract_boxed_answer(text: str) -> Optional[str]:
+    """Extract the content inside the last \\boxed{...} in text.
+
+    Uses brace-depth tracking so nested braces are handled correctly
+    (e.g. \\boxed{\\frac{1}{2}} correctly returns '\\frac{1}{2}').
+    """
+    marker = "\\boxed{"
+    # Use the last occurrence so chain-of-thought intermediate boxes are skipped.
+    idx = text.rfind(marker)
+    if idx == -1:
+        return None
+    start = idx + len(marker)
+    depth = 1
+    i = start
+    while i < len(text) and depth > 0:
+        if text[i] == "{":
+            depth += 1
+        elif text[i] == "}":
+            depth -= 1
+        i += 1
+    if depth != 0:
+        return None
+    result = text[start : i - 1].strip()
+    return result if result else None
+
+
 def _get_last_expr_value(code: str, globals_dict: dict, locals_dict: dict):
     """
     Replicates the behaviour that used to live inside execute_python:

diff --git a/resources_servers/math_with_code/configs/math_with_code.yaml b/resources_servers/math_with_code/configs/math_with_code.yaml
@@ -27,3 +27,4 @@ math_with_code_simple_agent:
         - name: example
           type: example
           jsonl_fpath: resources_servers/math_with_code/data/example.jsonl
+      max_steps: 3  # Tighter cap to reduce long-tail rollout stragglers
diff --git a/resources_servers/math_with_judge/app.py b/resources_servers/math_with_judge/app.py
@@ -157,9 +157,8 @@ async def _verify_answer(
         if not self.config.should_use_judge or library_reward > 0.5:
             return library_reward, extracted_answer, library_reward, None
 
-        judge_reward, judge_evaluations = await self._verify_answer_with_judge(
-            question, expected_answer, generated_answer
-        )
+        judge_answer = extracted_answer if extracted_answer else generated_answer
+        judge_reward, judge_evaluations = await self._verify_answer_with_judge(question, expected_answer, judge_answer)
         return judge_reward, extracted_answer, library_reward, judge_evaluations
 
     @classmethod
@@ -172,11 +171,27 @@ def _mute_output(cls):
         ):
             yield
 
+    @staticmethod
+    def _strip_math_delimiters(s: str) -> str:
+        """Strip outer math delimiters from expected answers.
+
+        Many expected_answer values are wrapped in \\(...\\) or $...$,
+        which causes the math_verify parser to fail when we wrap them
+        in \\boxed{}.  Removing these outer delimiters fixes parsing.
+        """
+        s = s.strip()
+        if s.startswith("\\(") and s.endswith("\\)"):
+            s = s[2:-2].strip()
+        if s.startswith("$") and s.endswith("$") and len(s) > 1:
+            s = s[1:-1].strip()
+        return s
+
     def _verify_answer_with_library(self, expected_answer: str, generated_answer: str) -> tuple[float, Optional[str]]:
         # This functionality is migrated from Nemo RL.
         # https://github.com/NVIDIA-NeMo/RL/blob/e1f56c42ae175d3863ccaf4e21b7de7e9c46c2e1/nemo_rl/environments/math_environment.py
         try:
-            ground_truth_parsable = "\\boxed{" + expected_answer + "}"
+            stripped = self._strip_math_delimiters(expected_answer)
+            ground_truth_parsable = "\\boxed{" + stripped + "}"
             with self._mute_output():
                 ret_score, extracted_answer = self._library_verifier([ground_truth_parsable], [generated_answer])
 
@@ -197,7 +212,7 @@ def _verify_answer_with_library(self, expected_answer: str, generated_answer: st
                     # If no match is found, that means all the answers are
                     # incorrect.  The first prediction is used as the extracted
                     # answer.
-                    extracted_answer = extracted_prediction[0]
+                    extracted_answer = extracted_prediction[0] if extracted_prediction else None
 
             return reward, extracted_answer
 
@@ -235,6 +250,7 @@ async def _generate_judge_evaluation(
     ) -> tuple[bool, JudgeEvaluation]:
         config = self.config
         responses_create_params = config.judge_responses_create_params.model_copy(deep=True)
+
         judge_prompt = self.JUDGE_PROMPT_TEMPLATE.format(
             question=question, first_answer=first_answer, second_answer=second_answer
         )

diff --git a/resources_servers/math_with_judge/tests/test_app.py b/resources_servers/math_with_judge/tests/test_app.py
@@ -159,7 +159,7 @@ async def test_verify(self, config: LibraryJudgeMathResourcesServerConfig) -> No
             judge_evaluations[0],
             question,
             expected_answer,
-            first_part,
+            not_equal_verify_response.extracted_answer,
             {},
             "verify_not_equal_id",
             not_equal_item,
@@ -293,15 +293,15 @@ async def test_verify_answer(self, config: LibraryJudgeMathResourcesServerConfig
             judge_equal_judge_evaluations[0],
             judge_equal_question,
             judge_equal_expected_answer,
-            judge_equal_generated_answer,
+            judge_equal_extracted_answer,
             {},
             "verify_answer_first_judge_equal_id",
             first_judge_equal_item,
         )
         self._check_judge_evaluation(
             judge_equal_judge_evaluations[1],
             judge_equal_question,
-            judge_equal_generated_answer,
+            judge_equal_extracted_answer,
             judge_equal_expected_answer,
             {},
             "verify_answer_second_judge_equal_id",

diff --git a/resources_servers/mcqa/app.py b/resources_servers/mcqa/app.py
@@ -35,7 +35,7 @@ class MCQARunRequest(BaseRunRequest):
     # Preferred dataset format: top-level `metadata` carries arbitrary data and
     # is not interpreted by the verifier. Only the fields below are used for
     # grading.
-    options: Optional[list[dict[str, str]]] = None
+    options: Optional[list[dict[str, Optional[str]]]] = None
     expected_answer: Optional[str] = None
     # Optional additional metadata for the request; if provided, may contain
     # fields like options/expected_answer as an alternative location.
@@ -140,9 +140,9 @@ def _match_option_text(text: str, options: list[dict[str, str]], allowed_letters
     normalized_options: list[tuple[str, str]] = []
     for entry in options or []:
         for k, v in entry.items():
-            if isinstance(k, str) and len(k) == 1 and k.upper() in allowed_letters:
+            # Skip null values and only include valid letter keys with string values
+            if v is not None and isinstance(k, str) and len(k) == 1 and k.upper() in allowed_letters:
                 normalized_options.append((k.upper(), _normalize_for_match(v)))
-                break
 
     matched_letters: set[str] = set()
     for cand in normalized_candidates:
@@ -193,7 +193,7 @@ def _parse_answer_with_custom_regex(
         normalized_captured = _normalize_for_match(captured)
         for entry in options or []:
             for k, v in entry.items():
-                if k.upper() in allowed_letters and _normalize_for_match(v) == normalized_captured:
+                if v is not None and k.upper() in allowed_letters and _normalize_for_match(v) == normalized_captured:
                     return k.upper()
 
         return None
@@ -276,10 +276,10 @@ def _get_allowed_letters_from_options(
     letters: set[str] = set()
     if options:
         for entry in options:
-            for k in entry.keys():
-                if isinstance(k, str) and len(k) == 1 and k.isalpha():
+            # Exclude null values
+            for k, v in entry.items():
+                if isinstance(k, str) and len(k) == 1 and k.isalpha() and v is not None:
                     letters.add(k.upper())
-                break
     return letters
 
 

diff --git a/resources_servers/mcqa/configs/mcqa.yaml b/resources_servers/mcqa/configs/mcqa.yaml
@@ -27,6 +27,12 @@ mcqa_simple_agent:
         huggingface_identifier:
           repo_id: nvidia/Nemotron-RL-knowledge-mcqa
         license: Apache 2.0
+      - name: validation
+        type: validation
+        jsonl_fpath: resources_servers/mcqa/data/validation.jsonl
+        huggingface_identifier:
+          repo_id: nvidia/Nemotron-RL-knowledge-mcqa
+        license: Apache 2.0
       - name: example
         type: example
         jsonl_fpath: resources_servers/mcqa/data/example.jsonl