From 89d4ff08e2ce0945b6d2b215c0bb9ca8facd2fed Mon Sep 17 00:00:00 2001
From: Carl Hvarfner <hvarfner@meta.com>
Date: Fri, 19 Dec 2025 03:25:48 -0800
Subject: [PATCH] optimization_trace to return noiseless function_values
 instead of noisy

Summary: Changing optimization trace to return underlying noiseless value instead of noisy.

Differential Revision:
D89407340

Privacy Context Container: L1307644
---
 ax/benchmark/benchmark.py                   | 21 +++++++++++----------
 ax/benchmark/benchmark_runner.py            |  1 -
 ax/benchmark/benchmark_trial_metadata.py    | 14 ++++++++------
 ax/benchmark/tests/test_benchmark_metric.py |  2 ++
 ax/benchmark/tests/test_benchmark_runner.py |  1 +
 ax/service/utils/best_point.py              |  5 ++++-
 6 files changed, 26 insertions(+), 18 deletions(-)

diff --git a/ax/benchmark/benchmark.py b/ax/benchmark/benchmark.py
index 959d2f07390..2436bdc381a 100644
--- a/ax/benchmark/benchmark.py
+++ b/ax/benchmark/benchmark.py
@@ -806,7 +806,7 @@ def get_opt_trace_by_steps(experiment: Experiment) -> npt.NDArray:
             "Cumulative epochs not supported for problems with outcome constraints."
         )
 
-    objective_name = optimization_config.objective.metric.name
+    objective_name: str = optimization_config.objective.metric.name
     data = assert_is_instance(experiment.lookup_data(), MapData)
     map_df = data.map_df
 
@@ -815,39 +815,40 @@ def get_opt_trace_by_steps(experiment: Experiment) -> npt.NDArray:
     # to know which actually ran
     def _get_df(trial: Trial) -> pd.DataFrame:
         """
-        Get the (virtual) time each epoch finished at.
+        Get the (virtual) time each epoch finished at, along with the ground
+        truth values (Y_true).
         """
         metadata = trial.run_metadata["benchmark_metadata"]
         backend_simulator = none_throws(metadata.backend_simulator)
-        # Data for the first metric, which is the only metric
-        df = next(iter(metadata.dfs.values()))
+        # Get the DataFrame for the objective metric
+        df = metadata.dfs[objective_name].copy()
         start_time = backend_simulator.get_sim_trial_by_index(
             trial.index
         ).sim_start_time
         df["time"] = df["virtual runtime"] + start_time
         return df
 
-    with_timestamps = pd.concat(
+    with_timestamps_and_y_true = pd.concat(
         (
             _get_df(trial=assert_is_instance(trial, Trial))
             for trial in experiment.trials.values()
         ),
         axis=0,
         ignore_index=True,
-    )[["trial_index", MAP_KEY, "time"]]
+    )[["trial_index", MAP_KEY, "time", "Y_true"]]
 
     df = (
         map_df.loc[
             map_df["metric_name"] == objective_name,
-            ["trial_index", "arm_name", "mean", MAP_KEY],
+            ["trial_index", "arm_name", MAP_KEY],
         ]
-        .merge(with_timestamps, how="left")
+        .merge(with_timestamps_and_y_true, how="left")
         .sort_values("time", ignore_index=True)
     )
     return (
-        df["mean"].cummin()
+        df["Y_true"].cummin()
         if optimization_config.objective.minimize
-        else df["mean"].cummax()
+        else df["Y_true"].cummax()
     ).to_numpy()
 
 
diff --git a/ax/benchmark/benchmark_runner.py b/ax/benchmark/benchmark_runner.py
index 765d413452f..9a0c4152309 100644
--- a/ax/benchmark/benchmark_runner.py
+++ b/ax/benchmark/benchmark_runner.py
@@ -303,7 +303,6 @@ def run(self, trial: BaseTrial) -> dict[str, BenchmarkTrialMetadata]:
                 df=df, noise_stds=self.get_noise_stds(), arm_weights=arm_weights
             )
         df["trial_index"] = trial.index
-        df.drop(columns=["Y_true"], inplace=True)
         df["metric_signature"] = df["metric_name"]
 
         if self.simulated_backend_runner is not None:
diff --git a/ax/benchmark/benchmark_trial_metadata.py b/ax/benchmark/benchmark_trial_metadata.py
index 52b3db216d7..db5b1898d0a 100644
--- a/ax/benchmark/benchmark_trial_metadata.py
+++ b/ax/benchmark/benchmark_trial_metadata.py
@@ -20,12 +20,14 @@ class BenchmarkTrialMetadata:
 
     Args:
         df: A dict mapping each metric name to a Pandas DataFrame with columns
-            ["metric_name", "arm_name", "mean", "sem", and "step"]. The "sem" is
-            always present in this df even if noise levels are unobserved;
-            ``BenchmarkMetric`` and ``BenchmarkMapMetric`` hide that data if it
-            should not be observed, and ``BenchmarkMapMetric``s drop data from
-            time periods that that are not observed based on the (simulated)
-            trial progression.
+            ["metric_name", "arm_name", "mean", "sem", "Y_true", and "step"]. The
+            "sem" is always present in this df even if noise levels are
+            unobserved; ``BenchmarkMetric`` and ``BenchmarkMapMetric`` hide that
+            data if it should not be observed, and ``BenchmarkMapMetric``s drop
+            data from time periods that that are not observed based on the
+            (simulated) trial progression. The "Y_true" column contains the
+            ground-truth (noiseless) values, which are used for computing the
+            optimization trace.
         backend_simulator: Optionally, the backend simulator that is tracking
             the trial's status.
     """
diff --git a/ax/benchmark/tests/test_benchmark_metric.py b/ax/benchmark/tests/test_benchmark_metric.py
index 4d4bedf8ceb..e6521c99d9a 100644
--- a/ax/benchmark/tests/test_benchmark_metric.py
+++ b/ax/benchmark/tests/test_benchmark_metric.py
@@ -47,6 +47,7 @@ def _get_one_step_df(
                 "metric_name": metric_name,
                 "mean": [1.0, 2.5] if batch else [1.0],
                 "sem": sem,
+                "Y_true": [0.9, 2.4] if batch else [0.9],
                 "trial_index": 0,
                 "step": step,
                 "virtual runtime": step,
@@ -59,6 +60,7 @@ def _get_one_step_df(
             "metric_name": metric_name,
             "mean": [0.5, 1.5] if batch else [0.5],
             "sem": sem,
+            "Y_true": [0.4, 1.4] if batch else [0.4],
             "trial_index": 0,
             "step": step,
             "virtual runtime": step,
diff --git a/ax/benchmark/tests/test_benchmark_runner.py b/ax/benchmark/tests/test_benchmark_runner.py
index add8ae20e39..81d3247b09f 100644
--- a/ax/benchmark/tests/test_benchmark_runner.py
+++ b/ax/benchmark/tests/test_benchmark_runner.py
@@ -367,6 +367,7 @@ def test_heterogeneous_noise(self) -> None:
                     "metric_signature",
                     "mean",
                     "sem",
+                    "Y_true",
                     "trial_index",
                     "step",
                     "virtual runtime",
diff --git a/ax/service/utils/best_point.py b/ax/service/utils/best_point.py
index 11c2394fc45..2733089e19b 100644
--- a/ax/service/utils/best_point.py
+++ b/ax/service/utils/best_point.py
@@ -844,9 +844,12 @@ def _prepare_data_for_trace(
 
     # Transform to a DataFrame with columns ["trial_index", "arm_name"] +
     # relevant metric names, and values being means.
+    # Use Y_true (ground truth) if available (benchmarking context),
+    # otherwise fall back to mean (production context)
+    value_col = "Y_true" if "Y_true" in df.columns else "mean"
     df_wide = (
         df[df["metric_name"].isin(metrics)]
-        .set_index(["trial_index", "arm_name", "metric_name"])["mean"]
+        .set_index(["trial_index", "arm_name", "metric_name"])[value_col]
         .unstack(level="metric_name")
     )
     missing_metrics = [