fix: add a switch for ensemble_time_upper_bound and fix some bug in main (#1226)

jingyuanlm · Hoder-zyf · web-flow · commit f00a5382b163 · 2025-09-08T16:35:42.000+08:00
* change runner prompts

* v1

* ensemble_time_upper_bound

* lint

* fix inf bug in function prob_dis_torch() and ensemble prompts

* lint

* lint

---------

Co-authored-by: amstrongzyf &lt;amstrongzyf@126.com&gt;
diff --git a/rdagent/app/data_science/conf.py b/rdagent/app/data_science/conf.py
@@ -176,6 +176,8 @@ class DataScienceBasePropSetting(KaggleBasePropSetting):
     #### Task Generate related
     fix_seed_and_data_split: bool = False
 
+    ensemble_time_upper_bound: bool = False
+
 
 DS_RD_SETTING = DataScienceBasePropSetting()
 
diff --git a/rdagent/scenarios/data_science/proposal/exp_gen/prompts_v2.yaml b/rdagent/scenarios/data_science/proposal/exp_gen/prompts_v2.yaml
@@ -539,28 +539,31 @@ hypothesis_select:
     - **Time Limit Guidance**
       {% if time_max < 0 %}
       - Initial Case: runtime info unavailable, keep most hypotheses if component is Ensemble.
-      - Remove only those clearly excessive (e.g., > {{ full_time }} hours) or overly complex.
       {% elif time_max >= full_time * 0.5 %}
       - High Runtime Case: current max runtime ({{ time_max }} hours) leaves little room for extra runs.
       - Avoid high-fold or heavy ensembles.
       - Maximum recommended folds: {{ (full_time // time_max) | int }}
-      - Remove hypotheses clearly excessive (> {{ full_time }} hours)
       {% else %}
       - Low Runtime Case: current max runtime ({{ time_max }} hours) is far from the time limit.
       - Prefer hypotheses with runtimes ≤ {{ full_time }} hours.
       - Hypotheses slightly above {{ time_max }} hours can be retained only with strong justification.
       {% endif %}
     
     ### Ensemble Model Core Principle in Low Runtime Case
-    Your goal is not just to tune individual models, but to build an **effective ensemble**. Make design decisions that lead to **strong overall ensemble performance**, not just strong base models.
-    Please note: you are operating under a time budget dedicated to ensemble training of {{res_time}} seconds, and the maximum allowed time is {{full_time}} seconds.
-  
-    Please take the remaining {{res_time}} seconds to carefully consider and design the most reasonable and optimal ensemble models based on your current progress.
+    Your goal is not just to tune individual models, but to build an **effective ensemble**. Make design decisions that lead to **strong overall ensemble performance**, not just strong base models.  
+    These are examples:
+    
+    Example 1:
     Assume training a single model takes about 1 hour. For example, if you have roughly twice that time left, you can try training multiple models with different random seeds or data splits to reuse time effectively.
     If you have more time, you might consider training a multi-fold ensemble. Use your judgment to decide how many folds or seeds fit within your remaining time budget.
+    
+    Example 2:
+    Assume training a single fold of a model takes at most {{ time_max }} hours. Within your remaining time budget, prioritize training multiple folds of the same model rather than trying many different models.
+    For instance, if you have roughly 2 × {{ time_max }} hours left, you could train 2 folds of the same model with different data splits or random seeds.
+    If more time is available, you might consider increasing the number of folds further. Use your judgment to decide how many folds fit within the remaining time budget while respecting the time_max constraint for a single fold.
 
     ### 2. Training-Time Resource Allocation
-    - You may use **multiple folds** if justified, but you must **ensure the full pipeline completes within runtime limits**.
+    - You may use **multiple folds** if justified, but you must **ensure the full pipeline completes within remaining time budget**.
     - Avoid reducing base model quality just to save time. For example:
       - Freezing large parts of the model (e.g., embeddings)
       - Using only embedding-level regression instead of full modeling
diff --git a/rdagent/scenarios/data_science/proposal/exp_gen/proposal.py b/rdagent/scenarios/data_science/proposal/exp_gen/proposal.py
@@ -963,6 +963,7 @@ def _prob_dis_torch(
             alpha, beta = 1.0, 0
         gamma = math.log(2) / 30
         logits = alpha * sim_matrix * math.exp(-gamma * path_length) + beta * torch.tanh(score_diff_matrix)
+        logits = torch.clamp(logits, min=-2, max=2)
         probs = torch.softmax(logits, dim=1)
 
         num_candidates = probs.size(-1)
diff --git a/rdagent/scenarios/data_science/scen/__init__.py b/rdagent/scenarios/data_science/scen/__init__.py
@@ -156,16 +156,13 @@ def recommend_debug_timeout(self):
         return DS_RD_SETTING.debug_recommend_timeout
 
     def real_full_timeout(self):
-        remain_time = RD_Agent_TIMER_wrapper.timer.remain_time()
-        all_duration = RD_Agent_TIMER_wrapper.timer.all_duration
-        remain_percent = remain_time / all_duration
+        if DS_RD_SETTING.ensemble_time_upper_bound:
+            remain_time = RD_Agent_TIMER_wrapper.timer.remain_time()
+            all_duration = RD_Agent_TIMER_wrapper.timer.all_duration
+            remain_percent = remain_time / all_duration
+            if remain_percent * 100 < 100 - DS_RD_SETTING.ratio_merge_or_ensemble:
+                return DS_RD_SETTING.full_timeout * DS_RD_SETTING.runner_longer_timeout_multiplier_upper
 
-        if remain_percent * 100 < 100 - DS_RD_SETTING.ratio_merge_or_ensemble:
-            return DS_RD_SETTING.full_timeout * DS_RD_SETTING.runner_longer_timeout_multiplier_upper
-
-        # Every 'patience' failures, move to next timeout level
-        # Each level adds 'runner_timeout_increase_stage' multiplier to timeout
-        # Capped by upper limit to prevent infinite growth
         return (
             DS_RD_SETTING.full_timeout
             * min(