microsoft
diff --git a/‎pyproject.toml‎
Lines changed: 1 addition & 0 deletions b/‎pyproject.toml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎rdagent/app/data_science/conf.py‎
Lines changed: 5 additions & 1 deletion b/‎rdagent/app/data_science/conf.py‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎rdagent/scenarios/data_science/dev/runner/prompts.yaml‎
Lines changed: 4 additions & 2 deletions b/‎rdagent/scenarios/data_science/dev/runner/prompts.yaml‎
Lines changed: 4 additions & 2 deletions
diff --git a/‎rdagent/scenarios/data_science/proposal/exp_gen/prompts_v2.yaml‎
Lines changed: 13 additions & 19 deletions b/‎rdagent/scenarios/data_science/proposal/exp_gen/prompts_v2.yaml‎
Lines changed: 13 additions & 19 deletions
diff --git a/‎rdagent/scenarios/data_science/proposal/exp_gen/proposal.py‎
Lines changed: 35 additions & 52 deletions b/‎rdagent/scenarios/data_science/proposal/exp_gen/proposal.py‎
Lines changed: 35 additions & 52 deletions
@@ -112,6 +112,7 @@ docs = {file = ["requirements/docs.txt"]}
 lint = {file = ["requirements/lint.txt"]}
 package = {file = ["requirements/package.txt"]}
 test = {file = ["requirements/test.txt"]}
+torch = {file = ["requirements/torch.txt"]} # some agent algorithms need torch.  pip install rdagent[torch]
 
 [tool.setuptools_scm]
 local_scheme = "no-local-version"
 
@@ -145,7 +145,9 @@ class DataScienceBasePropSetting(KaggleBasePropSetting):
     coder_longer_timeout_multiplier_upper: int = 3
     runner_longer_timeout_multiplier_upper: int = 2
     coder_timeout_increase_stage: float = 0.3
-    runner_timeout_increase_stage: float = 0.15
+    runner_timeout_increase_stage: float = 0.3
+    runner_timeout_increase_stage_patience: int = 2
+    """Number of failures tolerated before escalating to next timeout level (stage width). Every 'patience' failures, timeout increases by 'runner_timeout_increase_stage'"""
     show_hard_limit: bool = True
 
     #### enable runner code change summary
@@ -174,6 +176,8 @@ class DataScienceBasePropSetting(KaggleBasePropSetting):
     #### Task Generate related
     fix_seed_and_data_split: bool = False
 
+    ensemble_time_upper_bound: bool = False
+
 
 DS_RD_SETTING = DataScienceBasePropSetting()
 
 
@@ -34,16 +34,18 @@ DSCoSTEER_eval:
     For example, if the code uses only a very small portion of the allowed time, and hyperparameters like `n_estimators` or `epochs` have low values, with early stopping not being triggered and possible signs of underfitting, you should suggest increasing these hyperparameters.
     You should also notice other resources utilization hyper-parameters.
     For example, if you are using a GPU with large memory, and the batch size is set very low, you should suggest increasing the batch size if it is not reasonable.
-
+    For example, prioritize adjustments to batch size and number of epochs. If further tuning is needed, consider parameters with significant impact on performance such as learning rate and the number of model folds. For CV competitions, also consider image size (imgsize), and for NLP competitions, consider maximum sequence length (maxlen), as these can have a substantial impact on results.
     ## Evaluation Guidelines
     1. The code execution time or resource utilization suggest that there is room for improvement in the hyperparameters.
     2. The code must apply early stopping strategy already (in order to prevent overfitting).
     3. Your suggestion should have a strong chance of improving the model's performance. Focus on the most obvious and impactful opportunities for quick improvement by leveraging more training time. Don't explore hyperparameters with low confidence.  If there are no obvious and impactful opportunities and the code runs well, please accept it.
     4. Only include the suggestions in your response without leak any time limit information because the user might over-fit the model to the time limit.
     5. Never make your judgment only based on the time spent, you should also consider the code and the stdout.
+
     If the code satisfy the requirements:
     - Set "hyperparameter_tuning_decision" to true.
-    - In "hyperparameter_tuning_suggestion", provide a clear, specific, and actionable suggestion. Begin with a concrete observation, then state a direct action to take. Do not use vague language, options, or uncertainty (avoid words like "A or B"). For example: "[Observation] The maximum number of epochs was reached, but the validation loss is still decreasing and early stopping was not activated. Only small portion of the allowed time was used. [Suggestion] Increase epochs to 100 to avoid underfitting and further improve model performance."
+    - In "hyperparameter_tuning_suggestion", provide a clear, specific, and actionable suggestion. Begin with a concrete observation, then state a direct action to take. Do not use vague language, options, or uncertainty (avoid words like "A or B"). For example: "[Observation] Training stopped due to early stopping while the validation loss was still decreasing. This suggests the patience parameter may be too small. 
+      [Suggestion] Increase the early stopping patience to allow more training epochs before stopping, which can further improve model performance."
     If the code does not satisfy the requirements:
     - Set "hyperparameter_tuning_decision" to false.
     - Set "hyperparameter_tuning_suggestion" to an empty string.
 
@@ -539,28 +539,31 @@ hypothesis_select:
     - **Time Limit Guidance**
       {% if time_max < 0 %}
       - Initial Case: runtime info unavailable, keep most hypotheses if component is Ensemble.
-      - Remove only those clearly excessive (e.g., > {{ full_time }} hours) or overly complex.
       {% elif time_max >= full_time * 0.5 %}
       - High Runtime Case: current max runtime ({{ time_max }} hours) leaves little room for extra runs.
       - Avoid high-fold or heavy ensembles.
       - Maximum recommended folds: {{ (full_time // time_max) | int }}
-      - Remove hypotheses clearly excessive (> {{ full_time }} hours)
       {% else %}
       - Low Runtime Case: current max runtime ({{ time_max }} hours) is far from the time limit.
       - Prefer hypotheses with runtimes ≤ {{ full_time }} hours.
       - Hypotheses slightly above {{ time_max }} hours can be retained only with strong justification.
       {% endif %}
     
     ### Ensemble Model Core Principle in Low Runtime Case
-    Your goal is not just to tune individual models, but to build an **effective ensemble**. Make design decisions that lead to **strong overall ensemble performance**, not just strong base models.
-    Please note: you are operating under a time budget dedicated to ensemble training of {{res_time}} seconds, and the maximum allowed time is {{full_time}} seconds.
-  
-    Please take the remaining {{res_time}} seconds to carefully consider and design the most reasonable and optimal ensemble models based on your current progress.
+    Your goal is not just to tune individual models, but to build an **effective ensemble**. Make design decisions that lead to **strong overall ensemble performance**, not just strong base models.  
+    These are examples:
+    
+    Example 1:
     Assume training a single model takes about 1 hour. For example, if you have roughly twice that time left, you can try training multiple models with different random seeds or data splits to reuse time effectively.
     If you have more time, you might consider training a multi-fold ensemble. Use your judgment to decide how many folds or seeds fit within your remaining time budget.
+    
+    Example 2:
+    Assume training a single fold of a model takes at most {{ time_max }} hours. Within your remaining time budget, prioritize training multiple folds of the same model rather than trying many different models.
+    For instance, if you have roughly 2 × {{ time_max }} hours left, you could train 2 folds of the same model with different data splits or random seeds.
+    If more time is available, you might consider increasing the number of folds further. Use your judgment to decide how many folds fit within the remaining time budget while respecting the time_max constraint for a single fold.
 
     ### 2. Training-Time Resource Allocation
-    - You may use **multiple folds** if justified, but you must **ensure the full pipeline completes within runtime limits**.
+    - You may use **multiple folds** if justified, but you must **ensure the full pipeline completes within remaining time budget**.
     - Avoid reducing base model quality just to save time. For example:
       - Freezing large parts of the model (e.g., embeddings)
       - Using only embedding-level regression instead of full modeling
@@ -702,19 +705,10 @@ task_gen:
     10. File Handling & DataFrame Generation: Generate a pandas DataFrame with columns [“id”, “path”, “fold”].
       - id: a unique identifier for each sample.
       - path: the file path of the corresponding sample.
-      - split: indicates the assignment of each sample for data splitting. Two modes are supported:
-        - K-Fold (optional): assign integers 0, 1, …, K-1 for each fold.
-        - Train/Test Split (optional): assign "train" or "test" for each sample according to the split ratio (e.g., 8:2).
-        - Ensure reproducibility: the DataFrame must be generated exactly the same way every time the script runs, e.g., by fixing the random seed 42.
-      Data Splitting: use this DataFrame to perform dataset splitting, selecting samples for training and testing based on the fold column.
-    11. Random Seed for Model Training:
-      - If training neural networks, ensure the initial weights and all random operations use a fixed seed of 42 (e.g., torch.manual_seed(42), numpy.random.seed(42), random.seed(42)).
-      - If training machine learning models such as LightGBM, XGBoost, or scikit-learn estimators, absolutely ensure the random seed is fixed (e.g., `random_state=42`) to guarantee reproducibility.
-      - This is mandatory: all aspects of the experiment must be fully reproducible and aligned, including dataset splits and random seeds;
-      - For multi-fold training, use out-of-fold (OOF) predictions as validation scores and save them as an oof file.
-    12. Hypothesis Handling: At the initial stage, multiple hypotheses may be proposed simultaneously. If some hypotheses overlap, select the most promising one for implementation and ignore redundant overlapping hypotheses. Each implemented hypothesis should remain an independent task.
-    Ensure reproducibility: the DataFrame must be generated exactly the same way every time the script runs, regardless of system or runtime conditions (e.g., by fixing the random seed).
+      
+    11. Hypothesis Handling: At the initial stage, multiple hypotheses may be proposed simultaneously. If some hypotheses overlap, select the most promising one for implementation and ignore redundant overlapping hypotheses. Each implemented hypothesis should remain an independent task.
     {% endif %}
+
     ## Package Declaration
     At the end of your design, **you MUST** provide a key `packages` in the final JSON output.  
     It should be an **array of PyPI package names** (strings) that you expect to `import` in the forthcoming implementation.  
 
@@ -606,7 +606,7 @@ def identify_problem(
                 all_problems[problem_name] = fb_problems[problem_name]
         return all_problems
 
-    @wait_retry(retry_n=5)
+    @wait_retry(retry_n=10)
     def hypothesis_gen(
         self,
         component_desc: str,
@@ -920,77 +920,56 @@ def select_hypothesis(
         )
         return index_to_pick_pool_list[reproducible_int]
 
-    # BEGIN: for support llm-based hypothesis selection  -----
-    def _cosine_similarity_matrix_numpy(self, A, B):
-        dot_products = np.matmul(A, B.T)
-        A_norms = np.linalg.norm(A, axis=1, keepdims=True)
-        B_norms = np.linalg.norm(B, axis=1, keepdims=True).T
-        return dot_products / (A_norms * B_norms)
-
-    def _gumbel_softmax_hard_sample(self, logits, tau=1.0, n_samples=1):
-
-        gumbel_noise = -np.log(-np.log(np.random.uniform(size=logits.shape) + 1e-20) + 1e-20)
-        y = (logits + gumbel_noise) / tau
-        # softmax
-        y_soft = np.exp(y - np.max(y, axis=1, keepdims=True))
-        y_soft = y_soft / np.sum(y_soft, axis=1, keepdims=True)
+    def _cosine_similarity_matrix_torch(self, A, B):
+        import torch
 
-        sampled_indices = []
-        for i in range(y_soft.shape[0]):
-            choices = np.arange(y_soft.shape[1])
-            idx = np.random.choice(choices, size=n_samples, replace=False, p=y_soft[i])
-            sampled_indices.append(idx)
-        sampled_indices = np.unique(np.concatenate(sampled_indices))
-        return sampled_indices.tolist()
+        dot_products = torch.matmul(A, B.T)
+        A_norms = torch.norm(A, dim=1, keepdim=True)
+        B_norms = torch.norm(B, dim=1, keepdim=True).T
+        return dot_products / (A_norms * B_norms)
 
-    def _prob_dis(
+    def _prob_dis_torch(
         self,
         current_sota_score_in_current_trace,
         extra_hypo_l: list[tuple[DSHypothesis, float]],
         hypothesis_candidates,
         competition,
         path_length,
     ):
-        # TODO: typing
+        import torch
+
         history_hypo_str, history_scores = [], []
         for hypo, score in extra_hypo_l:
             history_hypo_str.append(hypo.hypothesis)
             history_scores.append(score)
 
         target_texts = [v["hypothesis"] for v in hypothesis_candidates.values()]
-        target_embs = np.array(APIBackend().create_embedding(target_texts), dtype=np.float32)
+        target_embs = torch.tensor(APIBackend().create_embedding(target_texts), dtype=torch.float32)
 
         if not history_hypo_str:
             return []
-        history_embs = np.array(APIBackend().create_embedding(history_hypo_str), dtype=np.float32)
-        # TODO: Here is an example to help understand the code:(Please check the correctness of the comment
-        # history_embs: numpy.ndarray of shape (N, D) where N is the number of historical hypotheses
-        # and D is the embedding dimension returned by APIBackend().create_embedding.
-        # It contains vector representations of each hypothesis string in history_hypo_str,
-        # used for computing similarity with target embeddings.
-        # Example: if history_hypo_str = ["Try RandomForest with 200 estimators", "Use LightGBM with early stopping"]
-        # and embedding dimension D=3, history_embs might be:
-        # array([[ 0.123, -0.456,  0.789],
-        #        [ 0.234,  0.567, -0.890]], dtype=float32)
-        sim_matrix = self._cosine_similarity_matrix_numpy(target_embs, history_embs)
-        candidate_scores = np.full((len(target_texts), 1), current_sota_score_in_current_trace, dtype=np.float32)
-        history_scores = np.array(history_scores, dtype=np.float32).reshape(1, -1)
+        history_embs = torch.tensor(APIBackend().create_embedding(history_hypo_str), dtype=torch.float32)
+        sim_matrix = self._cosine_similarity_matrix_torch(target_embs, history_embs)
+        candidate_scores = [current_sota_score_in_current_trace for i in range(len(target_texts))]
+        candidate_scores = torch.tensor(candidate_scores, dtype=torch.float32).unsqueeze(1)
+        history_scores = torch.tensor(history_scores, dtype=torch.float32).unsqueeze(0)
         bigger_is_better = get_metric_direction(competition)
         if bigger_is_better:
             score_diff_matrix = history_scores - candidate_scores
         else:
             score_diff_matrix = candidate_scores - history_scores
         alpha, beta = 1.0, 1.0
-        if current_sota_score_in_current_trace == -1:  # FIXME: less magic number;
+        if current_sota_score_in_current_trace == -1:
             alpha, beta = 1.0, 0
         gamma = math.log(2) / 30
-        logits = alpha * sim_matrix * math.exp(-gamma * path_length) + beta * np.tanh(score_diff_matrix)
-        logits_max = np.max(logits, axis=1, keepdims=True)
-        exp_logits = np.exp(logits - logits_max)
-        probs = exp_logits / np.sum(exp_logits, axis=1, keepdims=True)
-        num_candidates = probs.shape[-1]
+        logits = alpha * sim_matrix * math.exp(-gamma * path_length) + beta * torch.tanh(score_diff_matrix)
+        logits = torch.clamp(logits, min=-2, max=2)
+        probs = torch.softmax(logits, dim=1)
+
+        num_candidates = probs.size(-1)
         n_samples = min(2, num_candidates)
-        flat_indices = self._gumbel_softmax_hard_sample(np.log(probs + 1e-20), tau=0.01, n_samples=n_samples)
+        sampled_indices = torch.multinomial(probs, num_samples=n_samples).squeeze(1)
+        flat_indices = sampled_indices.flatten().unique().tolist()
         if bigger_is_better:
             best_idx = history_scores[0].argmax().item()
             best_entry = (history_hypo_str[best_idx], history_scores[0, best_idx])
@@ -1097,14 +1076,18 @@ def hypothesis_select_with_llm(
             if getattr(tr[1], "decision", False)
         ]
         time_max = max(time_list_success) / 3600
-        # sota_flag = (hasattr(trace, "sota_exp_to_submit") and trace.sota_exp_to_submit is not None)----> V10 CODE VERSION
-        bvs = BestValidSelector()  # ----> V14 CODE VERSION
-        sota_exp = bvs.get_sota_exp_to_submit(trace)  # ----> V14 CODE VERSION
-        sota_flag = sota_exp is not None and sota_exp.result is not None  # ----> V14 CODE VERSION
+        sota_flag = (
+            hasattr(trace, "sota_exp_to_submit") and trace.sota_exp_to_submit is not None
+        )  # ----> V10 CODE VERSION
+        # bvs = BestValidSelector()  # ----> V14 CODE VERSION
+        # sota_exp = bvs.get_sota_exp_to_submit(trace)  # ----> V14 CODE VERSION
+        # sota_flag = sota_exp is not None and sota_exp.result is not None  # ----> V14 CODE VERSION
 
         if sota_flag:
-            current_sota_score = sota_exp.result.loc["ensemble"].iloc[0].round(3)  # ----> V14 CODE VERSION
-            # trace.sota_exp_to_submit.result.loc["ensemble"].iloc[0].round(3) ----> V10 CODE VERSION
+            # current_sota_score = sota_exp.result.loc["ensemble"].iloc[0].round(3)  # ----> V14 CODE VERSION
+            current_sota_score = (
+                trace.sota_exp_to_submit.result.loc["ensemble"].iloc[0].round(3)
+            )  # ----> V10 CODE VERSION
         else:
             current_sota_score = -1
 
@@ -1120,7 +1103,7 @@ def hypothesis_select_with_llm(
         extra_hypo_l = self._llm_select_extra_hypo(trace)
         if len(extra_hypo_l) > 0:
             # TODO:
-            selected_extra_hypo_l = self._prob_dis(
+            selected_extra_hypo_l = self._prob_dis_torch(
                 current_sota_score_in_current_trace,
                 extra_hypo_l,
                 hypothesis_candidates,