Mola-maker · Mola-maker · Apr 20, 2026 · Apr 20, 2026
diff --git a/agents/data_simulation.py b/agents/data_simulation.py
@@ -0,0 +1,243 @@
+"""P1.7 — Data Simulation Agent.
+
+Augments undersized datasets (after P1.5 cleaning) with synthetic rows so
+downstream modeling / solver stages have enough data to work with.
+
+Design notes
+------------
+- **Gaussian-perturbation bootstrap.** Re-samples cleaned rows and adds
+  column-scaled Gaussian noise to numeric columns; non-numeric columns are
+  bootstrap-copied verbatim. This keeps the joint distribution close to the
+  original (small KS-statistic) without pretending to discover new structure.
+- **Never overwrites cleaned files.** Output goes to `augmented_{stem}.csv`
+  alongside the original; P3 solver can opt in.
+- **`_sim_origin` column** (values: `"real"` / `"simulated"`) tags every row
+  so downstream code can filter if needed.
+- **Runs as `on_error="skip"`**: if no eligible files, the agent records a
+  note in context and returns without raising.
+- **No LLM dependency.** Pure statistical augmentation — reliable even when
+  the model router is unreachable.
+"""
+
+from __future__ import annotations
+
+import os
+from dataclasses import asdict, dataclass, field
+from pathlib import Path
+
+import numpy as np
+import pandas as pd
+
+from agents.orchestrator import load_context, save_context
+
+try:
+    from scipy import stats as _scipy_stats
+except ImportError:
+    _scipy_stats = None
+
+BASE_DIR = Path(__file__).resolve().parent.parent
+VOL_HOST = Path(os.getenv("VOL_HOST", str(BASE_DIR / "vol")))
+DATA_DIR = VOL_HOST / "data"
+
+# ── Tunables ────────────────────────────────────────────────────────────────
+MIN_ROWS_FOR_MODELING = 30     # fewer rows → trigger augmentation
+TARGET_ROWS = 100              # expansion cap (including real rows)
+PERTURBATION_SIGMA = 0.05      # relative noise std for numeric cols
+KS_WARNING_THRESHOLD = 0.30    # per-column KS stat above this gets flagged
+SIM_ORIGIN_COL = "_sim_origin"
+
+
+@dataclass
+class SimulatedFile:
+    source: str
+    output: str
+    original_rows: int
+    simulated_rows: int
+    method: str
+    preserved_cols: list[str] = field(default_factory=list)
+    numeric_cols: list[str] = field(default_factory=list)
+    ks_stats: dict[str, float] = field(default_factory=dict)
+    warnings: list[str] = field(default_factory=list)
+
+
+@dataclass
+class SimulationResult:
+    trigger_threshold: int = MIN_ROWS_FOR_MODELING
+    target_rows: int = TARGET_ROWS
+    files: list[dict] = field(default_factory=list)
+    skipped: list[dict] = field(default_factory=list)
+    simulated_files: list[str] = field(default_factory=list)
+    total_rows_added: int = 0
+
+    def to_dict(self) -> dict:
+        return asdict(self)
+
+
+# ── Helpers ─────────────────────────────────────────────────────────────────
+
+
+def _ks_2samp(a: np.ndarray, b: np.ndarray) -> float:
+    """Two-sample KS statistic. Uses scipy if available, else numpy-based fallback."""
+    if _scipy_stats is not None:
+        stat, _ = _scipy_stats.ks_2samp(a, b)
+        return float(stat)
+    # Fallback: compute empirical CDF difference manually.
+    combined = np.sort(np.concatenate([a, b]))
+    cdf_a = np.searchsorted(np.sort(a), combined, side="right") / a.size
+    cdf_b = np.searchsorted(np.sort(b), combined, side="right") / b.size
+    return float(np.max(np.abs(cdf_a - cdf_b)))
+
+
+def _numeric_columns(df: pd.DataFrame) -> list[str]:
+    return [c for c in df.columns if pd.api.types.is_numeric_dtype(df[c]) and c != SIM_ORIGIN_COL]
+
+
+def _iter_cleaned_entries(ctx: dict):
+    """Yield (filename, cleaned_file_path) for every successful P1.5 entry."""
+    results = ctx.get("data_cleaning", {}).get("results", {}) or {}
+    for fname, entry in results.items():
+        if not isinstance(entry, dict):
+            continue
+        if entry.get("status") != "success":
+            continue
+        cleaned = entry.get("cleaned_file")
+        if not cleaned:
+            continue
+        path = Path(cleaned)
+        if not path.is_absolute():
+            path = (BASE_DIR / cleaned).resolve()
+        if path.exists():
+            yield fname, path
+
+
+def _gaussian_bootstrap(
+    df: pd.DataFrame,
+    target_rows: int,
+    sigma: float,
+    rng: np.random.Generator,
+) -> tuple[pd.DataFrame, dict[str, float], list[str]]:
+    """Generate synthetic rows; return (augmented_df, ks_stats, warnings)."""
+    n_real = len(df)
+    n_needed = max(0, target_rows - n_real)
+    numeric_cols = _numeric_columns(df)
+    warnings: list[str] = []
+
+    if n_needed == 0:
+        out = df.copy()
+        out[SIM_ORIGIN_COL] = "real"
+        return out, {}, warnings
+
+    # Bootstrap-sample row indices; copy every column verbatim first.
+    sampled_idx = rng.integers(0, n_real, size=n_needed)
+    synth = df.iloc[sampled_idx].reset_index(drop=True).copy()
+
+    # Perturb numeric columns: add N(0, sigma * col_std) scaled noise.
+    ks_stats: dict[str, float] = {}
+    for col in numeric_cols:
+        col_values = df[col].to_numpy(dtype=float)
+        finite = col_values[np.isfinite(col_values)]
+        if finite.size < 2:
+            warnings.append(f"{col}: <2 finite values, no noise added")
+            continue
+        std = float(np.std(finite, ddof=1))
+        if std == 0.0:
+            # Constant column: keep verbatim.
+            continue
+        noise = rng.normal(0.0, sigma * std, size=n_needed)
+        synth[col] = synth[col].astype(float) + noise
+
+        try:
+            stat = _ks_2samp(finite, synth[col].to_numpy(dtype=float))
+            ks_stats[col] = round(stat, 4)
+            if stat > KS_WARNING_THRESHOLD:
+                warnings.append(f"{col}: KS stat {stat:.3f} > {KS_WARNING_THRESHOLD}")
+        except Exception:
+            pass
+
+    real_part = df.copy()
+    real_part[SIM_ORIGIN_COL] = "real"
+    synth[SIM_ORIGIN_COL] = "simulated"
+    out = pd.concat([real_part, synth], ignore_index=True)
+    return out, ks_stats, warnings
+
+
+def _augment_one(fname: str, cleaned_path: Path, rng: np.random.Generator) -> tuple[SimulatedFile | None, dict | None]:
+    """Process a single cleaned file. Returns (simulated_file, None) or (None, skip_entry)."""
+    try:
+        df = pd.read_csv(cleaned_path)
+    except Exception as exc:
+        return None, {"source": fname, "reason": f"read_csv failed: {exc}"}
+
+    n_real = len(df)
+    if n_real == 0:
+        return None, {"source": fname, "reason": "empty dataframe"}
+
+    if n_real >= MIN_ROWS_FOR_MODELING:
+        return None, {"source": fname, "reason": f"sufficient rows ({n_real} >= {MIN_ROWS_FOR_MODELING})"}
+
+    numeric_cols = _numeric_columns(df)
+    if not numeric_cols:
+        return None, {"source": fname, "reason": "no numeric columns to perturb"}
+
+    out_df, ks_stats, warns = _gaussian_bootstrap(df, TARGET_ROWS, PERTURBATION_SIGMA, rng)
+    out_path = cleaned_path.with_name(f"augmented_{cleaned_path.stem.replace('cleaned_', '', 1)}.csv")
+    out_df.to_csv(out_path, index=False, encoding="utf-8")
+
+    preserved = [c for c in df.columns if c not in numeric_cols]
+    simulated_added = len(out_df) - n_real
+    return (
+        SimulatedFile(
+            source=str(cleaned_path),
+            output=str(out_path),
+            original_rows=n_real,
+            simulated_rows=simulated_added,
+            method="gaussian_bootstrap",
+            preserved_cols=preserved,
+            numeric_cols=numeric_cols,
+            ks_stats=ks_stats,
+            warnings=warns,
+        ),
+        None,
+    )
+
+
+class DataSimulationAgent:
+    """P1.7 — augment undersized cleaned CSVs with Gaussian-perturbation bootstrap."""
+
+    def __init__(self, seed: int = 42) -> None:
+        self._rng = np.random.default_rng(seed)
+
+    def run(self) -> dict:
+        ctx = load_context()
+        result = SimulationResult()
+
+        entries = list(_iter_cleaned_entries(ctx))
+        if not entries:
+            print("  [P1.7] 未发现 P1.5 清洗产物，跳过")
+            return self._write(ctx, result, note="no cleaned files")
+
+        for fname, path in entries:
+            sim, skip = _augment_one(fname, path, self._rng)
+            if sim is not None:
+                result.files.append(asdict(sim))
+                result.simulated_files.append(sim.output)
+                result.total_rows_added += sim.simulated_rows
+            elif skip is not None:
+                result.skipped.append(skip)
+
+        print(
+            f"  [P1.7] 扫描 {len(entries)} 个清洗文件 → "
+            f"增强 {len(result.files)} 个 (+{result.total_rows_added} 行)，"
+            f"跳过 {len(result.skipped)}"
+        )
+        return self._write(ctx, result)
+
+    @staticmethod
+    def _write(ctx: dict, result: SimulationResult, note: str = "") -> dict:
+        payload = result.to_dict()
+        if note:
+            payload["note"] = note
+        ctx["data_simulation"] = payload
+        ctx["phase"] = "P1.7_complete"
+        save_context(ctx)
+        return ctx
diff --git a/agents/experience_recorder.py b/agents/experience_recorder.py
@@ -63,6 +63,20 @@
 }
 """,
 
+"P1.7": """你是数据增强专家。根据数据仿真结果，提炼供"下次处理小样本数据"时参考的经验。
+
+输出严格 JSON（不含 markdown 代码块），结构：
+{
+  "augmentation_triggered": true/false,
+  "small_sample_signals": ["触发仿真的信号1（例：原始<30行）", "信号2"],
+  "numeric_vs_categorical_ratio": "数值/非数值列比例说明",
+  "ks_quality": "KS统计量整体质量（<0.1优 / 0.1-0.3可接受 / >0.3需警惕）",
+  "warnings_encountered": ["出现的告警1", "告警2"],
+  "pitfalls": ["踩坑记录1（例：扰动 sigma 过大破坏相关结构）"],
+  "reuse_tips": ["复用建议1（例：对时序数据宜改用 block bootstrap）"]
+}
+""",
+
 "P2": """你是数学建模专家。根据建模过程，提炼供"下次遇到相似建模任务"时参考的经验。
 
 输出严格 JSON（不含 markdown 代码块），结构：
@@ -172,6 +186,17 @@ def _extract_phase_context(ctx: dict, phase: str) -> str:
         summaries = dc.get("stdout_summaries", {})
         snippets["stdout_summary_sample"] = list(summaries.values())[:2]
 
+    elif phase == "P1.7":
+        sim = ctx.get("data_simulation", {})
+        snippets["trigger_threshold"] = sim.get("trigger_threshold")
+        snippets["target_rows"] = sim.get("target_rows")
+        snippets["total_rows_added"] = sim.get("total_rows_added", 0)
+        snippets["files"] = [
+            {k: v for k, v in f.items() if k in ("source", "original_rows", "simulated_rows", "method", "ks_stats", "warnings")}
+            for f in sim.get("files", [])
+        ][:4]
+        snippets["skipped"] = sim.get("skipped", [])[:4]
+
     elif phase == "P2":
         m = ctx.get("modeling", {})
         snippets["model_type"] = m.get("model_type", "")
@@ -313,6 +338,7 @@ def record_experience(phase: str) -> dict | None:
 _PHASE_NAME = {
     "P1": "题目解析",
     "P1.5": "数据清洗",
+    "P1.7": "数据仿真",
     "P2": "数学建模",
     "P3": "代码求解",
     "P4": "论文撰写",

diff --git a/main.py b/main.py
@@ -13,6 +13,7 @@
 from agents.modeling_agent import ModelingAgent
 from agents.model_compare import ModelCompareAgent
 from agents.paper_figures import PaperFiguresAgent
+from agents.data_simulation import DataSimulationAgent
 from agents.matlab_viz import MatlabVizAgent
 from agents.viz3d import Viz3DAgent
 from agents.code_agent import CodeAgent
@@ -85,6 +86,14 @@ def p1_5(ctx: dict) -> PhaseOutcome:
         note = f"数据清洗: {success}/{len(results)} 文件成功" + (f"，EDA 图片 {len(figs)} 张" if figs else "")
         return PhaseOutcome(ctx=new_ctx, note=note)
 
+    def p1_7(ctx: dict) -> PhaseOutcome:
+        new_ctx = DataSimulationAgent().run()
+        sim = new_ctx.get("data_simulation", {})
+        added = sim.get("total_rows_added", 0)
+        n_files = len(sim.get("files", []))
+        note = f"数据仿真: {n_files} 文件, +{added} 行" if added else "无需仿真（样本充足或无输入）"
+        return PhaseOutcome(ctx=new_ctx, note=note)
+
     def p2(ctx: dict) -> PhaseOutcome:
         new_ctx = ModelingAgent().run()
         model = new_ctx["modeling"].get("primary_model", {})
@@ -179,6 +188,7 @@ def p5_5(ctx: dict) -> PhaseOutcome:
         PhaseSpec(name="P0b", run=p0b, on_error="skip", description="PDF → Markdown"),
         PhaseSpec(name="P1", run=p1, record_experience=True, description="题目解析 + 三手分发"),
         PhaseSpec(name="P1.5", run=p1_5, record_experience=True, description="数据清洗 + EDA"),
+        PhaseSpec(name="P1.7", run=p1_7, on_error="skip", record_experience=True, description="小样本数据仿真增强"),
         PhaseSpec(name="P2", run=p2, record_experience=True, description="数学建模"),
         PhaseSpec(name="P2.8", run=p2_8, on_error="skip", description="多模型对比（LLM + 指标）"),
         PhaseSpec(name="P2.5", run=p2_5, on_error="skip", description="MATLAB 风格可视化"),
@@ -267,7 +277,7 @@ def run_pipeline(start_phase: str = "P0b", selected_problem: str | None = None)
     parser.add_argument(
         "--start",
         default="P0b",
-        choices=["P0b", "P1", "P1.5", "P2", "P2.8", "P2.5", "P2.7", "P3", "P3.5", "P3.7", "P4", "P4.5", "P5", "P5.5"],
+        choices=["P0b", "P1", "P1.5", "P1.7", "P2", "P2.8", "P2.5", "P2.7", "P3", "P3.5", "P3.7", "P4", "P4.5", "P5", "P5.5"],
         help="起始阶段，默认 P0b",
     )
     parser.add_argument(

diff --git a/ui/server.py b/ui/server.py
@@ -67,6 +67,7 @@
     "P0b":  {"name": "PDF 转译",   "agent": "pdf_agent.py",          "icon": "doc"},
     "P1":   {"name": "题目解析",   "agent": "question_extractor.py", "icon": "search"},
     "P1.5": {"name": "数据清洗",   "agent": "data_cleaning_agent.py","icon": "data"},
+    "P1.7": {"name": "数据仿真",   "agent": "data_simulation.py",    "icon": "data"},
     "P2":   {"name": "数学建模",   "agent": "modeling_agent.py",     "icon": "model"},
     "P2.5": {"name": "数学可视化", "agent": "matlab_viz.py",         "icon": "model"},
     "P3":   {"name": "代码求解",   "agent": "code_agent.py",         "icon": "code"},
@@ -77,22 +78,23 @@
     "P5.5": {"name": "数据审计",   "agent": "data_validator.py",     "icon": "audit"},
 }
 
-PHASE_ORDER = ["P0b", "P1", "P1.5", "P2", "P2.5", "P3", "P3.5", "P4", "P4.5", "P5", "P5.5"]
+PHASE_ORDER = ["P0b", "P1", "P1.5", "P1.7", "P2", "P2.5", "P3", "P3.5", "P4", "P4.5", "P5", "P5.5"]
 
 PHASE_COMPLETE_MAP = {
     "init": set(),
     "P0b_complete": {"P0b"},
     "P1_extraction_complete": {"P0b", "P1"},
     "P1.5_complete": {"P0b", "P1", "P1.5"},
     "P1.5_skipped": {"P0b", "P1", "P1.5"},
-    "P2_complete": {"P0b", "P1", "P1.5", "P2"},
-    "P2.5_complete": {"P0b", "P1", "P1.5", "P2", "P2.5"},
-    "P3_complete": {"P0b", "P1", "P1.5", "P2", "P2.5", "P3"},
-    "P3_logic_err": {"P0b", "P1", "P1.5", "P2", "P2.5"},
-    "P3.5_complete": {"P0b", "P1", "P1.5", "P2", "P2.5", "P3", "P3.5"},
-    "P4_complete": {"P0b", "P1", "P1.5", "P2", "P2.5", "P3", "P3.5", "P4"},
-    "P4.5_complete": {"P0b", "P1", "P1.5", "P2", "P2.5", "P3", "P3.5", "P4", "P4.5"},
-    "P5_complete": {"P0b", "P1", "P1.5", "P2", "P2.5", "P3", "P3.5", "P4", "P4.5", "P5"},
+    "P1.7_complete": {"P0b", "P1", "P1.5", "P1.7"},
+    "P2_complete": {"P0b", "P1", "P1.5", "P1.7", "P2"},
+    "P2.5_complete": {"P0b", "P1", "P1.5", "P1.7", "P2", "P2.5"},
+    "P3_complete": {"P0b", "P1", "P1.5", "P1.7", "P2", "P2.5", "P3"},
+    "P3_logic_err": {"P0b", "P1", "P1.5", "P1.7", "P2", "P2.5"},
+    "P3.5_complete": {"P0b", "P1", "P1.5", "P1.7", "P2", "P2.5", "P3", "P3.5"},
+    "P4_complete": {"P0b", "P1", "P1.5", "P1.7", "P2", "P2.5", "P3", "P3.5", "P4"},
+    "P4.5_complete": {"P0b", "P1", "P1.5", "P1.7", "P2", "P2.5", "P3", "P3.5", "P4", "P4.5"},
+    "P5_complete": {"P0b", "P1", "P1.5", "P1.7", "P2", "P2.5", "P3", "P3.5", "P4", "P4.5", "P5"},
     "P5.5_complete": set(PHASE_ORDER),
 }