Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
243 changes: 243 additions & 0 deletions agents/data_simulation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,243 @@
"""P1.7 — Data Simulation Agent.

Augments undersized datasets (after P1.5 cleaning) with synthetic rows so
downstream modeling / solver stages have enough data to work with.

Design notes
------------
- **Gaussian-perturbation bootstrap.** Re-samples cleaned rows and adds
column-scaled Gaussian noise to numeric columns; non-numeric columns are
bootstrap-copied verbatim. This keeps the joint distribution close to the
original (small KS-statistic) without pretending to discover new structure.
- **Never overwrites cleaned files.** Output goes to `augmented_{stem}.csv`
alongside the original; P3 solver can opt in.
- **`_sim_origin` column** (values: `"real"` / `"simulated"`) tags every row
so downstream code can filter if needed.
- **Runs as `on_error="skip"`**: if no eligible files, the agent records a
note in context and returns without raising.
- **No LLM dependency.** Pure statistical augmentation — reliable even when
the model router is unreachable.
"""

from __future__ import annotations

import os
from dataclasses import asdict, dataclass, field
from pathlib import Path

import numpy as np
import pandas as pd

from agents.orchestrator import load_context, save_context

try:
from scipy import stats as _scipy_stats
except ImportError:
_scipy_stats = None

BASE_DIR = Path(__file__).resolve().parent.parent
VOL_HOST = Path(os.getenv("VOL_HOST", str(BASE_DIR / "vol")))
DATA_DIR = VOL_HOST / "data"

# ── Tunables ────────────────────────────────────────────────────────────────
MIN_ROWS_FOR_MODELING = 30 # fewer rows → trigger augmentation
TARGET_ROWS = 100 # expansion cap (including real rows)
PERTURBATION_SIGMA = 0.05 # relative noise std for numeric cols
KS_WARNING_THRESHOLD = 0.30 # per-column KS stat above this gets flagged
SIM_ORIGIN_COL = "_sim_origin"


@dataclass
class SimulatedFile:
source: str
output: str
original_rows: int
simulated_rows: int
method: str
preserved_cols: list[str] = field(default_factory=list)
numeric_cols: list[str] = field(default_factory=list)
ks_stats: dict[str, float] = field(default_factory=dict)
warnings: list[str] = field(default_factory=list)


@dataclass
class SimulationResult:
trigger_threshold: int = MIN_ROWS_FOR_MODELING
target_rows: int = TARGET_ROWS
files: list[dict] = field(default_factory=list)
skipped: list[dict] = field(default_factory=list)
simulated_files: list[str] = field(default_factory=list)
total_rows_added: int = 0

def to_dict(self) -> dict:
return asdict(self)


# ── Helpers ─────────────────────────────────────────────────────────────────


def _ks_2samp(a: np.ndarray, b: np.ndarray) -> float:
"""Two-sample KS statistic. Uses scipy if available, else numpy-based fallback."""
if _scipy_stats is not None:
stat, _ = _scipy_stats.ks_2samp(a, b)
return float(stat)
# Fallback: compute empirical CDF difference manually.
combined = np.sort(np.concatenate([a, b]))
cdf_a = np.searchsorted(np.sort(a), combined, side="right") / a.size
cdf_b = np.searchsorted(np.sort(b), combined, side="right") / b.size
return float(np.max(np.abs(cdf_a - cdf_b)))


def _numeric_columns(df: pd.DataFrame) -> list[str]:
return [c for c in df.columns if pd.api.types.is_numeric_dtype(df[c]) and c != SIM_ORIGIN_COL]


def _iter_cleaned_entries(ctx: dict):
"""Yield (filename, cleaned_file_path) for every successful P1.5 entry."""
results = ctx.get("data_cleaning", {}).get("results", {}) or {}
for fname, entry in results.items():
if not isinstance(entry, dict):
continue
if entry.get("status") != "success":
continue
cleaned = entry.get("cleaned_file")
if not cleaned:
continue
path = Path(cleaned)
if not path.is_absolute():
path = (BASE_DIR / cleaned).resolve()
if path.exists():
yield fname, path


def _gaussian_bootstrap(
df: pd.DataFrame,
target_rows: int,
sigma: float,
rng: np.random.Generator,
) -> tuple[pd.DataFrame, dict[str, float], list[str]]:
"""Generate synthetic rows; return (augmented_df, ks_stats, warnings)."""
n_real = len(df)
n_needed = max(0, target_rows - n_real)
numeric_cols = _numeric_columns(df)
warnings: list[str] = []

if n_needed == 0:
out = df.copy()
out[SIM_ORIGIN_COL] = "real"
return out, {}, warnings

# Bootstrap-sample row indices; copy every column verbatim first.
sampled_idx = rng.integers(0, n_real, size=n_needed)
synth = df.iloc[sampled_idx].reset_index(drop=True).copy()

# Perturb numeric columns: add N(0, sigma * col_std) scaled noise.
ks_stats: dict[str, float] = {}
for col in numeric_cols:
col_values = df[col].to_numpy(dtype=float)
finite = col_values[np.isfinite(col_values)]
if finite.size < 2:
warnings.append(f"{col}: <2 finite values, no noise added")
continue
std = float(np.std(finite, ddof=1))
if std == 0.0:
# Constant column: keep verbatim.
continue
noise = rng.normal(0.0, sigma * std, size=n_needed)
synth[col] = synth[col].astype(float) + noise

try:
stat = _ks_2samp(finite, synth[col].to_numpy(dtype=float))
ks_stats[col] = round(stat, 4)
if stat > KS_WARNING_THRESHOLD:
warnings.append(f"{col}: KS stat {stat:.3f} > {KS_WARNING_THRESHOLD}")
except Exception:
pass

real_part = df.copy()
real_part[SIM_ORIGIN_COL] = "real"
synth[SIM_ORIGIN_COL] = "simulated"
out = pd.concat([real_part, synth], ignore_index=True)
return out, ks_stats, warnings


def _augment_one(fname: str, cleaned_path: Path, rng: np.random.Generator) -> tuple[SimulatedFile | None, dict | None]:
"""Process a single cleaned file. Returns (simulated_file, None) or (None, skip_entry)."""
try:
df = pd.read_csv(cleaned_path)
except Exception as exc:
return None, {"source": fname, "reason": f"read_csv failed: {exc}"}

n_real = len(df)
if n_real == 0:
return None, {"source": fname, "reason": "empty dataframe"}

if n_real >= MIN_ROWS_FOR_MODELING:
return None, {"source": fname, "reason": f"sufficient rows ({n_real} >= {MIN_ROWS_FOR_MODELING})"}

numeric_cols = _numeric_columns(df)
if not numeric_cols:
return None, {"source": fname, "reason": "no numeric columns to perturb"}

out_df, ks_stats, warns = _gaussian_bootstrap(df, TARGET_ROWS, PERTURBATION_SIGMA, rng)
out_path = cleaned_path.with_name(f"augmented_{cleaned_path.stem.replace('cleaned_', '', 1)}.csv")
out_df.to_csv(out_path, index=False, encoding="utf-8")

preserved = [c for c in df.columns if c not in numeric_cols]
simulated_added = len(out_df) - n_real
return (
SimulatedFile(
source=str(cleaned_path),
output=str(out_path),
original_rows=n_real,
simulated_rows=simulated_added,
method="gaussian_bootstrap",
preserved_cols=preserved,
numeric_cols=numeric_cols,
ks_stats=ks_stats,
warnings=warns,
),
None,
)


class DataSimulationAgent:
"""P1.7 — augment undersized cleaned CSVs with Gaussian-perturbation bootstrap."""

def __init__(self, seed: int = 42) -> None:
self._rng = np.random.default_rng(seed)

def run(self) -> dict:
ctx = load_context()
result = SimulationResult()

entries = list(_iter_cleaned_entries(ctx))
if not entries:
print(" [P1.7] 未发现 P1.5 清洗产物,跳过")
return self._write(ctx, result, note="no cleaned files")

for fname, path in entries:
sim, skip = _augment_one(fname, path, self._rng)
if sim is not None:
result.files.append(asdict(sim))
result.simulated_files.append(sim.output)
result.total_rows_added += sim.simulated_rows
elif skip is not None:
result.skipped.append(skip)

print(
f" [P1.7] 扫描 {len(entries)} 个清洗文件 → "
f"增强 {len(result.files)} 个 (+{result.total_rows_added} 行),"
f"跳过 {len(result.skipped)}"
)
return self._write(ctx, result)

@staticmethod
def _write(ctx: dict, result: SimulationResult, note: str = "") -> dict:
payload = result.to_dict()
if note:
payload["note"] = note
ctx["data_simulation"] = payload
ctx["phase"] = "P1.7_complete"
save_context(ctx)
return ctx
26 changes: 26 additions & 0 deletions agents/experience_recorder.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,20 @@
}
""",

"P1.7": """你是数据增强专家。根据数据仿真结果,提炼供"下次处理小样本数据"时参考的经验。

输出严格 JSON(不含 markdown 代码块),结构:
{
"augmentation_triggered": true/false,
"small_sample_signals": ["触发仿真的信号1(例:原始<30行)", "信号2"],
"numeric_vs_categorical_ratio": "数值/非数值列比例说明",
"ks_quality": "KS统计量整体质量(<0.1优 / 0.1-0.3可接受 / >0.3需警惕)",
"warnings_encountered": ["出现的告警1", "告警2"],
"pitfalls": ["踩坑记录1(例:扰动 sigma 过大破坏相关结构)"],
"reuse_tips": ["复用建议1(例:对时序数据宜改用 block bootstrap)"]
}
""",

"P2": """你是数学建模专家。根据建模过程,提炼供"下次遇到相似建模任务"时参考的经验。

输出严格 JSON(不含 markdown 代码块),结构:
Expand Down Expand Up @@ -172,6 +186,17 @@ def _extract_phase_context(ctx: dict, phase: str) -> str:
summaries = dc.get("stdout_summaries", {})
snippets["stdout_summary_sample"] = list(summaries.values())[:2]

elif phase == "P1.7":
sim = ctx.get("data_simulation", {})
snippets["trigger_threshold"] = sim.get("trigger_threshold")
snippets["target_rows"] = sim.get("target_rows")
snippets["total_rows_added"] = sim.get("total_rows_added", 0)
snippets["files"] = [
{k: v for k, v in f.items() if k in ("source", "original_rows", "simulated_rows", "method", "ks_stats", "warnings")}
for f in sim.get("files", [])
][:4]
snippets["skipped"] = sim.get("skipped", [])[:4]

elif phase == "P2":
m = ctx.get("modeling", {})
snippets["model_type"] = m.get("model_type", "")
Expand Down Expand Up @@ -313,6 +338,7 @@ def record_experience(phase: str) -> dict | None:
_PHASE_NAME = {
"P1": "题目解析",
"P1.5": "数据清洗",
"P1.7": "数据仿真",
"P2": "数学建模",
"P3": "代码求解",
"P4": "论文撰写",
Expand Down
12 changes: 11 additions & 1 deletion main.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from agents.modeling_agent import ModelingAgent
from agents.model_compare import ModelCompareAgent
from agents.paper_figures import PaperFiguresAgent
from agents.data_simulation import DataSimulationAgent
from agents.matlab_viz import MatlabVizAgent
from agents.viz3d import Viz3DAgent
from agents.code_agent import CodeAgent
Expand Down Expand Up @@ -85,6 +86,14 @@ def p1_5(ctx: dict) -> PhaseOutcome:
note = f"数据清洗: {success}/{len(results)} 文件成功" + (f",EDA 图片 {len(figs)} 张" if figs else "")
return PhaseOutcome(ctx=new_ctx, note=note)

def p1_7(ctx: dict) -> PhaseOutcome:
new_ctx = DataSimulationAgent().run()
sim = new_ctx.get("data_simulation", {})
added = sim.get("total_rows_added", 0)
n_files = len(sim.get("files", []))
note = f"数据仿真: {n_files} 文件, +{added} 行" if added else "无需仿真(样本充足或无输入)"
return PhaseOutcome(ctx=new_ctx, note=note)

def p2(ctx: dict) -> PhaseOutcome:
new_ctx = ModelingAgent().run()
model = new_ctx["modeling"].get("primary_model", {})
Expand Down Expand Up @@ -179,6 +188,7 @@ def p5_5(ctx: dict) -> PhaseOutcome:
PhaseSpec(name="P0b", run=p0b, on_error="skip", description="PDF → Markdown"),
PhaseSpec(name="P1", run=p1, record_experience=True, description="题目解析 + 三手分发"),
PhaseSpec(name="P1.5", run=p1_5, record_experience=True, description="数据清洗 + EDA"),
PhaseSpec(name="P1.7", run=p1_7, on_error="skip", record_experience=True, description="小样本数据仿真增强"),
PhaseSpec(name="P2", run=p2, record_experience=True, description="数学建模"),
PhaseSpec(name="P2.8", run=p2_8, on_error="skip", description="多模型对比(LLM + 指标)"),
PhaseSpec(name="P2.5", run=p2_5, on_error="skip", description="MATLAB 风格可视化"),
Expand Down Expand Up @@ -267,7 +277,7 @@ def run_pipeline(start_phase: str = "P0b", selected_problem: str | None = None)
parser.add_argument(
"--start",
default="P0b",
choices=["P0b", "P1", "P1.5", "P2", "P2.8", "P2.5", "P2.7", "P3", "P3.5", "P3.7", "P4", "P4.5", "P5", "P5.5"],
choices=["P0b", "P1", "P1.5", "P1.7", "P2", "P2.8", "P2.5", "P2.7", "P3", "P3.5", "P3.7", "P4", "P4.5", "P5", "P5.5"],
help="起始阶段,默认 P0b",
)
parser.add_argument(
Expand Down
20 changes: 11 additions & 9 deletions ui/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,7 @@
"P0b": {"name": "PDF 转译", "agent": "pdf_agent.py", "icon": "doc"},
"P1": {"name": "题目解析", "agent": "question_extractor.py", "icon": "search"},
"P1.5": {"name": "数据清洗", "agent": "data_cleaning_agent.py","icon": "data"},
"P1.7": {"name": "数据仿真", "agent": "data_simulation.py", "icon": "data"},
"P2": {"name": "数学建模", "agent": "modeling_agent.py", "icon": "model"},
"P2.5": {"name": "数学可视化", "agent": "matlab_viz.py", "icon": "model"},
"P3": {"name": "代码求解", "agent": "code_agent.py", "icon": "code"},
Expand All @@ -77,22 +78,23 @@
"P5.5": {"name": "数据审计", "agent": "data_validator.py", "icon": "audit"},
}

PHASE_ORDER = ["P0b", "P1", "P1.5", "P2", "P2.5", "P3", "P3.5", "P4", "P4.5", "P5", "P5.5"]
PHASE_ORDER = ["P0b", "P1", "P1.5", "P1.7", "P2", "P2.5", "P3", "P3.5", "P4", "P4.5", "P5", "P5.5"]

PHASE_COMPLETE_MAP = {
"init": set(),
"P0b_complete": {"P0b"},
"P1_extraction_complete": {"P0b", "P1"},
"P1.5_complete": {"P0b", "P1", "P1.5"},
"P1.5_skipped": {"P0b", "P1", "P1.5"},
"P2_complete": {"P0b", "P1", "P1.5", "P2"},
"P2.5_complete": {"P0b", "P1", "P1.5", "P2", "P2.5"},
"P3_complete": {"P0b", "P1", "P1.5", "P2", "P2.5", "P3"},
"P3_logic_err": {"P0b", "P1", "P1.5", "P2", "P2.5"},
"P3.5_complete": {"P0b", "P1", "P1.5", "P2", "P2.5", "P3", "P3.5"},
"P4_complete": {"P0b", "P1", "P1.5", "P2", "P2.5", "P3", "P3.5", "P4"},
"P4.5_complete": {"P0b", "P1", "P1.5", "P2", "P2.5", "P3", "P3.5", "P4", "P4.5"},
"P5_complete": {"P0b", "P1", "P1.5", "P2", "P2.5", "P3", "P3.5", "P4", "P4.5", "P5"},
"P1.7_complete": {"P0b", "P1", "P1.5", "P1.7"},
"P2_complete": {"P0b", "P1", "P1.5", "P1.7", "P2"},
"P2.5_complete": {"P0b", "P1", "P1.5", "P1.7", "P2", "P2.5"},
"P3_complete": {"P0b", "P1", "P1.5", "P1.7", "P2", "P2.5", "P3"},
"P3_logic_err": {"P0b", "P1", "P1.5", "P1.7", "P2", "P2.5"},
"P3.5_complete": {"P0b", "P1", "P1.5", "P1.7", "P2", "P2.5", "P3", "P3.5"},
"P4_complete": {"P0b", "P1", "P1.5", "P1.7", "P2", "P2.5", "P3", "P3.5", "P4"},
"P4.5_complete": {"P0b", "P1", "P1.5", "P1.7", "P2", "P2.5", "P3", "P3.5", "P4", "P4.5"},
"P5_complete": {"P0b", "P1", "P1.5", "P1.7", "P2", "P2.5", "P3", "P3.5", "P4", "P4.5", "P5"},
"P5.5_complete": set(PHASE_ORDER),
}

Expand Down
Loading