AMD-AGI · iraj465 · Mar 9, 2026 · Mar 9, 2026 · Mar 9, 2026 · Mar 12, 2026
diff --git a/.gitignore b/.gitignore
@@ -14,7 +14,15 @@ agents/geak_optimagentv2/.geak_setup_complete
 agents/geak_ourllm_kernel2kernel/GEAK-agent
 run.sh
 config_*.yaml
+!config_geak_triton_mem_*.yaml
 tmp*
 kill.sh
 saved_results
-.mcp.json
+.mcp.json
+
+*workspace*
+ws_mem*
+do_task.sh
+traj.json
+**/baseline_metrics.json
+**/profile.json
diff --git a/README.md b/README.md
@@ -318,6 +318,85 @@ Review the generated `validation_report.yaml` in the workspace directory. The ta
 See [agents/task_validator/README.md](agents/task_validator/README.md) for the full list of validation checks and requirements.
 
 
+## GEAK Triton Kernel Optimization Runs
+
+Multi-GPU batch optimization of Triton kernels using the GEAK agent with heterogeneous memory configuration and model ensemble.
+
+All runs use: `GEAK_CONFIG_NAME=heterogeneous_memory_on`
+
+### Batch 1 Configs & Commands
+
+**Slot 1 — GPUs 0-3** (`config_geak_triton_mem_slot1_rerun.yaml`):
+- `triton2triton/geak_eval/L1/refk_fp8_blockwise_mm`
+- `triton2triton/geak_eval/L1/moe_routing_sigmoid_top1`
+- `triton2triton/geak_eval/L1/llama_ff_triton`
+- `triton2triton/geak_eval/L1/refk_identity`
+
+```bash
+GEAK_CONFIG_NAME=heterogeneous_memory_on GEAK_GPU_IDS="0,1,2,3" \
+  python3 main.py --config_name config_geak_triton_mem_slot1_rerun.yaml \
+  > /tmp/slot1_run.log 2>&1 &
+```
+
+**Slot 2 — GPUs 4-7** (`config_geak_triton_mem_slot2_rerun.yaml`):
+- `triton2triton/geak_eval/L2/topk`
+- `triton2triton/geak_eval/L2/lean_atten_paged`
+- `triton2triton/geak_eval/L2/fast_rms_layernorm`
+- `triton2triton/geak_eval/L1/mla_decode`
+
+```bash
+GEAK_CONFIG_NAME=heterogeneous_memory_on GEAK_GPU_IDS="4,5,6,7" \
+  python3 main.py --config_name config_geak_triton_mem_slot2_rerun.yaml \
+  > /tmp/slot2_run.log 2>&1 &
+```
+
+### Batch 2 Configs & Commands
+
+**Slot 1 — GPUs 0-3** (`config_geak_triton_mem_slot1_batch2.yaml`):
+- `triton2triton/geak_eval/L1/fused_append_shared_experts`
+- `triton2triton/geak_eval/L2/ff_backward`
+- `triton2triton/geak_eval/L3/gemm_a16w16_atomic`
+- `triton2triton/geak_eval/L3/fused_qkv_rope`
+- `triton2triton/geak_eval/L3/fused_mxfp4_quant_moe_sort`
+
+```bash
+GEAK_CONFIG_NAME=heterogeneous_memory_on GEAK_GPU_IDS="0,1,2,3" \
+  python3 main.py --config_name config_geak_triton_mem_slot1_batch2.yaml \
+  > /tmp/slot1_b2_run.log 2>&1 &
+```
+
+**Slot 2 — GPUs 4-7** (`config_geak_triton_mem_slot2_batch2.yaml`):
+- `triton2triton/geak_eval/L3/gemm`
+- `triton2triton/geak_eval/L3/gemm_a16wfp4`
+- `triton2triton/geak_eval/L3/fused_moe_mxfp4`
+- `triton2triton/geak_eval/L3/fused_qk_rope_cache_mla`
+- `triton2triton/geak_eval/L3/fused_rms_fp8`
+
+```bash
+GEAK_CONFIG_NAME=heterogeneous_memory_on GEAK_GPU_IDS="4,5,6,7" \
+  python3 main.py --config_name config_geak_triton_mem_slot2_batch2.yaml \
+  > /tmp/slot2_b2_run.log 2>&1 &
+```
+
+### Monitoring
+
+```bash
+# Check processes
+ps aux | grep "main.py" | grep -v grep
+
+# Tail logs (batch 1)
+tail -20 /tmp/slot1_run.log
+tail -20 /tmp/slot2_run.log
+
+# Tail logs (batch 2)
+tail -20 /tmp/slot1_b2_run.log
+tail -20 /tmp/slot2_b2_run.log
+
+# Check completed results
+find ws_mem*/ -name "geak_summary.json" -exec echo "=== {} ===" \; -exec cat {} \;
+```
+
+
 ## Next Steps
 
 - Enhance A/B Testing with Better Interactivity and User Experience

diff --git a/agents/SWE_agent/launch_agent.py b/agents/SWE_agent/launch_agent.py
@@ -97,10 +97,21 @@ def launch_agent(eval_config: dict[str, Any], task_config_dir: str, workspace: s
         # copy the script python_bindings/tritonbench.py into the workspace
         shutil.copy(tritonbench_script_path, os.path.join(workspace, "python_bindings", "tritonbench.py"))
     if any("rocprim" in task for task in eval_config["tasks"]):
-        subprocess.run(
-            ["git", "clone", "https://github.com/ROCm/rocPRIM.git", os.path.join(workspace, "rocPRIM")],
-            check=True
-        )
+        for task in eval_config["tasks"]:
+            if "rocprim" not in task:
+                continue
+            repo_dir = Path(workspace) / "tasks" / task / "rocPRIM"
+            if (repo_dir / ".git").exists():
+                logger.info(f"Repository already exists at {repo_dir}, skipping clone")
+                continue
+            if repo_dir.exists():
+                logger.info(f"Repository directory already exists at {repo_dir}, skipping clone")
+                continue
+            repo_dir.parent.mkdir(parents=True, exist_ok=True)
+            subprocess.run(
+                ["git", "clone", "https://github.com/ROCm/rocPRIM.git", str(repo_dir)],
+                check=True,
+            )
         test_correctness_benchmark_path = Path(task_config_dir).parent / "python_bindings" / "test_correctness_benchmark.py"
         # make a dir for the target path
         os.makedirs(os.path.join(workspace, "python_bindings"), exist_ok=True)

diff --git a/agents/geak_v3/README.md b/agents/geak_v3/README.md
@@ -0,0 +1,88 @@
+## `GEAK-V3`
+
+This agent template integrates **GEAK v3** into AgentKernelArena so you can run AgentKernelArena tasks using GEAK-v3 as the optimizing agent.
+
+### 1) Install GEAK
+
+GEAK provides the `geak` CLIs. Install it in your Python environment:
+
+```bash
+cd /path/to/GEAK
+pip install -e .
+```
+
+### 2) Configure AMD LLM environment variables
+
+```bash
+export AMD_LLM_API_KEY="your-key-here"
+```
+
+### 3) Configure the GEAK runner in geak_v3
+
+Edit `agents/geak_v3/agent_config.yaml`.
+
+Key fields:
+- **`run.cmd`**: which executable to run `geak`
+- **`run.configs`**: CLI options passed to that executable
+
+Example:
+
+```yaml
+run:
+  cmd: geak
+  configs: "-c geak.yaml --yolo --num-parallel=2 --gpu-ids=0,1"
+```
+
+Notes:
+- `-c geak.yaml` points to `agents/geak_v3/geak.yaml` (the launcher automatically resolves it to an absolute path).
+- `--num-parallel` / `--gpu-ids` controls **parallel sub-agents inside a single task** (multi-GPU). This does *not* change how AgentKernelArena schedules tasks (see the “Tasks run serially” note below).
+- If you want to use a different `agent_config.yaml` without editing the repo, set:
+
+```bash
+export GEAK_AGENT_CONFIG="/abs/path/to/agent_config.yaml"
+```
+
+### 4) Configure tasks in AgentKernelArena
+
+Edit `AgentKernelArena/config.yaml`:
+
+1) Select this agent template:
+
+```yaml
+agent:
+  template: geak_v3
+```
+
+2) Select tasks to run (task names are relative to `tasks/`):
+
+Here are tasks of hip kernels: 
+```yaml
+tasks:
+  - hip2hip/others/
+  - repository/rocprim/block_radix_rank
+  - repository/rocprim/device_binary_search
+  - repository/rocprim/device_search_n
+  - repository/rocprim/device_merge_sort
+```
+
+### 5) Run
+
+From the `AgentKernelArena/` directory:
+
+```bash
+python3 main.py
+```
+
+### 6) Where to find results
+
+Quick checklist:
+
+- **AgentKernelArena Run log**: `logs/*.log` (path controlled by `log_directory` in `AgentKernelArena/config.yaml`)
+- **Workspace root**: `workspace_<GPU>_geak_v3/` (you can rename it by changing `workspace_directory_prefix` in `AgentKernelArena/config.yaml`)
+- **Per-task results**: `workspace_.../<task>_<timestamp>/task_result.yaml` (also `baseline_perf.yaml`, `optimized_perf.yaml`, `build/performance_report.json`)
+- **GEAK logs**: `workspace_.../<task>_<timestamp>_logs/` (see `best_results.json`, `parallel_*/`)
+- **Aggregate summary**: `workspace_.../task_results_summary.csv` (and sometimes `task_results_report.txt`)
+
+### Important: tasks run serially
+
+In AgentKernelArena, the `tasks:` list is executed **sequentially (one task at a time)**. If you want overall throughput, add more GPUs to **GEAK parallelism inside each task** via `--num-parallel` and `--gpu-ids`.
diff --git a/agents/geak_v3/__init__.py b/agents/geak_v3/__init__.py
@@ -0,0 +1,4 @@
+# Copyright(C) [2026] Advanced Micro Devices, Inc. All rights reserved.
+from agents.geak_v3.launch_agent import launch_agent
+
+__all__ = ["launch_agent"]
diff --git a/agents/geak_v3/agent_config.yaml b/agents/geak_v3/agent_config.yaml
@@ -0,0 +1,8 @@
+version: 0
+
+# Agent timeout settings
+timeout_seconds: 36000
+python_path: python3
+
+run:
+  configs: '-c geak.yaml --yolo --num-parallel=2 --gpu-ids=0,1'
diff --git a/agents/geak_v3/geak.yaml b/agents/geak_v3/geak.yaml
@@ -0,0 +1,31 @@
+agent:
+  step_limit: 0.
+  cost_limit: 0.
+  mode: confirm
+env:
+  env:
+    PAGER: cat
+    MANPAGER: cat
+    LESS: -R
+    PIP_PROGRESS_BAR: 'off'
+    TQDM_DISABLE: '1'
+  timeout: 3600
+model:
+  model_class: amd_llm
+  # claude-opus-4.5, claude-sonnet-4.5, gpt-5.1, gpt-5, gpt-5-codex
+  model_name: claude-opus-4.5
+  api_key: ""
+  # model_kwargs:
+  #   temperature: 0.0
+  #   max_tokens: 16000
+  #   # reasoning is only valid for gpt models, can be set to none, low, medium, high
+  #   reasoning:
+  #     effort: high
+  #   # text is only valid for gpt models, can be set to low or high. determines how many output tokens are generated
+  #   text:
+  #     verbosity: low
+
+tools:
+  profiling: false
+  profiling_type: profiling
+  strategy_manager: true