FairRootGroup · rbx · Mar 3, 2026 · Feb 19, 2026 · Feb 19, 2026 · Feb 20, 2026
diff --git a/analyze_lambda_occupancy.py b/analyze_lambda_occupancy.py
diff --git a/data/workload_statistics/analyze_workload_logs.py b/data/workload_statistics/analyze_workload_logs.py
diff --git a/data/workload_statistics/workload_logs.txt b/data/workload_statistics/workload_logs.txt
@@ -4,11 +4,14 @@ hours_observed=788
 arrivals_per_hour: mean=4.2119 std=77.5734
 duration_hours: mean=7.4852 std=29.6084
 nodes: mean=1.0054 std=0.1202
-cores: mean=14.9328 std=30.1301
-corr(duration, nodes)=0.007262 corr(duration, cores)=0.628780
+cores: mean=14.8503 std=30.0671
+corr(duration, nodes)=0.007262 corr(duration, cores)=0.629600
 burst_baseline_jobs_per_hour: small=0.0000 heavy=0.0000
 burst_prob_estimates: small_event=0.0025 small_volume=0.0243 heavy_event=0.0000 heavy_volume=0.0000
-suggested_flags: --wg-burst-small-prob 0.0025 --wg-burst-heavy-prob 0.0000
+burst_hours_detected: small=11 heavy=0
+suggested_small_ranges: jobs=3:767 duration=1:2 nodes=1:1 cores=2:6
+suggested_heavy_ranges: jobs=nan:nan duration=nan:nan nodes=nan:nan cores=nan:nan
+suggested_flags: --wg-burst-small-prob 0.0025 --wg-burst-heavy-prob 0.0000 --wg-burst-small-jobs-min 3 --wg-burst-small-jobs-max 767 --wg-burst-small-duration-min 1 --wg-burst-small-duration-max 2 --wg-burst-small-nodes-min 1 --wg-burst-small-nodes-max 1 --wg-burst-small-cores-min 2 --wg-burst-small-cores-max 6 --wg-burst-heavy-jobs-min nan --wg-burst-heavy-jobs-max nan --wg-burst-heavy-duration-min nan --wg-burst-heavy-duration-max nan --wg-burst-heavy-nodes-min nan --wg-burst-heavy-nodes-max nan --wg-burst-heavy-cores-min nan --wg-burst-heavy-cores-max nan
 
 === allusers-grid-30.log ===
 jobs=40639 skipped_rows=1
@@ -20,43 +23,52 @@ cores: mean=7.9984 std=0.1122
 corr(duration, nodes)=nan corr(duration, cores)=0.026772
 burst_baseline_jobs_per_hour: small=0.0000 heavy=0.0000
 burst_prob_estimates: small_event=0.0423 small_volume=0.0752 heavy_event=0.0000 heavy_volume=0.0000
-suggested_flags: --wg-burst-small-prob 0.0423 --wg-burst-heavy-prob 0.0000
+burst_hours_detected: small=79 heavy=0
+suggested_small_ranges: jobs=15:303 duration=1:7 nodes=1:1 cores=8:8
+suggested_heavy_ranges: jobs=nan:nan duration=nan:nan nodes=nan:nan cores=nan:nan
+suggested_flags: --wg-burst-small-prob 0.0423 --wg-burst-heavy-prob 0.0000 --wg-burst-small-jobs-min 15 --wg-burst-small-jobs-max 303 --wg-burst-small-duration-min 1 --wg-burst-small-duration-max 7 --wg-burst-small-nodes-min 1 --wg-burst-small-nodes-max 1 --wg-burst-small-cores-min 8 --wg-burst-small-cores-max 8 --wg-burst-heavy-jobs-min nan --wg-burst-heavy-jobs-max nan --wg-burst-heavy-duration-min nan --wg-burst-heavy-duration-max nan --wg-burst-heavy-nodes-min nan --wg-burst-heavy-nodes-max nan --wg-burst-heavy-cores-min nan --wg-burst-heavy-cores-max nan
 
 === allusers-high_mem-30.log ===
 jobs=237373 skipped_rows=1
 hours_observed=1385
 arrivals_per_hour: mean=171.3884 std=611.6586
 duration_hours: mean=0.2537 std=1.8726
 nodes: mean=1.0010 std=0.1191
-cores: mean=13.6286 std=47.3460
-corr(duration, nodes)=0.405630 corr(duration, cores)=0.113031
+cores: mean=13.5372 std=45.6885
+corr(duration, nodes)=0.405630 corr(duration, cores)=0.041992
 burst_baseline_jobs_per_hour: small=129.0000 heavy=0.0000
 burst_prob_estimates: small_event=0.1329 small_volume=0.9510 heavy_event=0.0043 heavy_volume=0.0011
-suggested_flags: --wg-burst-small-prob 0.1329 --wg-burst-heavy-prob 0.0043
+burst_hours_detected: small=47 heavy=2
+suggested_small_ranges: jobs=1407:4689 duration=1:1 nodes=1:1 cores=4:4
+suggested_heavy_ranges: jobs=2:4 duration=168:169 nodes=10:16 cores=64:64
+suggested_flags: --wg-burst-small-prob 0.1329 --wg-burst-heavy-prob 0.0043 --wg-burst-small-jobs-min 1407 --wg-burst-small-jobs-max 4689 --wg-burst-small-duration-min 1 --wg-burst-small-duration-max 1 --wg-burst-small-nodes-min 1 --wg-burst-small-nodes-max 1 --wg-burst-small-cores-min 4 --wg-burst-small-cores-max 4 --wg-burst-heavy-jobs-min 2 --wg-burst-heavy-jobs-max 4 --wg-burst-heavy-duration-min 168 --wg-burst-heavy-duration-max 169 --wg-burst-heavy-nodes-min 10 --wg-burst-heavy-nodes-max 16 --wg-burst-heavy-cores-min 64 --wg-burst-heavy-cores-max 64
 
 === allusers-long-30.log ===
 jobs=706918 skipped_rows=1
 hours_observed=1381
 arrivals_per_hour: mean=511.8885 std=1308.5007
 duration_hours: mean=3.2965 std=13.0609
 nodes: mean=1.0137 std=0.6736
-cores: mean=7.5031 std=54.1584
-corr(duration, nodes)=0.030925 corr(duration, cores)=0.020810
+cores: mean=6.4411 std=10.4744
+corr(duration, nodes)=0.030925 corr(duration, cores)=-0.065966
 burst_baseline_jobs_per_hour: small=319.0000 heavy=0.0000
 burst_prob_estimates: small_event=0.2274 small_volume=1.0000 heavy_event=0.0210 heavy_volume=0.0072
-suggested_flags: --wg-burst-small-prob 0.2274 --wg-burst-heavy-prob 0.0210
+burst_hours_detected: small=84 heavy=15
+suggested_small_ranges: jobs=1685:6569 duration=1:3 nodes=1:1 cores=2:8
+suggested_heavy_ranges: jobs=2:6 duration=132:169 nodes=5:16 cores=64:96
+suggested_flags: --wg-burst-small-prob 0.2274 --wg-burst-heavy-prob 0.0210 --wg-burst-small-jobs-min 1685 --wg-burst-small-jobs-max 6569 --wg-burst-small-duration-min 1 --wg-burst-small-duration-max 3 --wg-burst-small-nodes-min 1 --wg-burst-small-nodes-max 1 --wg-burst-small-cores-min 2 --wg-burst-small-cores-max 8 --wg-burst-heavy-jobs-min 2 --wg-burst-heavy-jobs-max 6 --wg-burst-heavy-duration-min 132 --wg-burst-heavy-duration-max 169 --wg-burst-heavy-nodes-min 5 --wg-burst-heavy-nodes-max 16 --wg-burst-heavy-cores-min 64 --wg-burst-heavy-cores-max 96
 
 === allusers-main-30.log ===
 jobs=1448404 skipped_rows=1
 hours_observed=752
 arrivals_per_hour: mean=1926.0691 std=3618.8276
 duration_hours: mean=0.3007 std=0.8709
 nodes: mean=1.0007 std=0.1308
-cores: mean=2.9509 std=3.6442
-corr(duration, nodes)=0.008865 corr(duration, cores)=-0.027545
+cores: mean=2.9427 std=3.2604
+corr(duration, nodes)=0.008865 corr(duration, cores)=-0.032620
 burst_baseline_jobs_per_hour: small=907.0000 heavy=0.0000
 burst_prob_estimates: small_event=0.3697 small_volume=1.0000 heavy_event=0.0000 heavy_volume=0.0000
-suggested_flags: --wg-burst-small-prob 0.3697 --wg-burst-heavy-prob 0.0000
-
-
-NOTE: skipped_rows when duration = 0 rows appear. Not used in calculation.
+burst_hours_detected: small=71 heavy=0
+suggested_small_ranges: jobs=6084:13961 duration=1:1 nodes=1:1 cores=2:4
+suggested_heavy_ranges: jobs=nan:nan duration=nan:nan nodes=nan:nan cores=nan:nan
+suggested_flags: --wg-burst-small-prob 0.3697 --wg-burst-heavy-prob 0.0000 --wg-burst-small-jobs-min 6084 --wg-burst-small-jobs-max 13961 --wg-burst-small-duration-min 1 --wg-burst-small-duration-max 1 --wg-burst-small-nodes-min 1 --wg-burst-small-nodes-max 1 --wg-burst-small-cores-min 2 --wg-burst-small-cores-max 4 --wg-burst-heavy-jobs-min nan --wg-burst-heavy-jobs-max nan --wg-burst-heavy-duration-min nan --wg-burst-heavy-duration-max nan --wg-burst-heavy-nodes-min nan --wg-burst-heavy-nodes-max nan --wg-burst-heavy-cores-min nan --wg-burst-heavy-cores-max nan
diff --git a/src/baseline.py b/src/baseline.py
@@ -13,6 +13,7 @@
 )
 from src.metrics_tracker import MetricsTracker
 from src.reward_calculation import power_cost
+from src.config import CORES_PER_NODE
 
 
 def baseline_step(
@@ -113,4 +114,5 @@ def baseline_step(
     baseline_cost_off = power_cost(num_used_nodes, 0, current_price)
     env_print(f"    > baseline_cost_off: €{baseline_cost_off:.4f} | used nodes: {num_used_nodes}, idle nodes: 0")
 
-    return baseline_cost, baseline_cost_off, baseline_next_empty_slot, next_job_id
+    num_used_cores = num_on_nodes * CORES_PER_NODE - np.sum(baseline_cores_available)
+    return baseline_cost, baseline_cost_off, baseline_next_empty_slot, next_job_id, num_used_nodes, num_used_cores
diff --git a/src/environment.py b/src/environment.py
@@ -433,6 +433,7 @@ def step(self, action: np.ndarray) -> tuple[dict[str, np.ndarray], float, bool,
         self.metrics.episode_running_jobs_counts.append(num_running_jobs)
         self.metrics.episode_on_nodes.append(num_on_nodes)
         self.metrics.episode_used_nodes.append(num_used_nodes)
+        self.metrics.episode_used_cores.append(num_used_cores)
         self.metrics.episode_job_queue_sizes.append(num_unprocessed_jobs)
         self.metrics.episode_price_stats.append(current_price)
 
@@ -453,7 +454,7 @@ def step(self, action: np.ndarray) -> tuple[dict[str, np.ndarray], float, bool,
         self.env_print(f"[5] Calculating reward...")
 
         # Baseline step
-        baseline_cost, baseline_cost_off, self.baseline_next_empty_slot, self.next_job_id = baseline_step(
+        baseline_cost, baseline_cost_off, self.baseline_next_empty_slot, self.next_job_id, baseline_num_used_nodes, baseline_num_used_cores = baseline_step(
             self.baseline_state, self.baseline_cores_available, self.baseline_running_jobs,
             current_price, new_jobs_count, new_jobs_durations, new_jobs_nodes, new_jobs_cores,
             self.baseline_next_empty_slot, self.next_job_id, self.metrics, self.env_print,
@@ -465,6 +466,9 @@ def step(self, action: np.ndarray) -> tuple[dict[str, np.ndarray], float, bool,
         self.metrics.episode_baseline_cost += baseline_cost
         self.metrics.episode_baseline_cost_off += baseline_cost_off
 
+        self.metrics.episode_baseline_used_nodes.append(baseline_num_used_nodes)
+        self.metrics.episode_baseline_used_cores.append(baseline_num_used_cores)
+
         step_reward, step_cost, eff_reward_norm, price_reward, idle_penalty_norm, job_age_penalty_norm = self.reward_calculator.calculate(
             num_used_nodes, num_idle_nodes, current_price, average_future_price,
             num_off_nodes, num_launched_jobs, num_node_changes, job_queue_2d,

diff --git a/src/metrics_tracker.py b/src/metrics_tracker.py
@@ -81,6 +81,9 @@ def reset_episode_metrics(self) -> None:
         # Time series data for plotting (episode)
         self.episode_on_nodes: list[int] = []
         self.episode_used_nodes: list[int] = []
+        self.episode_used_cores: list[int] = []
+        self.episode_baseline_used_nodes: list[int] = []
+        self.episode_baseline_used_cores: list[int] = []
         self.episode_job_queue_sizes: list[int] = []
         self.episode_price_stats: list[float] = []
 

diff --git a/test/test_inspect_workloadgen.py b/test/test_inspect_workloadgen.py
@@ -3,7 +3,7 @@
 python -m test.test_inspect_workloadgen --workload-gen poisson --wg-poisson-lambdas4 200,10,6,24 --wg-max-jobs-hour 1500 --hours 336 --plot --wg-burst-small-prob 0.2 --wg-burst-heavy-prob 0.02
 """
 
-# inspect_workloadgen.py
+# test_inspect_workloadgen.py
 import argparse
 import hashlib
 
@@ -133,6 +133,8 @@ def main():
         cpn = [t[3] for t in all_jobs_triplets] if all_jobs_triplets else []
 
         axs[2, 0].hist(durations, bins=50)
+        if args.wg_burst_heavy_prob > 0.0:
+            axs[2, 0].set_yscale("log")
         axs[2, 0].set_title("Durations (hours)")
         axs[2, 0].set_xlabel("duration [h]")
         axs[2, 0].set_ylabel("count")
@@ -143,6 +145,8 @@ def main():
         axs[2, 1].set_ylabel("count")
 
         axs[3, 0].hist(cpn, bins=32)
+        if args.wg_burst_heavy_prob > 0.0:
+            axs[3, 0].set_yscale("log")
         axs[3, 0].set_title("Cores per node (Jobs shape/Volume)")
         axs[3, 0].set_xlabel("cores/node")
         axs[3, 0].set_ylabel("count")

diff --git a/train.py b/train.py
@@ -13,6 +13,7 @@
 import pandas as pd
 from src.workloadgen import WorkloadGenerator
 from src.workloadgen_cli import add_workloadgen_args, build_workloadgen_config
+from src.config import MAX_NODES, CORES_PER_NODE, EPISODE_HOURS
 import time
 
 
@@ -223,7 +224,9 @@ def main():
                 f"Jobs={env.metrics.jobs_completed}/{env.metrics.jobs_submitted} ({completion_rate:.0f}%), "
                 f"AvgWait={avg_wait:.1f}h, "
                 f"EpisodeMaxQueue={env.metrics.episode_max_queue_size_reached}, Dropped={env.metrics.episode_jobs_dropped}, "
-                f"MaxQueue={env.metrics.max_queue_size_reached}")
+                f"MaxQueue={env.metrics.max_queue_size_reached}, "
+                f"Agent Occupancy (Cores)={env.metrics.episode_used_cores[-1]*100/(CORES_PER_NODE*MAX_NODES) if env.metrics.episode_used_cores else 0 :.2f}%, Baseline Occupancy (Cores)={env.metrics.episode_baseline_used_cores[-1]*100/(CORES_PER_NODE*MAX_NODES) if env.metrics.episode_baseline_used_cores else 0 :.2f}%, "
+                f"Agent Occupancy (Nodes)={env.metrics.episode_used_nodes[-1]*100/MAX_NODES if env.metrics.episode_used_nodes else 0 :.2f}%, Baseline Occupancy (Nodes)={env.metrics.episode_baseline_used_nodes[-1]*100/MAX_NODES if env.metrics.episode_baseline_used_nodes else 0 :.2f}% " )
 
         print(f"\nEvaluation complete! Generated {num_episodes} episodes of cost data.")