Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
745 changes: 745 additions & 0 deletions analyze_lambda_occupancy.py

Large diffs are not rendered by default.

266 changes: 258 additions & 8 deletions data/workload_statistics/analyze_workload_logs.py

Large diffs are not rendered by default.

44 changes: 28 additions & 16 deletions data/workload_statistics/workload_logs.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,14 @@ hours_observed=788
arrivals_per_hour: mean=4.2119 std=77.5734
duration_hours: mean=7.4852 std=29.6084
nodes: mean=1.0054 std=0.1202
cores: mean=14.9328 std=30.1301
corr(duration, nodes)=0.007262 corr(duration, cores)=0.628780
cores: mean=14.8503 std=30.0671
corr(duration, nodes)=0.007262 corr(duration, cores)=0.629600
burst_baseline_jobs_per_hour: small=0.0000 heavy=0.0000
burst_prob_estimates: small_event=0.0025 small_volume=0.0243 heavy_event=0.0000 heavy_volume=0.0000
suggested_flags: --wg-burst-small-prob 0.0025 --wg-burst-heavy-prob 0.0000
burst_hours_detected: small=11 heavy=0
suggested_small_ranges: jobs=3:767 duration=1:2 nodes=1:1 cores=2:6
suggested_heavy_ranges: jobs=nan:nan duration=nan:nan nodes=nan:nan cores=nan:nan
suggested_flags: --wg-burst-small-prob 0.0025 --wg-burst-heavy-prob 0.0000 --wg-burst-small-jobs-min 3 --wg-burst-small-jobs-max 767 --wg-burst-small-duration-min 1 --wg-burst-small-duration-max 2 --wg-burst-small-nodes-min 1 --wg-burst-small-nodes-max 1 --wg-burst-small-cores-min 2 --wg-burst-small-cores-max 6 --wg-burst-heavy-jobs-min nan --wg-burst-heavy-jobs-max nan --wg-burst-heavy-duration-min nan --wg-burst-heavy-duration-max nan --wg-burst-heavy-nodes-min nan --wg-burst-heavy-nodes-max nan --wg-burst-heavy-cores-min nan --wg-burst-heavy-cores-max nan

=== allusers-grid-30.log ===
jobs=40639 skipped_rows=1
Expand All @@ -20,43 +23,52 @@ cores: mean=7.9984 std=0.1122
corr(duration, nodes)=nan corr(duration, cores)=0.026772
burst_baseline_jobs_per_hour: small=0.0000 heavy=0.0000
burst_prob_estimates: small_event=0.0423 small_volume=0.0752 heavy_event=0.0000 heavy_volume=0.0000
suggested_flags: --wg-burst-small-prob 0.0423 --wg-burst-heavy-prob 0.0000
burst_hours_detected: small=79 heavy=0
suggested_small_ranges: jobs=15:303 duration=1:7 nodes=1:1 cores=8:8
suggested_heavy_ranges: jobs=nan:nan duration=nan:nan nodes=nan:nan cores=nan:nan
suggested_flags: --wg-burst-small-prob 0.0423 --wg-burst-heavy-prob 0.0000 --wg-burst-small-jobs-min 15 --wg-burst-small-jobs-max 303 --wg-burst-small-duration-min 1 --wg-burst-small-duration-max 7 --wg-burst-small-nodes-min 1 --wg-burst-small-nodes-max 1 --wg-burst-small-cores-min 8 --wg-burst-small-cores-max 8 --wg-burst-heavy-jobs-min nan --wg-burst-heavy-jobs-max nan --wg-burst-heavy-duration-min nan --wg-burst-heavy-duration-max nan --wg-burst-heavy-nodes-min nan --wg-burst-heavy-nodes-max nan --wg-burst-heavy-cores-min nan --wg-burst-heavy-cores-max nan

=== allusers-high_mem-30.log ===
jobs=237373 skipped_rows=1
hours_observed=1385
arrivals_per_hour: mean=171.3884 std=611.6586
duration_hours: mean=0.2537 std=1.8726
nodes: mean=1.0010 std=0.1191
cores: mean=13.6286 std=47.3460
corr(duration, nodes)=0.405630 corr(duration, cores)=0.113031
cores: mean=13.5372 std=45.6885
corr(duration, nodes)=0.405630 corr(duration, cores)=0.041992
burst_baseline_jobs_per_hour: small=129.0000 heavy=0.0000
burst_prob_estimates: small_event=0.1329 small_volume=0.9510 heavy_event=0.0043 heavy_volume=0.0011
suggested_flags: --wg-burst-small-prob 0.1329 --wg-burst-heavy-prob 0.0043
burst_hours_detected: small=47 heavy=2
suggested_small_ranges: jobs=1407:4689 duration=1:1 nodes=1:1 cores=4:4
suggested_heavy_ranges: jobs=2:4 duration=168:169 nodes=10:16 cores=64:64
suggested_flags: --wg-burst-small-prob 0.1329 --wg-burst-heavy-prob 0.0043 --wg-burst-small-jobs-min 1407 --wg-burst-small-jobs-max 4689 --wg-burst-small-duration-min 1 --wg-burst-small-duration-max 1 --wg-burst-small-nodes-min 1 --wg-burst-small-nodes-max 1 --wg-burst-small-cores-min 4 --wg-burst-small-cores-max 4 --wg-burst-heavy-jobs-min 2 --wg-burst-heavy-jobs-max 4 --wg-burst-heavy-duration-min 168 --wg-burst-heavy-duration-max 169 --wg-burst-heavy-nodes-min 10 --wg-burst-heavy-nodes-max 16 --wg-burst-heavy-cores-min 64 --wg-burst-heavy-cores-max 64

=== allusers-long-30.log ===
jobs=706918 skipped_rows=1
hours_observed=1381
arrivals_per_hour: mean=511.8885 std=1308.5007
duration_hours: mean=3.2965 std=13.0609
nodes: mean=1.0137 std=0.6736
cores: mean=7.5031 std=54.1584
corr(duration, nodes)=0.030925 corr(duration, cores)=0.020810
cores: mean=6.4411 std=10.4744
corr(duration, nodes)=0.030925 corr(duration, cores)=-0.065966
burst_baseline_jobs_per_hour: small=319.0000 heavy=0.0000
burst_prob_estimates: small_event=0.2274 small_volume=1.0000 heavy_event=0.0210 heavy_volume=0.0072
suggested_flags: --wg-burst-small-prob 0.2274 --wg-burst-heavy-prob 0.0210
burst_hours_detected: small=84 heavy=15
suggested_small_ranges: jobs=1685:6569 duration=1:3 nodes=1:1 cores=2:8
suggested_heavy_ranges: jobs=2:6 duration=132:169 nodes=5:16 cores=64:96
suggested_flags: --wg-burst-small-prob 0.2274 --wg-burst-heavy-prob 0.0210 --wg-burst-small-jobs-min 1685 --wg-burst-small-jobs-max 6569 --wg-burst-small-duration-min 1 --wg-burst-small-duration-max 3 --wg-burst-small-nodes-min 1 --wg-burst-small-nodes-max 1 --wg-burst-small-cores-min 2 --wg-burst-small-cores-max 8 --wg-burst-heavy-jobs-min 2 --wg-burst-heavy-jobs-max 6 --wg-burst-heavy-duration-min 132 --wg-burst-heavy-duration-max 169 --wg-burst-heavy-nodes-min 5 --wg-burst-heavy-nodes-max 16 --wg-burst-heavy-cores-min 64 --wg-burst-heavy-cores-max 96

=== allusers-main-30.log ===
jobs=1448404 skipped_rows=1
hours_observed=752
arrivals_per_hour: mean=1926.0691 std=3618.8276
duration_hours: mean=0.3007 std=0.8709
nodes: mean=1.0007 std=0.1308
cores: mean=2.9509 std=3.6442
corr(duration, nodes)=0.008865 corr(duration, cores)=-0.027545
cores: mean=2.9427 std=3.2604
corr(duration, nodes)=0.008865 corr(duration, cores)=-0.032620
burst_baseline_jobs_per_hour: small=907.0000 heavy=0.0000
burst_prob_estimates: small_event=0.3697 small_volume=1.0000 heavy_event=0.0000 heavy_volume=0.0000
suggested_flags: --wg-burst-small-prob 0.3697 --wg-burst-heavy-prob 0.0000


NOTE: skipped_rows when duration = 0 rows appear. Not used in calculation.
burst_hours_detected: small=71 heavy=0
suggested_small_ranges: jobs=6084:13961 duration=1:1 nodes=1:1 cores=2:4
suggested_heavy_ranges: jobs=nan:nan duration=nan:nan nodes=nan:nan cores=nan:nan
suggested_flags: --wg-burst-small-prob 0.3697 --wg-burst-heavy-prob 0.0000 --wg-burst-small-jobs-min 6084 --wg-burst-small-jobs-max 13961 --wg-burst-small-duration-min 1 --wg-burst-small-duration-max 1 --wg-burst-small-nodes-min 1 --wg-burst-small-nodes-max 1 --wg-burst-small-cores-min 2 --wg-burst-small-cores-max 4 --wg-burst-heavy-jobs-min nan --wg-burst-heavy-jobs-max nan --wg-burst-heavy-duration-min nan --wg-burst-heavy-duration-max nan --wg-burst-heavy-nodes-min nan --wg-burst-heavy-nodes-max nan --wg-burst-heavy-cores-min nan --wg-burst-heavy-cores-max nan
4 changes: 3 additions & 1 deletion src/baseline.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
)
from src.metrics_tracker import MetricsTracker
from src.reward_calculation import power_cost
from src.config import CORES_PER_NODE


def baseline_step(
Expand Down Expand Up @@ -113,4 +114,5 @@ def baseline_step(
baseline_cost_off = power_cost(num_used_nodes, 0, current_price)
env_print(f" > baseline_cost_off: €{baseline_cost_off:.4f} | used nodes: {num_used_nodes}, idle nodes: 0")

return baseline_cost, baseline_cost_off, baseline_next_empty_slot, next_job_id
num_used_cores = num_on_nodes * CORES_PER_NODE - np.sum(baseline_cores_available)
return baseline_cost, baseline_cost_off, baseline_next_empty_slot, next_job_id, num_used_nodes, num_used_cores
6 changes: 5 additions & 1 deletion src/environment.py
Original file line number Diff line number Diff line change
Expand Up @@ -433,6 +433,7 @@ def step(self, action: np.ndarray) -> tuple[dict[str, np.ndarray], float, bool,
self.metrics.episode_running_jobs_counts.append(num_running_jobs)
self.metrics.episode_on_nodes.append(num_on_nodes)
self.metrics.episode_used_nodes.append(num_used_nodes)
self.metrics.episode_used_cores.append(num_used_cores)
self.metrics.episode_job_queue_sizes.append(num_unprocessed_jobs)
self.metrics.episode_price_stats.append(current_price)

Expand All @@ -453,7 +454,7 @@ def step(self, action: np.ndarray) -> tuple[dict[str, np.ndarray], float, bool,
self.env_print(f"[5] Calculating reward...")

# Baseline step
baseline_cost, baseline_cost_off, self.baseline_next_empty_slot, self.next_job_id = baseline_step(
baseline_cost, baseline_cost_off, self.baseline_next_empty_slot, self.next_job_id, baseline_num_used_nodes, baseline_num_used_cores = baseline_step(
self.baseline_state, self.baseline_cores_available, self.baseline_running_jobs,
current_price, new_jobs_count, new_jobs_durations, new_jobs_nodes, new_jobs_cores,
self.baseline_next_empty_slot, self.next_job_id, self.metrics, self.env_print,
Expand All @@ -465,6 +466,9 @@ def step(self, action: np.ndarray) -> tuple[dict[str, np.ndarray], float, bool,
self.metrics.episode_baseline_cost += baseline_cost
self.metrics.episode_baseline_cost_off += baseline_cost_off

self.metrics.episode_baseline_used_nodes.append(baseline_num_used_nodes)
self.metrics.episode_baseline_used_cores.append(baseline_num_used_cores)

step_reward, step_cost, eff_reward_norm, price_reward, idle_penalty_norm, job_age_penalty_norm = self.reward_calculator.calculate(
num_used_nodes, num_idle_nodes, current_price, average_future_price,
num_off_nodes, num_launched_jobs, num_node_changes, job_queue_2d,
Expand Down
3 changes: 3 additions & 0 deletions src/metrics_tracker.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,9 @@ def reset_episode_metrics(self) -> None:
# Time series data for plotting (episode)
self.episode_on_nodes: list[int] = []
self.episode_used_nodes: list[int] = []
self.episode_used_cores: list[int] = []
self.episode_baseline_used_nodes: list[int] = []
self.episode_baseline_used_cores: list[int] = []
self.episode_job_queue_sizes: list[int] = []
self.episode_price_stats: list[float] = []

Expand Down
6 changes: 5 additions & 1 deletion test/test_inspect_workloadgen.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
python -m test.test_inspect_workloadgen --workload-gen poisson --wg-poisson-lambdas4 200,10,6,24 --wg-max-jobs-hour 1500 --hours 336 --plot --wg-burst-small-prob 0.2 --wg-burst-heavy-prob 0.02
"""

# inspect_workloadgen.py
# test_inspect_workloadgen.py
import argparse
import hashlib

Expand Down Expand Up @@ -133,6 +133,8 @@ def main():
cpn = [t[3] for t in all_jobs_triplets] if all_jobs_triplets else []

axs[2, 0].hist(durations, bins=50)
if args.wg_burst_heavy_prob > 0.0:
axs[2, 0].set_yscale("log")
axs[2, 0].set_title("Durations (hours)")
axs[2, 0].set_xlabel("duration [h]")
axs[2, 0].set_ylabel("count")
Expand All @@ -143,6 +145,8 @@ def main():
axs[2, 1].set_ylabel("count")

axs[3, 0].hist(cpn, bins=32)
if args.wg_burst_heavy_prob > 0.0:
axs[3, 0].set_yscale("log")
axs[3, 0].set_title("Cores per node (Jobs shape/Volume)")
axs[3, 0].set_xlabel("cores/node")
axs[3, 0].set_ylabel("count")
Expand Down
5 changes: 4 additions & 1 deletion train.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
import pandas as pd
from src.workloadgen import WorkloadGenerator
from src.workloadgen_cli import add_workloadgen_args, build_workloadgen_config
from src.config import MAX_NODES, CORES_PER_NODE, EPISODE_HOURS
import time


Expand Down Expand Up @@ -223,7 +224,9 @@ def main():
f"Jobs={env.metrics.jobs_completed}/{env.metrics.jobs_submitted} ({completion_rate:.0f}%), "
f"AvgWait={avg_wait:.1f}h, "
f"EpisodeMaxQueue={env.metrics.episode_max_queue_size_reached}, Dropped={env.metrics.episode_jobs_dropped}, "
f"MaxQueue={env.metrics.max_queue_size_reached}")
f"MaxQueue={env.metrics.max_queue_size_reached}, "
f"Agent Occupancy (Cores)={env.metrics.episode_used_cores[-1]*100/(CORES_PER_NODE*MAX_NODES) if env.metrics.episode_used_cores else 0 :.2f}%, Baseline Occupancy (Cores)={env.metrics.episode_baseline_used_cores[-1]*100/(CORES_PER_NODE*MAX_NODES) if env.metrics.episode_baseline_used_cores else 0 :.2f}%, "
f"Agent Occupancy (Nodes)={env.metrics.episode_used_nodes[-1]*100/MAX_NODES if env.metrics.episode_used_nodes else 0 :.2f}%, Baseline Occupancy (Nodes)={env.metrics.episode_baseline_used_nodes[-1]*100/MAX_NODES if env.metrics.episode_baseline_used_nodes else 0 :.2f}% " )

print(f"\nEvaluation complete! Generated {num_episodes} episodes of cost data.")

Expand Down