Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,8 @@ poetry run python3 ic_topology/main.py
"enforce_sev_constraint": false,
"enforce_health_constraint": false,
"enforce_blacklist_constraint": true,
"enforce_per_node_provider_assignation": false
"enforce_per_node_provider_assignation": false,
"spare_node_ratio": 0.0
}
```

Expand All @@ -62,6 +63,7 @@ poetry run python3 ic_topology/main.py
| `enforce_health_constraint`| `bool` | If `true`, only healthy nodes (not `DOWN` or `DEGRADED`) will be considered for allocation. |
| `enforce_blacklist_constraint`| `bool` | If `true`, blacklisted nodes will be excluded from all subnet assignments. |
| `enforce_per_node_provider_assignation`| `bool` | If `true` each node provider will have at least one of their nodes assigned to a subnet if they have more than 4 nodes. |
| `spare_node_ratio` | float | If not `0.0` each node provider will have that ratio of spare nodes per data center spare. If a node provider has 10 nodes and the `spare_node_ratio` is set to 0.1, they will have up to 9 nodes assigned and one spare. |

---

Expand Down
1,446 changes: 1,446 additions & 0 deletions data/network_data/current_nodes_20251029_125331.csv

Large diffs are not rendered by default.

118 changes: 118 additions & 0 deletions tests/spare_capacity.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
import unittest

from tests.test_utils import (
NetworkData,
NodeEntry,
TopologyEntry,
execute_min_synthetic_nodes_scenario,
)


class SpareCapacityTestScenarios(unittest.TestCase):
def test_rebalance_if_possible(self):
topology = (
TopologyEntry()
.with_subnet_type("NNS")
.with_subnet_id("subnet")
.with_size(10)
.with_country_limit(10)
.with_dc_limit(10)
.with_dc_provider_limit(10)
.with_node_provider_limit(10)
)

np_1 = []
for _ in range(10):
np_1.append(
NodeEntry()
.with_provider_name("DFINITY")
.with_country("AA")
.with_data_center("aa1")
.with_owner("owner")
.in_subnet("subnet")
)

np_2 = []
for _ in range(5):
np_2.append(
NodeEntry()
.with_provider_name("Other np")
.with_country("AA")
.with_owner("owner")
.with_data_center("aa1")
)

# Initially don't enable the spare capacity feature
network_data = (
NetworkData()
.with_extend_nodes(np_1)
.with_extend_nodes(np_2)
.with_topology_entry(topology)
# Needed to evade the default special limit of exactly 3 dfinity nodes in NNS
.with_special_limit("subnet", "node_provider", "DFINITY", 1000, "lt")
)

output, status = execute_min_synthetic_nodes_scenario(network_data.build())

assert status == "Optimal"
# There should be no swaps because the current topology is
# optimal
for changes in output:
assert len(changes["added"]) == 0
assert len(changes["removed"]) == 0

# Now enable the feature and try again
network_data = network_data.with_spare_node_ratio(0.1)

output, status = execute_min_synthetic_nodes_scenario(network_data.build())
assert status == "Optimal"

only_swap = output[0]
added = only_swap["added"]
removed = only_swap["removed"]

assert len(added) == 1
assert len(removed) == 1

assert added[0]["node_provider"] == "Other np"
assert removed[0]["node_provider"] == "DFINITY"

def test_failure_if_ratio_outside_bounds(self):
topology = (
TopologyEntry()
.with_subnet_type("NNS")
.with_subnet_id("subnet")
.with_size(3)
)

nodes = [
NodeEntry()
.with_provider_name("DFINITY")
.in_subnet("subnet")
.with_country("aa")
for _ in range(3)
]

network_data = (
NetworkData().with_extend_nodes(nodes).with_topology_entry(topology)
)

ratios_to_test = [-15, 15, -0.4, 1.2]

for ratio in ratios_to_test:
network_data = network_data.with_spare_node_ratio(ratio)

expected_exception = None
try:
_, _ = execute_min_synthetic_nodes_scenario(network_data.build())
except ValueError as e:
expected_exception = e

assert expected_exception is not None
assert "Spare node ratio has to be a float between 0 and 1" in str(
expected_exception
)


if __name__ == "__main__":
unittest.main()
17 changes: 13 additions & 4 deletions tests/test_utils.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,14 @@
from typing import List, Dict, Self, Any
import uuid
from topology_optimizer.utils import ALLOWED_FEATURES, parse_solver_result
from typing import Any, Dict, List, Self

import pandas as pd

from topology_optimizer.data_preparation import prepare_data
from topology_optimizer.linear_solver import (
solver_model_minimize_swaps,
ATTRIBUTE_NAMES,
solver_model_minimize_swaps,
)
import pandas as pd
from topology_optimizer.utils import ALLOWED_FEATURES, parse_solver_result


class TopologyEntry:
Expand Down Expand Up @@ -195,6 +197,7 @@ class NetworkData:
_enforce_health_constraint: bool
_enforce_blacklist_constraint: bool
_enforce_per_node_provider_assignation: bool
_spare_node_ratio: float

_cluster_scenario: Dict[str, List[str]]
_cluster_scenario_name: str
Expand All @@ -206,6 +209,7 @@ def __init__(self):
self._enforce_blacklist_constraint = False
self._enforce_per_node_provider_assignation = False
self._enforce_health_constraint = False
self._spare_node_ratio = 1.0

self._nodes = list()
self._pipeline = list()
Expand Down Expand Up @@ -235,6 +239,10 @@ def enforce_per_node_provider_assignation(self) -> Self:
self._enforce_per_node_provider_assignation = True
return self

def with_spare_node_ratio(self, ratio) -> Self:
self._spare_node_ratio = ratio
return self

def with_synthetic_countries(self, num_countries: int) -> Self:
self._number_synthetic_countries = num_countries
return self
Expand Down Expand Up @@ -350,6 +358,7 @@ def build(self) -> dict[str, Any]:
special_limits=self._special_limits
if len(self._special_limits) > 0
else None,
spare_node_ratio=self._spare_node_ratio,
)


Expand Down
5 changes: 3 additions & 2 deletions topology_optimizer/config.json
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
{
"nodes_file": "./data/network_data/current_nodes_20250819_110856.csv",
"nodes_file": "./data/network_data/current_nodes_20251029_125331.csv",
"topology_file": "./data/topology/prod_topology.csv",
"node_pipeline_file": "./data/node_pipelines/11_new_subnets.csv",
"blacklist_file": "./data/blacklist.yaml",
Expand All @@ -9,5 +9,6 @@
"enforce_sev_constraint": false,
"enforce_health_constraint": true,
"enforce_blacklist_constraint": true,
"enforce_per_node_provider_assignation": true
"enforce_per_node_provider_assignation": true,
"spare_node_ratio": 0.0
}
7 changes: 5 additions & 2 deletions topology_optimizer/data_preparation.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,14 @@
from typing import Any, Dict, List

import pandas as pd
from typing import Dict, Any, List

from topology_optimizer.utils import (
create_node_dataframe,
generate_synthetic_countries,
generate_synthetic_nodes,
get_existing_assignment,
post_process_node_providers,
mark_blacklisted_nodes,
post_process_node_providers,
)


Expand All @@ -24,6 +25,7 @@ def prepare_data(
cluster_scenario_name: str,
enforce_per_node_provider_assignation: bool,
sev_node_providers: List[str],
spare_node_ratio: bool,
special_limits: dict[int, dict[str, dict[str, (int, str)]]] = None,
) -> Dict[str, Any]:
# Remove everything that is not a replica
Expand Down Expand Up @@ -98,6 +100,7 @@ def prepare_data(
"current_assignment": current_assignment,
"enforce_per_node_provider_assignation": enforce_per_node_provider_assignation,
"special_limits": special_limits,
"spare_node_ratio": spare_node_ratio,
}


Expand Down
76 changes: 69 additions & 7 deletions topology_optimizer/linear_solver.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,21 +5,24 @@

"""

import logging
import math
import tempfile

import pandas as pd
from pulp import (
PULP_CBC_CMD,
LpBinary,
LpInteger,
LpMinimize,
LpProblem,
LpStatus,
LpVariable,
LpInteger,
LpBinary,
lpSum,
LpMinimize,
value,
LpStatus,
)
import pandas as pd

from topology_optimizer.utils import get_subnet_limit
from pulp import PULP_CBC_CMD
import logging

# Standard attribute types to optimize over
ATTRIBUTE_NAMES = [
Expand Down Expand Up @@ -142,6 +145,65 @@ def add_node_constraints(network_data, model):
if network_data.get("enforce_per_node_provider_assignation", False):
add_per_node_provider_constraint(model, network_data)

spare_node_ratio = network_data.get("spare_node_ratio")
if spare_node_ratio < 0 or spare_node_ratio > 1:
raise ValueError(
f"Spare node ratio has to be a float between 0 and 1. Got: {spare_node_ratio}"
)
if spare_node_ratio != 0:
enforce_spare_nodes(model, network_data, spare_node_ratio)


def enforce_spare_nodes(model, network_data, spare_node_ratio):
prob = model["prob"]
data_centers_list = network_data["data_center_list"]
data_centers = network_data["data_center_indices"]
node_df = network_data["node_df"]
node_alloc = model["node_allocations"]
subnet_indices = network_data["subnet_indices"]

nodes_to_use_ratio = 1 - spare_node_ratio

for dc in data_centers:
providers_in_dc = (
node_df.loc[
node_df["data_center"] == data_centers_list[dc],
"original_node_provider",
]
.dropna()
.unique()
.tolist()
)

for provider in providers_in_dc:
dc_nodes = node_df[
(node_df["data_center"] == data_centers_list[dc])
& (node_df["original_node_provider"] == provider)
& (node_df["is_available"])
& (~node_df["is_blacklisted"])
].index

max_allowed_allocations_per_dc_per_np = math.floor(
len(dc_nodes) * nodes_to_use_ratio
)

if max_allowed_allocations_per_dc_per_np == 0:
print(
f"Skipping sparing node provider {provider} in dc {data_centers_list[dc]} because they would have 0 nodes allowed after sparing {spare_node_ratio} nodes. \
Currently they have {len(dc_nodes)} in that data center."
)
continue

prob += (
lpSum(
node_alloc[node][subnet]
for node in dc_nodes
for subnet in subnet_indices
)
<= max_allowed_allocations_per_dc_per_np,
f"AtLeastOneSpareNodePerDC_{data_centers_list[dc]}_PerNP_{provider}",
)


def add_per_node_provider_constraint(model, network_data):
"""
Expand Down
Loading
Loading