from obnb.data import GOBP, BioGRID
from obnb.dataset import Dataset as OBNBDataset
from obnb.label.filters import Compose, EntityExistenceFilter, LabelsetRangeFilterSize, LabelsetRangeFilterSplit, NegativeGeneratorHypergeom
from obnb.label.split import RatioPartition
from obnb.util.converter import GenePropertyConverter
function_labels = GOBP(root='data')
network = BioGRID(root='data')
pubmedcnt_converter = GenePropertyConverter(root='data', name="PubMedCount")
sb_splitter = RatioPartition(0.6, 0.2, 0.2, ascending=False,property_converter=pubmedcnt_converter)
negatives_p_thresh = 0.05
def make_dataset(graph, labels, splitter):
labels.iapply(
Compose(
# Only use genes that are present in the network
EntityExistenceFilter(list(graph.node_ids)),
# Remove any labelsets with less than 15 network genes
LabelsetRangeFilterSize(min_val=15),
# Selective negatives using hyper-geom test
NegativeGeneratorHypergeom(p_thresh=negatives_p_thresh),
# Make sure each split has at least 5 positive examples
LabelsetRangeFilterSplit(min_val=5, splitter=splitter),
),
)
return OBNBDataset(
graph=graph,
feature=graph.to_dense_graph().to_feature(),
transform='Node2Vec',
label=labels,
splitter=splitter,
resolve=True)
gobp_sb = make_dataset(graph=network, labels=function_labels, splitter=sb_splitter)
---------------------------------------------------------------------------
IDNotExistError Traceback (most recent call last)
Cell In[8], line 1
----> 1 gobp_sb = make_dataset(graph=network, labels=function_labels, splitter=sb_splitter)
Cell In[7], line 8, in make_dataset(graph, labels, splitter)
7 def make_dataset(graph, labels, splitter):
----> 8 labels.iapply(
9 Compose(
10 # Only use genes that are present in the network
11 EntityExistenceFilter(list(graph.node_ids)),
12 # Remove any labelsets with less than 15 network genes
13 LabelsetRangeFilterSize(min_val=15),
14 # Selective negatives using hyper-geom test
15 NegativeGeneratorHypergeom(p_thresh=negatives_p_thresh),
16 # Make sure each split has at least 5 positive examples
17 LabelsetRangeFilterSplit(min_val=5, splitter=splitter),
18 ),
19 )
20 return OBNBDataset(
21 graph=graph,
22 feature=graph.to_dense_graph().to_feature(),
(...)
25 splitter=splitter,
26 resolve=True)
File ~/miniconda3/envs/study_bias/lib/python3.12/site-packages/obnb/label/collection.py:492, in LabelsetCollection.iapply(self, filter_func, progress_bar)
486 def iapply(self, filter_func, progress_bar: bool = False):
487 """Apply filter to labelsets inplace.
488
489 This is a shortcut for calling self.apply(filter_func, inplace=True).
490
491 """
--> 492 self.apply(filter_func, inplace=True, progress_bar=progress_bar)
File ~/miniconda3/envs/study_bias/lib/python3.12/site-packages/obnb/label/collection.py:483, in LabelsetCollection.apply(self, filter_func, inplace, progress_bar)
481 checkers.checkType("inplace", bool, inplace)
482 obj = self if inplace else self.copy()
--> 483 filter_func(obj, progress_bar)
484 return obj
File ~/miniconda3/envs/study_bias/lib/python3.12/site-packages/obnb/label/filters/base.py:113, in Compose.__call__(self, lsc, progress_bar)
111 def __call__(self, lsc, progress_bar):
112 for filter_ in self.filters:
--> 113 filter_.__call__(lsc, progress_bar)
114 self.logger.info(lsc.stats())
File ~/miniconda3/envs/study_bias/lib/python3.12/site-packages/obnb/label/filters/base.py:81, in BaseFilter.__call__(self, lsc, progress_bar)
79 pbar = tqdm(entity_ids, desc=f"{self!r}", disable=not progress_bar)
80 for entity_id in pbar:
---> 81 if self.criterion(val_getter(entity_id)):
82 mod_fun(entity_id)
83 self.logger.debug(
84 f"Modification ({self.mod_name}) criterion met for "
85 f"{entity_id!r}",
86 )
File ~/miniconda3/envs/study_bias/lib/python3.12/site-packages/obnb/label/filters/range_filter.py:166, in LabelsetRangeFilterSplit.get_val_getter.<locals>.val_getter(label_id)
164 def val_getter(label_id):
165 y_all, masks = lsc.split(self.splitter, **self.kwargs)
--> 166 neg_idx = lsc.entity[lsc.get_negative(label_id)]
167 self.logger.debug(f"{label_id = } {neg_idx = }")
168 # TODO: make label_ids to index mapping?
File ~/miniconda3/envs/study_bias/lib/python3.12/site-packages/obnb/util/idhandler.py:87, in IDlst.__getitem__(self, identifier)
85 return self._getitem_sinlge(identifier)
86 elif isinstance(identifier, Iterable):
---> 87 return self._getitem_multiple(identifier)
88 else:
89 raise TypeError(
90 f"ID key(s) must be string or iterables of string, "
91 f"not {type(identifier)!r}",
92 )
File ~/miniconda3/envs/study_bias/lib/python3.12/site-packages/obnb/util/idhandler.py:102, in IDlst._getitem_multiple(self, identifiers)
100 idx_lst = []
101 for identifier in identifiers:
--> 102 idx_lst.append(self._getitem_sinlge(identifier))
103 return np.array(idx_lst)
File ~/miniconda3/envs/study_bias/lib/python3.12/site-packages/obnb/util/idhandler.py:202, in IDmap._getitem_sinlge(self, identifier)
201 def _getitem_sinlge(self, identifier):
--> 202 self._check_ID_existence(identifier, True)
203 return self._map[identifier]
File ~/miniconda3/envs/study_bias/lib/python3.12/site-packages/obnb/util/idhandler.py:111, in IDlst._check_ID_existence(self, identifier, existence)
109 raise IDExistsError(f"Existing ID {identifier!r}")
110 elif existence & (identifier not in self):
--> 111 raise IDNotExistError(f"Unknown ID {identifier!r}")
IDNotExistError: Unknown ID '5557'
code:
error:
I have test multiple versions of the above function and without
NegativeGeneratorHypergeomit works fine but anytime this is included it gives the error, even if it is the only filtering performed