Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/remote-tests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ jobs:
max-parallel: 4
fail-fast: false
matrix:
python-version: ['3.10', '3.11', '3.12', '3.13']
python-version: ['3.11', '3.12', '3.13']

steps:
- name: Checkout
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/unit-tests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ jobs:
max-parallel: 4
fail-fast: false
matrix:
python-version: ['3.10', '3.11', '3.12', '3.13']
python-version: ['3.11', '3.12', '3.13']

steps:
- name: Checkout
Expand Down
175 changes: 66 additions & 109 deletions pyenzyme/fetcher/chebi.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,10 @@
"""

import re
from typing import Dict, List, Optional
from typing import List, Optional

import httpx
from pydantic import BaseModel, ConfigDict, RootModel
from pydantic import BaseModel, ConfigDict, Field

from pyenzyme.versions import v2

Expand All @@ -24,73 +24,61 @@ def __init__(self, message: str, cause: Optional[Exception] = None):
self.cause = cause


class ChEBIStructure(BaseModel):
"""Chemical structure information."""

model_config = ConfigDict(extra="ignore")

smiles: Optional[str] = None
standard_inchi: Optional[str] = None
standard_inchi_key: Optional[str] = None


class ChEBIEntryData(BaseModel):
"""Core data structure for a ChEBI entry."""
class ChebiSearchSource(BaseModel):
"""Source data structure from ChEBI search API result."""

model_config = ConfigDict(extra="ignore")

chebi_accession: str
name: Optional[str] = None
ascii_name: str
default_structure: Optional[ChEBIStructure] = None


class ChEBIEntryResult(BaseModel):
"""Individual ChEBI entry result."""

model_config = ConfigDict(extra="ignore")

standardized_chebi_id: str
data: ChEBIEntryData


class ChEBIApiResponse(RootModel[Dict[str, ChEBIEntryResult]]):
"""Top-level response structure from ChEBI API. Maps ChEBI IDs to their corresponding entry data."""

root: Dict[str, ChEBIEntryResult]
smiles: Optional[str] = None
inchi: Optional[str] = None
inchikey: Optional[str] = None
definition: Optional[str] = None
formula: Optional[str] = None
charge: Optional[int] = None
mass: Optional[float] = None
monoisotopicmass: Optional[float] = None
stars: Optional[int] = None
default_structure: Optional[int] = None
structures: Optional[List[int]] = None


class ChebiSearchResult(BaseModel):
"""Individual search result structure."""

model_config = ConfigDict(extra="ignore")
model_config = ConfigDict(extra="ignore", populate_by_name=True)

_source: Dict[str, str] # Contains chebi_accession field
source: ChebiSearchSource = Field(alias="_source")


class ChebiSearchResponse(BaseModel):
"""Search response structure from ChEBI search API."""

results: List[ChebiSearchResult]
total: int
number_pages: int


class ChEBIClient:
"""Client for accessing the ChEBI API to fetch chemical entity data."""

BASE_URL = "https://www.ebi.ac.uk/chebi/backend/api/public/compounds/"
SEARCH_URL = "https://www.ebi.ac.uk/chebi/backend/api/public/es_search/"

def __init__(self):
"""Initialize the ChEBI client."""
pass

def get_entry_by_id(self, chebi_id: str) -> ChEBIEntryResult:
def get_entry_by_id(self, chebi_id: str) -> ChebiSearchSource:
"""
Fetch a ChEBI entry by its ID.
Fetch a ChEBI entry by its ID using the search API.

Args:
chebi_id: The ChEBI ID to fetch, can be with or without the 'CHEBI:' prefix

Returns:
ChEBIEntryResult object with the parsed response data
ChebiSearchSource object with the parsed response data

Raises:
ChEBIError: If the ChEBI ID is invalid or not found
Expand All @@ -101,21 +89,18 @@ def get_entry_by_id(self, chebi_id: str) -> ChEBIEntryResult:

try:
with httpx.Client(timeout=DEFAULT_TIMEOUT) as client:
url = self.BASE_URL.format(chebi_id)
response = client.get(url)
params = {"term": chebi_id, "page": "1", "size": "1"}
response = client.get(self.SEARCH_URL, params=params)
response.raise_for_status()

if response.status_code == 200:
try:
raw_response_data = response.json()
search_response = ChebiSearchResponse(**response.json())

if not raw_response_data or len(raw_response_data) == 0:
if not search_response.results or len(search_response.results) == 0:
raise ChEBIError(f"No data found for ChEBI ID {chebi_id}")

chebi_response = ChEBIApiResponse(raw_response_data)

entry = list(chebi_response.root.values())[0]
return entry
return search_response.results[0].source

except Exception as e:
if isinstance(e, ChEBIError):
Expand All @@ -127,15 +112,15 @@ def get_entry_by_id(self, chebi_id: str) -> ChEBIEntryResult:
except httpx.HTTPStatusError as e:
raise ChEBIError(f"Failed to fetch ChEBI ID {chebi_id}: {str(e)}", e)

def get_entries_batch(self, chebi_ids: List[str]) -> List[ChEBIEntryResult]:
def get_entries_batch(self, chebi_ids: List[str]) -> List[ChebiSearchSource]:
"""
Fetch multiple ChEBI entries by their IDs.
Fetch multiple ChEBI entries by their IDs using the search API.

Args:
chebi_ids: List of ChEBI IDs to fetch

Returns:
List of ChEBIEntryResult objects with data from ChEBI
List of ChebiSearchSource objects with data from ChEBI

Raises:
ChEBIError: If any ChEBI ID is invalid or not found
Expand All @@ -144,74 +129,51 @@ def get_entries_batch(self, chebi_ids: List[str]) -> List[ChEBIEntryResult]:
if not chebi_ids:
return []

formatted_ids = []
results = []
for chebi_id in chebi_ids:
if not chebi_id.startswith("CHEBI:"):
formatted_ids.append(f"CHEBI:{chebi_id}")
else:
formatted_ids.append(chebi_id)

try:
with httpx.Client(timeout=DEFAULT_TIMEOUT) as client:
url = self.BASE_URL.format(chebi_id)
response = client.get(url)
response.raise_for_status()

if response.status_code == 200:
try:
raw_response_data = response.json()
chebi_response = ChEBIApiResponse(raw_response_data)
return list(chebi_response.root.values())

except Exception as e:
raise ChEBIError(
f"Failed to parse ChEBI batch response: {str(e)}", e
)
else:
raise ChEBIError(f"HTTP {response.status_code}: {response.text}")
try:
entry = self.get_entry_by_id(chebi_id)
results.append(entry)
except ChEBIError as e:
# Continue with other IDs even if one fails
raise ChEBIError(f"Failed to fetch ChEBI ID {chebi_id}: {str(e)}", e)

except httpx.HTTPStatusError as e:
raise ChEBIError(f"Failed to fetch ChEBI batch: {str(e)}", e)
return results

def search_entries(
self, query: str, size: Optional[int] = None
) -> List[ChEBIEntryResult]:
self, query: str, size: Optional[int] = None, page: int = 1
) -> List[ChebiSearchSource]:
"""
Search for ChEBI entries by query string.

Args:
query: The search query string to find ChEBI entries
size: The maximum number of search results to return
page: The page number to retrieve (default: 1)

Returns:
List of ChEBIEntryResult objects for matching entries
List of ChebiSearchSource objects for matching entries

Raises:
ChEBIError: If the search request fails or the API is unavailable
"""
params = {"term": query}
params = {"term": query, "page": str(page)}
if size:
params["size"] = str(size)

try:
with httpx.Client(timeout=DEFAULT_TIMEOUT) as client:
url = self.SEARCH_URL
response = client.get(url, params=params)
response = client.get(self.SEARCH_URL, params=params)
response.raise_for_status()

if response.status_code == 200:
try:
search_results = ChebiSearchResponse(**response.json())
search_response = ChebiSearchResponse(**response.json())

if not search_results.results:
if not search_response.results:
return []

chebi_ids = [
result._source["chebi_accession"]
for result in search_results.results
]

return self.get_entries_batch(chebi_ids)
return [result.source for result in search_response.results]

except Exception as e:
if isinstance(e, ChEBIError):
Expand All @@ -226,44 +188,39 @@ def search_entries(
raise ChEBIError(f"Failed to search ChEBI: {str(e)}", e)


def process_chebi_entry(entry: ChEBIEntryResult) -> v2.SmallMolecule:
def process_search_result(source: ChebiSearchSource) -> v2.SmallMolecule:
"""
Process a ChEBI entry result and convert it to a SmallMolecule object.
Process a ChEBI search result source and convert it to a SmallMolecule object.

Args:
entry: The ChEBI entry result from the API
source: The ChEBI search result source from the API

Returns:
A SmallMolecule object with mapped data
"""
smallmol_id = process_id(entry.data.ascii_name)

structure = entry.data.default_structure
canonical_smiles = structure.smiles if structure else None
inchi = structure.standard_inchi if structure else None
inchikey = structure.standard_inchi_key if structure else None
smallmol_id = process_id(source.ascii_name)

small_molecule = v2.SmallMolecule(
id=smallmol_id,
name=entry.data.ascii_name,
canonical_smiles=canonical_smiles,
inchi=inchi,
inchikey=inchikey,
name=source.ascii_name,
canonical_smiles=source.smiles,
inchi=source.inchi,
inchikey=source.inchikey,
constant=False,
vessel_id=None,
synonymous_names=[],
references=[
f"https://www.ebi.ac.uk/chebi/searchId.do?chebiId={entry.standardized_chebi_id}"
f"https://www.ebi.ac.uk/chebi/searchId.do?chebiId={source.chebi_accession}"
],
)

small_molecule.add_type_term(
term=f"OBO:{entry.standardized_chebi_id.replace(':', '_')}",
term=f"OBO:{source.chebi_accession.replace(':', '_')}",
prefix="OBO",
iri="http://purl.obolibrary.org/obo/",
)

small_molecule.ld_id = f"OBO:{entry.standardized_chebi_id.replace(':', '_')}"
small_molecule.ld_id = f"OBO:{source.chebi_accession.replace(':', '_')}"

return small_molecule

Expand All @@ -290,9 +247,9 @@ def fetch_chebi(
"""
try:
client = ChEBIClient()
chebi_entry = client.get_entry_by_id(chebi_id)
chebi_source = client.get_entry_by_id(chebi_id)

small_molecule = process_chebi_entry(chebi_entry)
small_molecule = process_search_result(chebi_source)

if smallmol_id is not None:
small_molecule.id = smallmol_id
Expand Down Expand Up @@ -325,9 +282,9 @@ def fetch_chebi_batch(chebi_ids: List[str]) -> List[v2.SmallMolecule]:
return []

client = ChEBIClient()
chebi_entries = client.get_entries_batch(chebi_ids)
chebi_sources = client.get_entries_batch(chebi_ids)

return [process_chebi_entry(entry) for entry in chebi_entries]
return [process_search_result(source) for source in chebi_sources]


def search_chebi(query: str, size: Optional[int] = None) -> List[v2.SmallMolecule]:
Expand Down Expand Up @@ -355,9 +312,9 @@ def search_chebi(query: str, size: Optional[int] = None) -> List[v2.SmallMolecul
atp_results = search_chebi('ATP', 5)
"""
client = ChEBIClient()
chebi_entries = client.search_entries(query, size)
chebi_sources = client.search_entries(query, size)

return [process_chebi_entry(entry) for entry in chebi_entries]
return [process_search_result(source) for source in chebi_sources]


def process_id(name: str) -> str:
Expand Down
3 changes: 0 additions & 3 deletions pyenzyme/fetcher/pubchem.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,9 +98,6 @@ def from_cid(cid: int) -> PubChemQuery:
response = client.get(url)
response.raise_for_status()

if response.status_code != 200:
raise ValueError(f"Failed to fetch PubChem data for CID {cid}")

return PubChemQuery(**response.json())

@staticmethod
Expand Down
4 changes: 2 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ name = "pyenzyme"
version = "2.2.1"
description = "A Python library for EnzymeML"
authors = [{ name = "Jan Range", email = "range.jan@web.de" }]
requires-python = ">=3.10,<4"
requires-python = ">=3.11,<4"
readme = "README.md"
license = { text = "BSD-2-Clause" }
dependencies = [
Expand All @@ -17,7 +17,7 @@ dependencies = [
"fastobo>=0.13.0,<0.14",
"pymetadata>=0.5.3,<0.6",
"httpx>=0.27",
"mdmodels>=0.2.1,<0.3",
"mdmodels>=0.2.4,<0.3",
"joblib>=1.5.0,<2",
"bokeh>=3.7.3,<4",
"matplotlib>=3.10,<4",
Expand Down
3 changes: 2 additions & 1 deletion tests/integration/test_fetcher.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import httpx
import pytest

from pyenzyme.fetcher.chebi import fetch_chebi
Expand Down Expand Up @@ -155,7 +156,7 @@ def test_fetch_pubchem_to_small_molecule_with_prefix(self):

@pytest.mark.remote
def test_fetch_pubchem_to_small_molecule_invalid_id(self):
with pytest.raises(ValueError):
with pytest.raises(httpx.HTTPStatusError):
fetch_pubchem(cid="162176127617627")

@pytest.mark.remote
Expand Down
Loading