From 75d70eeaad30d5173021574d5bfcac36047514c2 Mon Sep 17 00:00:00 2001 From: Khoroshevskyi Date: Mon, 16 Feb 2026 15:15:18 -0500 Subject: [PATCH 1/9] Refactor project structure and update CI configurations - modernization for a new python --- .github/workflows/black.yml | 6 ++- .github/workflows/cli-coverage.yml | 18 +++---- .github/workflows/python-publish.yml | 24 ++++----- .github/workflows/run-pytest.yml | 19 +++----- .pre-commit-config.yaml | 20 -------- MANIFEST.in | 8 --- bbconf/__init__.py | 4 +- bbconf/_version.py | 1 - bbconf/modules/bedsets.py | 2 +- pyproject.toml | 73 ++++++++++++++++++++++++++++ requirements/requirements-all.txt | 19 -------- requirements/requirements-dev.txt | 1 - requirements/requirements-test.txt | 11 ----- setup.py | 63 ------------------------ 14 files changed, 105 insertions(+), 164 deletions(-) delete mode 100644 .pre-commit-config.yaml delete mode 100644 MANIFEST.in delete mode 100644 bbconf/_version.py create mode 100644 pyproject.toml delete mode 100644 requirements/requirements-all.txt delete mode 100644 requirements/requirements-dev.txt delete mode 100644 requirements/requirements-test.txt delete mode 100644 setup.py diff --git a/.github/workflows/black.yml b/.github/workflows/black.yml index d842ae33..093667e9 100644 --- a/.github/workflows/black.yml +++ b/.github/workflows/black.yml @@ -8,4 +8,8 @@ jobs: steps: - uses: actions/checkout@v6 - uses: actions/setup-python@v6 - - uses: psf/black@stable + with: + python-version: "3.12" + - run: pip install ruff + - run: ruff check . + - run: ruff format --check . diff --git a/.github/workflows/cli-coverage.yml b/.github/workflows/cli-coverage.yml index 5d355745..d590bf2c 100644 --- a/.github/workflows/cli-coverage.yml +++ b/.github/workflows/cli-coverage.yml @@ -9,7 +9,7 @@ jobs: strategy: matrix: python-version: ["3.12"] - os: [ ubuntu-latest ] # can't use macOS when using service containers or container jobs + os: [ubuntu-latest] # can't use macOS when using service containers or container jobs runs-on: ${{ matrix.os }} services: postgres: @@ -24,23 +24,17 @@ jobs: options: --health-cmd pg_isready --health-interval 10s --health-timeout 5s --health-retries 5 steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v6 - - uses: actions/setup-python@v5 + - uses: actions/setup-python@v6 with: python-version: '3.12' - name: Install uv run: pip install uv - - name: Install dev dependencies - run: if [ -f requirements/requirements-dev.txt ]; then uv pip install -r requirements/requirements-dev.txt --system; fi - - - name: Install test dependencies - run: if [ -f requirements/requirements-test.txt ]; then uv pip install -r requirements/requirements-test.txt --system; fi - - - name: Install package - run: uv pip install . --system + - name: Install package with test dependencies + run: uv pip install ".[test]" --system - name: Run tests run: coverage run -m pytest @@ -55,4 +49,4 @@ jobs: SMOKESHOW_GITHUB_CONTEXT: coverage SMOKESHOW_GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} SMOKESHOW_GITHUB_PR_HEAD_SHA: ${{ github.event.pull_request.head.sha }} - SMOKESHOW_AUTH_KEY: ${{ secrets.SMOKESHOW_AUTH_KEY }} \ No newline at end of file + SMOKESHOW_AUTH_KEY: ${{ secrets.SMOKESHOW_AUTH_KEY }} diff --git a/.github/workflows/python-publish.yml b/.github/workflows/python-publish.yml index 2516e668..ac1d79b1 100644 --- a/.github/workflows/python-publish.yml +++ b/.github/workflows/python-publish.yml @@ -6,24 +6,24 @@ on: jobs: deploy: - runs-on: ubuntu-latest name: upload release to PyPI permissions: - id-token: write # IMPORTANT: this permission is mandatory for trusted publishing + contents: read + id-token: write steps: - - uses: actions/checkout@v5 + - uses: actions/checkout@v6 - name: Set up Python - uses: actions/setup-python@v5 + uses: actions/setup-python@v6 with: python-version: '3.x' - - name: Install dependencies - run: | - python -m pip install --upgrade pip - pip install setuptools wheel twine - - name: Build and publish - run: | - python setup.py sdist bdist_wheel + + - name: Install build dependencies + run: python -m pip install --upgrade pip build + + - name: Build package + run: python -m build + - name: Publish package distributions to PyPI - uses: pypa/gh-action-pypi-publish@release/v1 \ No newline at end of file + uses: pypa/gh-action-pypi-publish@release/v1 diff --git a/.github/workflows/run-pytest.yml b/.github/workflows/run-pytest.yml index 5b594a08..3abcaa95 100644 --- a/.github/workflows/run-pytest.yml +++ b/.github/workflows/run-pytest.yml @@ -27,23 +27,18 @@ jobs: - 5432:5432 options: --health-cmd pg_isready --health-interval 10s --health-timeout 5s --health-retries 5 steps: - - uses: actions/checkout@v4 - - name: Set up Python - uses: actions/setup-python@v5 + - uses: actions/checkout@v6 + + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v6 with: python-version: ${{ matrix.python-version }} - name: Install uv run: pip install uv - - name: Install dev dependencies - run: if [ -f requirements/requirements-dev.txt ]; then uv pip install -r requirements/requirements-dev.txt --system; fi - - - name: Install test dependencies - run: if [ -f requirements/requirements-test.txt ]; then uv pip install -r requirements/requirements-test.txt --system; fi - - - name: Install package - run: uv pip install . --system + - name: Install package with test dependencies + run: uv pip install ".[test]" --system - name: Run pytest tests - run: pytest tests -x -vv \ No newline at end of file + run: pytest tests -x -vv diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml deleted file mode 100644 index ab5489e2..00000000 --- a/.pre-commit-config.yaml +++ /dev/null @@ -1,20 +0,0 @@ -repos: - - repo: https://github.com/pre-commit/pre-commit-hooks - rev: v3.4.0 - hooks: - - id: trailing-whitespace - - id: check-yaml - - id: end-of-file-fixer - - id: requirements-txt-fixer - - id: trailing-whitespace - - - repo: https://github.com/PyCQA/isort - rev: 5.7.0 - hooks: - - id: isort - args: ["--profile", "black"] - - - repo: https://github.com/psf/black - rev: 20.8b1 - hooks: - - id: black diff --git a/MANIFEST.in b/MANIFEST.in deleted file mode 100644 index a6f1b9c0..00000000 --- a/MANIFEST.in +++ /dev/null @@ -1,8 +0,0 @@ -include README.md -include LICENSE.txt -include requirements/* -include bbconf/schemas/* -include bbconf/modules/* -include bbconf/config_parser/* -include bbconf/models/* -include bbconf/utils/* \ No newline at end of file diff --git a/bbconf/__init__.py b/bbconf/__init__.py index c225ce3c..3640d299 100644 --- a/bbconf/__init__.py +++ b/bbconf/__init__.py @@ -3,11 +3,9 @@ import coloredlogs from bbconf.bbagent import BedBaseAgent - -from ._version import __version__ from .const import PKG_NAME -__all__ = ["BedBaseAgent", "__version__"] +__all__ = ["BedBaseAgent"] _LOGGER = logging.getLogger(PKG_NAME) coloredlogs.install( diff --git a/bbconf/_version.py b/bbconf/_version.py deleted file mode 100644 index e1130451..00000000 --- a/bbconf/_version.py +++ /dev/null @@ -1 +0,0 @@ -__version__ = "0.14.7" diff --git a/bbconf/modules/bedsets.py b/bbconf/modules/bedsets.py index 6179ee98..10ee01c5 100644 --- a/bbconf/modules/bedsets.py +++ b/bbconf/modules/bedsets.py @@ -107,7 +107,7 @@ def get_plots(self, identifier: str) -> BedSetPlots: raise BedSetNotFoundError(f"Bed file with id: {identifier} not found.") bedset_files = BedSetPlots() for result in bedset_object.files: - if result.name in bedset_files.model_fields: + if result.name in BedSetPlots.model_fields: setattr( bedset_files, result.name, diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 00000000..3eef88c4 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,73 @@ +[project] +name = "bbconf" +version = "0.14.7" +description = "Configuration and data management tool for BEDbase" +readme = "README.md" +license = "BSD-2-Clause" +requires-python = ">=3.10" +authors = [ + { name = "Oleksandr Khoroshevskyi", email = "bnt4me@virginia.edu" }, +] +keywords = ["bioinformatics", "genomics", "BED", "configuration"] +classifiers = [ + "Development Status :: 4 - Beta", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Programming Language :: Python :: 3.13", + "Topic :: Scientific/Engineering :: Bio-Informatics", +] +dependencies = [ + "yacman == 0.9.4", + "sqlalchemy >= 2.0.0", + "gtars >= 0.5.3", + "geniml[ml] >= 0.8.4", + "psycopg >= 3.1.15", + "coloredlogs", + "pydantic >= 2.9.0", + "botocore >= 1.34.0, < 1.36.0", + "boto3 >= 1.34.54, < 1.36.0", + "pephubclient >= 0.4.5", + "sqlalchemy_schemadisplay", + "zarr < 3.0.0", + "pyyaml >= 6.0.1", + "s3fs >= 2024.3.1", + "pandas < 3.0.0", + "pybiocfilecache == 0.6.1", + "umap-learn >= 0.5.8", + "qdrant_client >= 1.16.1", +] + +[project.urls] +Homepage = "https://github.com/databio/bbconf" + +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[project.optional-dependencies] +test = [ + "pytest", + "pytest-cov", + "pytest-mock", + "python-dotenv", + "coverage", + "smokeshow", +] +dev = [ + "ruff", +] + +[tool.pytest.ini_options] +addopts = "-rfE" +testpaths = ["tests"] + +[tool.ruff] +line-length = 88 + +[tool.ruff.lint] +select = ["E", "F", "I"] +ignore = ["F403", "F405", "E501"] + +[tool.ruff.lint.isort] +known-first-party = ["bbconf"] diff --git a/requirements/requirements-all.txt b/requirements/requirements-all.txt deleted file mode 100644 index 5ea0d7a8..00000000 --- a/requirements/requirements-all.txt +++ /dev/null @@ -1,19 +0,0 @@ -yacman == 0.9.4 -sqlalchemy >= 2.0.0 -gtars >= 0.5.3 -geniml[ml] >= 0.8.4 -psycopg >= 3.1.15 -coloredlogs -pydantic >= 2.9.0 -botocore >= 1.34.0, < 1.36.0 -boto3 >= 1.34.54, < 1.36.0 -pephubclient >= 0.4.5 -sqlalchemy_schemadisplay -zarr < 3.0.0 -pyyaml >= 6.0.1 # for s3fs because of the errors -s3fs >= 2024.3.1 -pandas < 3.0.0 -pybiocfilecache == 0.6.1 -umap-learn >= 0.5.8 -qdrant_client >= 1.16.1 -setuptools>=82.0.0 diff --git a/requirements/requirements-dev.txt b/requirements/requirements-dev.txt deleted file mode 100644 index 8b137891..00000000 --- a/requirements/requirements-dev.txt +++ /dev/null @@ -1 +0,0 @@ - diff --git a/requirements/requirements-test.txt b/requirements/requirements-test.txt deleted file mode 100644 index 65d4db9f..00000000 --- a/requirements/requirements-test.txt +++ /dev/null @@ -1,11 +0,0 @@ -black -ruff -pytest -python-dotenv -pytest-mock -flake8 -coveralls -pytest-cov -pre-commit -coverage -smokeshow \ No newline at end of file diff --git a/setup.py b/setup.py deleted file mode 100644 index 7936fae9..00000000 --- a/setup.py +++ /dev/null @@ -1,63 +0,0 @@ -#! /usr/bin/env python - -import os -import sys - -from setuptools import setup - -PACKAGE = "bbconf" - -# Additional keyword arguments for setup(). -extra = {} - -# Ordinary dependencies -DEPENDENCIES = [] -with open("requirements/requirements-all.txt", "r") as reqs_file: - for line in reqs_file: - if not line.strip(): - continue - DEPENDENCIES.append(line) - -extra["install_requires"] = DEPENDENCIES - -with open("{}/_version.py".format(PACKAGE), "r") as versionfile: - version = versionfile.readline().split()[-1].strip("\"'\n") - -# Handle the pypi README formatting. -try: - import pypandoc - - long_description = pypandoc.convert_file("README.md", "rst") -except (IOError, ImportError, OSError): - long_description = open("README.md").read() - -setup( - name=PACKAGE, - packages=[PACKAGE], - version=version, - description="Configuration package for bedbase project", - long_description=long_description, - long_description_content_type="text/markdown", - classifiers=[ - "Development Status :: 4 - Beta", - "License :: OSI Approved :: BSD License", - "Programming Language :: Python :: 3.10", - "Programming Language :: Python :: 3.11", - "Programming Language :: Python :: 3.12", - "Programming Language :: Python :: 3.13", - "Topic :: Scientific/Engineering :: Bio-Informatics", - ], - keywords="", - url="https://databio.org", - author="Michal Stolarczyk, Oleksandr Khoroshevskyi", - author_email="khorosh@virginia.edu", - license="BSD2", - package_data={PACKAGE: [os.path.join(PACKAGE, "*")]}, - include_package_data=True, - test_suite="tests", - tests_require=(["pytest"]), - setup_requires=( - ["pytest-runner"] if {"test", "pytest", "ptr"} & set(sys.argv) else [] - ), - **extra, -) From a11f5c4f57ce01bc2505a0754f6a23e07be978fe Mon Sep 17 00:00:00 2001 From: Khoroshevskyi Date: Mon, 16 Feb 2026 15:41:07 -0500 Subject: [PATCH 2/9] Modernize - phase 4 - types update --- bbconf/__init__.py | 1 + bbconf/bbagent.py | 21 +++-- bbconf/config_parser/bedbaseconfig.py | 67 ++++++++------- bbconf/config_parser/models.py | 25 +++--- bbconf/config_parser/utils.py | 34 +++----- bbconf/db_utils.py | 24 +++--- bbconf/models/base_models.py | 63 +++++++------- bbconf/models/bed_models.py | 119 +++++++++++++------------- bbconf/models/bedset_models.py | 27 +++--- bbconf/models/drs_models.py | 21 +++-- bbconf/modules/bedfiles.py | 55 ++++++------ bbconf/modules/bedsets.py | 13 ++- bbconf/modules/objects.py | 10 +-- pyproject.toml | 2 + tests/conftest.py | 2 - tests/test_common.py | 1 - 16 files changed, 236 insertions(+), 249 deletions(-) diff --git a/bbconf/__init__.py b/bbconf/__init__.py index 3640d299..7452621d 100644 --- a/bbconf/__init__.py +++ b/bbconf/__init__.py @@ -3,6 +3,7 @@ import coloredlogs from bbconf.bbagent import BedBaseAgent + from .const import PKG_NAME __all__ = ["BedBaseAgent"] diff --git a/bbconf/bbagent.py b/bbconf/bbagent.py index 19b8d3a5..166dbf3c 100644 --- a/bbconf/bbagent.py +++ b/bbconf/bbagent.py @@ -2,11 +2,10 @@ import statistics from functools import cached_property from pathlib import Path -from typing import Dict, List, Union import numpy as np -from sqlalchemy.orm import Session from sqlalchemy.engine import ScalarResult +from sqlalchemy.orm import Session from sqlalchemy.sql import and_, distinct, func, or_, select from bbconf.config_parser.bedbaseconfig import BedBaseConfig @@ -18,11 +17,11 @@ Files, GeoGsmStatus, License, + ReferenceGenome, UsageBedMeta, UsageBedSetMeta, UsageFiles, UsageSearch, - ReferenceGenome, ) from bbconf.models.base_models import ( AllFilesInfo, @@ -43,10 +42,10 @@ _LOGGER = logging.getLogger(PKG_NAME) -class BedBaseAgent(object): +class BedBaseAgent: def __init__( self, - config: Union[Path, str], + config: Path | str, init_ml: bool = True, ): """ @@ -339,7 +338,7 @@ def get_detailed_usage(self) -> UsageStats: bed_downloads=bed_downloads, ) - def get_list_genomes(self) -> List[str]: + def get_list_genomes(self) -> list[str]: """ Get list of genomes from the database @@ -354,7 +353,7 @@ def get_list_genomes(self) -> List[str]: genomes = session.execute(statement).all() return [result[0] for result in genomes if result[0]] - def get_list_assays(self): + def get_list_assays(self) -> list[str]: """ Get list of genomes from the database @@ -371,7 +370,7 @@ def get_list_assays(self): return [result[0] for result in results if result[0]] @cached_property - def list_of_licenses(self) -> List[str]: + def list_of_licenses(self) -> list[str]: """ Get list of licenses from the database @@ -497,7 +496,7 @@ def add_usage(self, stats: UsageModel) -> None: session.commit() - def _stats_comments(self, sa_session: Session) -> Dict[str, int]: + def _stats_comments(self, sa_session: Session) -> dict[str, int]: """ Get statistics about comments that are present in bed files. @@ -551,7 +550,7 @@ def _stats_comments(self, sa_session: Session) -> Dict[str, int]: "header_comments": header_comments, } - def _stats_geo_status(self, sa_session: Session) -> Dict[str, int]: + def _stats_geo_status(self, sa_session: Session) -> dict[str, int]: """ Get statistics about status of GEO bed file processing. @@ -801,7 +800,7 @@ def _get_geo_stats(self, sa_session: Session) -> GEOStatistics: ), ) - def get_reference_genomes(self) -> Dict[str, str]: + def get_reference_genomes(self) -> dict[str, str]: """ Get mapping of genome aliases to reference genome names. diff --git a/bbconf/config_parser/bedbaseconfig.py b/bbconf/config_parser/bedbaseconfig.py index a37c5bcc..06b823a5 100644 --- a/bbconf/config_parser/bedbaseconfig.py +++ b/bbconf/config_parser/bedbaseconfig.py @@ -3,7 +3,7 @@ import os import warnings from pathlib import Path -from typing import List, Literal, Union +from typing import Literal import boto3 import joblib @@ -12,6 +12,7 @@ import s3fs import yacman import zarr +from botocore.client import BaseClient from botocore.exceptions import BotoCoreError, EndpointConnectionError from fastembed import TextEmbedding from geniml.region2vec.main import Region2VecExModel @@ -47,12 +48,12 @@ _LOGGER = logging.getLogger(PKG_NAME) -class BedBaseConfig(object): +class BedBaseConfig: """ Class to handle BEDbase configuration file and create objects for different modules. """ - def __init__(self, config: Union[Path, str], init_ml: bool = True): + def __init__(self, config: Path | str, init_ml: bool = True): """ Initialize BedBaseConfig object @@ -75,19 +76,19 @@ def __init__(self, config: Union[Path, str], init_ml: bool = True): if init_ml: self.dense_encoder: TextEmbedding = self._init_dense_encoder() - self.sparse_encoder: Union[SparseEncoder, None] = self._init_sparce_model() - self.umap_encoder: Union[UMAP, None] = self._init_umap_model() - self.r2v_encoder: Union[Region2VecExModel, None] = self._init_r2v_encoder() + self.sparse_encoder: SparseEncoder | None = self._init_sparce_model() + self.umap_encoder: UMAP | None = self._init_umap_model() + self.r2v_encoder: Region2VecExModel | None = self._init_r2v_encoder() self._init_qdrant_hybrid( qdrant_cl=self.qdrant_client, dense_encoder=self.dense_encoder, ) - self.qdrant_file_backend: Union[QdrantBackend, None] = ( + self.qdrant_file_backend: QdrantBackend | None = ( self._init_qdrant_file_backend(qdrant_cl=self.qdrant_client) ) # used for bivec search - self._qdrant_text_backend: Union[QdrantBackend, None] = ( + self._qdrant_text_backend: QdrantBackend | None = ( self._init_qdrant_text_backend( qdrant_cl=self.qdrant_client, dense_encoder=self.dense_encoder, @@ -111,7 +112,7 @@ def __init__(self, config: Union[Path, str], init_ml: bool = True): self.r2v_encoder = None self.b2b_search_interface = None self.bivec_search_interface = None - self.umap_encoder: Union[UMAP, None] = None + self.umap_encoder: UMAP | None = None self.sparse_encoder = None self._phc = self._init_pephubclient() @@ -178,7 +179,7 @@ def boto3_client(self) -> boto3.client: return self._boto3_client @property - def zarr_root(self) -> Union[Z_GROUP, None]: + def zarr_root(self) -> Z_GROUP | None: """ Get zarr root object (Group) @@ -243,7 +244,7 @@ def _init_qdrant_client(self) -> QdrantClient: def _init_qdrant_file_backend( self, qdrant_cl: QdrantClient - ) -> Union[QdrantBackend, None]: + ) -> QdrantBackend | None: """ Create qdrant client object using credentials provided in config file @@ -255,7 +256,7 @@ def _init_qdrant_file_backend( if not isinstance(qdrant_cl, QdrantClient): _LOGGER.error( - f"Unable to create Qdrant bivec file collection, qdrant client is None." + "Unable to create Qdrant bivec file collection, qdrant client is None." ) return None @@ -270,7 +271,7 @@ def _init_qdrant_file_backend( def _init_qdrant_text_backend( self, qdrant_cl: QdrantClient, dense_encoder: TextEmbedding - ) -> Union[QdrantBackend, None]: + ) -> QdrantBackend | None: """ Create qdrant client text embedding object using credentials provided in config file @@ -283,12 +284,12 @@ def _init_qdrant_text_backend( if not isinstance(qdrant_cl, QdrantClient): _LOGGER.error( - f"Unable to create Qdrant bivec text collection, qdrant client is None." + "Unable to create Qdrant bivec text collection, qdrant client is None." ) return None if not isinstance(dense_encoder, TextEmbedding): _LOGGER.error( - f"Unable to create Qdrant bivec text collection, dense encoder is None." + "Unable to create Qdrant bivec text collection, dense encoder is None." ) return None @@ -318,12 +319,12 @@ def _init_qdrant_hybrid( if not isinstance(qdrant_cl, QdrantClient): _LOGGER.error( - f"Unable to create Qdrant hybrid collection, qdrant client is None." + "Unable to create Qdrant hybrid collection, qdrant client is None." ) return None if not isinstance(dense_encoder, TextEmbedding): _LOGGER.error( - f"Unable to create Qdrant hybrid collection, dense encoder is None." + "Unable to create Qdrant hybrid collection, dense encoder is None." ) return None @@ -385,7 +386,7 @@ def _init_bivec_interface( qdrant_file_backend: QdrantBackend, qdrant_text_backend: QdrantBackend, text_encoder: TextEmbedding, - ) -> Union[BiVectorSearchInterface, None]: + ) -> BiVectorSearchInterface | None: """ Create BiVectorSearchInterface object using credentials provided in config file @@ -409,8 +410,8 @@ def _init_bivec_interface( def _init_b2b_search_interface( self, qdrant_file_backend: QdrantBackend, - region_encoder: Union[Region2VecExModel, str], - ) -> Union[BED2BEDSearchInterface, None]: + region_encoder: Region2VecExModel | str, + ) -> BED2BEDSearchInterface | None: """ Create Bed 2 BED search interface and return this object @@ -430,7 +431,7 @@ def _init_b2b_search_interface( ) return None - def _init_r2v_encoder(self) -> Union[Region2VecExModel, None]: + def _init_r2v_encoder(self) -> Region2VecExModel | None: """ Create Region2VecExModel object using credentials provided in config file """ @@ -446,7 +447,7 @@ def _init_r2v_encoder(self) -> Union[Region2VecExModel, None]: ) return None - def _init_dense_encoder(self) -> Union[None, TextEmbedding]: + def _init_dense_encoder(self) -> TextEmbedding | None: """ Initialize dense model from the specified path or huggingface model hub """ @@ -457,7 +458,7 @@ def _init_dense_encoder(self) -> Union[None, TextEmbedding]: dense_encoder = TextEmbedding(self.config.path.text2vec) return dense_encoder - def _init_sparce_model(self) -> Union[None, SparseEncoder]: + def _init_sparce_model(self) -> SparseEncoder | None: """ Initialize SparseEncoder model from the specified path or huggingface model hub """ @@ -472,7 +473,7 @@ def _init_sparce_model(self) -> Union[None, SparseEncoder]: return None return sparse_encoder - def _init_umap_model(self) -> Union[UMAP, None]: + def _init_umap_model(self) -> UMAP | None: """ Load UMAP model from the specified path, or url """ @@ -492,7 +493,7 @@ def _init_umap_model(self) -> Union[UMAP, None]: response.raise_for_status() buffer = io.BytesIO(response.content) umap_model = joblib.load(buffer) - print(f"UMAP model loaded from URL: {model_path}") + _LOGGER.info(f"UMAP model loaded from URL: {model_path}") except requests.RequestException as e: _LOGGER.error(f"Error downloading UMAP model from URL: {e}") return None @@ -500,7 +501,7 @@ def _init_umap_model(self) -> Union[UMAP, None]: try: with open(model_path, "rb") as file: umap_model = joblib.load(file) - print(f"UMAP model loaded from local path: {model_path}") + _LOGGER.info(f"UMAP model loaded from local path: {model_path}") except FileNotFoundError as e: _LOGGER.error(f"Error loading UMAP model from local path: {e}") return None @@ -514,7 +515,7 @@ def _init_umap_model(self) -> Union[UMAP, None]: def _init_boto3_client( self, - ) -> Union[boto3.client, None]: + ) -> BaseClient | None: """ Create Pephub client object using credentials provided in config file @@ -532,7 +533,7 @@ def _init_boto3_client( warnings.warn(f"Error in creating boto3 client object: {e}", UserWarning) return None - def upload_s3(self, file_path: str, s3_path: Union[Path, str]) -> None: + def upload_s3(self, file_path: str, s3_path: Path | str) -> None: """ Upload file to s3. @@ -555,10 +556,10 @@ def upload_s3(self, file_path: str, s3_path: Union[Path, str]) -> None: def upload_files_s3( self, identifier: str, - files: Union[BedFiles, BedPlots, BedSetPlots], + files: BedFiles | BedPlots | BedSetPlots, base_path: str, type: Literal["files", "plots", "bedsets"] = "files", - ) -> Union[BedFiles, BedPlots, BedSetPlots]: + ) -> BedFiles | BedPlots | BedSetPlots: """ Upload files to s3. @@ -635,7 +636,7 @@ def delete_s3(self, s3_path: str) -> None: "Could not delete file from s3. Connection error." ) - def delete_files_s3(self, files: List[FileModel]) -> None: + def delete_files_s3(self, files: list[FileModel]) -> None: """ Delete files from s3. @@ -649,7 +650,7 @@ def delete_files_s3(self, files: List[FileModel]) -> None: return None @staticmethod - def _init_pephubclient() -> Union[PEPHubClient, None]: + def _init_pephubclient() -> PEPHubClient | None: """ Create Pephub client object using credentials provided in config file @@ -681,7 +682,7 @@ def get_prefixed_uri(self, postfix: str, access_id: str) -> str: _LOGGER.error(f"Access method {access_id} is not defined.") raise BadAccessMethodError(f"Access method {access_id} is not defined.") - def construct_access_method_list(self, rel_path: str) -> List[AccessMethod]: + def construct_access_method_list(self, rel_path: str) -> list[AccessMethod]: """ Construct access method list for a given record diff --git a/bbconf/config_parser/models.py b/bbconf/config_parser/models.py index 3fe3fbe2..6b5e9f61 100644 --- a/bbconf/config_parser/models.py +++ b/bbconf/config_parser/models.py @@ -1,6 +1,5 @@ import logging from pathlib import Path -from typing import Optional, Union from pydantic import BaseModel, ConfigDict, computed_field, field_validator from yacman import load_yaml @@ -35,7 +34,7 @@ class ConfigDB(BaseModel): password: str database: str = DEFAULT_DB_NAME dialect: str = DEFAULT_DB_DIALECT - driver: Optional[str] = DEFAULT_DB_DRIVER + driver: str | None = DEFAULT_DB_DRIVER model_config = ConfigDict(extra="forbid") @@ -53,10 +52,10 @@ def url(self) -> str: class ConfigQdrant(BaseModel): host: str port: int = DEFAULT_QDRANT_PORT - api_key: Optional[str] = None + api_key: str | None = None file_collection: str = DEFAULT_QDRANT_FILE_COLLECTION_NAME - text_collection: Optional[str] = DEFAULT_QDRANT_BIVEC_COLLECTION_NAME - hybrid_collection: Optional[str] = DEFAULT_QDRANT_HYBRID_COLLECTION_NAME + text_collection: str | None = DEFAULT_QDRANT_BIVEC_COLLECTION_NAME + hybrid_collection: str | None = DEFAULT_QDRANT_HYBRID_COLLECTION_NAME class ConfigServer(BaseModel): @@ -69,7 +68,7 @@ class ConfigPath(BaseModel): # vec2vec: str = DEFAULT_VEC2VEC_MODEL text2vec: str = DEFAULT_TEXT2VEC_MODEL sparse_model: str = DEFAULT_SPARSE_MODEL - umap_model: Union[str, None] = None # Path or link to pre-trained UMAP model + umap_model: str | None = None # Path or link to pre-trained UMAP model class AccessMethodsStruct(BaseModel): @@ -85,10 +84,10 @@ class AccessMethods(BaseModel): class ConfigS3(BaseModel): - endpoint_url: Union[str, None] = None - aws_access_key_id: Union[str, None] = None - aws_secret_access_key: Union[str, None] = None - bucket: Union[str, None] = DEFAULT_S3_BUCKET + endpoint_url: str | None = None + aws_access_key_id: str | None = None + aws_secret_access_key: str | None = None + bucket: str | None = DEFAULT_S3_BUCKET @field_validator("aws_access_key_id", "aws_secret_access_key") def validate_aws_credentials(cls, value): @@ -120,9 +119,9 @@ def modify_access(self) -> bool: class ConfigPepHubClient(BaseModel): - namespace: Union[str, None] = DEFAULT_PEPHUB_NAMESPACE - name: Union[str, None] = DEFAULT_PEPHUB_NAME - tag: Union[str, None] = DEFAULT_PEPHUB_TAG + namespace: str | None = DEFAULT_PEPHUB_NAMESPACE + name: str | None = DEFAULT_PEPHUB_NAME + tag: str | None = DEFAULT_PEPHUB_TAG class ConfigFile(BaseModel): diff --git a/bbconf/config_parser/utils.py b/bbconf/config_parser/utils.py index 45d55bde..2e255dfc 100644 --- a/bbconf/config_parser/utils.py +++ b/bbconf/config_parser/utils.py @@ -1,11 +1,15 @@ +import logging + import yacman from pephubclient.helpers import MessageHandler as m from pydantic_core._pydantic_core import ValidationError from bbconf.config_parser.models import ConfigFile -from bbconf.exceptions import BedBaseConfError +from bbconf.const import PKG_NAME from bbconf.helpers import get_bedbase_cfg +_LOGGER = logging.getLogger(PKG_NAME) + def config_analyzer(config_path: str) -> bool: """ @@ -17,7 +21,7 @@ def config_analyzer(config_path: str) -> bool: """ config_path = get_bedbase_cfg(config_path) - print(f"Analyzing the configuration file {config_path}...") + _LOGGER.info(f"Analyzing the configuration file {config_path}...") _config = yacman.YAMLConfigManager(filepath=config_path).exp @@ -27,35 +31,23 @@ def config_analyzer(config_path: str) -> bool: config_dict[field_name] = annotation.annotation(**_config.get(field_name)) except TypeError: if annotation.is_required(): - print( - str( - BedBaseConfError( - f"`Config info: {field_name}` Field is not set in the configuration file or missing. " - ) - ) + _LOGGER.error( + f"`Config info: {field_name}` Field is not set in the configuration file or missing." ) else: - print( + _LOGGER.info( f"Config info: `{field_name}` Field is not set in the configuration file. Using default value." ) try: config_dict[field_name] = None except ValidationError as e: - print( - str( - BedBaseConfError( - f"Error in provided configuration file. Section: `{field_name}` missing values :: \n {e}" - ) - ) + _LOGGER.error( + f"Error in provided configuration file. Section: `{field_name}` missing values :: \n {e}" ) return False except ValidationError as e: - print( - str( - BedBaseConfError( - f"Error in provided configuration file. Section: `{field_name}` missing values :: \n {e}" - ) - ) + _LOGGER.error( + f"Error in provided configuration file. Section: `{field_name}` missing values :: \n {e}" ) return False diff --git a/bbconf/db_utils.py b/bbconf/db_utils.py index 971f2e1e..9baba10a 100644 --- a/bbconf/db_utils.py +++ b/bbconf/db_utils.py @@ -1,6 +1,6 @@ import datetime import logging -from typing import List, Optional +from typing import Optional import pandas as pd from sqlalchemy import ( @@ -117,11 +117,11 @@ class Bed(Base): ) is_universe: Mapped[Optional[bool]] = mapped_column(default=False) - files: Mapped[List["Files"]] = relationship( + files: Mapped[list["Files"]] = relationship( "Files", back_populates="bedfile", cascade="all, delete-orphan" ) - bedsets: Mapped[List["BedFileBedSetRelation"]] = relationship( + bedsets: Mapped[list["BedFileBedSetRelation"]] = relationship( "BedFileBedSetRelation", back_populates="bedfile", cascade="all, delete-orphan" ) @@ -144,7 +144,7 @@ class Bed(Base): ) license_mapping: Mapped["License"] = relationship("License", back_populates="bed") - ref_classifier: Mapped[List["GenomeRefStats"]] = relationship( + ref_classifier: Mapped[list["GenomeRefStats"]] = relationship( "GenomeRefStats", back_populates="bed", cascade="all, delete-orphan" ) processed: Mapped[bool] = mapped_column( @@ -337,10 +337,10 @@ class BedSets(Base): JSON, comment="Median values of the bedset" ) - bedfiles: Mapped[List["BedFileBedSetRelation"]] = relationship( + bedfiles: Mapped[list["BedFileBedSetRelation"]] = relationship( "BedFileBedSetRelation", back_populates="bedset", cascade="all, delete-orphan" ) - files: Mapped[List["Files"]] = relationship("Files", back_populates="bedset") + files: Mapped[list["Files"]] = relationship("Files", back_populates="bedset") universe: Mapped["Universes"] = relationship("Universes", back_populates="bedset") author: Mapped[str] = mapped_column(nullable=True, comment="Author of the bedset") @@ -413,7 +413,7 @@ class License(Base): nullable=False, comment="License description" ) - bed: Mapped[List["Bed"]] = relationship("Bed", back_populates="license_mapping") + bed: Mapped[list["Bed"]] = relationship("Bed", back_populates="license_mapping") class ReferenceGenome(Base): @@ -424,7 +424,7 @@ class ReferenceGenome(Base): nullable=False, comment="Name of the reference genome" ) - bed_reference: Mapped[List["GenomeRefStats"]] = relationship( + bed_reference: Mapped[list["GenomeRefStats"]] = relationship( "GenomeRefStats", back_populates="genome_object", cascade="all, delete-orphan", @@ -501,7 +501,7 @@ class GeoGseStatus(Base): number_of_skips: Mapped[int] = mapped_column(default=0, comment="Number of skips") number_of_fails: Mapped[int] = mapped_column(default=0, comment="Number of fails") - gsm_status_mapper: Mapped[List["GeoGsmStatus"]] = relationship( + gsm_status_mapper: Mapped[list["GeoGsmStatus"]] = relationship( "GeoGsmStatus", back_populates="gse_status_mapper" ) error: Mapped[str] = mapped_column(nullable=True, comment="Error message") @@ -607,10 +607,10 @@ def __init__( host: str = "localhost", port: int = 5432, database: str = "bedbase", - user: str = None, - password: str = None, + user: str | None = None, + password: str | None = None, drivername: str = POSTGRES_DIALECT, - dsn: str = None, + dsn: str | None = None, echo: bool = False, ): """ diff --git a/bbconf/models/base_models.py b/bbconf/models/base_models.py index 8e104927..ef3efd59 100644 --- a/bbconf/models/base_models.py +++ b/bbconf/models/base_models.py @@ -1,5 +1,4 @@ import datetime -from typing import Dict, List, Optional, Union from pydantic import BaseModel, ConfigDict, Field @@ -8,14 +7,14 @@ class FileModel(BaseModel): name: str - title: Optional[str] = None + title: str | None = None path: str - file_digest: Optional[str] = None - path_thumbnail: Optional[Union[str, None]] = Field(None, alias="thumbnail_path") - description: Optional[str] = None - size: Optional[int] = None - object_id: Optional[str] = None - access_methods: List[AccessMethod] = None + file_digest: str | None = None + path_thumbnail: str | None = Field(None, alias="thumbnail_path") + description: str | None = None + size: int | None = None + object_id: str | None = None + access_methods: list[AccessMethod] | None = None model_config = ConfigDict(populate_by_name=True, extra="ignore") @@ -28,11 +27,11 @@ class StatsReturn(BaseModel): class UsageStats(BaseModel): # file_downloads: Dict[str, int] # Placeholder for tracking file download statistics in the future. - bed_metadata: Dict[str, int] - bedset_metadata: Dict[str, int] - bed_search_terms: Dict[str, int] - bedset_search_terms: Dict[str, int] - bed_downloads: Dict[str, int] + bed_metadata: dict[str, int] + bedset_metadata: dict[str, int] + bed_search_terms: dict[str, int] + bedset_search_terms: dict[str, int] + bed_downloads: dict[str, int] class UsageModel(BaseModel): @@ -40,15 +39,15 @@ class UsageModel(BaseModel): Usage model. Used to track usage of the bedbase. """ - bed_meta: Union[dict, None] = Dict[str, int] - bedset_meta: Union[dict, None] = Dict[str, int] + bed_meta: dict[str, int] | None = None + bedset_meta: dict[str, int] | None = None - bed_search: Union[dict, None] = Dict[str, int] - bedset_search: Union[dict, None] = Dict[str, int] - files: Union[dict, None] = Dict[str, int] + bed_search: dict[str, int] | None = None + bedset_search: dict[str, int] | None = None + files: dict[str, int] | None = None date_from: datetime.datetime - date_to: Union[datetime.datetime, None] = None + date_to: datetime.datetime | None = None class FileInfo(BaseModel): @@ -71,12 +70,12 @@ class AllFilesInfo(BaseModel): """ total: int - files: List[FileInfo] + files: list[FileInfo] class BinValues(BaseModel): - bins: List[Union[int, float, str]] - counts: List[int] + bins: list[int | float | str] + counts: list[int] mean: float median: float @@ -86,20 +85,20 @@ class GEOStatistics(BaseModel): GEO statistics for files. """ - number_of_files: Dict[str, int] - cumulative_number_of_files: Dict[str, int] + number_of_files: dict[str, int] + cumulative_number_of_files: dict[str, int] file_sizes: BinValues class FileStats(BaseModel): - bed_compliance: Dict[str, int] - data_format: Dict[str, int] - file_genome: Dict[str, int] - file_organism: Dict[str, int] - file_assay: Dict[str, int] - cell_line: Dict[str, int] - geo_status: Dict[str, int] - bed_comments: Dict[str, int] + bed_compliance: dict[str, int] + data_format: dict[str, int] + file_genome: dict[str, int] + file_organism: dict[str, int] + file_assay: dict[str, int] + cell_line: dict[str, int] + geo_status: dict[str, int] + bed_comments: dict[str, int] mean_region_width: BinValues file_size: BinValues number_of_regions: BinValues diff --git a/bbconf/models/bed_models.py b/bbconf/models/bed_models.py index a6059568..e6ba995d 100644 --- a/bbconf/models/bed_models.py +++ b/bbconf/models/bed_models.py @@ -1,5 +1,4 @@ import datetime -from typing import List, Optional, Union from pydantic import BaseModel, ConfigDict, Field, field_validator @@ -23,8 +22,8 @@ class BedPlots(BaseModel): class BedFiles(BaseModel): - bed_file: Union[FileModel, None] = None - bigbed_file: Union[FileModel, None] = None + bed_file: FileModel | None = None + bigbed_file: FileModel | None = None model_config = ConfigDict( populate_by_name=True, @@ -33,47 +32,47 @@ class BedFiles(BaseModel): class BedClassification(BaseModel): - name: Optional[str] = None - genome_alias: str = None - genome_digest: Union[str, None] = None + name: str | None = None + genome_alias: str | None = None + genome_digest: str | None = None bed_compliance: str = Field( default="bed3", pattern=r"^bed(?:[3-9]|1[0-5])(?:\+|$)[0-9]?+$" ) - data_format: Union[str, None] = None + data_format: str | None = None compliant_columns: int = 3 non_compliant_columns: int = 0 - header: Union[str, None] = None # Header of the bed file (if any) + header: str | None = None # Header of the bed file (if any) model_config = ConfigDict(extra="ignore") class BedStatsModel(BaseModel): - number_of_regions: Optional[float] = None - gc_content: Optional[float] = None - median_tss_dist: Optional[float] = None - mean_region_width: Optional[float] = None + number_of_regions: float | None = None + gc_content: float | None = None + median_tss_dist: float | None = None + mean_region_width: float | None = None - exon_frequency: Optional[float] = None - exon_percentage: Optional[float] = None + exon_frequency: float | None = None + exon_percentage: float | None = None - intron_frequency: Optional[float] = None - intron_percentage: Optional[float] = None + intron_frequency: float | None = None + intron_percentage: float | None = None - intergenic_percentage: Optional[float] = None - intergenic_frequency: Optional[float] = None + intergenic_percentage: float | None = None + intergenic_frequency: float | None = None - promotercore_frequency: Optional[float] = None - promotercore_percentage: Optional[float] = None + promotercore_frequency: float | None = None + promotercore_percentage: float | None = None - fiveutr_frequency: Optional[float] = None - fiveutr_percentage: Optional[float] = None + fiveutr_frequency: float | None = None + fiveutr_percentage: float | None = None - threeutr_frequency: Optional[float] = None - threeutr_percentage: Optional[float] = None + threeutr_frequency: float | None = None + threeutr_percentage: float | None = None - promoterprox_frequency: Optional[float] = None - promoterprox_percentage: Optional[float] = None + promoterprox_frequency: float | None = None + promoterprox_percentage: float | None = None model_config = ConfigDict(extra="ignore", populate_by_name=True) @@ -114,7 +113,7 @@ class StandardMeta(BaseModel): species_id: str = "" genotype: str = Field("", description="Genotype of the sample") phenotype: str = Field("", description="Phenotype of the sample") - description: Union[str, None] = "" + description: str | None = "" cell_type: str = Field( "", @@ -139,10 +138,10 @@ class StandardMeta(BaseModel): "", description="Treatment of the sample (e.g. drug treatment)" ) - global_sample_id: Union[List[str], None] = Field( + global_sample_id: list[str] | None = Field( "", description="Global sample identifier. e.g. GSM000" ) # excluded in training - global_experiment_id: Union[List[str], None] = Field( + global_experiment_id: list[str] | None = Field( "", description="Global experiment identifier. e.g. GSE000" ) # excluded in training @@ -155,7 +154,7 @@ class StandardMeta(BaseModel): ) @field_validator("global_sample_id", "global_experiment_id", mode="before") - def ensure_list(cls, v: Union[str, List[str]]) -> List[str]: + def ensure_list(cls, v: str | list[str]) -> list[str]: if isinstance(v, str): return [v] elif isinstance(v, list): @@ -172,67 +171,67 @@ class BedPEPHubRestrict(BedPEPHub): class BedMetadataBasic(BedClassification): id: str - name: Optional[Union[str, None]] = "" - description: Optional[str] = None - submission_date: datetime.datetime = None - last_update_date: Optional[datetime.datetime] = None - is_universe: Optional[bool] = False - license_id: Optional[str] = DEFAULT_LICENSE - annotation: Optional[StandardMeta] = None - processed: Optional[bool] = True + name: str | None = "" + description: str | None = None + submission_date: datetime.datetime | None = None + last_update_date: datetime.datetime | None = None + is_universe: bool | None = False + license_id: str | None = DEFAULT_LICENSE + annotation: StandardMeta | None = None + processed: bool | None = True class UniverseMetadata(BaseModel): - construct_method: Union[str, None] = None - bedset_id: Union[str, None] = None + construct_method: str | None = None + bedset_id: str | None = None class BedSetMinimal(BaseModel): id: str - name: Union[str, None] = None - description: Union[str, None] = None + name: str | None = None + description: str | None = None class BedMetadataAll(BedMetadataBasic): - stats: Union[BedStatsModel, None] = None - plots: Union[BedPlots, None] = None - files: Union[BedFiles, None] = None - universe_metadata: Union[UniverseMetadata, None] = None - raw_metadata: Union[BedPEPHub, BedPEPHubRestrict, None] = None - bedsets: Union[List[BedSetMinimal], None] = None + stats: BedStatsModel | None = None + plots: BedPlots | None = None + files: BedFiles | None = None + universe_metadata: UniverseMetadata | None = None + raw_metadata: BedPEPHub | BedPEPHubRestrict | None = None + bedsets: list[BedSetMinimal] | None = None class BedListResult(BaseModel): count: int limit: int offset: int - results: List[BedMetadataBasic] + results: list[BedMetadataBasic] class QdrantSearchResult(BaseModel): id: str payload: dict = None score: float = None - metadata: Union[BedMetadataBasic, None] = None + metadata: BedMetadataBasic | None = None class BedListSearchResult(BaseModel): count: int limit: int offset: int - results: List[QdrantSearchResult] = None + results: list[QdrantSearchResult] | None = None class TokenizedBedResponse(BaseModel): universe_id: str bed_id: str - tokenized_bed: List[int] + tokenized_bed: list[int] class BedEmbeddingResult(BaseModel): identifier: str payload: dict - embedding: List[float] + embedding: list[float] class TokenizedPathResponse(BaseModel): @@ -244,11 +243,11 @@ class TokenizedPathResponse(BaseModel): class RefGenValidModel(BaseModel): provided_genome: str - compared_genome: Union[str, None] - genome_digest: Union[str, None] + compared_genome: str | None + genome_digest: str | None xs: float = 0.0 - oobr: Union[float, None] = None - sequence_fit: Union[float, None] = None + oobr: float | None = None + sequence_fit: float | None = None assigned_points: int tier_ranking: int @@ -257,8 +256,8 @@ class RefGenValidModel(BaseModel): class RefGenValidReturnModel(BaseModel): id: str - provided_genome: Union[str, None] = None - compared_genome: List[RefGenValidModel] + provided_genome: str | None = None + compared_genome: list[RefGenValidModel] class VectorMetadata(BaseModel): @@ -272,7 +271,7 @@ class VectorMetadata(BaseModel): treatment: str assay: str genome_alias: str - genome_digest: Union[str, None] = None + genome_digest: str | None = None species_name: str # summary: str # global_sample_id: str diff --git a/bbconf/models/bedset_models.py b/bbconf/models/bedset_models.py index 9c19bac7..ca074a99 100644 --- a/bbconf/models/bedset_models.py +++ b/bbconf/models/bedset_models.py @@ -1,5 +1,4 @@ import datetime -from typing import List, Union from pydantic import BaseModel, ConfigDict, model_validator @@ -24,36 +23,36 @@ class BedSetMetadata(BaseModel): md5sum: str submission_date: datetime.datetime = None last_update_date: datetime.datetime = None - statistics: Union[BedSetStats, None] = None - plots: Union[BedSetPlots, None] = None + statistics: BedSetStats | None = None + plots: BedSetPlots | None = None description: str = None summary: str = None - bed_ids: List[str] = None - author: Union[str, None] = None - source: Union[str, None] = None + bed_ids: list[str] = None + author: str | None = None + source: str | None = None class BedSetListResult(BaseModel): count: int limit: int offset: int - results: List[BedSetMetadata] + results: list[BedSetMetadata] class BedSetBedFiles(BaseModel): count: int - results: List[BedMetadataBasic] + results: list[BedMetadataBasic] class BedSetPEP(BaseModel): sample_name: str original_name: str - genome_alias: Union[str, None] = "" - genome_digest: Union[str, None] = "" - bed_compliance: Union[str, None] = "" - data_format: Union[str, None] = "" - description: Union[str, None] = "" - url: Union[str, None] = "" + genome_alias: str | None = "" + genome_digest: str | None = "" + bed_compliance: str | None = "" + data_format: str | None = "" + description: str | None = "" + url: str | None = "" @model_validator(mode="before") def remove_underscore_keys(cls, values): diff --git a/bbconf/models/drs_models.py b/bbconf/models/drs_models.py index 01435a56..a69fcc27 100644 --- a/bbconf/models/drs_models.py +++ b/bbconf/models/drs_models.py @@ -1,5 +1,4 @@ import datetime -from typing import List, Optional, Union from pydantic import BaseModel @@ -7,23 +6,23 @@ # DRS Models class AccessURL(BaseModel): url: str - headers: Optional[dict] = None + headers: dict | None = None class AccessMethod(BaseModel): type: str - access_url: Optional[AccessURL] = None - access_id: Optional[str] = None - region: Optional[str] = None + access_url: AccessURL | None = None + access_id: str | None = None + region: str | None = None class DRSModel(BaseModel): id: str - name: Optional[str] = None + name: str | None = None self_uri: str - size: Union[int, None] = None - created_time: Optional[datetime.datetime] = None - updated_time: Optional[datetime.datetime] = None + size: int | None = None + created_time: datetime.datetime | None = None + updated_time: datetime.datetime | None = None checksums: str - access_methods: List[AccessMethod] - description: Optional[str] = None + access_methods: list[AccessMethod] + description: str | None = None diff --git a/bbconf/modules/bedfiles.py b/bbconf/modules/bedfiles.py index 2e759c73..1c82da9b 100644 --- a/bbconf/modules/bedfiles.py +++ b/bbconf/modules/bedfiles.py @@ -1,7 +1,6 @@ import datetime import os from logging import getLogger -from typing import Dict, List, Union import numpy as np from geniml.bbclient import BBClient @@ -10,8 +9,8 @@ from pephubclient.exceptions import ResponseError from pydantic import BaseModel from qdrant_client import models -from qdrant_client.http.models import PointStruct from qdrant_client.http.exceptions import UnexpectedResponse +from qdrant_client.http.models import PointStruct from qdrant_client.models import PointIdsList from sqlalchemy import and_, cast, delete, func, or_, select from sqlalchemy.dialects import postgresql @@ -23,9 +22,9 @@ from bbconf.config_parser.bedbaseconfig import BedBaseConfig from bbconf.const import ( DEFAULT_LICENSE, + DEFAULT_QDRANT_GENOME_DIGESTS, PKG_NAME, ZARR_TOKENIZED_FOLDER, - DEFAULT_QDRANT_GENOME_DIGESTS, ) from bbconf.db_utils import ( Bed, @@ -368,7 +367,7 @@ def get_classification(self, identifier: str) -> BedClassification: return bed_classification - def get_objects(self, identifier: str) -> Dict[str, FileModel]: + def get_objects(self, identifier: str) -> dict[str, FileModel]: """ Get all object related to bedfile @@ -522,7 +521,7 @@ def add( plots: dict = None, files: dict = None, classification: dict = None, - ref_validation: Union[Dict[str, BaseModel], None] = None, + ref_validation: dict[str, BaseModel] | None = None, license_id: str = DEFAULT_LICENSE, upload_qdrant: bool = False, upload_pephub: bool = False, @@ -716,12 +715,12 @@ def add( def update( self, identifier: str, - stats: Union[dict, None] = None, - metadata: Union[dict, None] = None, - plots: Union[dict, None] = None, - files: Union[dict, None] = None, - classification: Union[dict, None] = None, - ref_validation: Union[Dict[str, BaseModel], None] = None, + stats: dict | None = None, + metadata: dict | None = None, + plots: dict | None = None, + files: dict | None = None, + classification: dict | None = None, + ref_validation: dict[str, BaseModel] | None = None, license_id: str = DEFAULT_LICENSE, upload_qdrant: bool = False, upload_pephub: bool = False, @@ -1025,7 +1024,7 @@ def _update_ref_validation( self, sa_session: Session, bed_id: str, - ref_validation: Dict[str, BaseModel], + ref_validation: dict[str, BaseModel], provided_genome: str = "", ) -> None: """ @@ -1060,7 +1059,7 @@ def _update_ref_validation( def _create_ref_validation_models( self, - ref_validation: Dict[str, BaseModel], + ref_validation: dict[str, BaseModel], bed_id: str, provided_genome: str = None, ) -> list[GenomeRefStats]: @@ -1163,7 +1162,7 @@ def delete_pephub_sample(self, identifier: str): def upload_file_qdrant( self, bed_id: str, - bed_file: Union[str, GRegionSet], + bed_file: str | GRegionSet, payload: dict = None, ) -> None: """ @@ -1191,7 +1190,7 @@ def upload_file_qdrant( ) return None - def _embed_file(self, bed_file: Union[str, GRegionSet]) -> np.ndarray: + def _embed_file(self, bed_file: str | GRegionSet) -> np.ndarray: """ Create embedding for bed file @@ -1223,7 +1222,7 @@ def _embed_file(self, bed_file: Union[str, GRegionSet]) -> np.ndarray: vec_dim = bed_embedding.shape[0] return bed_embedding.reshape(1, vec_dim) - def _get_umap_file(self, bed_file: Union[str, GRegionSet]) -> np.ndarray: + def _get_umap_file(self, bed_file: str | GRegionSet) -> np.ndarray: """ Create UMAP for bed file @@ -1452,7 +1451,7 @@ def reindex_qdrant(self, batch: int = 100, purge: bool = False) -> None: .join(BedMetadata, Bed.id == BedMetadata.id) .where( and_( - Bed.file_indexed == False, + Bed.file_indexed.is_(False), # Bed.genome_alias == QDRANT_GENOME, # BedMetadata.global_experiment_id.contains(['encode']) # If we want only encode data Bed.genome_digest.in_(DEFAULT_QDRANT_GENOME_DIGESTS), @@ -1463,7 +1462,7 @@ def reindex_qdrant(self, batch: int = 100, purge: bool = False) -> None: annotation_results = session.scalars(statement) - results: List[Bed] = [result for result in annotation_results] + results: list[Bed] = [result for result in annotation_results] if not results: _LOGGER.info("No files to reindex in qdrant.") return None @@ -1554,6 +1553,8 @@ def delete_qdrant_point(self, identifier: str) -> None: points=[identifier], ), ) + if result.status == "completed": + _LOGGER.info(f"File with id: {identifier} successfully deleted from qdrant.") return None def exists(self, identifier: str) -> bool: @@ -1817,7 +1818,7 @@ def get_tokenized_link( def get_missing_plots( self, plot_name: str, limit: int = 1000, offset: int = 0 - ) -> List[str]: + ) -> list[str]: """ Get list of bed files that are missing plot @@ -1855,7 +1856,7 @@ def get_missing_plots( return results - def get_missing_stats(self, limit: int = 1000, offset: int = 0) -> List[str]: + def get_missing_stats(self, limit: int = 1000, offset: int = 0) -> list[str]: """ Get list of bed files that are missing statistics @@ -1879,7 +1880,7 @@ def get_missing_stats(self, limit: int = 1000, offset: int = 0) -> List[str]: return results - def get_missing_files(self, limit: int = 1000, offset: int = 0) -> List[str]: + def get_missing_files(self, limit: int = 1000, offset: int = 0) -> list[str]: """ Get list of bed files that are missing files (bigBed files) @@ -1911,7 +1912,7 @@ def get_missing_files(self, limit: int = 1000, offset: int = 0) -> List[str]: return results def get_unprocessed( - self, limit: int = 1000, offset: int = 0, genome: Union[str, list, None] = None + self, limit: int = 1000, offset: int = 0, genome: str | list | None = None ) -> BedListResult: """ Get bed files that are not processed. @@ -1976,9 +1977,9 @@ def get_unprocessed( def _update_sources( self, - identifier, - global_sample_id: List[str] = None, - global_experiment_id: List[str] = None, + identifier: str, + global_sample_id: list[str] | None = None, + global_experiment_id: list[str] | None = None, ) -> None: """ Add global sample and experiment ids to the bed file if they are missing @@ -2030,7 +2031,7 @@ def reindex_hybrid_search(self, batch: int = 1000, purge: bool = False) -> None: statement = ( select(Bed) .join(BedMetadata, Bed.id == BedMetadata.id) - .where(Bed.indexed == False) + .where(Bed.indexed.is_(False)) ) with Session(self._sa_engine) as session: @@ -2116,7 +2117,7 @@ def reindex_hybrid_search(self, batch: int = 1000, purge: bool = False) -> None: ) pbar.write("Uploaded batch to qdrant.") points = [] - print(operation_info.status) + _LOGGER.info(operation_info.status) assert ( operation_info.status == "completed" or operation_info.status == "acknowledged" diff --git a/bbconf/modules/bedsets.py b/bbconf/modules/bedsets.py index 10ee01c5..543db7bb 100644 --- a/bbconf/modules/bedsets.py +++ b/bbconf/modules/bedsets.py @@ -1,5 +1,4 @@ import logging -from typing import Dict, List from geniml.io.utils import compute_md5sum_bedset from sqlalchemy import Float, Numeric, func, or_, select @@ -121,7 +120,7 @@ def get_plots(self, identifier: str) -> BedSetPlots: ) return bedset_files - def get_objects(self, identifier: str) -> Dict[str, FileModel]: + def get_objects(self, identifier: str) -> dict[str, FileModel]: """ Get objects for bedset by identifier. @@ -281,11 +280,11 @@ def create( self, identifier: str, name: str, - bedid_list: List[str], - description: str = None, + bedid_list: list[str], + description: str | None = None, statistics: bool = False, - annotation: dict = None, - plots: dict = None, + annotation: dict | None = None, + plots: dict | None = None, upload_pephub: bool = False, upload_s3: bool = False, local_path: str = "", @@ -393,7 +392,7 @@ def create( _LOGGER.info(f"Bedset '{identifier}' was created successfully") return None - def _calculate_statistics(self, bed_ids: List[str]) -> BedSetStats: + def _calculate_statistics(self, bed_ids: list[str]) -> BedSetStats: """ Calculate statistics for bedset. diff --git a/bbconf/modules/objects.py b/bbconf/modules/objects.py index ac77db3d..ace529f3 100644 --- a/bbconf/modules/objects.py +++ b/bbconf/modules/objects.py @@ -1,6 +1,6 @@ import datetime import logging -from typing import List, Literal, Union +from typing import Literal from bbconf.config_parser.bedbaseconfig import BedBaseConfig from bbconf.const import PKG_NAME @@ -79,7 +79,7 @@ def _get_result( self, record_type: Literal["bed", "bedset"], record_id: str, - result_id: Union[str, List[str]], + result_id: str | list[str], ) -> FileModel: """ Generic getter that can return a result from either bed or bedset @@ -157,9 +157,9 @@ def construct_drs_metadata( base_uri: str, object_id: str, record_metadata: FileModel, - created_time: datetime.datetime = None, - modified_time: datetime.datetime = None, - ): + created_time: datetime.datetime | None = None, + modified_time: datetime.datetime | None = None, + ) -> DRSModel: """ Construct DRS metadata object diff --git a/pyproject.toml b/pyproject.toml index 3eef88c4..3c93c470 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -68,6 +68,8 @@ line-length = 88 [tool.ruff.lint] select = ["E", "F", "I"] ignore = ["F403", "F405", "E501"] +exclude = ["manual_testing.py"] + [tool.ruff.lint.isort] known-first-party = ["bbconf"] diff --git a/tests/conftest.py b/tests/conftest.py index b06d944f..58466de1 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,6 +1,4 @@ import os -import subprocess -from atexit import register import pytest diff --git a/tests/test_common.py b/tests/test_common.py index 9e9b029a..76d6a7ec 100644 --- a/tests/test_common.py +++ b/tests/test_common.py @@ -3,7 +3,6 @@ import pytest from bbconf.const import DEFAULT_LICENSE -from bbconf.exceptions import BedBaseConfError from bbconf.models.base_models import UsageModel from .conftest import SERVICE_UNAVAILABLE From 8f289f753750666990a44077c0dbec781b3319a0 Mon Sep 17 00:00:00 2001 From: Khoroshevskyi Date: Mon, 16 Feb 2026 15:51:03 -0500 Subject: [PATCH 3/9] Formatting --- bbconf/bbagent.py | 2 -- bbconf/config_parser/bedbaseconfig.py | 2 -- bbconf/models/bed_models.py | 1 - bbconf/modules/bedfiles.py | 7 +++---- bbconf/modules/bedsets.py | 2 -- tests/test_bedfile.py | 1 - tests/test_universes.py | 1 - 7 files changed, 3 insertions(+), 13 deletions(-) diff --git a/bbconf/bbagent.py b/bbconf/bbagent.py index 166dbf3c..ee573af2 100644 --- a/bbconf/bbagent.py +++ b/bbconf/bbagent.py @@ -112,7 +112,6 @@ def get_detailed_stats(self, concise: bool = False) -> FileStats: _LOGGER.info("Getting detailed statistics for all bed files") with Session(self.config.db_engine.engine) as session: - bed_compliance = { f[0]: f[1] for f in session.execute( @@ -384,7 +383,6 @@ def list_of_licenses(self) -> list[str]: def add_usage(self, stats: UsageModel) -> None: with Session(self.config.db_engine.engine) as session: - # FILES USAGE reported_items_files = session.scalars( select(UsageFiles).where(UsageFiles.date_to > func.now()) diff --git a/bbconf/config_parser/bedbaseconfig.py b/bbconf/config_parser/bedbaseconfig.py index 06b823a5..8b76d683 100644 --- a/bbconf/config_parser/bedbaseconfig.py +++ b/bbconf/config_parser/bedbaseconfig.py @@ -74,7 +74,6 @@ def __init__(self, config: Path | str, init_ml: bool = True): init_ml = False if init_ml: - self.dense_encoder: TextEmbedding = self._init_dense_encoder() self.sparse_encoder: SparseEncoder | None = self._init_sparce_model() self.umap_encoder: UMAP | None = self._init_umap_model() @@ -487,7 +486,6 @@ def _init_umap_model(self) -> UMAP | None: model_path = self.config.path.umap_model umap_model = None if model_path.startswith(("http://", "https://")): - try: response = requests.get(model_path) response.raise_for_status() diff --git a/bbconf/models/bed_models.py b/bbconf/models/bed_models.py index e6ba995d..04e0ef46 100644 --- a/bbconf/models/bed_models.py +++ b/bbconf/models/bed_models.py @@ -165,7 +165,6 @@ def ensure_list(cls, v: str | list[str]) -> list[str]: class BedPEPHubRestrict(BedPEPHub): - model_config = ConfigDict(extra="ignore") diff --git a/bbconf/modules/bedfiles.py b/bbconf/modules/bedfiles.py index 1c82da9b..fee42312 100644 --- a/bbconf/modules/bedfiles.py +++ b/bbconf/modules/bedfiles.py @@ -689,7 +689,6 @@ def add( session.add(new_metadata) if ref_validation: - new_gen_refs = self._create_ref_validation_models( ref_validation=ref_validation, bed_id=identifier, @@ -1554,7 +1553,9 @@ def delete_qdrant_point(self, identifier: str) -> None: ), ) if result.status == "completed": - _LOGGER.info(f"File with id: {identifier} successfully deleted from qdrant.") + _LOGGER.info( + f"File with id: {identifier} successfully deleted from qdrant." + ) return None def exists(self, identifier: str) -> bool: @@ -2035,7 +2036,6 @@ def reindex_hybrid_search(self, batch: int = 1000, purge: bool = False) -> None: ) with Session(self._sa_engine) as session: - if purge: _LOGGER.info("Purging indexed files in the database ...") session.query(Bed).update({Bed.indexed: False}) @@ -2295,7 +2295,6 @@ def search_external_file(self, source: str, accession: str) -> BedListSearchResu ) with Session(self._sa_engine) as session: - bed_objects = session.scalars(statement) results = [ BedMetadataBasic( diff --git a/bbconf/modules/bedsets.py b/bbconf/modules/bedsets.py index 543db7bb..c7758a17 100644 --- a/bbconf/modules/bedsets.py +++ b/bbconf/modules/bedsets.py @@ -184,7 +184,6 @@ def get_bedset_pep(self, identifier: str) -> dict: bedfile_meta_list = [] for bedfile in bedfiles: - try: annotation = bedfile.annotations.__dict__ except AttributeError: @@ -616,7 +615,6 @@ def get_unprocessed(self, limit: int = 100, offset: int = 0) -> BedSetListResult """ with Session(self._db_engine.engine) as session: - statement = ( select(BedSets) .where(BedSets.processed.is_(False)) diff --git a/tests/test_bedfile.py b/tests/test_bedfile.py index c0ed1d1d..6312e29a 100644 --- a/tests/test_bedfile.py +++ b/tests/test_bedfile.py @@ -213,7 +213,6 @@ def test_bed_update(self, bbagent_obj): # TODO: has to be expanded with ContextManagerDBTesting(config=bbagent_obj.config, add_data=True): - bed_file = bbagent_obj.bed.get(BED_TEST_ID, full=True) # assert bed_file.annotation.model_dump(exclude_defaults=True) == {} assert bed_file.annotation.cell_line == "" diff --git a/tests/test_universes.py b/tests/test_universes.py index d3cdfc9a..366b3770 100644 --- a/tests/test_universes.py +++ b/tests/test_universes.py @@ -8,7 +8,6 @@ @pytest.mark.skipif(SERVICE_UNAVAILABLE, reason="Database is not available") class TestUniverses: - def test_add(self, bbagent_obj): with ContextManagerDBTesting(config=bbagent_obj.config, add_data=True): bbagent_obj.bed.add_universe( From cff7a1619915a0b7684ff8d63447c402d927b095 Mon Sep 17 00:00:00 2001 From: Khoroshevskyi Date: Mon, 16 Feb 2026 22:40:29 -0500 Subject: [PATCH 4/9] Modernizing - docstrings --- bbconf/bbagent.py | 130 +++--- bbconf/config_parser/bedbaseconfig.py | 188 ++++---- bbconf/config_parser/models.py | 21 +- bbconf/config_parser/utils.py | 14 +- bbconf/db_utils.py | 74 +-- bbconf/helpers.py | 25 +- bbconf/models/bed_models.py | 4 +- bbconf/modules/bedfiles.py | 644 ++++++++++++++------------ bbconf/modules/bedsets.py | 195 ++++---- bbconf/modules/objects.py | 88 ++-- 10 files changed, 764 insertions(+), 619 deletions(-) diff --git a/bbconf/bbagent.py b/bbconf/bbagent.py index ee573af2..0e90058c 100644 --- a/bbconf/bbagent.py +++ b/bbconf/bbagent.py @@ -48,12 +48,12 @@ def __init__( config: Path | str, init_ml: bool = True, ): - """ - Initialize connection to the pep_db database. You can use the basic connection parameters + """Initialize connection to the pep_db database. You can use the basic connection parameters or libpq connection string. - :param config: path to the configuration file - :param init_ml: initialize ML models for search (default: True) + Args: + config: Path to the configuration file. + init_ml: Initialize ML models for search (default: True). """ self.config = BedBaseConfig(config, init_ml) @@ -82,10 +82,10 @@ def __repr__(self) -> str: return repr def get_stats(self) -> StatsReturn: - """ - Get statistics for a bed file + """Get statistics for a bed file. - :return: statistics + Returns: + Statistics. """ with Session(self.config.db_engine.engine) as session: number_of_bed = session.execute(select(func.count(Bed.id))).one()[0] @@ -102,11 +102,13 @@ def get_stats(self) -> StatsReturn: ) def get_detailed_stats(self, concise: bool = False) -> FileStats: - """ - Get comprehensive statistics for all bed files + """Get comprehensive statistics for all bed files. - :param concise: if True, return only top 20 items for each category - :return: FileStats object containing detailed statistics + Args: + concise: If True, return only top 20 items for each category. + + Returns: + FileStats object containing detailed statistics. """ _LOGGER.info("Getting detailed statistics for all bed files") @@ -266,11 +268,11 @@ def get_detailed_stats(self, concise: bool = False) -> FileStats: ) def get_detailed_usage(self) -> UsageStats: - """ - Get detailed usage statistics for the bedbase platform. + """Get detailed usage statistics for the bedbase platform. This method will only return top 20 items for each category. - :return: UsageStats object containing detailed usage statistics + Returns: + UsageStats object containing detailed usage statistics. """ _LOGGER.info("Getting detailed usage statistics.") @@ -338,10 +340,10 @@ def get_detailed_usage(self) -> UsageStats: ) def get_list_genomes(self) -> list[str]: - """ - Get list of genomes from the database + """Get list of genomes from the database. - :return: list of genomes + Returns: + List of genomes. """ statement = ( select(Bed.genome_alias) @@ -353,10 +355,10 @@ def get_list_genomes(self) -> list[str]: return [result[0] for result in genomes if result[0]] def get_list_assays(self) -> list[str]: - """ - Get list of genomes from the database + """Get list of genomes from the database. - :return: list of genomes + Returns: + List of genomes. """ with Session(self.config.db_engine.engine) as session: @@ -370,10 +372,10 @@ def get_list_assays(self) -> list[str]: @cached_property def list_of_licenses(self) -> list[str]: - """ - Get list of licenses from the database + """Get list of licenses from the database. - :return: list of licenses + Returns: + List of licenses. """ statement = select(License.id) with Session(self.config.db_engine.engine) as session: @@ -495,12 +497,13 @@ def add_usage(self, stats: UsageModel) -> None: session.commit() def _stats_comments(self, sa_session: Session) -> dict[str, int]: - """ - Get statistics about comments that are present in bed files. + """Get statistics about comments that are present in bed files. - :param sa_session: SQLAlchemy session + Args: + sa_session: SQLAlchemy session. - :return: Dict[str, int] + Returns: + Dict mapping comment type to count. """ _LOGGER.info("Analyzing bed table for comments in bed files...") @@ -549,11 +552,13 @@ def _stats_comments(self, sa_session: Session) -> dict[str, int]: } def _stats_geo_status(self, sa_session: Session) -> dict[str, int]: - """ - Get statistics about status of GEO bed file processing. + """Get statistics about status of GEO bed file processing. + + Args: + sa_session: SQLAlchemy session. - :param sa_session: SQLAlchemy session - :return Dict[str, int] + Returns: + Dict mapping status type to count. """ success_statement = select( @@ -598,23 +603,10 @@ def _stats_geo_status(self, sa_session: Session) -> dict[str, int]: } def bed_files_info(self) -> AllFilesInfo: - """ - Get information about all bed files in bedbase. - - :param sa_session: SQLAlchemy session - :return AllFilesInfo: - { - "total": int," - "files": [ - { id: str - bed_compliance: str - data_format: str - mean_region_width: float - file_size: int - number_of_regions: int - }, - ... ] - } + """Get information about all bed files in bedbase. + + Returns: + AllFilesInfo containing total count and list of file info objects. """ all_files_statement = ( @@ -660,11 +652,13 @@ def bed_files_info(self) -> AllFilesInfo: ) def _bin_number_of_regions(self, number_of_regions: list) -> BinValues: - """ - Create bins for number of regions in bed files + """Create bins for number of regions in bed files. + + Args: + number_of_regions: List of number of regions in bed files. - :param number_of_regions: list of number of regions in bed files - :return: BinValues object containing bins and values + Returns: + BinValues object containing bins and values. """ max_value_threshold = 400_000 # set a threshold for maximum value to avoid outliers in the histogram @@ -688,11 +682,13 @@ def _bin_number_of_regions(self, number_of_regions: list) -> BinValues: ) def _bin_mean_region_width(self, mean_region_widths: list) -> BinValues: - """ - Create bins for number of regions in bed files + """Create bins for number of regions in bed files. - :param mean_region_widths: list of mean region widths in bed files - :return: BinValues object containing bins and values + Args: + mean_region_widths: List of mean region widths in bed files. + + Returns: + BinValues object containing bins and values. """ max_value_threshold = 5_000 # set a threshold for maximum value to avoid outliers in the histogram @@ -716,11 +712,13 @@ def _bin_mean_region_width(self, mean_region_widths: list) -> BinValues: ) def _bin_file_size(self, list_file_size: list) -> BinValues: - """ - Create bins for number of regions in bed files + """Create bins for number of regions in bed files. + + Args: + list_file_size: List of bed file sizes in bytes. - :param list_file_size: list of bed file sizes in bytes - :return: BinValues object containing bins and values + Returns: + BinValues object containing bins and values. """ max_value_threshold = 10 * 1024 * 1024 @@ -745,10 +743,10 @@ def _bin_file_size(self, list_file_size: list) -> BinValues: ) def _get_geo_stats(self, sa_session: Session) -> GEOStatistics: - """ - Get GEO statistics for the bedbase platform. + """Get GEO statistics for the bedbase platform. - :return: GEOStatistics + Returns: + GEOStatistics. """ _LOGGER.info("Getting GEO statistics.") @@ -799,10 +797,10 @@ def _get_geo_stats(self, sa_session: Session) -> GEOStatistics: ) def get_reference_genomes(self) -> dict[str, str]: - """ - Get mapping of genome aliases to reference genome names. + """Get mapping of genome aliases to reference genome names. - :return: dict mapping genome_alias -> reference_genome_name + Returns: + Dict mapping genome_alias to reference_genome_name. """ genomes = {} diff --git a/bbconf/config_parser/bedbaseconfig.py b/bbconf/config_parser/bedbaseconfig.py index 8b76d683..2f43676d 100644 --- a/bbconf/config_parser/bedbaseconfig.py +++ b/bbconf/config_parser/bedbaseconfig.py @@ -54,11 +54,11 @@ class BedBaseConfig: """ def __init__(self, config: Path | str, init_ml: bool = True): - """ - Initialize BedBaseConfig object + """Initialize BedBaseConfig object. - :param config: path to the configuration file - :param init_ml: initialize machine learning models used for search + Args: + config: Path to the configuration file. + init_ml: Initialize machine learning models used for search. """ self.cfg_path = get_bedbase_cfg(config) @@ -119,12 +119,16 @@ def __init__(self, config: Path | str, init_ml: bool = True): @staticmethod def _read_config_file(config_path: str) -> ConfigFile: - """ - Read configuration file and insert default values if not set + """Read configuration file and insert default values if not set. + + Args: + config_path: Configuration file path. - :param config_path: configuration file path - :return: None - :raises: raise_missing_key (if config key is missing) + Returns: + Parsed ConfigFile object. + + Raises: + raise_missing_key: If config key is missing. """ _config = yacman.YAMLConfigManager(filepath=config_path).exp @@ -143,46 +147,46 @@ def _read_config_file(config_path: str) -> ConfigFile: @property def config(self) -> ConfigFile: - """ - Get configuration + """Get configuration. - :return: configuration object + Returns: + Configuration object. """ return self._config @property def db_engine(self) -> BaseEngine: - """ - Get database engine + """Get database engine. - :return: database engine + Returns: + Database engine. """ return self._db_engine @property def phc(self) -> PEPHubClient: - """ - Get PEPHub client + """Get PEPHub client. - :return: PEPHub client + Returns: + PEPHub client. """ return self._phc @property def boto3_client(self) -> boto3.client: - """ - Get boto3 client + """Get boto3 client. - :return: boto3 client + Returns: + Boto3 client. """ return self._boto3_client @property def zarr_root(self) -> Z_GROUP | None: - """ - Get zarr root object (Group) + """Get zarr root object (Group). - :return: zarr root group object + Returns: + Zarr root group object. """ try: @@ -244,11 +248,13 @@ def _init_qdrant_client(self) -> QdrantClient: def _init_qdrant_file_backend( self, qdrant_cl: QdrantClient ) -> QdrantBackend | None: - """ - Create qdrant client object using credentials provided in config file + """Create qdrant client object using credentials provided in config file. - :param: qdrant_cl: QdrantClient object - :return: QdrantClient + Args: + qdrant_cl: QdrantClient object. + + Returns: + QdrantClient. """ _LOGGER.info("Initializing qdrant bivec file backend...") @@ -271,12 +277,14 @@ def _init_qdrant_file_backend( def _init_qdrant_text_backend( self, qdrant_cl: QdrantClient, dense_encoder: TextEmbedding ) -> QdrantBackend | None: - """ - Create qdrant client text embedding object using credentials provided in config file + """Create qdrant client text embedding object using credentials provided in config file. - :param: qdrant_cl: QdrantClient object - :param: dense_encoder: TextEmbedding model for encoding text queries - :return: QdrantClient + Args: + qdrant_cl: QdrantClient object. + dense_encoder: TextEmbedding model for encoding text queries. + + Returns: + QdrantClient. """ _LOGGER.info("Initializing qdrant bivec text backend...") @@ -306,12 +314,14 @@ def _init_qdrant_text_backend( def _init_qdrant_hybrid( self, qdrant_cl: QdrantClient, dense_encoder: TextEmbedding ) -> None: - """ - Create qdrant client with sparse and text embedding object using credentials provided in config file + """Create qdrant client with sparse and text embedding object using credentials provided in config file. - :param: qdrant_cl: QdrantClient object - :param: dense_encoder: TextEmbedding model for encoding text queries - :return: QdrantClient + Args: + qdrant_cl: QdrantClient object. + dense_encoder: TextEmbedding model for encoding text queries. + + Returns: + QdrantClient. """ _LOGGER.info("Initializing qdrant sparse collection...") @@ -386,13 +396,15 @@ def _init_bivec_interface( qdrant_text_backend: QdrantBackend, text_encoder: TextEmbedding, ) -> BiVectorSearchInterface | None: - """ - Create BiVectorSearchInterface object using credentials provided in config file + """Create BiVectorSearchInterface object using credentials provided in config file. - :param: qdrant_file_backend: QdrantBackend for file vectors - :param: qdrant_text_backend: QdrantBackend for text vectors - :param: text_encoder: TextEmbedding model for encoding text queries - :return: BiVectorSearchInterface + Args: + qdrant_file_backend: QdrantBackend for file vectors. + qdrant_text_backend: QdrantBackend for text vectors. + text_encoder: TextEmbedding model for encoding text queries. + + Returns: + BiVectorSearchInterface. """ _LOGGER.info("Initializing BiVectorBackend...") @@ -411,10 +423,10 @@ def _init_b2b_search_interface( qdrant_file_backend: QdrantBackend, region_encoder: Region2VecExModel | str, ) -> BED2BEDSearchInterface | None: - """ - Create Bed 2 BED search interface and return this object + """Create Bed 2 BED search interface and return this object. - :return: Bed2BEDSearchInterface object + Returns: + Bed2BEDSearchInterface object. """ try: _LOGGER.info("Initializing search bed 2 bed search interfaces...") @@ -514,10 +526,10 @@ def _init_umap_model(self) -> UMAP | None: def _init_boto3_client( self, ) -> BaseClient | None: - """ - Create Pephub client object using credentials provided in config file + """Create boto3 client object using credentials provided in config file. - :return: PephubClient + Returns: + Boto3 client. """ try: return boto3.client( @@ -532,12 +544,14 @@ def _init_boto3_client( return None def upload_s3(self, file_path: str, s3_path: Path | str) -> None: - """ - Upload file to s3. + """Upload file to s3. - :param file_path: local path to the file - :param s3_path: path to the file in s3 with file name - :return: None + Args: + file_path: Local path to the file. + s3_path: Path to the file in s3 with file name. + + Returns: + None. """ if not self._boto3_client: _LOGGER.warning( @@ -558,14 +572,16 @@ def upload_files_s3( base_path: str, type: Literal["files", "plots", "bedsets"] = "files", ) -> BedFiles | BedPlots | BedSetPlots: - """ - Upload files to s3. + """Upload files to s3. - :param identifier: bed file identifier - :param files: dictionary with files to upload - :param base_path: local path to the output files - :param type: type of files to upload [files, plots, bedsets] - :return: None + Args: + identifier: Bed file identifier. + files: Dictionary with files to upload. + base_path: Local path to the output files. + type: Type of files to upload [files, plots, bedsets]. + + Returns: + None. """ if type == "files": @@ -611,11 +627,13 @@ def upload_files_s3( return files def delete_s3(self, s3_path: str) -> None: - """ - Delete file from s3. + """Delete file from s3. - :param s3_path: path to the file in s3 - :return: None + Args: + s3_path: Path to the file in s3. + + Returns: + None. """ if not self._boto3_client: _LOGGER.warning( @@ -635,11 +653,13 @@ def delete_s3(self, s3_path: str) -> None: ) def delete_files_s3(self, files: list[FileModel]) -> None: - """ - Delete files from s3. + """Delete files from s3. - :param files: list of file objects - :return: None + Args: + files: List of file objects. + + Returns: + None. """ for file in files: self.delete_s3(file.path) @@ -649,10 +669,10 @@ def delete_files_s3(self, files: list[FileModel]) -> None: @staticmethod def _init_pephubclient() -> PEPHubClient | None: - """ - Create Pephub client object using credentials provided in config file + """Create Pephub client object using credentials provided in config file. - :return: PephubClient + Returns: + PephubClient. """ # try: @@ -665,12 +685,14 @@ def _init_pephubclient() -> PEPHubClient | None: return None def get_prefixed_uri(self, postfix: str, access_id: str) -> str: - """ - Return uri with correct prefix (schema) + """Return uri with correct prefix (schema). + + Args: + postfix: Postfix of the uri (or everything after uri schema). + access_id: Access method name, e.g. http, s3, etc. - :param postfix: postfix of the uri (or everything after uri schema) - :param access_id: access method name, e.g. http, s3, etc. - :return: full uri path + Returns: + Full uri path. """ try: @@ -681,11 +703,13 @@ def get_prefixed_uri(self, postfix: str, access_id: str) -> str: raise BadAccessMethodError(f"Access method {access_id} is not defined.") def construct_access_method_list(self, rel_path: str) -> list[AccessMethod]: - """ - Construct access method list for a given record + """Construct access method list for a given record. + + Args: + rel_path: Relative path to the record. - :param rel_path: relative path to the record - :return: list of access methods + Returns: + List of access methods. """ access_methods = [] for access_id in self.config.access_methods.model_dump().keys(): diff --git a/bbconf/config_parser/models.py b/bbconf/config_parser/models.py index 6b5e9f61..d4499f7e 100644 --- a/bbconf/config_parser/models.py +++ b/bbconf/config_parser/models.py @@ -41,10 +41,10 @@ class ConfigDB(BaseModel): @computed_field @property def url(self) -> str: - """ - The URL of the database. + """The URL of the database. - :return str: The URL of the database. + Returns: + The URL of the database. """ return f"{self.dialect}+{self.driver}://{self.user}:{self.password}@{self.host}:{self.port}/{self.database}" @@ -105,10 +105,10 @@ def validate_aws_credentials(cls, value): @computed_field @property def modify_access(self) -> bool: - """ - If the AWS credentials are provided, set the modify access to True. (create = True) + """If the AWS credentials are provided, set the modify access to True. (create = True) - :return str: The URL of the database. + Returns: + True if AWS credentials are provided, False otherwise. """ if self.aws_access_key_id and self.aws_secret_access_key: return True @@ -137,11 +137,12 @@ class ConfigFile(BaseModel): @classmethod def from_yaml(cls, path: Path): - """ - Load the database configuration from a YAML file. + """Load the database configuration from a YAML file. - :param path: The path to the YAML file. + Args: + path: The path to the YAML file. - :returns: DatabaseConfig: The database configuration. + Returns: + The database configuration. """ return cls.model_validate(load_yaml(path.as_posix())) diff --git a/bbconf/config_parser/utils.py b/bbconf/config_parser/utils.py index 2e255dfc..f489f5e7 100644 --- a/bbconf/config_parser/utils.py +++ b/bbconf/config_parser/utils.py @@ -12,12 +12,16 @@ def config_analyzer(config_path: str) -> bool: - """ - Read configuration file and insert default values if not set + """Read configuration file and insert default values if not set. + + Args: + config_path: Configuration file path. + + Returns: + True if the config is valid, False otherwise. - :param config_path: configuration file path - :return: None - :raises: raise_missing_key (if config key is missing) + Raises: + raise_missing_key: If config key is missing. """ config_path = get_bedbase_cfg(config_path) diff --git a/bbconf/db_utils.py b/bbconf/db_utils.py index 9baba10a..f453000e 100644 --- a/bbconf/db_utils.py +++ b/bbconf/db_utils.py @@ -613,18 +613,18 @@ def __init__( dsn: str | None = None, echo: bool = False, ): - """ - Initialize connection to the bedbase database. You can use The basic connection parameters + """Initialize connection to the bedbase database. You can use the basic connection parameters or libpq connection string. - :param host: database server address e.g., localhost or an IP address. - :param port: the port number that defaults to 5432 if it is not provided. - :param database: the name of the database that you want to connect. - :param user: the username used to authenticate. - :param password: password used to authenticate. - :param drivername: driver used in - :param dsn: libpq connection string using the dsn parameter - (e.g. 'postgresql://user_name:password@host_name:port/db_name') + Args: + host: Database server address e.g., localhost or an IP address. + port: The port number that defaults to 5432 if it is not provided. + database: The name of the database that you want to connect. + user: The username used to authenticate. + password: Password used to authenticate. + drivername: Driver used in connection. + dsn: Libpq connection string using the dsn parameter + (e.g. 'postgresql://user_name:password@host_name:port/db_name'). """ if not dsn: dsn = URL.create( @@ -641,11 +641,13 @@ def __init__( self.check_db_connection() def create_schema(self, engine=None): - """ - Create sql schema in the database. + """Create sql schema in the database. - :param engine: sqlalchemy engine [Default: None] - :return: None + Args: + engine: Sqlalchemy engine [Default: None]. + + Returns: + None. """ if not engine: engine = self._engine @@ -660,11 +662,13 @@ def create_schema(self, engine=None): pass def delete_schema(self, engine=None) -> None: - """ - Delete sql schema in the database. + """Delete sql schema in the database. - :param engine: sqlalchemy engine [Default: None] - :return: None + Args: + engine: Sqlalchemy engine [Default: None]. + + Returns: + None. """ if not engine: engine = self._engine @@ -672,12 +676,14 @@ def delete_schema(self, engine=None) -> None: return None def session_execute(self, statement: Select) -> Result: - """ - Execute statement using sqlalchemy statement + """Execute statement using sqlalchemy statement. + + Args: + statement: SQL query or a SQL expression that is constructed using + SQLAlchemy's SQL expression language. - :param statement: SQL query or a SQL expression that is constructed using - SQLAlchemy's SQL expression language - :return: query result represented with declarative base + Returns: + Query result represented with declarative base. """ _LOGGER.debug(f"Executing statement: {statement}") with Session(self._engine) as session: @@ -687,15 +693,19 @@ def session_execute(self, statement: Select) -> Result: @property def session(self): - """ - :return: started sqlalchemy session + """Get a started sqlalchemy session. + + Returns: + Started sqlalchemy session. """ return self._start_session() @property def engine(self) -> Engine: - """ - :return: sqlalchemy engine + """Get sqlalchemy engine. + + Returns: + Sqlalchemy engine. """ return self._engine @@ -715,11 +725,13 @@ def check_db_connection(self): raise SchemaError() def create_schema_graph(self, output_file: str = "schema.svg"): - """ - Create schema graph of the database. + """Create schema graph of the database. + + Args: + output_file: Path to the output file. - :param output_file: path to the output file - :return: None + Returns: + None. """ graph = create_schema_graph(engine=self.engine, metadata=Base.metadata) graph.write(output_file, format="svg", prog="dot") diff --git a/bbconf/helpers.py b/bbconf/helpers.py index 76eade26..2d0cf8fd 100644 --- a/bbconf/helpers.py +++ b/bbconf/helpers.py @@ -12,15 +12,17 @@ def get_bedbase_cfg(cfg: str = None) -> str: - """ - Determine path to the bedbase configuration file + """Determine path to the bedbase configuration file. The path can be either explicitly provided - or read from a $BEDBASE environment variable + or read from a $BEDBASE environment variable. + + Args: + cfg: Path to the config file. + Optional, the $BEDBASE config env var will be used if not provided. - :param str cfg: path to the config file. - Optional, the $BEDBASE config env var will be used if not provided - :return str: absolute configuration file path + Returns: + Absolute configuration file path. """ _LOGGER.info(f"Loading configuration file: {cfg}") @@ -41,13 +43,14 @@ def get_bedbase_cfg(cfg: str = None) -> str: def get_absolute_path(path: str, base_path: str) -> str: - """ - Get absolute path to the file and create it if it doesn't exist + """Get absolute path to the file and create it if it doesn't exist. - :param path: path to the file (abs or relative) - :param base_path: base path to the file (will be added to the relative path) + Args: + path: Path to the file (abs or relative). + base_path: Base path to the file (will be added to the relative path). - :return: absolute path to the file + Returns: + Absolute path to the file. """ if not os.path.isabs(path) or not os.path.exists(path): return os.path.join(base_path, path) diff --git a/bbconf/models/bed_models.py b/bbconf/models/bed_models.py index 04e0ef46..d16d0c51 100644 --- a/bbconf/models/bed_models.py +++ b/bbconf/models/bed_models.py @@ -139,10 +139,10 @@ class StandardMeta(BaseModel): ) global_sample_id: list[str] | None = Field( - "", description="Global sample identifier. e.g. GSM000" + None, description="Global sample identifier. e.g. GSM000" ) # excluded in training global_experiment_id: list[str] | None = Field( - "", description="Global experiment identifier. e.g. GSE000" + None, description="Global experiment identifier. e.g. GSE000" ) # excluded in training original_file_name: str = Field("", description="Original file name") diff --git a/bbconf/modules/bedfiles.py b/bbconf/modules/bedfiles.py index fee42312..f38a2a45 100644 --- a/bbconf/modules/bedfiles.py +++ b/bbconf/modules/bedfiles.py @@ -81,9 +81,11 @@ class BedAgentBedFile: """ def __init__(self, config: BedBaseConfig, bbagent_obj=None): - """ - :param config: config object with database and qdrant engine and credentials - :param bbagent_obj: BedBaseAgent object (Parent object) + """Initialize BedAgentBedFile. + + Args: + config: Config object with database and qdrant engine and credentials. + bbagent_obj: BedBaseAgent object (Parent object). """ self._sa_engine = config.db_engine.engine self._db_engine = config.db_engine @@ -92,12 +94,14 @@ def __init__(self, config: BedBaseConfig, bbagent_obj=None): self.bb_agent = bbagent_obj def get(self, identifier: str, full: bool = False) -> BedMetadataAll: - """ - Get file metadata by identifier. + """Get file metadata by identifier. - :param identifier: bed file identifier - :param full: if True, return full metadata, including statistics, files, and raw metadata from pephub - :return: project metadata + Args: + identifier: Bed file identifier. + full: If True, return full metadata, including statistics, files, and raw metadata from pephub. + + Returns: + Project metadata. """ statement = select(Bed).where(and_(Bed.id == identifier)) @@ -209,12 +213,13 @@ def get(self, identifier: str, full: bool = False) -> BedMetadataAll: ) def get_stats(self, identifier: str) -> BedStatsModel: - """ - Get file statistics by identifier. + """Get file statistics by identifier. - :param identifier: bed file identifier + Args: + identifier: Bed file identifier. - :return: project statistics as BedStats object + Returns: + Project statistics as BedStats object. """ statement = select(BedStats).where(and_(BedStats.id == identifier)) @@ -227,11 +232,13 @@ def get_stats(self, identifier: str) -> BedStatsModel: return bed_stats def get_plots(self, identifier: str) -> BedPlots: - """ - Get file plots by identifier. + """Get file plots by identifier. + + Args: + identifier: Bed file identifier. - :param identifier: bed file identifier - :return: project plots + Returns: + Project plots. """ statement = select(Bed).where(and_(Bed.id == identifier)) @@ -258,14 +265,15 @@ def get_plots(self, identifier: str) -> BedPlots: def get_neighbours( self, identifier: str, limit: int = 10, offset: int = 0 ) -> BedListSearchResult: - """ - Get nearest neighbours of bed file from qdrant. + """Get nearest neighbours of bed file from qdrant. - :param identifier: bed file identifier - :param limit: number of results to return - :param offset: offset to start from + Args: + identifier: Bed file identifier. + limit: Number of results to return. + offset: Offset to start from. - :return: list of nearest neighbours + Returns: + List of nearest neighbours. """ if not self.exists(identifier): raise BEDFileNotFoundError(f"Bed file with id: {identifier} not found.") @@ -302,11 +310,13 @@ def get_neighbours( ) def get_files(self, identifier: str) -> BedFiles: - """ - Get file files by identifier. + """Get file files by identifier. - :param identifier: bed file identifier - :return: project files + Args: + identifier: Bed file identifier. + + Returns: + Project files. """ statement = select(Bed).where(and_(Bed.id == identifier)) @@ -332,11 +342,13 @@ def get_files(self, identifier: str) -> BedFiles: return bed_files def get_raw_metadata(self, identifier: str) -> BedPEPHub: - """ - Get file metadata by identifier. + """Get file metadata by identifier. - :param identifier: bed file identifier - :return: project metadata + Args: + identifier: Bed file identifier. + + Returns: + Project metadata. """ try: bed_metadata = self.config.phc.sample.get( @@ -351,11 +363,13 @@ def get_raw_metadata(self, identifier: str) -> BedPEPHub: return BedPEPHubRestrict(**bed_metadata) def get_classification(self, identifier: str) -> BedClassification: - """ - Get file classification by identifier. + """Get file classification by identifier. - :param identifier: bed file identifier - :return: project classification + Args: + identifier: Bed file identifier. + + Returns: + Project classification. """ statement = select(Bed).where(and_(Bed.id == identifier)) @@ -368,11 +382,13 @@ def get_classification(self, identifier: str) -> BedClassification: return bed_classification def get_objects(self, identifier: str) -> dict[str, FileModel]: - """ - Get all object related to bedfile + """Get all object related to bedfile. - :param identifier: bed file identifier - :return: project objects dict + Args: + identifier: Bed file identifier. + + Returns: + Project objects dict. """ statement = select(Bed).where(and_(Bed.id == identifier)) return_dict = {} @@ -387,11 +403,13 @@ def get_objects(self, identifier: str) -> dict[str, FileModel]: return return_dict def get_embedding(self, identifier: str) -> BedEmbeddingResult: - """ - Get bed file embedding of bed file from qdrant. + """Get bed file embedding of bed file from qdrant. - :param identifier: bed file identifier - :return: bed file embedding + Args: + identifier: Bed file identifier. + + Returns: + Bed file embedding. """ if not self.exists(identifier): raise BEDFileNotFoundError(f"Bed file with id: {identifier} not found.") @@ -416,15 +434,16 @@ def get_ids_list( genome: str = None, bed_compliance: str = None, ) -> BedListResult: - """ - Get list of bed file identifiers. + """Get list of bed file identifiers. - :param limit: number of results to return - :param offset: offset to start from - :param genome: filter by genome - :param bed_compliance: filter by bed type. e.g. 'bed6+4' + Args: + limit: Number of results to return. + offset: Offset to start from. + genome: Filter by genome. + bed_compliance: Filter by bed type. e.g. 'bed6+4'. - :return: list of bed file identifiers + Returns: + List of bed file identifiers. """ statement = select(Bed) count_statement = select(func.count(Bed.id)) @@ -463,11 +482,13 @@ def get_ids_list( ) def get_reference_validation(self, identifier: str) -> RefGenValidReturnModel: - """ - Get results of reference genome validation for the bed file. + """Get results of reference genome validation for the bed file. - :param identifier: bed file identifier - :return: reference genome validation results + Args: + identifier: Bed file identifier. + + Returns: + Reference genome validation results. """ if not self.exists(identifier): @@ -531,26 +552,28 @@ def add( nofail: bool = False, processed: bool = True, ) -> None: - """ - Add bed file to the database. - - :param identifier: bed file identifier - :param stats: bed file results {statistics, plots, files, metadata} - :param metadata: bed file metadata (will be saved in pephub) - :param plots: bed file plots - :param files: bed file files - :param classification: bed file classification - :param ref_validation: reference validation data. RefGenValidModel - :param license_id: bed file license id (default: 'DUO:0000042'). Full list of licenses: - https://raw.githubusercontent.com/EBISPOT/DUO/master/duo.csv - :param upload_qdrant: add bed file to qdrant indexs - :param upload_pephub: add bed file to pephub - :param upload_s3: upload files to s3 - :param local_path: local path to the output files - :param overwrite: overwrite bed file if it already exists - :param nofail: do not raise an error for error in pephub/s3/qdrant or record exsist and not overwrite - :param processed: true if bedfile was processed and statistics and plots were calculated - :return: None + """Add bed file to the database. + + Args: + identifier: Bed file identifier. + stats: Bed file results {statistics, plots, files, metadata}. + metadata: Bed file metadata (will be saved in pephub). + plots: Bed file plots. + files: Bed file files. + classification: Bed file classification. + ref_validation: Reference validation data. RefGenValidModel. + license_id: Bed file license id (default: 'DUO:0000042'). Full list of licenses: + https://raw.githubusercontent.com/EBISPOT/DUO/master/duo.csv + upload_qdrant: Add bed file to qdrant indexes. + upload_pephub: Add bed file to pephub. + upload_s3: Upload files to s3. + local_path: Local path to the output files. + overwrite: Overwrite bed file if it already exists. + nofail: Do not raise an error for error in pephub/s3/qdrant or record exists and not overwrite. + processed: True if bedfile was processed and statistics and plots were calculated. + + Returns: + None. """ _LOGGER.info(f"Adding bed file to database. bed_id: {identifier}") @@ -729,25 +752,27 @@ def update( nofail: bool = False, processed: bool = False, ) -> None: - """ - Update bed file to the database. - - :param identifier: bed file identifier - :param stats: bed file results {statistics, plots, files, metadata} - :param metadata: bed file metadata (will be saved in pephub) - :param plots: bed file plots - :param files: bed file files - :param classification: bed file classification - :param ref_validation: reference validation data. RefGenValidModel - :param license_id: bed file license id (default: 'DUO:0000042'). - :param upload_qdrant: add bed file to qdrant indexs - :param upload_pephub: add bed file to pephub - :param upload_s3: upload files to s3 - :param local_path: local path to the output files - :param overwrite: overwrite bed file if it already exists - :param nofail: do not raise an error for error in pephub/s3/qdrant or record exsist and not overwrite - :param processed: true if bedfile was processed and statistics and plots were calculated - :return: None + """Update bed file to the database. + + Args: + identifier: Bed file identifier. + stats: Bed file results {statistics, plots, files, metadata}. + metadata: Bed file metadata (will be saved in pephub). + plots: Bed file plots. + files: Bed file files. + classification: Bed file classification. + ref_validation: Reference validation data. RefGenValidModel. + license_id: Bed file license id (default: 'DUO:0000042'). + upload_qdrant: Add bed file to qdrant indexes. + upload_pephub: Add bed file to pephub. + upload_s3: Upload files to s3. + local_path: Local path to the output files. + overwrite: Overwrite bed file if it already exists. + nofail: Do not raise an error for error in pephub/s3/qdrant or record exists and not overwrite. + processed: True if bedfile was processed and statistics and plots were calculated. + + Returns: + None. """ if not self.exists(identifier): raise BEDFileNotFoundError( @@ -844,14 +869,15 @@ def update( def _update_classification( sa_session: Session, bed_object: Bed, classification: BedClassification ) -> None: - """ - Update bed file classification + """Update bed file classification. - :param sa_session: sqlalchemy session - :param bed_object: bed sqlalchemy object - :param classification: bed file classification as BedClassification object + Args: + sa_session: Sqlalchemy session. + bed_object: Bed sqlalchemy object. + classification: Bed file classification as BedClassification object. - :return: None + Returns: + None. """ classification_dict = classification.model_dump( exclude_defaults=True, exclude_none=True, exclude_unset=True @@ -865,13 +891,15 @@ def _update_classification( def _update_stats( sa_session: Session, bed_object: Bed, stats: BedStatsModel ) -> None: - """ - Update bed file statistics + """Update bed file statistics. + + Args: + sa_session: Sqlalchemy session. + bed_object: Bed sqlalchemy object. + stats: Bed file statistics as BedStatsModel object. - :param sa_session: sqlalchemy session - :param bed_object: bed sqlalchemy object - :param stats: bed file statistics as BedStatsModel object - :return: None + Returns: + None. """ stats_dict = stats.model_dump( @@ -889,14 +917,15 @@ def _update_stats( def _update_metadata( self, sa_session: Session, bed_object: Bed, bed_metadata: StandardMeta ) -> None: - """ - Update bed file metadata + """Update bed file metadata. - :param sa_session: sqlalchemy session - :param bed_object: bed sqlalchemy object - :param bed_metadata: bed file metadata as StandardMeta object + Args: + sa_session: Sqlalchemy session. + bed_object: Bed sqlalchemy object. + bed_metadata: Bed file metadata as StandardMeta object. - :return: None + Returns: + None. """ self._update_sources( @@ -929,13 +958,13 @@ def _update_plots( plots: BedPlots, local_path: str = None, ) -> None: - """ - Update bed file plots + """Update bed file plots. - :param sa_session: sqlalchemy session - :param bed_object: bed sqlalchemy object - :param plots: bed file plots - :param local_path: local path to the output files + Args: + sa_session: Sqlalchemy session. + bed_object: Bed sqlalchemy object. + plots: Bed file plots. + local_path: Local path to the output files. """ _LOGGER.info("Updating bed file plots..") @@ -978,12 +1007,12 @@ def _update_files( files: BedFiles, local_path: str = None, ) -> None: - """ - Update bed files + """Update bed files. - :param sa_session: sqlalchemy session - :param bed_object: bed sqlalchemy object - :param files: bed file files + Args: + sa_session: Sqlalchemy session. + bed_object: Bed sqlalchemy object. + files: Bed file files. """ _LOGGER.info("Updating bed files..") @@ -1026,15 +1055,15 @@ def _update_ref_validation( ref_validation: dict[str, BaseModel], provided_genome: str = "", ) -> None: - """ - Update reference validation data + """Update reference validation data. - ! This function won't update the reference validation data, if it exists, it will skip it. + This function won't update the reference validation data, if it exists, it will skip it. - :param sa_session: sqlalchemy session - :param bed_id: bed sqlalchemy object - :param ref_validation: bed file metadata - :param provided_genome: genome reference that was provided by user + Args: + sa_session: Sqlalchemy session. + bed_id: Bed sqlalchemy object. + ref_validation: Bed file metadata. + provided_genome: Genome reference that was provided by user. """ if not ref_validation: @@ -1085,11 +1114,13 @@ def _create_ref_validation_models( return new_gen_refs def delete(self, identifier: str) -> None: - """ - Delete bed file from the database. + """Delete bed file from the database. + + Args: + identifier: Bed file identifier. - :param identifier: bed file identifier - :return: None + Returns: + None. """ _LOGGER.info(f"Deleting bed file from database. bed_id: {identifier}") if not self.exists(identifier): @@ -1143,10 +1174,10 @@ def update_pephub( _LOGGER.warning(f"Could not update pephub. Error: {e}") def delete_pephub_sample(self, identifier: str): - """ - Delete sample from pephub + """Delete sample from pephub. - :param identifier: bed file identifier + Args: + identifier: Bed file identifier. """ try: self.config.phc.sample.remove( @@ -1164,15 +1195,17 @@ def upload_file_qdrant( bed_file: str | GRegionSet, payload: dict = None, ) -> None: - """ - Convert bed file to vector and add it to qdrant database + """Convert bed file to vector and add it to qdrant database. - !Warning: only hg38 genome can be added to qdrant! + Warning: only hg38 genome can be added to qdrant! - :param bed_id: bed file id - :param bed_file: path to the bed file, or RegionSet object - :param payload: additional metadata to store alongside vectors - :return: None + Args: + bed_id: Bed file id. + bed_file: Path to the bed file, or RegionSet object. + payload: Additional metadata to store alongside vectors. + + Returns: + None. """ _LOGGER.debug(f"Adding bed file to qdrant. bed_id: {bed_id}") @@ -1190,13 +1223,13 @@ def upload_file_qdrant( return None def _embed_file(self, bed_file: str | GRegionSet) -> np.ndarray: - """ - Create embedding for bed file + """Create embedding for bed file. - :param bed_file: bed file path or region set - :param bed_file: path to the bed file, or RegionSet object + Args: + bed_file: Path to the bed file, or RegionSet object. - :return np array of embeddings + Returns: + Numpy array of embeddings. """ if self.config.qdrant_file_backend is None: raise QdrantInstanceNotInitializedError @@ -1222,10 +1255,10 @@ def _embed_file(self, bed_file: str | GRegionSet) -> np.ndarray: return bed_embedding.reshape(1, vec_dim) def _get_umap_file(self, bed_file: str | GRegionSet) -> np.ndarray: - """ - Create UMAP for bed file + """Create UMAP for bed file. - :param bed_file: bed file path or region set + Args: + bed_file: Bed file path or region set. """ if self.config.umap_encoder is None: @@ -1242,16 +1275,17 @@ def text_to_bed_search( offset: int = 0, with_metadata: bool = True, ) -> BedListSearchResult: - """ - Search for bed files by text query in qdrant database - This is bivec_search + """Search for bed files by text query in qdrant database. + This is bivec_search. - :param query: text query - :param limit: number of results to return - :param offset: offset to start from - :param with_metadata: if True, will return metadata for each result + Args: + query: Text query. + limit: Number of results to return. + offset: Offset to start from. + with_metadata: If True, will return metadata for each result. - :return: list of bed file metadata + Returns: + List of bed file metadata. """ _LOGGER.info(f"Looking for: {query}") @@ -1296,14 +1330,15 @@ def bed_to_bed_search( limit: int = 10, offset: int = 0, ) -> BedListSearchResult: - """ - Search for bed files by using region set in qdrant database. + """Search for bed files by using region set in qdrant database. - :param region_set: RegionSet object to search for (bed file) - :param limit: number of results to return - :param offset: offset to start from + Args: + region_set: RegionSet object to search for (bed file). + limit: Number of results to return. + offset: Offset to start from. - :return: BedListSetResults + Returns: + BedListSetResults. """ results = self.config.b2b_search_interface.query_search( region_set, limit=limit, offset=offset @@ -1335,17 +1370,18 @@ def sql_search( limit: int = 10, offset: int = 0, ) -> BedListSearchResult: - """ - Search for bed files by using sql exact search. - This search will search files by id, name, and description + """Search for bed files by using sql exact search. + This search will search files by id, name, and description. - :param query: text query - :param genome: genome alias to filter results - :param assay: filter by assay type - :param limit: number of results to return - :param offset: offset to start from + Args: + query: Text query. + genome: Genome alias to filter results. + assay: Filter by assay type. + limit: Number of results to return. + offset: Offset to start from. - :return: list of bed file metadata + Returns: + List of bed file metadata. """ _LOGGER.debug(f"Looking for: {query}") @@ -1407,12 +1443,13 @@ def sql_search( ) def _sql_search_count(self, condition_statement) -> int: - """ - Get number of total found files in the database. + """Get number of total found files in the database. - :param condition_statement: sql alchemy condition statement to filter results + Args: + condition_statement: Sql alchemy condition statement to filter results. - :return: number of found files + Returns: + Number of found files. """ with Session(self._sa_engine) as session: @@ -1426,15 +1463,14 @@ def _sql_search_count(self, condition_statement) -> int: return count[0] def reindex_qdrant(self, batch: int = 100, purge: bool = False) -> None: - """ - Re-upload all files to quadrant. - !Warning: only hg38 genome can be added to qdrant! + """Re-upload all files to quadrant. + Warning: only hg38 genome can be added to qdrant! If you want to fully reindex/reupload to qdrant, first delete collection and create new one. - Upload all files to qdrant. - :param batch: number of files to upload in one batch + Args: + batch: Number of files to upload in one batch. """ bb_client = BBClient() @@ -1539,11 +1575,13 @@ def reindex_qdrant(self, batch: int = 100, purge: bool = False) -> None: return None def delete_qdrant_point(self, identifier: str) -> None: - """ - Delete bed file from qdrant. + """Delete bed file from qdrant. + + Args: + identifier: Bed file identifier. - :param identifier: bed file identifier - :return: None + Returns: + None. """ result = self.config.qdrant_file_backend.qd_client.delete( @@ -1559,11 +1597,13 @@ def delete_qdrant_point(self, identifier: str) -> None: return None def exists(self, identifier: str) -> bool: - """ - Check if bed file exists in the database. + """Check if bed file exists in the database. + + Args: + identifier: Bed file identifier. - :param identifier: bed file identifier - :return: True if bed file exists, False otherwise + Returns: + True if bed file exists, False otherwise. """ statement = select(Bed).where(and_(Bed.id == identifier)) @@ -1574,12 +1614,13 @@ def exists(self, identifier: str) -> bool: return True def exists_universe(self, identifier: str) -> bool: - """ - Check if universe exists in the database. + """Check if universe exists in the database. - :param identifier: universe identifier + Args: + identifier: Universe identifier. - :return: True if universe exists, False otherwise + Returns: + True if universe exists, False otherwise. """ statement = select(Universes).where(and_(Universes.id == identifier)) @@ -1592,14 +1633,15 @@ def exists_universe(self, identifier: str) -> bool: def add_universe( self, bedfile_id: str, bedset_id: str = None, construct_method: str = None ) -> str: - """ - Add universe to the database. + """Add universe to the database. - :param bedfile_id: bed file identifier - :param bedset_id: bedset identifier - :param construct_method: method used to construct the universe + Args: + bedfile_id: Bed file identifier. + bedset_id: Bedset identifier. + construct_method: Method used to construct the universe. - :return: universe identifier. + Returns: + Universe identifier. """ if not self.exists(bedfile_id): @@ -1615,11 +1657,13 @@ def add_universe( return bedfile_id def delete_universe(self, identifier: str) -> None: - """ - Delete universe from the database. + """Delete universe from the database. + + Args: + identifier: Universe identifier. - :param identifier: universe identifier - :return: None + Returns: + None. """ if not self.exists_universe(identifier): raise UniverseNotFoundError(f"Universe not found. id: {identifier}") @@ -1632,15 +1676,16 @@ def delete_universe(self, identifier: str) -> None: def add_tokenized( self, bed_id: str, universe_id: str, token_vector: list, overwrite: bool = False ) -> str: - """ - Add tokenized bed file to the database + """Add tokenized bed file to the database. - :param bed_id: bed file identifier - :param universe_id: universe identifier - :param token_vector: list of tokens - :param overwrite: overwrite tokenized file if it already exists + Args: + bed_id: Bed file identifier. + universe_id: Universe identifier. + token_vector: List of tokens. + overwrite: Overwrite tokenized file if it already exists. - :return: token path + Returns: + Token path. """ with Session(self._sa_engine) as session: @@ -1679,14 +1724,15 @@ def _add_zarr_s3( tokenized_vector: list, overwrite: bool = False, ) -> str: - """ - Add zarr file to the database + """Add zarr file to the database. - :param universe_id: universe identifier - :param bed_id: bed file identifier - :param tokenized_vector: tokenized vector + Args: + universe_id: Universe identifier. + bed_id: Bed file identifier. + tokenized_vector: Tokenized vector. - :return: zarr path + Returns: + Zarr path. """ univers_group = self.config.zarr_root.require_group(universe_id) @@ -1707,13 +1753,14 @@ def _add_zarr_s3( return str(os.path.join(ZARR_TOKENIZED_FOLDER, path)) def get_tokenized(self, bed_id: str, universe_id: str) -> TokenizedBedResponse: - """ - Get zarr file from the database + """Get zarr file from the database. - :param bed_id: bed file identifier - :param universe_id: universe identifier + Args: + bed_id: Bed file identifier. + universe_id: Universe identifier. - :return: zarr path + Returns: + Zarr path. """ if not self.exist_tokenized(bed_id, universe_id): @@ -1727,13 +1774,14 @@ def get_tokenized(self, bed_id: str, universe_id: str) -> TokenizedBedResponse: ) def delete_tokenized(self, bed_id: str, universe_id: str) -> None: - """ - Delete tokenized bed file from the database + """Delete tokenized bed file from the database. - :param bed_id: bed file identifier - :param universe_id: universe identifier + Args: + bed_id: Bed file identifier. + universe_id: Universe identifier. - :return: None + Returns: + None. """ if not self.exist_tokenized(bed_id, universe_id): raise TokenizeFileNotExistError("Tokenized file not found in the database.") @@ -1754,13 +1802,14 @@ def delete_tokenized(self, bed_id: str, universe_id: str) -> None: return None def _get_tokenized_path(self, bed_id: str, universe_id: str) -> str: - """ - Get tokenized path to tokenized file + """Get tokenized path to tokenized file. - :param bed_id: bed file identifier - :param universe_id: universe identifier + Args: + bed_id: Bed file identifier. + universe_id: Universe identifier. - :return: token path + Returns: + Token path. """ if not self.exist_tokenized(bed_id, universe_id): raise TokenizeFileNotExistError("Tokenized file not found in the database.") @@ -1776,13 +1825,14 @@ def _get_tokenized_path(self, bed_id: str, universe_id: str) -> str: return str(tokenized_object.path) def exist_tokenized(self, bed_id: str, universe_id: str) -> bool: - """ - Check if tokenized bed file exists in the database + """Check if tokenized bed file exists in the database. - :param bed_id: bed file identifier - :param universe_id: universe identifier + Args: + bed_id: Bed file identifier. + universe_id: Universe identifier. - :return: bool + Returns: + True if tokenized bed file exists, False otherwise. """ with Session(self._sa_engine) as session: statement = select(TokenizedBed).where( @@ -1799,14 +1849,17 @@ def exist_tokenized(self, bed_id: str, universe_id: str) -> bool: def get_tokenized_link( self, bed_id: str, universe_id: str ) -> TokenizedPathResponse: - """ - Get tokenized link to tokenized file + """Get tokenized link to tokenized file. + + Args: + bed_id: Bed file identifier. + universe_id: Universe identifier. - :param bed_id: bed file identifier - :param universe_id: universe identifier + Returns: + Token link. - :return: token link - :raises: TokenizeFileNotExistError + Raises: + TokenizeFileNotExistError: If the tokenized file does not exist. """ file_path = self._get_tokenized_path(bed_id, universe_id) @@ -1820,14 +1873,15 @@ def get_tokenized_link( def get_missing_plots( self, plot_name: str, limit: int = 1000, offset: int = 0 ) -> list[str]: - """ - Get list of bed files that are missing plot + """Get list of bed files that are missing plot. - :param plot_name: plot name - :param limit: number of results to return - :param offset: offset to start from + Args: + plot_name: Plot name. + limit: Number of results to return. + offset: Offset to start from. - :return: list of bed file identifiers + Returns: + List of bed file identifiers. """ if plot_name not in list(BedPlots.model_fields.keys()): raise BedBaseConfError( @@ -1858,13 +1912,14 @@ def get_missing_plots( return results def get_missing_stats(self, limit: int = 1000, offset: int = 0) -> list[str]: - """ - Get list of bed files that are missing statistics + """Get list of bed files that are missing statistics. - :param limit: number of results to return - :param offset: offset to start from + Args: + limit: Number of results to return. + offset: Offset to start from. - :return: list of bed file identifiers + Returns: + List of bed file identifiers. """ with Session(self._sa_engine) as session: @@ -1882,13 +1937,14 @@ def get_missing_stats(self, limit: int = 1000, offset: int = 0) -> list[str]: return results def get_missing_files(self, limit: int = 1000, offset: int = 0) -> list[str]: - """ - Get list of bed files that are missing files (bigBed files) + """Get list of bed files that are missing files (bigBed files). - :param limit: number of results to return - :param offset: offset to start from + Args: + limit: Number of results to return. + offset: Offset to start from. - :return: list of bed file identifiers + Returns: + List of bed file identifiers. """ with Session(self._sa_engine) as session: @@ -1915,14 +1971,16 @@ def get_missing_files(self, limit: int = 1000, offset: int = 0) -> list[str]: def get_unprocessed( self, limit: int = 1000, offset: int = 0, genome: str | list | None = None ) -> BedListResult: - """ - Get bed files that are not processed. + """Get bed files that are not processed. - :param limit: number of results to return - :param offset: offset to start from - :param genome: genome alias or list of genome aliases to filter by. e.g. "hg38" or ["hg38", "mm10"]. by default None, which means no filtering by genome. + Args: + limit: Number of results to return. + offset: Offset to start from. + genome: Genome alias or list of genome aliases to filter by. e.g. "hg38" or + ["hg38", "mm10"]. By default None, which means no filtering by genome. - :return: list of bed file identifiers + Returns: + List of bed file identifiers. """ if isinstance(genome, str): @@ -1982,14 +2040,15 @@ def _update_sources( global_sample_id: list[str] | None = None, global_experiment_id: list[str] | None = None, ) -> None: - """ - Add global sample and experiment ids to the bed file if they are missing + """Add global sample and experiment ids to the bed file if they are missing. - :param identifier: bed file identifier - :param global_sample_id: list of global sample ids - :param global_experiment_id: list of global experiment ids + Args: + identifier: Bed file identifier. + global_sample_id: List of global sample ids. + global_experiment_id: List of global experiment ids. - :return: None + Returns: + None. """ _LOGGER.info(f"Updating sources for bed file: {identifier}") @@ -2019,13 +2078,14 @@ def _update_sources( session.commit() def reindex_hybrid_search(self, batch: int = 1000, purge: bool = False) -> None: - """ - Reindex all bed files for semantic database + """Reindex all bed files for semantic database. - :param batch: number of files to upload in one batch - :param purge: resets indexed in database for all files to False + Args: + batch: Number of files to upload in one batch. + purge: Resets indexed in database for all files to False. - :return: None + Returns: + None. """ # Add column that will indicate if this file is indexed or not @@ -2149,18 +2209,19 @@ def hybrid_search( offset: int = 0, with_metadata: bool = True, ) -> BedListSearchResult: - """ - Run semantic search for bed files using qdrant. + """Run semantic search for bed files using qdrant. This is not bivec search, but usual qdrant search with sparse and dense embeddings. - :param query: text query to search for - :param genome_alias: genome alias to filter results - :param assay: filter by assay type - :param limit: number of results to return - :param offset: offset to start from - :param with_metadata: if True, metadata will be returned in the results. Default is True. + Args: + query: Text query to search for. + genome_alias: Genome alias to filter results. + assay: Filter by assay type. + limit: Number of results to return. + offset: Offset to start from. + with_metadata: If True, metadata will be returned in the results. Default is True. - :return: list of bed file metadata + Returns: + List of bed file metadata. """ must_statement = [] @@ -2253,14 +2314,15 @@ def hybrid_search( ) def search_external_file(self, source: str, accession: str) -> BedListSearchResult: - """ - Search for bed files by external source and accession number. - e.g. source='geo', accession='GSE12345' + """Search for bed files by external source and accession number. + e.g. source='geo', accession='GSE12345'. - :param source: external source, e.g. 'geo' or 'encode' - :param accession: accession number, e.g. 'GSE12345' or 'ENCSR12345' + Args: + source: External source, e.g. 'geo' or 'encode'. + accession: Accession number, e.g. 'GSE12345' or 'ENCSR12345'. - :return: list of bed file metadata + Returns: + List of bed file metadata. """ if source not in ["geo", "encode"]: raise BedBaseConfError( diff --git a/bbconf/modules/bedsets.py b/bbconf/modules/bedsets.py index c7758a17..335d251b 100644 --- a/bbconf/modules/bedsets.py +++ b/bbconf/modules/bedsets.py @@ -38,19 +38,23 @@ class BedAgentBedSet: """ def __init__(self, config: BedBaseConfig): - """ - :param config: config object + """Initialize BedAgentBedSet. + + Args: + config: Config object. """ self.config = config self._db_engine = self.config.db_engine def get(self, identifier: str, full: bool = False) -> BedSetMetadata: - """ - Get file metadata by identifier. + """Get file metadata by identifier. + + Args: + identifier: Bed file identifier. + full: Return full record with stats, plots, files and metadata. - :param identifier: bed file identifier - :param full: return full record with stats, plots, files and metadata - :return: project metadata + Returns: + Project metadata. """ statement = select(BedSets).where(BedSets.id == identifier) @@ -92,11 +96,13 @@ def get(self, identifier: str, full: bool = False) -> BedSetMetadata: return bedset_metadata def get_plots(self, identifier: str) -> BedSetPlots: - """ - Get plots for bedset by identifier. + """Get plots for bedset by identifier. + + Args: + identifier: Bedset identifier. - :param identifier: bedset identifier - :return: bedset plots + Returns: + Bedset plots. """ statement = select(BedSets).where(BedSets.id == identifier) @@ -121,11 +127,13 @@ def get_plots(self, identifier: str) -> BedSetPlots: return bedset_files def get_objects(self, identifier: str) -> dict[str, FileModel]: - """ - Get objects for bedset by identifier. + """Get objects for bedset by identifier. - :param identifier: bedset identifier - :return: bedset objects + Args: + identifier: Bedset identifier. + + Returns: + Bedset objects. """ statement = select(BedSets).where(BedSets.id == identifier) return_dict = {} @@ -146,11 +154,13 @@ def get_objects(self, identifier: str) -> dict[str, FileModel]: return return_dict def get_statistics(self, identifier: str) -> BedSetStats: - """ - Get statistics for bedset by identifier. + """Get statistics for bedset by identifier. - :param identifier: bedset identifier - :return: bedset statistics + Args: + identifier: Bedset identifier. + + Returns: + Bedset statistics. """ statement = select(BedSets).where(BedSets.id == identifier) with Session(self._db_engine.engine) as session: @@ -163,11 +173,13 @@ def get_statistics(self, identifier: str) -> BedSetStats: ) def get_bedset_pep(self, identifier: str) -> dict: - """ - Create pep file for a bedset. + """Create pep file for a bedset. - :param identifier: bedset identifier - :return: pep dict + Args: + identifier: Bedset identifier. + + Returns: + Pep dict. """ statement = select(BedFileBedSetRelation).where( @@ -220,11 +232,13 @@ def get_bedset_pep(self, identifier: str) -> dict: } def get_track_hub_file(self, identifier: str) -> str: - """ - Get track hub file for bedset. + """Get track hub file for bedset. + + Args: + identifier: Bedset identifier. - :param identifier: bedset identifier - :return: track hub file + Returns: + Track hub file. """ statement = select(BedFileBedSetRelation).where( BedFileBedSetRelation.bedset_id == identifier @@ -291,23 +305,25 @@ def create( overwrite: bool = False, processed: bool = True, ) -> None: - """ - Create bedset in the database. - - :param identifier: bedset identifier - :param name: bedset name - :param description: bedset description - :param bedid_list: list of bed file identifiers - :param statistics: calculate statistics for bedset - :param annotation: bedset annotation (author, source) - :param plots: dictionary with plots - :param upload_pephub: upload bedset to pephub (create view in pephub) - :param upload_s3: upload bedset to s3 - :param local_path: local path to the output files - :param no_fail: do not raise an error if bedset already exists - :param overwrite: overwrite the record in the database - :param processed: flag to indicate that bedset is processed. [Default: True] - :return: None + """Create bedset in the database. + + Args: + identifier: Bedset identifier. + name: Bedset name. + description: Bedset description. + bedid_list: List of bed file identifiers. + statistics: Calculate statistics for bedset. + annotation: Bedset annotation (author, source). + plots: Dictionary with plots. + upload_pephub: Upload bedset to pephub (create view in pephub). + upload_s3: Upload bedset to s3. + local_path: Local path to the output files. + no_fail: Do not raise an error if bedset already exists. + overwrite: Overwrite the record in the database. + processed: Flag to indicate that bedset is processed. [Default: True]. + + Returns: + None. """ _LOGGER.info(f"Creating bedset '{identifier}'") @@ -392,11 +408,13 @@ def create( return None def _calculate_statistics(self, bed_ids: list[str]) -> BedSetStats: - """ - Calculate statistics for bedset. + """Calculate statistics for bedset. - :param bed_ids: list of bed file identifiers - :return: statistics + Args: + bed_ids: List of bed file identifiers. + + Returns: + Statistics. """ _LOGGER.info("Calculating bedset statistics") @@ -439,15 +457,16 @@ def _create_pephub_view( bed_ids: list = None, nofail: bool = False, ) -> None: - """ - Create view in pephub for bedset. + """Create view in pephub for bedset. - :param bedset_id: bedset identifier - :param description: bedset description - :param bed_ids: list of bed file identifiers - :param nofail: do not raise an error if sample not found + Args: + bedset_id: Bedset identifier. + description: Bedset description. + bed_ids: List of bed file identifiers. + nofail: Do not raise an error if sample not found. - :return: None + Returns: + None. """ _LOGGER.info(f"Creating view in pephub for bedset '{bedset_id}'") @@ -469,13 +488,15 @@ def _create_pephub_view( def get_ids_list( self, query: str = None, limit: int = 10, offset: int = 0 ) -> BedSetListResult: - """ - Get list of bedsets from the database. + """Get list of bedsets from the database. + + Args: + query: Search query. + limit: Limit of results. + offset: Offset of results. - :param query: search query - :param limit: limit of results - :param offset: offset of results - :return: list of bedsets + Returns: + List of bedsets. """ statement = select(BedSets.id) count_statement = select(func.count(BedSets.id)) @@ -510,12 +531,13 @@ def get_ids_list( ) def get_bedset_bedfiles(self, identifier: str) -> BedSetBedFiles: - """ - Get list of bedfiles in bedset. + """Get list of bedfiles in bedset. - :param identifier: bedset identifier + Args: + identifier: Bedset identifier. - :return: list of bedfiles + Returns: + List of bedfiles. """ sub_statement = select(BedFileBedSetRelation.bedfile_id).where( BedFileBedSetRelation.bedset_id == identifier @@ -544,11 +566,13 @@ def get_bedset_bedfiles(self, identifier: str) -> BedSetBedFiles: ) def delete(self, identifier: str) -> None: - """ - Delete bed file from the database. + """Delete bed file from the database. + + Args: + identifier: Bedset identifier. - :param identifier: bedset identifier - :return: None + Returns: + None. """ if not self.exists(identifier): raise BedSetNotFoundError(identifier) @@ -569,12 +593,14 @@ def delete(self, identifier: str) -> None: self.config.delete_files_s3(files) def delete_phc_view(self, identifier: str, nofail: bool = False) -> None: - """ - Delete view in pephub. + """Delete view in pephub. + + Args: + identifier: Bedset identifier. + nofail: Do not raise an error if view not found. - :param identifier: bedset identifier - :param nofail: do not raise an error if view not found - :return: None + Returns: + None. """ _LOGGER.info(f"Deleting view in pephub for bedset '{identifier}'") try: @@ -591,11 +617,13 @@ def delete_phc_view(self, identifier: str, nofail: bool = False) -> None: return None def exists(self, identifier: str) -> bool: - """ - Check if bedset exists in the database. + """Check if bedset exists in the database. + + Args: + identifier: Bedset identifier. - :param identifier: bedset identifier - :return: True if bedset exists, False otherwise + Returns: + True if bedset exists, False otherwise. """ statement = select(BedSets).where(BedSets.id == identifier) with Session(self._db_engine.engine) as session: @@ -605,13 +633,14 @@ def exists(self, identifier: str) -> bool: return False def get_unprocessed(self, limit: int = 100, offset: int = 0) -> BedSetListResult: - """ - Get unprocessed bedset from the database. + """Get unprocessed bedset from the database. - :param limit: limit of results - :param offset: offset of results + Args: + limit: Limit of results. + offset: Offset of results. - :return: bedset metadata + Returns: + Bedset metadata. """ with Session(self._db_engine.engine) as session: diff --git a/bbconf/modules/objects.py b/bbconf/modules/objects.py index ace529f3..94e5e00f 100644 --- a/bbconf/modules/objects.py +++ b/bbconf/modules/objects.py @@ -21,8 +21,10 @@ class BBObjects: """ """ def __init__(self, config: BedBaseConfig): - """ - :param config: config object + """Initialize BBObjects. + + Args: + config: Config object. """ self.config = config self.bed = BedAgentBedFile(self.config) @@ -35,14 +37,16 @@ def get_thumbnail_uri( result_id: str, access_id: str = "http", ) -> str: - """ - Create URL to access a bed- or bedset-associated thumbnail + """Create URL to access a bed- or bedset-associated thumbnail. + + Args: + record_type: Table name ["bed", "bedset"]. + record_id: Record identifier. + result_id: Column name (result name). + access_id: Access id (e.g. http, s3, etc.). - :param record_type: table_name ["bed", "bedset"] - :param record_id: record identifier - :param result_id: column name (result name) - :param access_id: access id (e.g. http, s3, etc.) - :return: string with thumbnail + Returns: + String with thumbnail. """ result = self._get_result(record_type, record_id, result_id) if result.path_thumbnail: @@ -63,14 +67,16 @@ def get_object_uri( result_id: str, access_id: str, ) -> str: - """ - Create URL to access a bed- or bedset-associated file + """Create URL to access a bed- or bedset-associated file. + + Args: + record_type: Table name ["bed", "bedset"]. + record_id: Record identifier. + result_id: Column name (result name). + access_id: Access id (e.g. http, s3, etc.). - :param record_type: table_name ["bed", "bedset"] - :param record_id: record identifier - :param result_id: column name (result name) - :param access_id: access id (e.g. http, s3, etc.) - :return: + Returns: + URI string for the object. """ result = self._get_result(record_type, record_id, result_id) return self.config.get_prefixed_uri(result.path, access_id) @@ -81,13 +87,15 @@ def _get_result( record_id: str, result_id: str | list[str], ) -> FileModel: - """ - Generic getter that can return a result from either bed or bedset + """Generic getter that can return a result from either bed or bedset. - :param record_type: table_name ["bed", "bedset"] - :param record_id: record identifier - :param result_id: column name (result name). e.g. "bigbedfile", "bed_file", "open_chromatin" - :return: pipestat result + Args: + record_type: Table name ["bed", "bedset"]. + record_id: Record identifier. + result_id: Column name (result name). e.g. "bigbedfile", "bed_file", "open_chromatin". + + Returns: + Pipestat result. """ if record_type == "bed": try: @@ -122,14 +130,16 @@ def get_drs_metadata( result_id: str, base_uri: str, ) -> DRSModel: - """ - Get DRS metadata for a bed- or bedset-associated file + """Get DRS metadata for a bed- or bedset-associated file. + + Args: + record_type: Bed or bedset. + record_id: Record identifier. + result_id: Name of the result file to get metadata for. + base_uri: Base uri to use for the self_uri field (server hostname of DRS broker). - :param record_type: bed or bedset - :param record_id: record identifier - :param result_id: name of the result file to get metadata for - :param base_uri: base uri to use for the self_uri field (server hostname of DRS broker) - :return: DRS metadata + Returns: + DRS metadata. """ object_id = f"{record_type}.{record_id}.{result_id}" @@ -160,15 +170,17 @@ def construct_drs_metadata( created_time: datetime.datetime | None = None, modified_time: datetime.datetime | None = None, ) -> DRSModel: - """ - Construct DRS metadata object - - :param base_uri: base uri to use for the self_uri field (server hostname of DRS broker) - :param object_id: record identifier - :param record_metadata: metadata of the record - :param created_time: time of creation - :param modified_time: time of last modification - :return: DRS metadata + """Construct DRS metadata object. + + Args: + base_uri: Base uri to use for the self_uri field (server hostname of DRS broker). + object_id: Record identifier. + record_metadata: Metadata of the record. + created_time: Time of creation. + modified_time: Time of last modification. + + Returns: + DRS metadata. """ access_methods = self.config.construct_access_method_list(record_metadata.path) drs_dict = DRSModel( From 69b4aa4394f590b12221543397aa5e0e83507c22 Mon Sep 17 00:00:00 2001 From: Khoroshevskyi Date: Mon, 16 Feb 2026 23:11:49 -0500 Subject: [PATCH 5/9] more docstrings adjustment --- .github/workflows/{black.yml => format.yml} | 0 bbconf/bbagent.py | 45 ++++-- bbconf/config_parser/bedbaseconfig.py | 60 +++++--- bbconf/config_parser/models.py | 9 +- bbconf/config_parser/utils.py | 3 +- bbconf/db_utils.py | 24 ++-- bbconf/exceptions.py | 42 ++++-- bbconf/helpers.py | 6 +- bbconf/modules/bedfiles.py | 150 +++++++++++++------- bbconf/modules/bedsets.py | 48 ++++--- bbconf/modules/objects.py | 18 ++- 11 files changed, 270 insertions(+), 135 deletions(-) rename .github/workflows/{black.yml => format.yml} (100%) diff --git a/.github/workflows/black.yml b/.github/workflows/format.yml similarity index 100% rename from .github/workflows/black.yml rename to .github/workflows/format.yml diff --git a/bbconf/bbagent.py b/bbconf/bbagent.py index 0e90058c..b7d07eb6 100644 --- a/bbconf/bbagent.py +++ b/bbconf/bbagent.py @@ -48,7 +48,8 @@ def __init__( config: Path | str, init_ml: bool = True, ): - """Initialize connection to the pep_db database. You can use the basic connection parameters + """ + Initialize connection to the pep_db database. You can use the basic connection parameters or libpq connection string. Args: @@ -82,7 +83,8 @@ def __repr__(self) -> str: return repr def get_stats(self) -> StatsReturn: - """Get statistics for a bed file. + """ + Get statistics for a bed file. Returns: Statistics. @@ -102,7 +104,8 @@ def get_stats(self) -> StatsReturn: ) def get_detailed_stats(self, concise: bool = False) -> FileStats: - """Get comprehensive statistics for all bed files. + """ + Get comprehensive statistics for all bed files. Args: concise: If True, return only top 20 items for each category. @@ -268,7 +271,8 @@ def get_detailed_stats(self, concise: bool = False) -> FileStats: ) def get_detailed_usage(self) -> UsageStats: - """Get detailed usage statistics for the bedbase platform. + """ + Get detailed usage statistics for the bedbase platform. This method will only return top 20 items for each category. Returns: @@ -340,7 +344,8 @@ def get_detailed_usage(self) -> UsageStats: ) def get_list_genomes(self) -> list[str]: - """Get list of genomes from the database. + """ + Get list of genomes from the database. Returns: List of genomes. @@ -355,7 +360,8 @@ def get_list_genomes(self) -> list[str]: return [result[0] for result in genomes if result[0]] def get_list_assays(self) -> list[str]: - """Get list of genomes from the database. + """ + Get list of genomes from the database. Returns: List of genomes. @@ -372,7 +378,8 @@ def get_list_assays(self) -> list[str]: @cached_property def list_of_licenses(self) -> list[str]: - """Get list of licenses from the database. + """ + Get list of licenses from the database. Returns: List of licenses. @@ -497,7 +504,8 @@ def add_usage(self, stats: UsageModel) -> None: session.commit() def _stats_comments(self, sa_session: Session) -> dict[str, int]: - """Get statistics about comments that are present in bed files. + """ + Get statistics about comments that are present in bed files. Args: sa_session: SQLAlchemy session. @@ -552,7 +560,8 @@ def _stats_comments(self, sa_session: Session) -> dict[str, int]: } def _stats_geo_status(self, sa_session: Session) -> dict[str, int]: - """Get statistics about status of GEO bed file processing. + """ + Get statistics about status of GEO bed file processing. Args: sa_session: SQLAlchemy session. @@ -603,7 +612,8 @@ def _stats_geo_status(self, sa_session: Session) -> dict[str, int]: } def bed_files_info(self) -> AllFilesInfo: - """Get information about all bed files in bedbase. + """ + Get information about all bed files in bedbase. Returns: AllFilesInfo containing total count and list of file info objects. @@ -652,7 +662,8 @@ def bed_files_info(self) -> AllFilesInfo: ) def _bin_number_of_regions(self, number_of_regions: list) -> BinValues: - """Create bins for number of regions in bed files. + """ + Create bins for number of regions in bed files. Args: number_of_regions: List of number of regions in bed files. @@ -682,7 +693,8 @@ def _bin_number_of_regions(self, number_of_regions: list) -> BinValues: ) def _bin_mean_region_width(self, mean_region_widths: list) -> BinValues: - """Create bins for number of regions in bed files. + """ + Create bins for number of regions in bed files. Args: mean_region_widths: List of mean region widths in bed files. @@ -712,7 +724,8 @@ def _bin_mean_region_width(self, mean_region_widths: list) -> BinValues: ) def _bin_file_size(self, list_file_size: list) -> BinValues: - """Create bins for number of regions in bed files. + """ + Create bins for number of regions in bed files. Args: list_file_size: List of bed file sizes in bytes. @@ -743,7 +756,8 @@ def _bin_file_size(self, list_file_size: list) -> BinValues: ) def _get_geo_stats(self, sa_session: Session) -> GEOStatistics: - """Get GEO statistics for the bedbase platform. + """ + Get GEO statistics for the bedbase platform. Returns: GEOStatistics. @@ -797,7 +811,8 @@ def _get_geo_stats(self, sa_session: Session) -> GEOStatistics: ) def get_reference_genomes(self) -> dict[str, str]: - """Get mapping of genome aliases to reference genome names. + """ + Get mapping of genome aliases to reference genome names. Returns: Dict mapping genome_alias to reference_genome_name. diff --git a/bbconf/config_parser/bedbaseconfig.py b/bbconf/config_parser/bedbaseconfig.py index 2f43676d..f61ecad0 100644 --- a/bbconf/config_parser/bedbaseconfig.py +++ b/bbconf/config_parser/bedbaseconfig.py @@ -54,7 +54,8 @@ class BedBaseConfig: """ def __init__(self, config: Path | str, init_ml: bool = True): - """Initialize BedBaseConfig object. + """ + Initialize BedBaseConfig object. Args: config: Path to the configuration file. @@ -119,7 +120,8 @@ def __init__(self, config: Path | str, init_ml: bool = True): @staticmethod def _read_config_file(config_path: str) -> ConfigFile: - """Read configuration file and insert default values if not set. + """ + Read configuration file and insert default values if not set. Args: config_path: Configuration file path. @@ -147,7 +149,8 @@ def _read_config_file(config_path: str) -> ConfigFile: @property def config(self) -> ConfigFile: - """Get configuration. + """ + Get configuration. Returns: Configuration object. @@ -156,7 +159,8 @@ def config(self) -> ConfigFile: @property def db_engine(self) -> BaseEngine: - """Get database engine. + """ + Get database engine. Returns: Database engine. @@ -165,7 +169,8 @@ def db_engine(self) -> BaseEngine: @property def phc(self) -> PEPHubClient: - """Get PEPHub client. + """ + Get PEPHub client. Returns: PEPHub client. @@ -174,7 +179,8 @@ def phc(self) -> PEPHubClient: @property def boto3_client(self) -> boto3.client: - """Get boto3 client. + """ + Get boto3 client. Returns: Boto3 client. @@ -183,7 +189,8 @@ def boto3_client(self) -> boto3.client: @property def zarr_root(self) -> Z_GROUP | None: - """Get zarr root object (Group). + """ + Get zarr root object (Group). Returns: Zarr root group object. @@ -248,7 +255,8 @@ def _init_qdrant_client(self) -> QdrantClient: def _init_qdrant_file_backend( self, qdrant_cl: QdrantClient ) -> QdrantBackend | None: - """Create qdrant client object using credentials provided in config file. + """ + Create qdrant client object using credentials provided in config file. Args: qdrant_cl: QdrantClient object. @@ -277,7 +285,8 @@ def _init_qdrant_file_backend( def _init_qdrant_text_backend( self, qdrant_cl: QdrantClient, dense_encoder: TextEmbedding ) -> QdrantBackend | None: - """Create qdrant client text embedding object using credentials provided in config file. + """ + Create qdrant client text embedding object using credentials provided in config file. Args: qdrant_cl: QdrantClient object. @@ -314,7 +323,8 @@ def _init_qdrant_text_backend( def _init_qdrant_hybrid( self, qdrant_cl: QdrantClient, dense_encoder: TextEmbedding ) -> None: - """Create qdrant client with sparse and text embedding object using credentials provided in config file. + """ + Create qdrant client with sparse and text embedding object using credentials provided in config file. Args: qdrant_cl: QdrantClient object. @@ -396,7 +406,8 @@ def _init_bivec_interface( qdrant_text_backend: QdrantBackend, text_encoder: TextEmbedding, ) -> BiVectorSearchInterface | None: - """Create BiVectorSearchInterface object using credentials provided in config file. + """ + Create BiVectorSearchInterface object using credentials provided in config file. Args: qdrant_file_backend: QdrantBackend for file vectors. @@ -423,7 +434,8 @@ def _init_b2b_search_interface( qdrant_file_backend: QdrantBackend, region_encoder: Region2VecExModel | str, ) -> BED2BEDSearchInterface | None: - """Create Bed 2 BED search interface and return this object. + """ + Create Bed 2 BED search interface and return this object. Returns: Bed2BEDSearchInterface object. @@ -526,7 +538,8 @@ def _init_umap_model(self) -> UMAP | None: def _init_boto3_client( self, ) -> BaseClient | None: - """Create boto3 client object using credentials provided in config file. + """ + Create boto3 client object using credentials provided in config file. Returns: Boto3 client. @@ -544,7 +557,8 @@ def _init_boto3_client( return None def upload_s3(self, file_path: str, s3_path: Path | str) -> None: - """Upload file to s3. + """ + Upload file to s3. Args: file_path: Local path to the file. @@ -572,7 +586,8 @@ def upload_files_s3( base_path: str, type: Literal["files", "plots", "bedsets"] = "files", ) -> BedFiles | BedPlots | BedSetPlots: - """Upload files to s3. + """ + Upload files to s3. Args: identifier: Bed file identifier. @@ -627,7 +642,8 @@ def upload_files_s3( return files def delete_s3(self, s3_path: str) -> None: - """Delete file from s3. + """ + Delete file from s3. Args: s3_path: Path to the file in s3. @@ -653,7 +669,8 @@ def delete_s3(self, s3_path: str) -> None: ) def delete_files_s3(self, files: list[FileModel]) -> None: - """Delete files from s3. + """ + Delete files from s3. Args: files: List of file objects. @@ -669,7 +686,8 @@ def delete_files_s3(self, files: list[FileModel]) -> None: @staticmethod def _init_pephubclient() -> PEPHubClient | None: - """Create Pephub client object using credentials provided in config file. + """ + Create Pephub client object using credentials provided in config file. Returns: PephubClient. @@ -685,7 +703,8 @@ def _init_pephubclient() -> PEPHubClient | None: return None def get_prefixed_uri(self, postfix: str, access_id: str) -> str: - """Return uri with correct prefix (schema). + """ + Return uri with correct prefix (schema). Args: postfix: Postfix of the uri (or everything after uri schema). @@ -703,7 +722,8 @@ def get_prefixed_uri(self, postfix: str, access_id: str) -> str: raise BadAccessMethodError(f"Access method {access_id} is not defined.") def construct_access_method_list(self, rel_path: str) -> list[AccessMethod]: - """Construct access method list for a given record. + """ + Construct access method list for a given record. Args: rel_path: Relative path to the record. diff --git a/bbconf/config_parser/models.py b/bbconf/config_parser/models.py index d4499f7e..8f1d37ec 100644 --- a/bbconf/config_parser/models.py +++ b/bbconf/config_parser/models.py @@ -41,7 +41,8 @@ class ConfigDB(BaseModel): @computed_field @property def url(self) -> str: - """The URL of the database. + """ + The URL of the database. Returns: The URL of the database. @@ -105,7 +106,8 @@ def validate_aws_credentials(cls, value): @computed_field @property def modify_access(self) -> bool: - """If the AWS credentials are provided, set the modify access to True. (create = True) + """ + If the AWS credentials are provided, set the modify access to True. (create = True) Returns: True if AWS credentials are provided, False otherwise. @@ -137,7 +139,8 @@ class ConfigFile(BaseModel): @classmethod def from_yaml(cls, path: Path): - """Load the database configuration from a YAML file. + """ + Load the database configuration from a YAML file. Args: path: The path to the YAML file. diff --git a/bbconf/config_parser/utils.py b/bbconf/config_parser/utils.py index f489f5e7..cba2e364 100644 --- a/bbconf/config_parser/utils.py +++ b/bbconf/config_parser/utils.py @@ -12,7 +12,8 @@ def config_analyzer(config_path: str) -> bool: - """Read configuration file and insert default values if not set. + """ + Read configuration file and insert default values if not set. Args: config_path: Configuration file path. diff --git a/bbconf/db_utils.py b/bbconf/db_utils.py index f453000e..050c7e61 100644 --- a/bbconf/db_utils.py +++ b/bbconf/db_utils.py @@ -36,7 +36,8 @@ class SchemaError(Exception): def __init__(self): super().__init__( - """The database schema is incorrect, can't connect to the database!""" + """ + The database schema is incorrect, can't connect to the database!""" ) @@ -613,7 +614,8 @@ def __init__( dsn: str | None = None, echo: bool = False, ): - """Initialize connection to the bedbase database. You can use the basic connection parameters + """ + Initialize connection to the bedbase database. You can use the basic connection parameters or libpq connection string. Args: @@ -641,7 +643,8 @@ def __init__( self.check_db_connection() def create_schema(self, engine=None): - """Create sql schema in the database. + """ + Create sql schema in the database. Args: engine: Sqlalchemy engine [Default: None]. @@ -662,7 +665,8 @@ def create_schema(self, engine=None): pass def delete_schema(self, engine=None) -> None: - """Delete sql schema in the database. + """ + Delete sql schema in the database. Args: engine: Sqlalchemy engine [Default: None]. @@ -676,7 +680,8 @@ def delete_schema(self, engine=None) -> None: return None def session_execute(self, statement: Select) -> Result: - """Execute statement using sqlalchemy statement. + """ + Execute statement using sqlalchemy statement. Args: statement: SQL query or a SQL expression that is constructed using @@ -693,7 +698,8 @@ def session_execute(self, statement: Select) -> Result: @property def session(self): - """Get a started sqlalchemy session. + """ + Get a started sqlalchemy session. Returns: Started sqlalchemy session. @@ -702,7 +708,8 @@ def session(self): @property def engine(self) -> Engine: - """Get sqlalchemy engine. + """ + Get sqlalchemy engine. Returns: Sqlalchemy engine. @@ -725,7 +732,8 @@ def check_db_connection(self): raise SchemaError() def create_schema_graph(self, output_file: str = "schema.svg"): - """Create schema graph of the database. + """ + Create schema graph of the database. Args: output_file: Path to the output file. diff --git a/bbconf/exceptions.py b/bbconf/exceptions.py index 3ce7c717..3ad393cc 100644 --- a/bbconf/exceptions.py +++ b/bbconf/exceptions.py @@ -2,7 +2,8 @@ class BedBaseConfError(Exception): - """Base exception type for this package""" + """ + Base exception type for this package""" __metaclass__ = abc.ABCMeta @@ -14,78 +15,91 @@ class BedbaseS3ConnectionError(BedBaseConfError): class BadAccessMethodError(BedBaseConfError): - """Access ID is not well defined""" + """ + Access ID is not well defined""" pass class BedBaseConnectionError(BedBaseConfError): - """Error type for DB connection problems""" + """ + Error type for DB connection problems""" pass class MissingThumbnailError(BedBaseConfError): - """Error type for missing thumbnail""" + """ + Error type for missing thumbnail""" pass class BedFIleExistsError(BedBaseConfError): - """Error where files exists, and should not be overwritten""" + """ + Error where files exists, and should not be overwritten""" pass class MissingObjectError(BedBaseConfError): - """Error type for missing object""" + """ + Error type for missing object""" pass class BEDFileNotFoundError(BedBaseConfError): - """Error type for missing bedfile""" + """ + Error type for missing bedfile""" pass class BedSetNotFoundError(BedBaseConfError): - """Error type for missing bedset""" + """ + Error type for missing bedset""" pass class BedSetExistsError(BedBaseConfError): - """Error type for existing bedset""" + """ + Error type for existing bedset""" pass class UniverseNotFoundError(BedBaseConfError): - """Error type for missing universe""" + """ + Error type for missing universe""" pass class TokenizeFileExistsError(BedBaseConfError): - """Error type for existing tokenize file""" + """ + Error type for existing tokenize file""" pass class TokenizeFileNotExistError(BedBaseConfError): - """Error type for missing tokenize file""" + """ + Error type for missing tokenize file""" pass class QdrantInstanceNotInitializedError(BedBaseConfError): - """Error type for missing qdrant instance""" + """ + Error type for missing qdrant instance""" pass class BedSetTrackHubLimitError(BedBaseConfError): - """Limit for visualizing trackhub exceeded""" + """ + Limit for visualizing trackhub exceeded""" pass diff --git a/bbconf/helpers.py b/bbconf/helpers.py index 2d0cf8fd..47dd056b 100644 --- a/bbconf/helpers.py +++ b/bbconf/helpers.py @@ -12,7 +12,8 @@ def get_bedbase_cfg(cfg: str = None) -> str: - """Determine path to the bedbase configuration file. + """ + Determine path to the bedbase configuration file. The path can be either explicitly provided or read from a $BEDBASE environment variable. @@ -43,7 +44,8 @@ def get_bedbase_cfg(cfg: str = None) -> str: def get_absolute_path(path: str, base_path: str) -> str: - """Get absolute path to the file and create it if it doesn't exist. + """ + Get absolute path to the file and create it if it doesn't exist. Args: path: Path to the file (abs or relative). diff --git a/bbconf/modules/bedfiles.py b/bbconf/modules/bedfiles.py index f38a2a45..6f3e093b 100644 --- a/bbconf/modules/bedfiles.py +++ b/bbconf/modules/bedfiles.py @@ -81,7 +81,8 @@ class BedAgentBedFile: """ def __init__(self, config: BedBaseConfig, bbagent_obj=None): - """Initialize BedAgentBedFile. + """ + Initialize BedAgentBedFile. Args: config: Config object with database and qdrant engine and credentials. @@ -94,7 +95,8 @@ def __init__(self, config: BedBaseConfig, bbagent_obj=None): self.bb_agent = bbagent_obj def get(self, identifier: str, full: bool = False) -> BedMetadataAll: - """Get file metadata by identifier. + """ + Get file metadata by identifier. Args: identifier: Bed file identifier. @@ -213,7 +215,8 @@ def get(self, identifier: str, full: bool = False) -> BedMetadataAll: ) def get_stats(self, identifier: str) -> BedStatsModel: - """Get file statistics by identifier. + """ + Get file statistics by identifier. Args: identifier: Bed file identifier. @@ -232,7 +235,8 @@ def get_stats(self, identifier: str) -> BedStatsModel: return bed_stats def get_plots(self, identifier: str) -> BedPlots: - """Get file plots by identifier. + """ + Get file plots by identifier. Args: identifier: Bed file identifier. @@ -265,7 +269,8 @@ def get_plots(self, identifier: str) -> BedPlots: def get_neighbours( self, identifier: str, limit: int = 10, offset: int = 0 ) -> BedListSearchResult: - """Get nearest neighbours of bed file from qdrant. + """ + Get nearest neighbours of bed file from qdrant. Args: identifier: Bed file identifier. @@ -310,7 +315,8 @@ def get_neighbours( ) def get_files(self, identifier: str) -> BedFiles: - """Get file files by identifier. + """ + Get file files by identifier. Args: identifier: Bed file identifier. @@ -342,7 +348,8 @@ def get_files(self, identifier: str) -> BedFiles: return bed_files def get_raw_metadata(self, identifier: str) -> BedPEPHub: - """Get file metadata by identifier. + """ + Get file metadata by identifier. Args: identifier: Bed file identifier. @@ -363,7 +370,8 @@ def get_raw_metadata(self, identifier: str) -> BedPEPHub: return BedPEPHubRestrict(**bed_metadata) def get_classification(self, identifier: str) -> BedClassification: - """Get file classification by identifier. + """ + Get file classification by identifier. Args: identifier: Bed file identifier. @@ -382,7 +390,8 @@ def get_classification(self, identifier: str) -> BedClassification: return bed_classification def get_objects(self, identifier: str) -> dict[str, FileModel]: - """Get all object related to bedfile. + """ + Get all object related to bedfile. Args: identifier: Bed file identifier. @@ -403,7 +412,8 @@ def get_objects(self, identifier: str) -> dict[str, FileModel]: return return_dict def get_embedding(self, identifier: str) -> BedEmbeddingResult: - """Get bed file embedding of bed file from qdrant. + """ + Get bed file embedding of bed file from qdrant. Args: identifier: Bed file identifier. @@ -434,7 +444,8 @@ def get_ids_list( genome: str = None, bed_compliance: str = None, ) -> BedListResult: - """Get list of bed file identifiers. + """ + Get list of bed file identifiers. Args: limit: Number of results to return. @@ -482,7 +493,8 @@ def get_ids_list( ) def get_reference_validation(self, identifier: str) -> RefGenValidReturnModel: - """Get results of reference genome validation for the bed file. + """ + Get results of reference genome validation for the bed file. Args: identifier: Bed file identifier. @@ -552,7 +564,8 @@ def add( nofail: bool = False, processed: bool = True, ) -> None: - """Add bed file to the database. + """ + Add bed file to the database. Args: identifier: Bed file identifier. @@ -752,7 +765,8 @@ def update( nofail: bool = False, processed: bool = False, ) -> None: - """Update bed file to the database. + """ + Update bed file to the database. Args: identifier: Bed file identifier. @@ -869,7 +883,8 @@ def update( def _update_classification( sa_session: Session, bed_object: Bed, classification: BedClassification ) -> None: - """Update bed file classification. + """ + Update bed file classification. Args: sa_session: Sqlalchemy session. @@ -891,7 +906,8 @@ def _update_classification( def _update_stats( sa_session: Session, bed_object: Bed, stats: BedStatsModel ) -> None: - """Update bed file statistics. + """ + Update bed file statistics. Args: sa_session: Sqlalchemy session. @@ -917,7 +933,8 @@ def _update_stats( def _update_metadata( self, sa_session: Session, bed_object: Bed, bed_metadata: StandardMeta ) -> None: - """Update bed file metadata. + """ + Update bed file metadata. Args: sa_session: Sqlalchemy session. @@ -958,7 +975,8 @@ def _update_plots( plots: BedPlots, local_path: str = None, ) -> None: - """Update bed file plots. + """ + Update bed file plots. Args: sa_session: Sqlalchemy session. @@ -1007,7 +1025,8 @@ def _update_files( files: BedFiles, local_path: str = None, ) -> None: - """Update bed files. + """ + Update bed files. Args: sa_session: Sqlalchemy session. @@ -1055,7 +1074,8 @@ def _update_ref_validation( ref_validation: dict[str, BaseModel], provided_genome: str = "", ) -> None: - """Update reference validation data. + """ + Update reference validation data. This function won't update the reference validation data, if it exists, it will skip it. @@ -1114,7 +1134,8 @@ def _create_ref_validation_models( return new_gen_refs def delete(self, identifier: str) -> None: - """Delete bed file from the database. + """ + Delete bed file from the database. Args: identifier: Bed file identifier. @@ -1174,7 +1195,8 @@ def update_pephub( _LOGGER.warning(f"Could not update pephub. Error: {e}") def delete_pephub_sample(self, identifier: str): - """Delete sample from pephub. + """ + Delete sample from pephub. Args: identifier: Bed file identifier. @@ -1195,7 +1217,8 @@ def upload_file_qdrant( bed_file: str | GRegionSet, payload: dict = None, ) -> None: - """Convert bed file to vector and add it to qdrant database. + """ + Convert bed file to vector and add it to qdrant database. Warning: only hg38 genome can be added to qdrant! @@ -1223,7 +1246,8 @@ def upload_file_qdrant( return None def _embed_file(self, bed_file: str | GRegionSet) -> np.ndarray: - """Create embedding for bed file. + """ + Create embedding for bed file. Args: bed_file: Path to the bed file, or RegionSet object. @@ -1255,7 +1279,8 @@ def _embed_file(self, bed_file: str | GRegionSet) -> np.ndarray: return bed_embedding.reshape(1, vec_dim) def _get_umap_file(self, bed_file: str | GRegionSet) -> np.ndarray: - """Create UMAP for bed file. + """ + Create UMAP for bed file. Args: bed_file: Bed file path or region set. @@ -1275,7 +1300,8 @@ def text_to_bed_search( offset: int = 0, with_metadata: bool = True, ) -> BedListSearchResult: - """Search for bed files by text query in qdrant database. + """ + Search for bed files by text query in qdrant database. This is bivec_search. Args: @@ -1330,7 +1356,8 @@ def bed_to_bed_search( limit: int = 10, offset: int = 0, ) -> BedListSearchResult: - """Search for bed files by using region set in qdrant database. + """ + Search for bed files by using region set in qdrant database. Args: region_set: RegionSet object to search for (bed file). @@ -1370,7 +1397,8 @@ def sql_search( limit: int = 10, offset: int = 0, ) -> BedListSearchResult: - """Search for bed files by using sql exact search. + """ + Search for bed files by using sql exact search. This search will search files by id, name, and description. Args: @@ -1443,7 +1471,8 @@ def sql_search( ) def _sql_search_count(self, condition_statement) -> int: - """Get number of total found files in the database. + """ + Get number of total found files in the database. Args: condition_statement: Sql alchemy condition statement to filter results. @@ -1463,7 +1492,8 @@ def _sql_search_count(self, condition_statement) -> int: return count[0] def reindex_qdrant(self, batch: int = 100, purge: bool = False) -> None: - """Re-upload all files to quadrant. + """ + Re-upload all files to quadrant. Warning: only hg38 genome can be added to qdrant! If you want to fully reindex/reupload to qdrant, first delete collection and create new one. @@ -1575,7 +1605,8 @@ def reindex_qdrant(self, batch: int = 100, purge: bool = False) -> None: return None def delete_qdrant_point(self, identifier: str) -> None: - """Delete bed file from qdrant. + """ + Delete bed file from qdrant. Args: identifier: Bed file identifier. @@ -1597,7 +1628,8 @@ def delete_qdrant_point(self, identifier: str) -> None: return None def exists(self, identifier: str) -> bool: - """Check if bed file exists in the database. + """ + Check if bed file exists in the database. Args: identifier: Bed file identifier. @@ -1614,7 +1646,8 @@ def exists(self, identifier: str) -> bool: return True def exists_universe(self, identifier: str) -> bool: - """Check if universe exists in the database. + """ + Check if universe exists in the database. Args: identifier: Universe identifier. @@ -1633,7 +1666,8 @@ def exists_universe(self, identifier: str) -> bool: def add_universe( self, bedfile_id: str, bedset_id: str = None, construct_method: str = None ) -> str: - """Add universe to the database. + """ + Add universe to the database. Args: bedfile_id: Bed file identifier. @@ -1657,7 +1691,8 @@ def add_universe( return bedfile_id def delete_universe(self, identifier: str) -> None: - """Delete universe from the database. + """ + Delete universe from the database. Args: identifier: Universe identifier. @@ -1676,7 +1711,8 @@ def delete_universe(self, identifier: str) -> None: def add_tokenized( self, bed_id: str, universe_id: str, token_vector: list, overwrite: bool = False ) -> str: - """Add tokenized bed file to the database. + """ + Add tokenized bed file to the database. Args: bed_id: Bed file identifier. @@ -1724,7 +1760,8 @@ def _add_zarr_s3( tokenized_vector: list, overwrite: bool = False, ) -> str: - """Add zarr file to the database. + """ + Add zarr file to the database. Args: universe_id: Universe identifier. @@ -1753,7 +1790,8 @@ def _add_zarr_s3( return str(os.path.join(ZARR_TOKENIZED_FOLDER, path)) def get_tokenized(self, bed_id: str, universe_id: str) -> TokenizedBedResponse: - """Get zarr file from the database. + """ + Get zarr file from the database. Args: bed_id: Bed file identifier. @@ -1774,7 +1812,8 @@ def get_tokenized(self, bed_id: str, universe_id: str) -> TokenizedBedResponse: ) def delete_tokenized(self, bed_id: str, universe_id: str) -> None: - """Delete tokenized bed file from the database. + """ + Delete tokenized bed file from the database. Args: bed_id: Bed file identifier. @@ -1802,7 +1841,8 @@ def delete_tokenized(self, bed_id: str, universe_id: str) -> None: return None def _get_tokenized_path(self, bed_id: str, universe_id: str) -> str: - """Get tokenized path to tokenized file. + """ + Get tokenized path to tokenized file. Args: bed_id: Bed file identifier. @@ -1825,7 +1865,8 @@ def _get_tokenized_path(self, bed_id: str, universe_id: str) -> str: return str(tokenized_object.path) def exist_tokenized(self, bed_id: str, universe_id: str) -> bool: - """Check if tokenized bed file exists in the database. + """ + Check if tokenized bed file exists in the database. Args: bed_id: Bed file identifier. @@ -1849,7 +1890,8 @@ def exist_tokenized(self, bed_id: str, universe_id: str) -> bool: def get_tokenized_link( self, bed_id: str, universe_id: str ) -> TokenizedPathResponse: - """Get tokenized link to tokenized file. + """ + Get tokenized link to tokenized file. Args: bed_id: Bed file identifier. @@ -1873,7 +1915,8 @@ def get_tokenized_link( def get_missing_plots( self, plot_name: str, limit: int = 1000, offset: int = 0 ) -> list[str]: - """Get list of bed files that are missing plot. + """ + Get list of bed files that are missing plot. Args: plot_name: Plot name. @@ -1912,7 +1955,8 @@ def get_missing_plots( return results def get_missing_stats(self, limit: int = 1000, offset: int = 0) -> list[str]: - """Get list of bed files that are missing statistics. + """ + Get list of bed files that are missing statistics. Args: limit: Number of results to return. @@ -1937,7 +1981,8 @@ def get_missing_stats(self, limit: int = 1000, offset: int = 0) -> list[str]: return results def get_missing_files(self, limit: int = 1000, offset: int = 0) -> list[str]: - """Get list of bed files that are missing files (bigBed files). + """ + Get list of bed files that are missing files (bigBed files). Args: limit: Number of results to return. @@ -1971,7 +2016,8 @@ def get_missing_files(self, limit: int = 1000, offset: int = 0) -> list[str]: def get_unprocessed( self, limit: int = 1000, offset: int = 0, genome: str | list | None = None ) -> BedListResult: - """Get bed files that are not processed. + """ + Get bed files that are not processed. Args: limit: Number of results to return. @@ -2040,7 +2086,8 @@ def _update_sources( global_sample_id: list[str] | None = None, global_experiment_id: list[str] | None = None, ) -> None: - """Add global sample and experiment ids to the bed file if they are missing. + """ + Add global sample and experiment ids to the bed file if they are missing. Args: identifier: Bed file identifier. @@ -2078,7 +2125,8 @@ def _update_sources( session.commit() def reindex_hybrid_search(self, batch: int = 1000, purge: bool = False) -> None: - """Reindex all bed files for semantic database. + """ + Reindex all bed files for semantic database. Args: batch: Number of files to upload in one batch. @@ -2209,7 +2257,8 @@ def hybrid_search( offset: int = 0, with_metadata: bool = True, ) -> BedListSearchResult: - """Run semantic search for bed files using qdrant. + """ + Run semantic search for bed files using qdrant. This is not bivec search, but usual qdrant search with sparse and dense embeddings. Args: @@ -2314,7 +2363,8 @@ def hybrid_search( ) def search_external_file(self, source: str, accession: str) -> BedListSearchResult: - """Search for bed files by external source and accession number. + """ + Search for bed files by external source and accession number. e.g. source='geo', accession='GSE12345'. Args: diff --git a/bbconf/modules/bedsets.py b/bbconf/modules/bedsets.py index 335d251b..346f9717 100644 --- a/bbconf/modules/bedsets.py +++ b/bbconf/modules/bedsets.py @@ -38,7 +38,8 @@ class BedAgentBedSet: """ def __init__(self, config: BedBaseConfig): - """Initialize BedAgentBedSet. + """ + Initialize BedAgentBedSet. Args: config: Config object. @@ -47,7 +48,8 @@ def __init__(self, config: BedBaseConfig): self._db_engine = self.config.db_engine def get(self, identifier: str, full: bool = False) -> BedSetMetadata: - """Get file metadata by identifier. + """ + Get file metadata by identifier. Args: identifier: Bed file identifier. @@ -96,7 +98,8 @@ def get(self, identifier: str, full: bool = False) -> BedSetMetadata: return bedset_metadata def get_plots(self, identifier: str) -> BedSetPlots: - """Get plots for bedset by identifier. + """ + Get plots for bedset by identifier. Args: identifier: Bedset identifier. @@ -127,7 +130,8 @@ def get_plots(self, identifier: str) -> BedSetPlots: return bedset_files def get_objects(self, identifier: str) -> dict[str, FileModel]: - """Get objects for bedset by identifier. + """ + Get objects for bedset by identifier. Args: identifier: Bedset identifier. @@ -154,7 +158,8 @@ def get_objects(self, identifier: str) -> dict[str, FileModel]: return return_dict def get_statistics(self, identifier: str) -> BedSetStats: - """Get statistics for bedset by identifier. + """ + Get statistics for bedset by identifier. Args: identifier: Bedset identifier. @@ -173,7 +178,8 @@ def get_statistics(self, identifier: str) -> BedSetStats: ) def get_bedset_pep(self, identifier: str) -> dict: - """Create pep file for a bedset. + """ + Create pep file for a bedset. Args: identifier: Bedset identifier. @@ -232,7 +238,8 @@ def get_bedset_pep(self, identifier: str) -> dict: } def get_track_hub_file(self, identifier: str) -> str: - """Get track hub file for bedset. + """ + Get track hub file for bedset. Args: identifier: Bedset identifier. @@ -305,7 +312,8 @@ def create( overwrite: bool = False, processed: bool = True, ) -> None: - """Create bedset in the database. + """ + Create bedset in the database. Args: identifier: Bedset identifier. @@ -408,7 +416,8 @@ def create( return None def _calculate_statistics(self, bed_ids: list[str]) -> BedSetStats: - """Calculate statistics for bedset. + """ + Calculate statistics for bedset. Args: bed_ids: List of bed file identifiers. @@ -457,7 +466,8 @@ def _create_pephub_view( bed_ids: list = None, nofail: bool = False, ) -> None: - """Create view in pephub for bedset. + """ + Create view in pephub for bedset. Args: bedset_id: Bedset identifier. @@ -488,7 +498,8 @@ def _create_pephub_view( def get_ids_list( self, query: str = None, limit: int = 10, offset: int = 0 ) -> BedSetListResult: - """Get list of bedsets from the database. + """ + Get list of bedsets from the database. Args: query: Search query. @@ -531,7 +542,8 @@ def get_ids_list( ) def get_bedset_bedfiles(self, identifier: str) -> BedSetBedFiles: - """Get list of bedfiles in bedset. + """ + Get list of bedfiles in bedset. Args: identifier: Bedset identifier. @@ -566,7 +578,8 @@ def get_bedset_bedfiles(self, identifier: str) -> BedSetBedFiles: ) def delete(self, identifier: str) -> None: - """Delete bed file from the database. + """ + Delete bed file from the database. Args: identifier: Bedset identifier. @@ -593,7 +606,8 @@ def delete(self, identifier: str) -> None: self.config.delete_files_s3(files) def delete_phc_view(self, identifier: str, nofail: bool = False) -> None: - """Delete view in pephub. + """ + Delete view in pephub. Args: identifier: Bedset identifier. @@ -617,7 +631,8 @@ def delete_phc_view(self, identifier: str, nofail: bool = False) -> None: return None def exists(self, identifier: str) -> bool: - """Check if bedset exists in the database. + """ + Check if bedset exists in the database. Args: identifier: Bedset identifier. @@ -633,7 +648,8 @@ def exists(self, identifier: str) -> bool: return False def get_unprocessed(self, limit: int = 100, offset: int = 0) -> BedSetListResult: - """Get unprocessed bedset from the database. + """ + Get unprocessed bedset from the database. Args: limit: Limit of results. diff --git a/bbconf/modules/objects.py b/bbconf/modules/objects.py index 94e5e00f..bf01a246 100644 --- a/bbconf/modules/objects.py +++ b/bbconf/modules/objects.py @@ -21,7 +21,8 @@ class BBObjects: """ """ def __init__(self, config: BedBaseConfig): - """Initialize BBObjects. + """ + Initialize BBObjects. Args: config: Config object. @@ -37,7 +38,8 @@ def get_thumbnail_uri( result_id: str, access_id: str = "http", ) -> str: - """Create URL to access a bed- or bedset-associated thumbnail. + """ + Create URL to access a bed- or bedset-associated thumbnail. Args: record_type: Table name ["bed", "bedset"]. @@ -67,7 +69,8 @@ def get_object_uri( result_id: str, access_id: str, ) -> str: - """Create URL to access a bed- or bedset-associated file. + """ + Create URL to access a bed- or bedset-associated file. Args: record_type: Table name ["bed", "bedset"]. @@ -87,7 +90,8 @@ def _get_result( record_id: str, result_id: str | list[str], ) -> FileModel: - """Generic getter that can return a result from either bed or bedset. + """ + Generic getter that can return a result from either bed or bedset. Args: record_type: Table name ["bed", "bedset"]. @@ -130,7 +134,8 @@ def get_drs_metadata( result_id: str, base_uri: str, ) -> DRSModel: - """Get DRS metadata for a bed- or bedset-associated file. + """ + Get DRS metadata for a bed- or bedset-associated file. Args: record_type: Bed or bedset. @@ -170,7 +175,8 @@ def construct_drs_metadata( created_time: datetime.datetime | None = None, modified_time: datetime.datetime | None = None, ) -> DRSModel: - """Construct DRS metadata object. + """ + Construct DRS metadata object. Args: base_uri: Base uri to use for the self_uri field (server hostname of DRS broker). From c315bf3464d8ba9bd5fe49b7a81f38e936add843 Mon Sep 17 00:00:00 2001 From: Khoroshevskyi Date: Thu, 26 Feb 2026 13:12:13 -0500 Subject: [PATCH 6/9] Updated incorrect docstrings --- bbconf/bbagent.py | 4 ++-- bbconf/config_parser/bedbaseconfig.py | 10 ++++------ bbconf/helpers.py | 2 +- bbconf/modules/bedfiles.py | 6 +++--- bbconf/modules/bedsets.py | 2 +- bbconf/modules/objects.py | 2 +- 6 files changed, 12 insertions(+), 14 deletions(-) diff --git a/bbconf/bbagent.py b/bbconf/bbagent.py index b7d07eb6..6248f6d8 100644 --- a/bbconf/bbagent.py +++ b/bbconf/bbagent.py @@ -361,10 +361,10 @@ def get_list_genomes(self) -> list[str]: def get_list_assays(self) -> list[str]: """ - Get list of genomes from the database. + Get list of assays from the database. Returns: - List of genomes. + List of assays. """ with Session(self.config.db_engine.engine) as session: diff --git a/bbconf/config_parser/bedbaseconfig.py b/bbconf/config_parser/bedbaseconfig.py index f61ecad0..e30397e2 100644 --- a/bbconf/config_parser/bedbaseconfig.py +++ b/bbconf/config_parser/bedbaseconfig.py @@ -128,10 +128,8 @@ def _read_config_file(config_path: str) -> ConfigFile: Returns: Parsed ConfigFile object. - - Raises: - raise_missing_key: If config key is missing. """ + _config = yacman.YAMLConfigManager(filepath=config_path).exp config_dict = {} @@ -262,7 +260,7 @@ def _init_qdrant_file_backend( qdrant_cl: QdrantClient object. Returns: - QdrantClient. + QdrantClient or None """ _LOGGER.info("Initializing qdrant bivec file backend...") @@ -293,7 +291,7 @@ def _init_qdrant_text_backend( dense_encoder: TextEmbedding model for encoding text queries. Returns: - QdrantClient. + QdrantClient or None """ _LOGGER.info("Initializing qdrant bivec text backend...") @@ -331,7 +329,7 @@ def _init_qdrant_hybrid( dense_encoder: TextEmbedding model for encoding text queries. Returns: - QdrantClient. + None, Initializes or creates the hybrid collection on the provided QdrantClient. """ _LOGGER.info("Initializing qdrant sparse collection...") diff --git a/bbconf/helpers.py b/bbconf/helpers.py index 47dd056b..37565223 100644 --- a/bbconf/helpers.py +++ b/bbconf/helpers.py @@ -45,7 +45,7 @@ def get_bedbase_cfg(cfg: str = None) -> str: def get_absolute_path(path: str, base_path: str) -> str: """ - Get absolute path to the file and create it if it doesn't exist. + Get absolute path to the file. Args: path: Path to the file (abs or relative). diff --git a/bbconf/modules/bedfiles.py b/bbconf/modules/bedfiles.py index 6f3e093b..0d3fc9b7 100644 --- a/bbconf/modules/bedfiles.py +++ b/bbconf/modules/bedfiles.py @@ -103,7 +103,7 @@ def get(self, identifier: str, full: bool = False) -> BedMetadataAll: full: If True, return full metadata, including statistics, files, and raw metadata from pephub. Returns: - Project metadata. + BED file metadata. """ statement = select(Bed).where(and_(Bed.id == identifier)) @@ -355,7 +355,7 @@ def get_raw_metadata(self, identifier: str) -> BedPEPHub: identifier: Bed file identifier. Returns: - Project metadata. + BED file raw metadata. """ try: bed_metadata = self.config.phc.sample.get( @@ -1493,7 +1493,7 @@ def _sql_search_count(self, condition_statement) -> int: def reindex_qdrant(self, batch: int = 100, purge: bool = False) -> None: """ - Re-upload all files to quadrant. + Re-upload all files to qdrant. Warning: only hg38 genome can be added to qdrant! If you want to fully reindex/reupload to qdrant, first delete collection and create new one. diff --git a/bbconf/modules/bedsets.py b/bbconf/modules/bedsets.py index 346f9717..83566ac3 100644 --- a/bbconf/modules/bedsets.py +++ b/bbconf/modules/bedsets.py @@ -56,7 +56,7 @@ def get(self, identifier: str, full: bool = False) -> BedSetMetadata: full: Return full record with stats, plots, files and metadata. Returns: - Project metadata. + BED file metadata. """ statement = select(BedSets).where(BedSets.id == identifier) diff --git a/bbconf/modules/objects.py b/bbconf/modules/objects.py index bf01a246..14dd66d7 100644 --- a/bbconf/modules/objects.py +++ b/bbconf/modules/objects.py @@ -99,7 +99,7 @@ def _get_result( result_id: Column name (result name). e.g. "bigbedfile", "bed_file", "open_chromatin". Returns: - Pipestat result. + FileModel instance representing the requested bed/bedset file object. """ if record_type == "bed": try: From a068e39ae39cd61ee554bbe3abb3747c23de1e14 Mon Sep 17 00:00:00 2001 From: Khoroshevskyi Date: Thu, 26 Feb 2026 13:59:33 -0500 Subject: [PATCH 7/9] Updated changelog and removed old files --- bbconf/_version.py | 0 docs/changelog.md | 27 ++++++++++++++++++++++++++- pyproject.toml | 3 ++- requirements/requirements-all.txt | 0 4 files changed, 28 insertions(+), 2 deletions(-) delete mode 100644 bbconf/_version.py delete mode 100644 requirements/requirements-all.txt diff --git a/bbconf/_version.py b/bbconf/_version.py deleted file mode 100644 index e69de29b..00000000 diff --git a/docs/changelog.md b/docs/changelog.md index b7a2d3c1..4a164cce 100644 --- a/docs/changelog.md +++ b/docs/changelog.md @@ -2,6 +2,26 @@ This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html) and [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) format. +### [0.14.8] - 2026-02-17 +### Changed: +- Updated versions of dependencies + +### [0.14.7] - 2026-02-16 +### Changed: +- Updated requirements + +### [0.14.6] - 2026-02-06 +### Fixed: +- Fixed qdrant upload exception catching + +### [0.14.5] - 2026-02-05 +### Changed: +- Updated reindexing script + +### [0.14.4] - 2026-02-04 +### Changed: +- Updated reindexing of bed files to use only verified genome digests + ### [0.14.3] - 2026-01-31 ### Added: - Cell line to detailed bedbase statistics @@ -11,7 +31,12 @@ This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.htm ### [0.14.2] - 2026-01-21 ### Added: -- Added method that fetches available reference genomes +- Added method that fetches available reference genomes + +### [0.14.1] - 2025-12-22 +### Fixed: +- Fixed hybrid search reindexing +- Updated limits in reindexing ### [0.14.0] - 2025-12-18 ### Fixed: diff --git a/pyproject.toml b/pyproject.toml index 3c93c470..bc07ce84 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "bbconf" -version = "0.14.7" +version = "0.14.8" description = "Configuration and data management tool for BEDbase" readme = "README.md" license = "BSD-2-Clause" @@ -36,6 +36,7 @@ dependencies = [ "pybiocfilecache == 0.6.1", "umap-learn >= 0.5.8", "qdrant_client >= 1.16.1", + "setuptools < 70.0.0", ] [project.urls] diff --git a/requirements/requirements-all.txt b/requirements/requirements-all.txt deleted file mode 100644 index e69de29b..00000000 From c2efd919e359217d2f88d6b15df83726641cbcef Mon Sep 17 00:00:00 2001 From: Khoroshevskyi Date: Thu, 26 Feb 2026 14:06:30 -0500 Subject: [PATCH 8/9] Updated changelog and version --- docs/changelog.md | 8 ++++++++ pyproject.toml | 2 +- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/docs/changelog.md b/docs/changelog.md index 4a164cce..39ca0b25 100644 --- a/docs/changelog.md +++ b/docs/changelog.md @@ -2,6 +2,14 @@ This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html) and [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) format. + +### [0.14.9] - 2026-02-26 +### Changed: +- Modernized docstrings +- Type annotation for python 3.10+ +- Updated requirements +- Updated package installation way to use pyproject.toml and hatchling + ### [0.14.8] - 2026-02-17 ### Changed: - Updated versions of dependencies diff --git a/pyproject.toml b/pyproject.toml index bc07ce84..3d331252 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "bbconf" -version = "0.14.8" +version = "0.14.9" description = "Configuration and data management tool for BEDbase" readme = "README.md" license = "BSD-2-Clause" From da100144c15fd84b8eb3b6f06f44676e1779eeca Mon Sep 17 00:00:00 2001 From: Khoroshevskyi Date: Thu, 26 Feb 2026 14:21:54 -0500 Subject: [PATCH 9/9] Updated README --- README.md | 29 +++++++++++++++++++++++++---- 1 file changed, 25 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 9b1358e0..227e40f1 100644 --- a/README.md +++ b/README.md @@ -12,9 +12,13 @@ ## What is this? -`bbconf` is a configuration and management tool for BEDbase, facilitating the reading of configuration files, -setting up connections to PostgreSQL, PEPhub, S3, and Qdrant databases, managing file paths, and storing transformer models. -It formalizes communication pathways for pipelines and downstream tools, ensuring seamless interaction." +`bbconf` is a configuration and data management library for the [BEDbase](https://bedbase.org) platform. It serves as the central backbone for all BEDbase tools and pipelines by: + +- Reading and validating YAML configuration files +- Setting up and managing connections to PostgreSQL, Qdrant, S3, and PEPHub +- Loading ML models (Region2Vec, text embedders, sparse encoders, UMAP) used for BED file search +- Providing high-level Python interfaces for querying and managing BED files and BED sets +- Exposing a unified `BedBaseAgent` object that all downstream tools use to interact with the platform --- @@ -28,10 +32,27 @@ It formalizes communication pathways for pipelines and downstream tools, ensurin To install `bbclient` use this command: ``` -pip install bbclient +pip install bbconf ``` or install the latest version from the GitHub repository: ``` pip install git+https://github.com/databio/bbconf.git ``` + +## Quick start + +```python +from bbconf import BedBaseAgent + +agent = BedBaseAgent(config="config.yaml") + +# Access submodules +agent.bed # BED file operations +agent.bedset # BED set operations +agent.objects # Generic object/file operations + +# Get platform statistics +stats = agent.get_stats() +print(stats.bedfiles_number, stats.bedsets_number) +```