diff --git a/README.md b/README.md index 2e47a39..d1b8585 100644 --- a/README.md +++ b/README.md @@ -35,7 +35,7 @@ Call ``convert_coordinate()``: ```python3 c.convert_coordinate("chr7", 140453136, 140453137, Strand.POSITIVE) -# returns [LiftoverResult(chrom='chr7', start=140753336, end=140753337, strand=)] +# returns [LiftoverResult(chrom='chr7', start=140753336, end=140753337, strand=, score=14633688187)] ``` ## Development diff --git a/analysis/requirements.txt b/analysis/requirements.txt new file mode 100644 index 0000000..6ff7e99 --- /dev/null +++ b/analysis/requirements.txt @@ -0,0 +1,48 @@ +-e file:///Users/jss009/code/agct +appnope==0.1.4 +asttokens==3.0.1 +certifi==2025.6.15 +charset-normalizer==3.4.2 +click==8.2.1 +comm==0.2.3 +coverage==7.9.1 +debugpy==1.8.20 +decorator==5.2.1 +executing==2.2.1 +idna==3.10 +iniconfig==2.1.0 +ipykernel==7.2.0 +ipython==9.11.0 +ipython-pygments-lexers==1.1.1 +jedi==0.19.2 +jupyter-client==8.8.0 +jupyter-core==5.9.1 +matplotlib-inline==0.2.1 +maturin==1.8.7 +nest-asyncio==1.6.0 +packaging==25.0 +parso==0.8.6 +pexpect==4.9.0 +platformdirs==4.9.4 +pluggy==1.6.0 +prek==0.2.24 +prompt-toolkit==3.0.52 +psutil==7.2.2 +ptyprocess==0.7.0 +pure-eval==0.2.3 +pygments==2.19.2 +pyliftover==0.4.1 +pytest==8.4.1 +pytest-cov==6.2.1 +python-dateutil==2.9.0.post0 +pyzmq==27.1.0 +requests==2.32.4 +ruff==0.14.10 +six==1.17.0 +stack-data==0.6.3 +tornado==6.5.5 +tqdm==4.67.1 +traitlets==5.14.3 +urllib3==2.5.0 +wags-tails==0.3.2 +wcwidth==0.6.0 diff --git a/analysis/speed_test.ipynb b/analysis/speed_test.ipynb index d5230f7..94991f8 100644 --- a/analysis/speed_test.ipynb +++ b/analysis/speed_test.ipynb @@ -5,19 +5,22 @@ "id": "e70c802c-9a54-41e2-a928-7ea685017195", "metadata": {}, "source": [ + "We don't necessarily intend to keep this up to date; this notebook can be reproduced as of the commit that it's associated with.\n", + "\n", "Don't forget to run `maturin develop --release`!" ] }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 8, "id": "856fbc60-6896-4824-8956-b466e29ddc69", "metadata": {}, "outputs": [], "source": [ "from pyliftover import LiftOver\n", "\n", - "from agct import Converter" + "from agct import Converter\n", + "from agct.seqref_registry import Assembly" ] }, { @@ -46,7 +49,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "1.11 s ± 26.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" + "498 ms ± 6.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" ] } ], @@ -57,7 +60,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 10, "id": "2332acf9-c6eb-42ac-8426-fe46eba132a0", "metadata": {}, "outputs": [ @@ -65,13 +68,13 @@ "name": "stdout", "output_type": "stream", "text": [ - "217 ms ± 9.13 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" + "50.9 ms ± 807 μs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n" ] } ], "source": [ "%%timeit\n", - "converter = Converter(\"hg38\", \"hg19\")" + "converter = Converter(Assembly.HG38, Assembly.HG19)" ] }, { @@ -84,7 +87,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 11, "id": "ae47dbbb-c9ad-46b4-9f4d-14823bbf26f2", "metadata": {}, "outputs": [ @@ -92,7 +95,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "1.09 s ± 14.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" + "512 ms ± 16.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" ] } ], @@ -104,7 +107,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 13, "id": "729abc9a-6429-4cb6-aaca-4bac4cd762f8", "metadata": {}, "outputs": [ @@ -112,14 +115,14 @@ "name": "stdout", "output_type": "stream", "text": [ - "215 ms ± 6.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" + "50.6 ms ± 1.17 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n" ] } ], "source": [ "%%timeit\n", - "converter = Converter(\"hg38\", \"hg19\")\n", - "converter.convert_coordinate(\"chr5\", 1404391, \"+\")" + "converter = Converter(Assembly.HG38, Assembly.HG19)\n", + "converter.convert_coordinate(\"chr5\", 1404391, 1404391, \"+\")" ] }, { @@ -132,19 +135,19 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 14, "id": "7c51c0cb-e997-4ece-8a68-8e946b9dde3d", "metadata": {}, "outputs": [], "source": [ "# load beforehand\n", "pyl = LiftOver(\"hg38\", \"hg19\")\n", - "converter = Converter(\"hg38\", \"hg19\")" + "converter = Converter(Assembly.HG38, Assembly.HG19)" ] }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 15, "id": "b68a6af6-1a89-414f-b4f1-2378f3c4f50d", "metadata": {}, "outputs": [ @@ -152,7 +155,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "1.97 µs ± 72.9 ns per loop (mean ± std. dev. of 7 runs, 100,000 loops each)\n" + "1.04 μs ± 27.1 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)\n" ] } ], @@ -163,7 +166,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 16, "id": "e6643c38-f9f5-4ed6-a277-9642e731e6bc", "metadata": {}, "outputs": [ @@ -171,13 +174,13 @@ "name": "stdout", "output_type": "stream", "text": [ - "2.77 µs ± 103 ns per loop (mean ± std. dev. of 7 runs, 100,000 loops each)\n" + "713 ns ± 9.94 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)\n" ] } ], "source": [ "%%timeit\n", - "converter.convert_coordinate(\"chr5\", 1404391, \"+\")" + "converter.convert_coordinate(\"chr5\", 1404391, 1404391, \"+\")" ] }, { @@ -198,7 +201,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 17, "id": "181231d8-3f00-407b-86a3-1ae268d4d931", "metadata": {}, "outputs": [ @@ -206,7 +209,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "303 ms ± 13.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" + "149 ms ± 2.79 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n" ] } ], @@ -217,7 +220,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 18, "id": "7822b8e7-939d-4b6d-85f6-514999c55602", "metadata": {}, "outputs": [ @@ -225,13 +228,13 @@ "name": "stdout", "output_type": "stream", "text": [ - "62.6 ms ± 2.99 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n" + "8.95 ms ± 73.5 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n" ] } ], "source": [ "%%timeit\n", - "converter = Converter(\"hg19\", \"hg38\")" + "converter = Converter(Assembly.HG19, Assembly.HG38)" ] }, { @@ -244,7 +247,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 19, "id": "0ab6aeb8-88da-45e8-9efe-8b4a7825bcff", "metadata": {}, "outputs": [ @@ -252,7 +255,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "318 ms ± 15.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" + "151 ms ± 1.51 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n" ] } ], @@ -264,7 +267,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 21, "id": "a014955a-5451-4b10-83bd-4d4843ddc227", "metadata": {}, "outputs": [ @@ -272,14 +275,14 @@ "name": "stdout", "output_type": "stream", "text": [ - "57.8 ms ± 742 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n" + "8.68 ms ± 59.2 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n" ] } ], "source": [ "%%timeit\n", - "converter = Converter(\"hg19\", \"hg38\")\n", - "converter.convert_coordinate(\"chr5\", 1404391, \"+\")" + "converter = Converter(Assembly.HG19, Assembly.HG38)\n", + "converter.convert_coordinate(\"chr5\", 1404391, 1404391, \"+\")" ] }, { @@ -292,19 +295,19 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 22, "id": "a516e0a8-02b1-4ce5-8e69-3d0badcaffa4", "metadata": {}, "outputs": [], "source": [ "# load beforehand\n", "pyl = LiftOver(\"hg19\", \"hg38\")\n", - "converter = Converter(\"hg19\", \"hg38\")" + "converter = Converter(Assembly.HG19, Assembly.HG38)" ] }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 23, "id": "d433fee3-9c40-4633-be5e-2649b723968e", "metadata": {}, "outputs": [ @@ -312,7 +315,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "2.16 µs ± 232 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)\n" + "1.06 μs ± 7.62 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)\n" ] } ], @@ -323,7 +326,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 24, "id": "5bb82164-e123-47ae-a748-7d4c4898fcb2", "metadata": {}, "outputs": [ @@ -331,21 +334,21 @@ "name": "stdout", "output_type": "stream", "text": [ - "2.87 µs ± 65 ns per loop (mean ± std. dev. of 7 runs, 100,000 loops each)\n" + "718 ns ± 1.6 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)\n" ] } ], "source": [ "%%timeit\n", - "converter.convert_coordinate(\"chr5\", 1404391, \"+\")" + "converter.convert_coordinate(\"chr5\", 1404391, 1404391, \"+\")" ] } ], "metadata": { "kernelspec": { - "display_name": "liftovertest", + "display_name": "agct", "language": "python", - "name": "liftovertest" + "name": "agct" }, "language_info": { "codemirror_mode": { @@ -357,7 +360,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.0" + "version": "3.13.2" } }, "nbformat": 4, diff --git a/pyproject.toml b/pyproject.toml index 32765d3..fe24b7f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "agct" -version = "0.2.0" +version = "0.2.1" authors = [ {name = "James Stevenson"}, {name = "Kori Kuzma"}, diff --git a/rust/Cargo.toml b/rust/Cargo.toml index 13c128e..b3acdfd 100644 --- a/rust/Cargo.toml +++ b/rust/Cargo.toml @@ -8,7 +8,7 @@ name = "agct" crate-type = ["cdylib"] [dependencies] -chainfile = "0.3.0" +chainfile = "0.4.0" directories = "5.0" -omics = { version = "0.2.0", features = ["coordinate"] } +omics = { version = "0.4.0", features = ["coordinate"] } pyo3 = { version = "0.23.3", features = ["abi3-py310"] } diff --git a/rust/src/lib.rs b/rust/src/lib.rs index d4fd58b..f7804dd 100644 --- a/rust/src/lib.rs +++ b/rust/src/lib.rs @@ -1,5 +1,6 @@ //! Provide Rust-based chainfile wrapping classes. use chainfile as chain; +use omics::coordinate::Contig; use omics::coordinate::{interbase::Coordinate, interval::interbase::Interval, Strand}; use pyo3::create_exception; use pyo3::exceptions::{PyException, PyFileNotFoundError, PyValueError}; @@ -43,10 +44,10 @@ impl Converter { pub fn lift( &self, chrom: &str, - start: u64, - end: u64, + start: u32, + end: u32, strand: &str, - ) -> PyResult>> { + ) -> PyResult> { let parsed_strand = if strand == "+" { Strand::Positive } else if strand == "-" { @@ -57,9 +58,14 @@ impl Converter { strand ))); }; - // safe to unwrap coordinates because `pos` is always an int - let start_coordinate = Coordinate::new(chrom, parsed_strand.clone(), start); - let end_coordinate = Coordinate::new(chrom, parsed_strand.clone(), end); + let chrom_contig = Contig::try_new(chrom).map_err(|_| { + PyValueError::new_err(format!( + "Unable to create contig from chrom name (must be nonempty): {}", + chrom + )) + })?; + let start_coordinate = Coordinate::new(chrom_contig.clone(), parsed_strand.clone(), start); + let end_coordinate = Coordinate::new(chrom_contig, parsed_strand.clone(), end); let Ok(interval) = Interval::try_new(start_coordinate.clone(), end_coordinate.clone()) else { @@ -71,13 +77,16 @@ impl Converter { if let Some(liftover_result) = self.machine.liftover(interval.clone()) { Ok(liftover_result .iter() - .map(|r| { - vec![ - r.query().contig().to_string(), - r.query().start().position().to_string(), - r.query().end().position().to_string(), - r.query().strand().to_string(), - ] + .flat_map(|chain_liftover| { + chain_liftover.segments().iter().map(|segment| { + ( + segment.query().contig().to_string(), + segment.query().start().position().get(), + segment.query().end().position().get(), + segment.query().strand().to_string(), + chain_liftover.chain().score(), + ) + }) }) .collect()) } else { diff --git a/src/agct/converter.py b/src/agct/converter.py index 14c6709..1b4ce96 100644 --- a/src/agct/converter.py +++ b/src/agct/converter.py @@ -2,7 +2,7 @@ import logging from collections.abc import Callable -from enum import Enum +from enum import StrEnum from functools import cache from pathlib import Path from typing import NamedTuple @@ -17,7 +17,7 @@ _logger = logging.getLogger(__name__) -class Strand(str, Enum): +class Strand(StrEnum): """Constrain strand values.""" POSITIVE = "+" @@ -31,6 +31,7 @@ class LiftoverResult(NamedTuple): start: int end: int strand: Strand + score: int class Converter: @@ -142,7 +143,8 @@ def convert_coordinate( :param strand: query strand (``"+"`` by default). :return: list of coordinate matches (possibly empty) :raise ValueError: if ``start`` > ``end`` and strandedness is positive, or - ``start`` < ``end`` and strandedness is negative + ``start`` < ``end`` and strandedness is negative, or if position is too large + to represent as a 32 bit unsigned int """ if start < end and strand == Strand.NEGATIVE: msg = f"`start` must be less than `end` on the negative strand: {start=}, {end=}" @@ -163,22 +165,10 @@ def convert_coordinate( strand, ) results = [] - formatted_results: list[LiftoverResult] = [] - for result in results: - try: - lifted_over_start, lifted_over_end = int(result[1]), int(result[2]) - except ValueError: - _logger.exception("Got invalid position value in %s", result) - continue - try: - strand = Strand(result[3]) - except ValueError: - _logger.exception("Got invalid Strand value in %s", result) - continue - formatted_results.append( - LiftoverResult(result[0], lifted_over_start, lifted_over_end, strand) - ) - return formatted_results + except OverflowError as e: + msg = f"Coordinates exceed representable bounds of a 32 bit unsigned int: {start=}, {end=} -- this is unsupported" + raise ValueError(msg) from e + return [LiftoverResult(*r) for r in results] @cache diff --git a/tests/test_liftover.py b/tests/test_liftover.py index 260da03..2d81879 100644 --- a/tests/test_liftover.py +++ b/tests/test_liftover.py @@ -13,27 +13,33 @@ def test_hg19_to_hg38(): result = converter.convert_coordinate("chr7", 140439611, 140439611) assert len(result) == 1 - assert result[0] == LiftoverResult("chr7", 140739811, 140739811, Strand.POSITIVE) + assert result[0] == LiftoverResult( + "chr7", 140739811, 140739811, Strand.POSITIVE, 14633688187 + ) result = converter.convert_coordinate("chr7", 140439746, 140439746) assert len(result) == 1 - assert result[0] == LiftoverResult("chr7", 140739946, 140739946, Strand.POSITIVE) + assert result[0] == LiftoverResult( + "chr7", 140739946, 140739946, Strand.POSITIVE, 14633688187 + ) result = converter.convert_coordinate("chr7", 140439703, 140439703) assert len(result) == 1 - assert result[0] == LiftoverResult("chr7", 140739903, 140739903, Strand.POSITIVE) + assert result[0] == LiftoverResult( + "chr7", 140739903, 140739903, Strand.POSITIVE, 14633688187 + ) result = converter.convert_coordinate("chr7", 140453136, 140453136) assert len(result) == 1 - assert result[0] == LiftoverResult("chr7", 140753336, 140753336, Strand.POSITIVE) + assert result[0] == LiftoverResult( + "chr7", 140753336, 140753336, Strand.POSITIVE, 14633688187 + ) result = converter.convert_coordinate("chr1", 206072707, 206072708) assert len(result) == 1 - assert result[0] == LiftoverResult("chr1", 206268644, 206268643, Strand.NEGATIVE) - - # coordinate exceeds bounds - result = converter.convert_coordinate("chr7", 14040053136, 14040053136) - assert result == [] + assert result[0] == LiftoverResult( + "chr1", 206268644, 206268643, Strand.NEGATIVE, 24611930 + ) def test_hg38_to_hg19(): @@ -42,23 +48,33 @@ def test_hg38_to_hg19(): result = converter.convert_coordinate("chr7", 140739811, 140739811) assert len(result) == 1 - assert result[0] == LiftoverResult("chr7", 140439611, 140439611, Strand.POSITIVE) + assert result[0] == LiftoverResult( + "chr7", 140439611, 140439611, Strand.POSITIVE, 14633759100 + ) result = converter.convert_coordinate("chr7", 140759820, 140759820) assert len(result) == 1 - assert result[0] == LiftoverResult("chr7", 140459620, 140459620, Strand.POSITIVE) + assert result[0] == LiftoverResult( + "chr7", 140459620, 140459620, Strand.POSITIVE, 14633759100 + ) result = converter.convert_coordinate("chr7", 60878240, 60878240) assert len(result) == 1 - assert result[0] == LiftoverResult("chr7", 61646115, 61646115, Strand.POSITIVE) + assert result[0] == LiftoverResult( + "chr7", 61646115, 61646115, Strand.POSITIVE, 2791030 + ) result = converter.convert_coordinate("chr7", 60878240, 60878240) assert len(result) == 1 - assert result[0] == LiftoverResult("chr7", 61646115, 61646115, Strand.POSITIVE) + assert result[0] == LiftoverResult( + "chr7", 61646115, 61646115, Strand.POSITIVE, 2791030 + ) result = converter.convert_coordinate("chr7", 60878240, 60878245) assert len(result) == 1 - assert result[0] == LiftoverResult("chr7", 61646115, 61646120, Strand.POSITIVE) + assert result[0] == LiftoverResult( + "chr7", 61646115, 61646120, Strand.POSITIVE, 2791030 + ) def test_interval_input(): @@ -82,3 +98,14 @@ def test_interval_input(): converter.convert_coordinate( "chr1", 206268644, 206268645, strand=Strand.NEGATIVE ) + + +def test_raises_for_int_overflow(): + converter = Converter(Assembly.HG19, Assembly.HG38) + with pytest.raises( + ValueError, + match=re.escape( + "Coordinates exceed representable bounds of a 32 bit unsigned int: start=14040053136, end=14040053136 -- this is unsupported" + ), + ): + converter.convert_coordinate("chr7", 14040053136, 14040053136)