From b6bffb5d57a80d201e3be42e7f444d236ff0e1ae Mon Sep 17 00:00:00 2001 From: James Stevenson Date: Fri, 13 Mar 2026 15:53:54 -0400 Subject: [PATCH] feat!: use materialized view for tx-exon alignment lookup --- .../mappers/exon_genomic_coords.py | 12 ++++---- src/cool_seq_tool/mappers/mane_transcript.py | 14 +++++----- src/cool_seq_tool/sources/uta_database.py | 28 ++++++++++--------- tests/sources/test_uta_database.py | 24 ++++++++-------- 4 files changed, 40 insertions(+), 38 deletions(-) diff --git a/src/cool_seq_tool/mappers/exon_genomic_coords.py b/src/cool_seq_tool/mappers/exon_genomic_coords.py index f206040..e00a8bf 100644 --- a/src/cool_seq_tool/mappers/exon_genomic_coords.py +++ b/src/cool_seq_tool/mappers/exon_genomic_coords.py @@ -639,7 +639,7 @@ async def _get_all_exon_coords( if genomic_ac: query = f""" SELECT DISTINCT ord, tx_start_i, tx_end_i, alt_start_i, alt_end_i, alt_strand - FROM {self.uta_db.schema}.tx_exon_aln_v + FROM {self.uta_db.schema}.tx_exon_aln_mv WHERE tx_ac = '{tx_ac}' AND alt_aln_method = 'splign' AND alt_ac = '{genomic_ac}' @@ -648,7 +648,7 @@ async def _get_all_exon_coords( else: query = f""" SELECT DISTINCT ord, tx_start_i, tx_end_i, alt_start_i, alt_end_i, alt_strand - FROM {self.uta_db.schema}.tx_exon_aln_v as t + FROM {self.uta_db.schema}.tx_exon_aln_mv as t INNER JOIN {self.uta_db.schema}._seq_anno_most_recent as s ON t.alt_ac = s.ac WHERE s.descr = '' @@ -890,7 +890,7 @@ async def _genomic_to_tx_segment( # Run if gene is for a noncoding transcript query = f""" SELECT DISTINCT tx_ac - FROM {self.uta_db.schema}.tx_exon_aln_v + FROM {self.uta_db.schema}.tx_exon_aln_mv WHERE hgnc = '{gene}' AND alt_ac = '{genomic_ac}' """ # noqa: S608 @@ -955,7 +955,7 @@ async def _genomic_to_tx_segment( ) else: is_exonic = True - exon_data = await self.uta_db.get_tx_exon_aln_v_data( + exon_data = await self.uta_db.get_tx_exon_aln_data( transcript, genomic_pos, genomic_pos, @@ -1035,7 +1035,7 @@ async def _validate_genomic_breakpoint( SELECT MIN(alt_start_i) AS min_start, MAX(alt_end_i) AS max_end - FROM {self.uta_db.schema}.tx_exon_aln_v + FROM {self.uta_db.schema}.tx_exon_aln_mv WHERE tx_ac = '{tx_ac}' AND alt_ac = '{genomic_ac}' ) @@ -1060,7 +1060,7 @@ async def _get_tx_ac_gene( """ query = f""" SELECT DISTINCT hgnc - FROM {self.uta_db.schema}.tx_exon_aln_v + FROM {self.uta_db.schema}.tx_exon_aln_mv WHERE tx_ac = '{tx_ac}' ORDER BY hgnc LIMIT 1; diff --git a/src/cool_seq_tool/mappers/mane_transcript.py b/src/cool_seq_tool/mappers/mane_transcript.py index 18c05da..dc1a10c 100644 --- a/src/cool_seq_tool/mappers/mane_transcript.py +++ b/src/cool_seq_tool/mappers/mane_transcript.py @@ -234,7 +234,7 @@ async def _liftover_to_38(self, genomic_tx_data: GenomicTxMetadata) -> None: query = f""" SELECT DISTINCT alt_ac - FROM {self.uta_db.schema}.tx_exon_aln_v + FROM {self.uta_db.schema}.tx_exon_aln_mv WHERE tx_ac = '{genomic_tx_data.tx_ac}'; """ # noqa: S608 nc_acs = await self.uta_db.execute_query(query) @@ -462,15 +462,15 @@ async def _g_to_c( :param ensembl_c_ac: Ensembl transcript accession :param alt_ac: Genomic accession :param found_result: ``True`` if found result, so do not need to query - tx_exon_aln_v table. This is because the user did not need to liftover. - ``False`` if need to get result from tx_exon_aln_v table. + tx_exon_aln_mv table. This is because the user did not need to liftover. + ``False`` if need to get result from tx_exon_aln_mv table. :return: Transcript data """ if found_result: tx_g_pos = g.alt_pos_range tx_pos_range = g.tx_pos_range else: - result = await self.uta_db.get_tx_exon_aln_v_data( + result = await self.uta_db.get_tx_exon_aln_data( refseq_c_ac, g.alt_pos_change_range[0], g.alt_pos_change_range[1], @@ -820,7 +820,7 @@ def _get_protein_rep( if alt_ac is None: alt_ac = row["alt_ac"] - found_tx_exon_aln_v_result = False + found_tx_exon_aln_result = False if is_p_or_c_start_anno: # Go from c -> g annotation (liftover as well) g = await self._c_to_g(tx_ac, (c_start_pos, c_end_pos)) @@ -832,7 +832,7 @@ def _get_protein_rep( annotation_layer=AnnotationLayer.GENOMIC, alt_ac=alt_ac, ) - found_tx_exon_aln_v_result = True + found_tx_exon_aln_result = True if not g: continue @@ -842,7 +842,7 @@ def _get_protein_rep( g=g, refseq_c_ac=tx_ac, status=TranscriptPriority.LONGEST_COMPATIBLE_REMAINING, - found_result=found_tx_exon_aln_v_result, + found_result=found_tx_exon_aln_result, ) if not lcr_c_data: diff --git a/src/cool_seq_tool/sources/uta_database.py b/src/cool_seq_tool/sources/uta_database.py index 319324f..5f7ede7 100644 --- a/src/cool_seq_tool/sources/uta_database.py +++ b/src/cool_seq_tool/sources/uta_database.py @@ -44,7 +44,7 @@ class DbConnectionArgs(BaseModelForbidExtra): class GenomicAlnData(BaseModelForbidExtra): - """Represent genomic alignment data from UTA tx_exon_aln_v view""" + """Represent genomic alignment data from UTA tx_exon_aln_mv view""" hgnc: StrictStr = Field(..., description="HGNC gene symbol.") ord: StrictInt = Field(..., description="Exon number. 0-based.") @@ -61,7 +61,7 @@ class GenomicAlnData(BaseModelForbidExtra): class TxExonAlnData(GenomicAlnData): - """Represent data from UTA tx_exon_aln_v view""" + """Represent data from UTA tx_exon_aln_mv view""" tx_ac: StrictStr = Field(..., description="Transcript accession.") tx_start_i: StrictInt = Field( @@ -285,7 +285,7 @@ async def get_alt_ac_start_or_end( query = f""" SELECT T.hgnc, T.alt_ac, T.alt_start_i, T.alt_end_i, T.alt_strand, T.ord FROM {self.schema}._cds_exons_fp_v as C - JOIN {self.schema}.tx_exon_aln_v as T ON T.tx_ac = C.tx_ac + JOIN {self.schema}.tx_exon_aln_mv as T ON T.tx_ac = C.tx_ac WHERE T.tx_ac = '{tx_ac}' {gene_query} AND {tx_exon_start} BETWEEN T.tx_start_i AND T.tx_end_i @@ -394,7 +394,7 @@ async def gene_exists(self, gene: str) -> bool: return result[0][0] async def transcript_exists(self, transcript: str) -> bool: - """Return whether or not a transcript exists in the UTA ``tx_exon_aln_v`` table + """Return whether or not a transcript exists in the UTA ``tx_exon_aln_mv`` table :param transcript: A transcript accession :return: ``True`` if transcript exists in UTA, ``False`` if not @@ -402,7 +402,7 @@ async def transcript_exists(self, transcript: str) -> bool: query = f""" SELECT EXISTS( SELECT tx_ac - FROM {self.schema}.tx_exon_aln_v + FROM {self.schema}.tx_exon_aln_mv WHERE tx_ac = '{transcript}' ); """ # noqa: S608 @@ -439,7 +439,7 @@ async def get_ac_descr(self, ac: str) -> str | None: result = None return result - async def get_tx_exon_aln_v_data( + async def get_tx_exon_aln_data( self, tx_ac: str, start_pos: int, @@ -448,7 +448,9 @@ async def get_tx_exon_aln_v_data( use_tx_pos: bool = True, like_tx_ac: bool = False, ) -> list[TxExonAlnData]: - """Return queried data from tx_exon_aln_v table. + """Get alignments between exons and reference sequences. + + This is a direct query against the UTA ``tx_exon_aln_mv`` view. :param tx_ac: accession on c. coordinate :param start_pos: Start position change @@ -491,7 +493,7 @@ async def get_tx_exon_aln_v_data( query = f""" SELECT hgnc, tx_ac, tx_start_i, tx_end_i, alt_ac, alt_start_i, alt_end_i, alt_strand, alt_aln_method, ord, tx_exon_id, alt_exon_id - FROM {self.schema}.tx_exon_aln_v + FROM {self.schema}.tx_exon_aln_mv {tx_q} {alt_ac_q} {aln_method} @@ -543,7 +545,7 @@ async def get_mane_c_genomic_data( self, ac: str, alt_ac: str | None, start_pos: int, end_pos: int ) -> GenomicTxMetadata | None: """Get MANE transcript and genomic data. Used when going from g. to MANE c. - representation. This function parses queried data from the tx_exon_aln_v + representation. This function parses queried data from the tx_exon_aln_mv table, and sorts the queried data by the most recent genomic build >>> import asyncio @@ -569,7 +571,7 @@ async def get_mane_c_genomic_data( :return: Metadata for MANE genomic and transcript accessions results if successful """ - results = await self.get_tx_exon_aln_v_data( + results = await self.get_tx_exon_aln_data( tx_ac=ac, start_pos=start_pos, end_pos=end_pos, @@ -636,7 +638,7 @@ async def get_genomic_tx_data( If ``alt_ac`` is provided, it will return the associated assembly. :return: Metadata for genomic and transcript accessions """ - results = await self.get_tx_exon_aln_v_data( + results = await self.get_tx_exon_aln_data( tx_ac, pos[0], pos[1], @@ -816,7 +818,7 @@ async def get_transcripts( SELECT AA.pro_ac, AA.tx_ac, ALIGN.alt_ac, T.cds_start_i FROM {self.schema}.associated_accessions as AA JOIN {self.schema}.transcript as T ON T.ac = AA.tx_ac - JOIN {self.schema}.tx_exon_aln_v as ALIGN ON T.ac = ALIGN.tx_ac + JOIN {self.schema}.tx_exon_aln_mv as ALIGN ON T.ac = ALIGN.tx_ac WHERE ALIGN.alt_aln_method = 'splign' {gene_cond} {alt_ac_cond} @@ -903,7 +905,7 @@ async def get_transcripts_from_genomic_pos( """ query = f""" SELECT distinct tx_ac - FROM {self.schema}.tx_exon_aln_v + FROM {self.schema}.tx_exon_aln_mv WHERE alt_ac = '{alt_ac}' AND {g_pos} BETWEEN alt_start_i AND alt_end_i AND tx_ac LIKE 'NM_%'; diff --git a/tests/sources/test_uta_database.py b/tests/sources/test_uta_database.py index c05cd63..2686ca1 100644 --- a/tests/sources/test_uta_database.py +++ b/tests/sources/test_uta_database.py @@ -14,8 +14,8 @@ @pytest.fixture(scope="module") -def tx_exon_aln_v_data(): - """Create test fixture for tx_aln_v_data test.""" +def tx_exon_aln_data(): + """Create test fixture for tx_exon_aln_data test.""" return TxExonAlnData( hgnc="BRAF", ord=14, @@ -121,19 +121,19 @@ async def test_get_ac_descr(test_db): @pytest.mark.asyncio -async def test_get_tx_exon_aln_v_data(test_db, tx_exon_aln_v_data): - """Test that get_tx_exon_aln_v_data""" - resp = await test_db.get_tx_exon_aln_v_data( +async def test_get_tx_exon_aln_data(test_db, tx_exon_aln_data): + """Test that get_tx_exon_aln_data""" + resp = await test_db.get_tx_exon_aln_data( "NM_004333.4", 140453136, 140453136, alt_ac="NC_000007.13", use_tx_pos=False ) - assert resp == [tx_exon_aln_v_data] + assert resp == [tx_exon_aln_data] - resp = await test_db.get_tx_exon_aln_v_data( + resp = await test_db.get_tx_exon_aln_data( "NM_004333.4", 140453136, 140453136, alt_ac=None, use_tx_pos=False ) - assert resp == [tx_exon_aln_v_data] + assert resp == [tx_exon_aln_data] - resp = await test_db.get_tx_exon_aln_v_data( + resp = await test_db.get_tx_exon_aln_data( "NM_004333.4", 1860, 1860, alt_ac=None, use_tx_pos=True ) assert resp == [ @@ -169,9 +169,9 @@ async def test_get_tx_exon_aln_v_data(test_db, tx_exon_aln_v_data): @pytest.mark.asyncio -async def test_data_from_result(test_db, tx_exon_aln_v_data, data_from_result): +async def test_data_from_result(test_db, tx_exon_aln_data, data_from_result): """Test that data_from_result works correctly.""" - resp = test_db.data_from_result(tx_exon_aln_v_data) + resp = test_db.data_from_result(tx_exon_aln_data) assert resp == data_from_result @@ -198,7 +198,7 @@ async def test_mane_c_genomic_data(test_db): } assert resp == GenomicTxMetadata(**expected_params) - # Test example where sorting of tx_exon_aln_v is needed + # Test example where sorting of tx_exon_aln_mv is needed resp = await test_db.get_mane_c_genomic_data( "NM_000077.5", "NC_000009.12", 21971186, 21971187 )