Merge pull request #79 from VariantEffect/align-target-to-protein

bencap · web-flow · commit 637703818529 · 2026-03-22T15:36:38.000-07:00
Align target sequence to selected transcript on protein level
diff --git a/.github/workflows/checks.yaml b/.github/workflows/checks.yaml
@@ -23,7 +23,7 @@ jobs:
 
       - name: Run tests
         # only run tests known to work in CI
-        run: python3 -m pytest tests/test_mavedb_data.py tests/test_vrs_map.py
+        run: python3 -m pytest tests/test_mavedb_data.py
   lint:
     runs-on: ubuntu-latest
     steps:
diff --git a/pyproject.toml b/pyproject.toml
@@ -48,7 +48,8 @@ dependencies = [
     "fastapi",
     "starlette",
     "uvicorn",
-    "polars<1.38.0"
+    "polars<1.38.0",
+    "pandas==2.2.1"
 ]
 dynamic = ["version"]
 
diff --git a/src/dcd_mapping/align.py b/src/dcd_mapping/align.py
@@ -10,6 +10,7 @@
 import httpx
 from Bio.SearchIO import HSP
 from Bio.SearchIO import parse as parse_blat
+from Bio.SearchIO import read as read_blat
 from Bio.SearchIO._model import Hit, QueryResult
 from cool_seq_tool.schemas import Strand
 
@@ -92,7 +93,11 @@ def get_ref_genome_file(
 
 
 def _run_blat(
-    target_args: str, query_file: Path, out_file: str, silent: bool
+    target_args: str,
+    query_file: Path,
+    reference_file: Path,
+    out_file: str,
+    silent: bool,
 ) -> subprocess.CompletedProcess:
     """Execute BLAT binary with relevant params.
 
@@ -109,9 +114,8 @@ def _run_blat(
     :param silent: if True, suppress all console output
     :return: process result
     """
-    reference_genome_file = get_ref_genome_file(silent=silent)
     bin_name = os.environ["BLAT_BIN_PATH"] if "BLAT_BIN_PATH" in os.environ else "blat"  # noqa: SIM401
-    command = f"{bin_name} {reference_genome_file} {target_args} -minScore=20 {query_file} {out_file}"
+    command = f"{bin_name} {reference_file} {target_args} -minScore=20 {query_file} {out_file}"
     _logger.debug("Running BLAT command: %s", command)
     result = subprocess.run(  # noqa: UP022
         command,
@@ -186,15 +190,20 @@ def _get_blat_output(
             # TODO consider implementing support for mixed types, not hard to do - just split blat into two files and run command with each set of arguments.
             msg = "Mapping for score sets with a mix of nucleotide and protein target sequences is not currently supported."
             raise NotImplementedError(msg)
-        process_result = _run_blat(target_args, query_file, "/dev/stdout", silent)
+        reference_genome_file = get_ref_genome_file(silent=silent)
+        process_result = _run_blat(
+            target_args, query_file, reference_genome_file, "/dev/stdout", silent
+        )
         out_file = _write_blat_output_tempfile(process_result)
 
         try:
             output = parse_blat(out_file, "blat-psl")
 
         except ValueError:
             target_args = "-q=dnax -t=dnax"
-            process_result = _run_blat(target_args, query_file, "/dev/stdout", silent)
+            process_result = _run_blat(
+                target_args, query_file, reference_genome_file, "/dev/stdout", silent
+            )
             out_file = _write_blat_output_tempfile(process_result)
             try:
                 output = parse_blat(out_file, "blat-psl")
@@ -254,7 +263,7 @@ def _get_best_hit(output: QueryResult, chromosome: str | None) -> Hit:
     return best_score_hit
 
 
-def _get_best_hsp(hit: Hit, gene_location: GeneLocation | None) -> HSP:
+def _get_best_hsp(hit: Hit, gene_location: GeneLocation | None = None) -> HSP:
     """Retrieve preferred HSP from BLAT Hit object.
 
     If gene location data is available, prefer the HSP with the least distance
@@ -491,3 +500,50 @@ def build_alignment_result(
         alignment_result = fetch_alignment(metadata, silent)
 
     return alignment_result
+
+
+def align_target_to_protein(
+    target_sequence: str, reference_sequence: str, silent: bool
+) -> AlignmentResult:
+    with tempfile.NamedTemporaryFile() as query_tmp_file, tempfile.NamedTemporaryFile() as reference_tmp_file:
+        _write_query_file(Path(query_tmp_file.name), [">query", target_sequence])
+        _write_query_file(
+            Path(reference_tmp_file.name), [">reference", reference_sequence]
+        )
+        target_args = "-q=prot -t=prot"
+
+        process_result = _run_blat(
+            target_args,
+            query_tmp_file.name,
+            reference_tmp_file.name,
+            "/dev/stdout",
+            silent,
+        )
+        out_file = _write_blat_output_tempfile(process_result)
+
+        try:
+            blat_output = read_blat(out_file, "blat-psl")
+
+        except ValueError as e:
+            msg = "Unable to run successful BLAT on target sequence against selected reference protein sequence."
+            raise AlignmentError(msg) from e
+
+    best_hit = _get_best_hit(blat_output, None)
+    best_hsp = _get_best_hsp(best_hit)
+
+    query_subranges = []
+    hit_subranges = []
+    for fragment in best_hsp:
+        query_subranges.append(
+            SequenceRange(start=fragment.query_start, end=fragment.query_end)
+        )
+        hit_subranges.append(
+            SequenceRange(start=fragment.hit_start, end=fragment.hit_end)
+        )
+
+    return AlignmentResult(
+        query_range=SequenceRange(start=best_hsp.query_start, end=best_hsp.query_end),
+        query_subranges=query_subranges,
+        hit_range=SequenceRange(start=best_hsp.hit_start, end=best_hsp.hit_end),
+        hit_subranges=hit_subranges,
+    )
diff --git a/src/dcd_mapping/schemas.py b/src/dcd_mapping/schemas.py
@@ -106,8 +106,8 @@ class MappedReferenceSequence(ReferenceSequence):
 class AlignmentResult(BaseModel):
     """Define BLAT alignment output."""
 
-    chrom: str
-    strand: Strand
+    chrom: str | None = None
+    strand: Strand | None = None
     coverage: float | None = None
     ident_pct: float | None = None
     query_range: SequenceRange
diff --git a/src/dcd_mapping/vrs_map.py b/src/dcd_mapping/vrs_map.py
@@ -21,6 +21,7 @@
 from mavehgvs.util import parse_variant_strings
 from mavehgvs.variant import Variant
 
+from dcd_mapping.align import align_target_to_protein
 from dcd_mapping.exceptions import (
     MissingSequenceIdError,
     UnsupportedReferenceSequenceNameSpaceError,
@@ -181,12 +182,12 @@ def _create_post_mapped_hgvs_strings(
     tx: TxSelectResult | None = None,
     alignment: AlignmentResult | None = None,
     accession_id: str | None = None,
+    protein_alignment: AlignmentResult | None = None,
 ) -> list[str]:
     """Generate a list of (post-mapped) HGVS strings from one long string containing many valid HGVS substrings.
 
-    For protein annotations, these strings must be adjusted to match the offset defined by the start of the
-    transcript sequence. For genomic annotations, these strings must be adjusted to match the coordinates of
-    the reference alignment.
+    For protein annotations, these strings must be adjusted to match the alignment of the target to the selected reference protein sequence.
+    For genomic annotations, these strings must be adjusted to match the coordinates of the reference alignment.
 
     :param raw_description: A string containing valid HGVS sub-strings
     :param layer: An enum denoting the targeted annotation layer of these HGVS strings
@@ -220,7 +221,7 @@ def _create_post_mapped_hgvs_strings(
         if layer is AnnotationLayer.PROTEIN:
             assert tx  # noqa: S101. mypy help
 
-            variant = _adjust_protein_variant_to_ref(variant, tx)
+            variant = _adjust_protein_variant_to_ref(variant, protein_alignment)
             hgvs_strings.append(tx.np + ":" + str(variant))
         elif layer is AnnotationLayer.GENOMIC:
             if accession_id:
@@ -250,14 +251,50 @@ def _create_post_mapped_hgvs_strings(
 
 def _adjust_protein_variant_to_ref(
     variant: Variant,
-    tx: TxSelectResult,
+    alignment: AlignmentResult,
 ) -> Variant:
+    # adjust starts - hgvs uses 1-based numbering for c. sequences, while blat hits are 0-based
+    starts = []
     if isinstance(variant.positions, Iterable):
+        is_multi_position = True
         for position in variant.positions:
-            position.position = position.position + tx.start
-        return variant
+            starts.append(position.position - 1)
+    else:
+        is_multi_position = False
+        starts.append(variant.positions.position - 1)
+
+    # get hit
+    query_subrange_containing_hit = None
+    target_subrange_containing_hit = None
+    for query_subrange, target_subrange in zip(
+        alignment.query_subranges, alignment.hit_subranges, strict=True
+    ):
+        if all(
+            start >= query_subrange.start and start < query_subrange.end
+            for start in starts
+        ):
+            query_subrange_containing_hit = query_subrange
+            target_subrange_containing_hit = target_subrange
+            break
+
+    if query_subrange_containing_hit is None or target_subrange_containing_hit is None:
+        msg = f"Cannot process variant {variant} because it is not fully contained within the aligned portion of the query sequence compared to the selected reference sequence"
+        raise ValueError(msg)
+
+    for idx, start in enumerate(starts):
+        # get variant start relative to the reference (the "hit")
+        # distance from beginning of query to variant start position:
+        query_to_start = start - query_subrange_containing_hit.start
+
+        # distance from beginning of ref to the variant start position:
+        ref_to_start = target_subrange_containing_hit.start + query_to_start
+
+        # add distance from ref to variant start; hgvs is 1-based, so convert back to 1-based
+        if is_multi_position:
+            variant.positions[idx].position = ref_to_start + 1
+        else:
+            variant.positions.position = ref_to_start + 1
 
-    variant.positions.position = variant.positions.position + tx.start
     return variant
 
 
@@ -368,14 +405,16 @@ def _map_protein_coding_pro(
     row: ScoreRow,
     sequence_id: str,
     transcript: TxSelectResult,
+    protein_align_result: AlignmentResult,
 ) -> MappedScore:
     """Construct VRS object mapping for ``hgvs_pro`` variant column entry
 
     These arguments are a little lazy and could be pruned down later
 
     :param row: A row of output from a MaveDB score set
     :param sequence_id: The GA4GH accession for the provided sequence
-    :param transcript: The transcript selection information for a score set
+    :param transcript: The transcript selection information for a target
+    :param protein_align_result: The protein-protein alignment result for a target against its selected protein reference
     :return: VRS mapping object if mapping succeeds
     """
     if (
@@ -444,6 +483,7 @@ def _map_protein_coding_pro(
             row.hgvs_pro,
             AnnotationLayer.PROTEIN,
             tx=transcript,
+            protein_alignment=protein_align_result,
         )
         post_mapped_protein = _construct_vrs_allele(
             post_mapped_hgvs_strings,
@@ -729,6 +769,7 @@ def _map_protein_coding(
     records: list[ScoreRow],
     transcript: TxSelectResult | TxSelectError,
     align_result: AlignmentResult,
+    silent: bool,
 ) -> list[MappedScore]:
     """Perform mapping on protein coding experiment results
 
@@ -748,6 +789,13 @@ def _map_protein_coding(
         sequence = metadata.target_sequence
         psequence_id = gsequence_id = store_sequence(sequence)
 
+    # align protein sequence to selected reference protein sequence to get offsets and gaps
+    protein_align_result = None
+    if isinstance(transcript, TxSelectResult):
+        protein_align_result = align_target_to_protein(
+            sequence, transcript.sequence, silent
+        )
+
     variations: list[MappedScore] = []
     for row in records:
         hgvs_nt_mappings = None
@@ -765,9 +813,16 @@ def _map_protein_coding(
                 error_message=str(transcript).strip("'"),
             )
         else:
-            if _hgvs_pro_is_valid(row.hgvs_pro):
+            if _hgvs_pro_is_valid(row.hgvs_pro) and protein_align_result is not None:
                 hgvs_pro_mappings = _map_protein_coding_pro(
-                    row, psequence_id, transcript
+                    row, psequence_id, transcript, protein_align_result
+                )
+            # This should not occur because protein align result is only None if transcript selection failed, which should be caught by the TxSelectError check above.
+            elif protein_align_result is None:
+                hgvs_pro_mappings = MappedScore(
+                    accession_id=row.accession,
+                    score=row.score,
+                    error_message="Could not perform mapping for protein variant because transcript sequence is missing or could not be aligned to reference sequence",
                 )
             elif (
                 not hgvs_nt_mappings
@@ -954,6 +1009,7 @@ def vrs_map(
             records,
             transcript,
             align_result,
+            silent,
         )
     return _map_regulatory_noncoding(
         metadata,

Original file line number	Diff line number	Diff line change
`@@ -48,7 +48,8 @@ dependencies = [`
`48`	`48`	`"fastapi",`
`49`	`49`	`"starlette",`
`50`	`50`	`"uvicorn",`
`51`		`- "polars<1.38.0"`
	`51`	`+ "polars<1.38.0",`
	`52`	`+ "pandas==2.2.1"`
`52`	`53`	`]`
`53`	`54`	`dynamic = ["version"]`
`54`	`55`