Skip to content

Commit 6377038

Browse files
authored
Merge pull request #79 from VariantEffect/align-target-to-protein
Align target sequence to selected transcript on protein level
2 parents b53eef1 + e5cff44 commit 6377038

5 files changed

Lines changed: 134 additions & 21 deletions

File tree

.github/workflows/checks.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ jobs:
2323
2424
- name: Run tests
2525
# only run tests known to work in CI
26-
run: python3 -m pytest tests/test_mavedb_data.py tests/test_vrs_map.py
26+
run: python3 -m pytest tests/test_mavedb_data.py
2727
lint:
2828
runs-on: ubuntu-latest
2929
steps:

pyproject.toml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,8 @@ dependencies = [
4848
"fastapi",
4949
"starlette",
5050
"uvicorn",
51-
"polars<1.38.0"
51+
"polars<1.38.0",
52+
"pandas==2.2.1"
5253
]
5354
dynamic = ["version"]
5455

src/dcd_mapping/align.py

Lines changed: 62 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
import httpx
1111
from Bio.SearchIO import HSP
1212
from Bio.SearchIO import parse as parse_blat
13+
from Bio.SearchIO import read as read_blat
1314
from Bio.SearchIO._model import Hit, QueryResult
1415
from cool_seq_tool.schemas import Strand
1516

@@ -92,7 +93,11 @@ def get_ref_genome_file(
9293

9394

9495
def _run_blat(
95-
target_args: str, query_file: Path, out_file: str, silent: bool
96+
target_args: str,
97+
query_file: Path,
98+
reference_file: Path,
99+
out_file: str,
100+
silent: bool,
96101
) -> subprocess.CompletedProcess:
97102
"""Execute BLAT binary with relevant params.
98103
@@ -109,9 +114,8 @@ def _run_blat(
109114
:param silent: if True, suppress all console output
110115
:return: process result
111116
"""
112-
reference_genome_file = get_ref_genome_file(silent=silent)
113117
bin_name = os.environ["BLAT_BIN_PATH"] if "BLAT_BIN_PATH" in os.environ else "blat" # noqa: SIM401
114-
command = f"{bin_name} {reference_genome_file} {target_args} -minScore=20 {query_file} {out_file}"
118+
command = f"{bin_name} {reference_file} {target_args} -minScore=20 {query_file} {out_file}"
115119
_logger.debug("Running BLAT command: %s", command)
116120
result = subprocess.run( # noqa: UP022
117121
command,
@@ -186,15 +190,20 @@ def _get_blat_output(
186190
# TODO consider implementing support for mixed types, not hard to do - just split blat into two files and run command with each set of arguments.
187191
msg = "Mapping for score sets with a mix of nucleotide and protein target sequences is not currently supported."
188192
raise NotImplementedError(msg)
189-
process_result = _run_blat(target_args, query_file, "/dev/stdout", silent)
193+
reference_genome_file = get_ref_genome_file(silent=silent)
194+
process_result = _run_blat(
195+
target_args, query_file, reference_genome_file, "/dev/stdout", silent
196+
)
190197
out_file = _write_blat_output_tempfile(process_result)
191198

192199
try:
193200
output = parse_blat(out_file, "blat-psl")
194201

195202
except ValueError:
196203
target_args = "-q=dnax -t=dnax"
197-
process_result = _run_blat(target_args, query_file, "/dev/stdout", silent)
204+
process_result = _run_blat(
205+
target_args, query_file, reference_genome_file, "/dev/stdout", silent
206+
)
198207
out_file = _write_blat_output_tempfile(process_result)
199208
try:
200209
output = parse_blat(out_file, "blat-psl")
@@ -254,7 +263,7 @@ def _get_best_hit(output: QueryResult, chromosome: str | None) -> Hit:
254263
return best_score_hit
255264

256265

257-
def _get_best_hsp(hit: Hit, gene_location: GeneLocation | None) -> HSP:
266+
def _get_best_hsp(hit: Hit, gene_location: GeneLocation | None = None) -> HSP:
258267
"""Retrieve preferred HSP from BLAT Hit object.
259268
260269
If gene location data is available, prefer the HSP with the least distance
@@ -491,3 +500,50 @@ def build_alignment_result(
491500
alignment_result = fetch_alignment(metadata, silent)
492501

493502
return alignment_result
503+
504+
505+
def align_target_to_protein(
506+
target_sequence: str, reference_sequence: str, silent: bool
507+
) -> AlignmentResult:
508+
with tempfile.NamedTemporaryFile() as query_tmp_file, tempfile.NamedTemporaryFile() as reference_tmp_file:
509+
_write_query_file(Path(query_tmp_file.name), [">query", target_sequence])
510+
_write_query_file(
511+
Path(reference_tmp_file.name), [">reference", reference_sequence]
512+
)
513+
target_args = "-q=prot -t=prot"
514+
515+
process_result = _run_blat(
516+
target_args,
517+
query_tmp_file.name,
518+
reference_tmp_file.name,
519+
"/dev/stdout",
520+
silent,
521+
)
522+
out_file = _write_blat_output_tempfile(process_result)
523+
524+
try:
525+
blat_output = read_blat(out_file, "blat-psl")
526+
527+
except ValueError as e:
528+
msg = "Unable to run successful BLAT on target sequence against selected reference protein sequence."
529+
raise AlignmentError(msg) from e
530+
531+
best_hit = _get_best_hit(blat_output, None)
532+
best_hsp = _get_best_hsp(best_hit)
533+
534+
query_subranges = []
535+
hit_subranges = []
536+
for fragment in best_hsp:
537+
query_subranges.append(
538+
SequenceRange(start=fragment.query_start, end=fragment.query_end)
539+
)
540+
hit_subranges.append(
541+
SequenceRange(start=fragment.hit_start, end=fragment.hit_end)
542+
)
543+
544+
return AlignmentResult(
545+
query_range=SequenceRange(start=best_hsp.query_start, end=best_hsp.query_end),
546+
query_subranges=query_subranges,
547+
hit_range=SequenceRange(start=best_hsp.hit_start, end=best_hsp.hit_end),
548+
hit_subranges=hit_subranges,
549+
)

src/dcd_mapping/schemas.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -106,8 +106,8 @@ class MappedReferenceSequence(ReferenceSequence):
106106
class AlignmentResult(BaseModel):
107107
"""Define BLAT alignment output."""
108108

109-
chrom: str
110-
strand: Strand
109+
chrom: str | None = None
110+
strand: Strand | None = None
111111
coverage: float | None = None
112112
ident_pct: float | None = None
113113
query_range: SequenceRange

src/dcd_mapping/vrs_map.py

Lines changed: 67 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
from mavehgvs.util import parse_variant_strings
2222
from mavehgvs.variant import Variant
2323

24+
from dcd_mapping.align import align_target_to_protein
2425
from dcd_mapping.exceptions import (
2526
MissingSequenceIdError,
2627
UnsupportedReferenceSequenceNameSpaceError,
@@ -181,12 +182,12 @@ def _create_post_mapped_hgvs_strings(
181182
tx: TxSelectResult | None = None,
182183
alignment: AlignmentResult | None = None,
183184
accession_id: str | None = None,
185+
protein_alignment: AlignmentResult | None = None,
184186
) -> list[str]:
185187
"""Generate a list of (post-mapped) HGVS strings from one long string containing many valid HGVS substrings.
186188
187-
For protein annotations, these strings must be adjusted to match the offset defined by the start of the
188-
transcript sequence. For genomic annotations, these strings must be adjusted to match the coordinates of
189-
the reference alignment.
189+
For protein annotations, these strings must be adjusted to match the alignment of the target to the selected reference protein sequence.
190+
For genomic annotations, these strings must be adjusted to match the coordinates of the reference alignment.
190191
191192
:param raw_description: A string containing valid HGVS sub-strings
192193
:param layer: An enum denoting the targeted annotation layer of these HGVS strings
@@ -220,7 +221,7 @@ def _create_post_mapped_hgvs_strings(
220221
if layer is AnnotationLayer.PROTEIN:
221222
assert tx # noqa: S101. mypy help
222223

223-
variant = _adjust_protein_variant_to_ref(variant, tx)
224+
variant = _adjust_protein_variant_to_ref(variant, protein_alignment)
224225
hgvs_strings.append(tx.np + ":" + str(variant))
225226
elif layer is AnnotationLayer.GENOMIC:
226227
if accession_id:
@@ -250,14 +251,50 @@ def _create_post_mapped_hgvs_strings(
250251

251252
def _adjust_protein_variant_to_ref(
252253
variant: Variant,
253-
tx: TxSelectResult,
254+
alignment: AlignmentResult,
254255
) -> Variant:
256+
# adjust starts - hgvs uses 1-based numbering for c. sequences, while blat hits are 0-based
257+
starts = []
255258
if isinstance(variant.positions, Iterable):
259+
is_multi_position = True
256260
for position in variant.positions:
257-
position.position = position.position + tx.start
258-
return variant
261+
starts.append(position.position - 1)
262+
else:
263+
is_multi_position = False
264+
starts.append(variant.positions.position - 1)
265+
266+
# get hit
267+
query_subrange_containing_hit = None
268+
target_subrange_containing_hit = None
269+
for query_subrange, target_subrange in zip(
270+
alignment.query_subranges, alignment.hit_subranges, strict=True
271+
):
272+
if all(
273+
start >= query_subrange.start and start < query_subrange.end
274+
for start in starts
275+
):
276+
query_subrange_containing_hit = query_subrange
277+
target_subrange_containing_hit = target_subrange
278+
break
279+
280+
if query_subrange_containing_hit is None or target_subrange_containing_hit is None:
281+
msg = f"Cannot process variant {variant} because it is not fully contained within the aligned portion of the query sequence compared to the selected reference sequence"
282+
raise ValueError(msg)
283+
284+
for idx, start in enumerate(starts):
285+
# get variant start relative to the reference (the "hit")
286+
# distance from beginning of query to variant start position:
287+
query_to_start = start - query_subrange_containing_hit.start
288+
289+
# distance from beginning of ref to the variant start position:
290+
ref_to_start = target_subrange_containing_hit.start + query_to_start
291+
292+
# add distance from ref to variant start; hgvs is 1-based, so convert back to 1-based
293+
if is_multi_position:
294+
variant.positions[idx].position = ref_to_start + 1
295+
else:
296+
variant.positions.position = ref_to_start + 1
259297

260-
variant.positions.position = variant.positions.position + tx.start
261298
return variant
262299

263300

@@ -368,14 +405,16 @@ def _map_protein_coding_pro(
368405
row: ScoreRow,
369406
sequence_id: str,
370407
transcript: TxSelectResult,
408+
protein_align_result: AlignmentResult,
371409
) -> MappedScore:
372410
"""Construct VRS object mapping for ``hgvs_pro`` variant column entry
373411
374412
These arguments are a little lazy and could be pruned down later
375413
376414
:param row: A row of output from a MaveDB score set
377415
:param sequence_id: The GA4GH accession for the provided sequence
378-
:param transcript: The transcript selection information for a score set
416+
:param transcript: The transcript selection information for a target
417+
:param protein_align_result: The protein-protein alignment result for a target against its selected protein reference
379418
:return: VRS mapping object if mapping succeeds
380419
"""
381420
if (
@@ -444,6 +483,7 @@ def _map_protein_coding_pro(
444483
row.hgvs_pro,
445484
AnnotationLayer.PROTEIN,
446485
tx=transcript,
486+
protein_alignment=protein_align_result,
447487
)
448488
post_mapped_protein = _construct_vrs_allele(
449489
post_mapped_hgvs_strings,
@@ -729,6 +769,7 @@ def _map_protein_coding(
729769
records: list[ScoreRow],
730770
transcript: TxSelectResult | TxSelectError,
731771
align_result: AlignmentResult,
772+
silent: bool,
732773
) -> list[MappedScore]:
733774
"""Perform mapping on protein coding experiment results
734775
@@ -748,6 +789,13 @@ def _map_protein_coding(
748789
sequence = metadata.target_sequence
749790
psequence_id = gsequence_id = store_sequence(sequence)
750791

792+
# align protein sequence to selected reference protein sequence to get offsets and gaps
793+
protein_align_result = None
794+
if isinstance(transcript, TxSelectResult):
795+
protein_align_result = align_target_to_protein(
796+
sequence, transcript.sequence, silent
797+
)
798+
751799
variations: list[MappedScore] = []
752800
for row in records:
753801
hgvs_nt_mappings = None
@@ -765,9 +813,16 @@ def _map_protein_coding(
765813
error_message=str(transcript).strip("'"),
766814
)
767815
else:
768-
if _hgvs_pro_is_valid(row.hgvs_pro):
816+
if _hgvs_pro_is_valid(row.hgvs_pro) and protein_align_result is not None:
769817
hgvs_pro_mappings = _map_protein_coding_pro(
770-
row, psequence_id, transcript
818+
row, psequence_id, transcript, protein_align_result
819+
)
820+
# This should not occur because protein align result is only None if transcript selection failed, which should be caught by the TxSelectError check above.
821+
elif protein_align_result is None:
822+
hgvs_pro_mappings = MappedScore(
823+
accession_id=row.accession,
824+
score=row.score,
825+
error_message="Could not perform mapping for protein variant because transcript sequence is missing or could not be aligned to reference sequence",
771826
)
772827
elif (
773828
not hgvs_nt_mappings
@@ -954,6 +1009,7 @@ def vrs_map(
9541009
records,
9551010
transcript,
9561011
align_result,
1012+
silent,
9571013
)
9581014
return _map_regulatory_noncoding(
9591015
metadata,

0 commit comments

Comments
 (0)