2121from mavehgvs .util import parse_variant_strings
2222from mavehgvs .variant import Variant
2323
24+ from dcd_mapping .align import align_target_to_protein
2425from dcd_mapping .exceptions import (
2526 MissingSequenceIdError ,
2627 UnsupportedReferenceSequenceNameSpaceError ,
@@ -181,12 +182,12 @@ def _create_post_mapped_hgvs_strings(
181182 tx : TxSelectResult | None = None ,
182183 alignment : AlignmentResult | None = None ,
183184 accession_id : str | None = None ,
185+ protein_alignment : AlignmentResult | None = None ,
184186) -> list [str ]:
185187 """Generate a list of (post-mapped) HGVS strings from one long string containing many valid HGVS substrings.
186188
187- For protein annotations, these strings must be adjusted to match the offset defined by the start of the
188- transcript sequence. For genomic annotations, these strings must be adjusted to match the coordinates of
189- the reference alignment.
189+ For protein annotations, these strings must be adjusted to match the alignment of the target to the selected reference protein sequence.
190+ For genomic annotations, these strings must be adjusted to match the coordinates of the reference alignment.
190191
191192 :param raw_description: A string containing valid HGVS sub-strings
192193 :param layer: An enum denoting the targeted annotation layer of these HGVS strings
@@ -220,7 +221,7 @@ def _create_post_mapped_hgvs_strings(
220221 if layer is AnnotationLayer .PROTEIN :
221222 assert tx # noqa: S101. mypy help
222223
223- variant = _adjust_protein_variant_to_ref (variant , tx )
224+ variant = _adjust_protein_variant_to_ref (variant , protein_alignment )
224225 hgvs_strings .append (tx .np + ":" + str (variant ))
225226 elif layer is AnnotationLayer .GENOMIC :
226227 if accession_id :
@@ -250,14 +251,50 @@ def _create_post_mapped_hgvs_strings(
250251
251252def _adjust_protein_variant_to_ref (
252253 variant : Variant ,
253- tx : TxSelectResult ,
254+ alignment : AlignmentResult ,
254255) -> Variant :
256+ # adjust starts - hgvs uses 1-based numbering for c. sequences, while blat hits are 0-based
257+ starts = []
255258 if isinstance (variant .positions , Iterable ):
259+ is_multi_position = True
256260 for position in variant .positions :
257- position .position = position .position + tx .start
258- return variant
261+ starts .append (position .position - 1 )
262+ else :
263+ is_multi_position = False
264+ starts .append (variant .positions .position - 1 )
265+
266+ # get hit
267+ query_subrange_containing_hit = None
268+ target_subrange_containing_hit = None
269+ for query_subrange , target_subrange in zip (
270+ alignment .query_subranges , alignment .hit_subranges , strict = True
271+ ):
272+ if all (
273+ start >= query_subrange .start and start < query_subrange .end
274+ for start in starts
275+ ):
276+ query_subrange_containing_hit = query_subrange
277+ target_subrange_containing_hit = target_subrange
278+ break
279+
280+ if query_subrange_containing_hit is None or target_subrange_containing_hit is None :
281+ msg = f"Cannot process variant { variant } because it is not fully contained within the aligned portion of the query sequence compared to the selected reference sequence"
282+ raise ValueError (msg )
283+
284+ for idx , start in enumerate (starts ):
285+ # get variant start relative to the reference (the "hit")
286+ # distance from beginning of query to variant start position:
287+ query_to_start = start - query_subrange_containing_hit .start
288+
289+ # distance from beginning of ref to the variant start position:
290+ ref_to_start = target_subrange_containing_hit .start + query_to_start
291+
292+ # add distance from ref to variant start; hgvs is 1-based, so convert back to 1-based
293+ if is_multi_position :
294+ variant .positions [idx ].position = ref_to_start + 1
295+ else :
296+ variant .positions .position = ref_to_start + 1
259297
260- variant .positions .position = variant .positions .position + tx .start
261298 return variant
262299
263300
@@ -368,14 +405,16 @@ def _map_protein_coding_pro(
368405 row : ScoreRow ,
369406 sequence_id : str ,
370407 transcript : TxSelectResult ,
408+ protein_align_result : AlignmentResult ,
371409) -> MappedScore :
372410 """Construct VRS object mapping for ``hgvs_pro`` variant column entry
373411
374412 These arguments are a little lazy and could be pruned down later
375413
376414 :param row: A row of output from a MaveDB score set
377415 :param sequence_id: The GA4GH accession for the provided sequence
378- :param transcript: The transcript selection information for a score set
416+ :param transcript: The transcript selection information for a target
417+ :param protein_align_result: The protein-protein alignment result for a target against its selected protein reference
379418 :return: VRS mapping object if mapping succeeds
380419 """
381420 if (
@@ -444,6 +483,7 @@ def _map_protein_coding_pro(
444483 row .hgvs_pro ,
445484 AnnotationLayer .PROTEIN ,
446485 tx = transcript ,
486+ protein_alignment = protein_align_result ,
447487 )
448488 post_mapped_protein = _construct_vrs_allele (
449489 post_mapped_hgvs_strings ,
@@ -729,6 +769,7 @@ def _map_protein_coding(
729769 records : list [ScoreRow ],
730770 transcript : TxSelectResult | TxSelectError ,
731771 align_result : AlignmentResult ,
772+ silent : bool ,
732773) -> list [MappedScore ]:
733774 """Perform mapping on protein coding experiment results
734775
@@ -748,6 +789,13 @@ def _map_protein_coding(
748789 sequence = metadata .target_sequence
749790 psequence_id = gsequence_id = store_sequence (sequence )
750791
792+ # align protein sequence to selected reference protein sequence to get offsets and gaps
793+ protein_align_result = None
794+ if isinstance (transcript , TxSelectResult ):
795+ protein_align_result = align_target_to_protein (
796+ sequence , transcript .sequence , silent
797+ )
798+
751799 variations : list [MappedScore ] = []
752800 for row in records :
753801 hgvs_nt_mappings = None
@@ -765,9 +813,16 @@ def _map_protein_coding(
765813 error_message = str (transcript ).strip ("'" ),
766814 )
767815 else :
768- if _hgvs_pro_is_valid (row .hgvs_pro ):
816+ if _hgvs_pro_is_valid (row .hgvs_pro ) and protein_align_result is not None :
769817 hgvs_pro_mappings = _map_protein_coding_pro (
770- row , psequence_id , transcript
818+ row , psequence_id , transcript , protein_align_result
819+ )
820+ # This should not occur because protein align result is only None if transcript selection failed, which should be caught by the TxSelectError check above.
821+ elif protein_align_result is None :
822+ hgvs_pro_mappings = MappedScore (
823+ accession_id = row .accession ,
824+ score = row .score ,
825+ error_message = "Could not perform mapping for protein variant because transcript sequence is missing or could not be aligned to reference sequence" ,
771826 )
772827 elif (
773828 not hgvs_nt_mappings
@@ -954,6 +1009,7 @@ def vrs_map(
9541009 records ,
9551010 transcript ,
9561011 align_result ,
1012+ silent ,
9571013 )
9581014 return _map_regulatory_noncoding (
9591015 metadata ,
0 commit comments