forked from ave-dcd/dcd_mapping
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathvrs_map.py
More file actions
1018 lines (886 loc) · 38.5 KB
/
vrs_map.py
File metadata and controls
1018 lines (886 loc) · 38.5 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
"""Map transcripts to VRS objects."""
import logging
import os
from collections.abc import Iterable
from itertools import cycle
from Bio.Seq import Seq
from bioutils.accessions import infer_namespace
from cool_seq_tool.schemas import AnnotationLayer, Strand
from ga4gh.core import ga4gh_identify, sha512t24u
from ga4gh.vrs._internal.models import (
Allele,
Haplotype,
LiteralSequenceExpression,
ReferenceLengthExpression,
SequenceLocation,
SequenceString,
)
from ga4gh.vrs.normalize import normalize
from mavehgvs.util import parse_variant_strings
from mavehgvs.variant import Variant
from dcd_mapping.align import align_target_to_protein
from dcd_mapping.exceptions import (
MissingSequenceIdError,
UnsupportedReferenceSequenceNameSpaceError,
UnsupportedReferenceSequencePrefixError,
)
from dcd_mapping.lookup import (
cdot_rest,
get_chromosome_identifier,
get_seqrepo,
translate_hgvs_to_vrs,
)
from dcd_mapping.resource_utils import is_missing_value, request_with_backoff
from dcd_mapping.schemas import (
AlignmentResult,
MappedScore,
ScoreRow,
TargetGene,
TargetSequenceType,
TargetType,
TxSelectResult,
)
from dcd_mapping.transcripts import TxSelectError
__all__ = ["vrs_map"]
_logger = logging.getLogger(__name__)
CLINGEN_API_URL = os.environ.get("CLINGEN_API_URL", "https://reg.genome.network/allele")
def _hgvs_variant_is_valid(hgvs_string: str) -> bool:
return not hgvs_string.endswith((".=", ")", "X"))
def _process_any_aa_code(hgvs_pro_string: str) -> str:
"""Substitute "Xaa" for "?" in variation expression.
Some expressions seem to use the single-character "?" wildcard in the context of
three-letter amino acid codes. This is weird, and the proper replacement is "Xaa".
Note that we currently do NOT make any alterations to nucleotide strings that use
weird apparently-wildcard characters like "X" -- we just treat them as invalid (see
_hgvs_variant_is_valid()).
:param hgvs_string: MAVE HGVS expression
:return: processed variation (equivalent to input if no wildcard code found)
"""
if "?" in hgvs_pro_string:
_logger.debug("Substituting Xaa for ? in %s", hgvs_pro_string)
hgvs_pro_string = hgvs_pro_string.replace("?", "Xaa")
return hgvs_pro_string
def is_intronic_variant(variant: Variant) -> bool:
"""Return True if given Variant is intronic, otherwise return False.
Supports single or multi-position variants.
"""
if isinstance(variant.positions, Iterable):
if any(position.is_intronic() for position in variant.positions):
return True
else:
if variant.positions.is_intronic():
return True
return False
def fetch_clingen_genomic_hgvs(hgvs: str) -> str | None:
"""Fetch the genomic HGVS string from ClinGen.
:param hgvs: The HGVS string to fetch
:return: The genomic HGVS string on GRCh38, or None if not found
"""
if CLINGEN_API_URL is None:
msg = "CLINGEN_API_URL environment variable is not set and default is unavailable."
_logger.error(msg)
raise ValueError(msg)
response = request_with_backoff(url=f"{CLINGEN_API_URL}?hgvs={hgvs}", timeout=30)
if response.status_code == 200:
data = response.json()
for allele in data.get("genomicAlleles", []):
if allele.get("referenceGenome") == "GRCh38":
for coordinates in allele.get("hgvs", []):
if coordinates.startswith("NC_"):
return coordinates
return None
def _create_pre_mapped_hgvs_strings(
raw_description: str,
layer: AnnotationLayer,
tx: TxSelectResult | None = None,
alignment: AlignmentResult | None = None,
accession_id: str | None = None,
) -> list[str]:
"""Generate a list of (pre-mapped) HGVS strings from one long string containing many valid HGVS substrings
Currently, the provided transcript is used as the reference for the hgvs string, but this is inaccurate
because pre-mapped variants should be relative to the user-provided target sequence, not an external accession.
Any offset between the transcript and target sequence is not taken into account here (the variant position
is relative to the target sequence).
:param raw_description: A string containing valid HGVS sub-strings
:param layer: An enum denoting the targeted annotation layer of these HGVS strings
:param tx: A TxSelectResult object defining the transcript we are mapping to (or None).
:param alignment: An AlignmentResult object defining the alignment we are mapping to (or None).
:param accession_id: An accession id describing the reference sequence (for accession-based target gene variants)
:return: A list of HGVS strings prior to being mapped to the `tx` or `alignment`
"""
if layer is AnnotationLayer.PROTEIN and tx is None:
msg = f"Transcript result must be provided for {layer} annotations (Transcript was `{tx}`)."
raise ValueError(msg)
if layer is AnnotationLayer.GENOMIC and alignment is None and accession_id is None:
msg = f"Alignment result or accession ID must be provided for {layer} annotations (Alignment was `{alignment}`)."
raise ValueError(msg)
raw_variant_strings = _parse_raw_variant_str(raw_description)
variants, errors = parse_variant_strings(raw_variant_strings)
hgvs_strings = []
for variant, error in zip(variants, errors, strict=True):
if error is not None:
msg = f"Variant could not be parsed by mavehgvs: {error}"
raise ValueError(msg)
# ga4gh hgvs_tools does not support intronic variants, so they will err out when vrs allele translator is called
# therefore skip them there
if is_intronic_variant(variant):
msg = f"Variant is intronic and cannot be processed: {variant}"
raise ValueError(msg)
if accession_id:
hgvs_strings.append(accession_id + ":" + str(variant))
# Ideally we would create an HGVS string namespaced to GA4GH. The line below
# creates such a string, but it is not able to be parsed by the GA4GH VRS translator.
# hgvs_strings.append('ga4gh:' + sequence_id + ':' + str(variant))
elif layer is AnnotationLayer.PROTEIN:
assert tx # noqa: S101. mypy help
hgvs_strings.append(tx.np + ":" + str(variant))
elif layer is AnnotationLayer.GENOMIC:
assert alignment # noqa: S101. mypy help
hgvs_strings.append(
get_chromosome_identifier(alignment.chrom) + ":" + str(variant)
)
else:
msg = (
f"Could not generate HGVS strings for invalid AnnotationLayer: {layer}"
)
raise ValueError(msg)
return hgvs_strings
def _create_post_mapped_hgvs_strings(
raw_description: str,
layer: AnnotationLayer,
tx: TxSelectResult | None = None,
alignment: AlignmentResult | None = None,
accession_id: str | None = None,
protein_alignment: AlignmentResult | None = None,
) -> list[str]:
"""Generate a list of (post-mapped) HGVS strings from one long string containing many valid HGVS substrings.
For protein annotations, these strings must be adjusted to match the alignment of the target to the selected reference protein sequence.
For genomic annotations, these strings must be adjusted to match the coordinates of the reference alignment.
:param raw_description: A string containing valid HGVS sub-strings
:param layer: An enum denoting the targeted annotation layer of these HGVS strings
:param tx: A TxSelectResult object defining the transcript we are mapping to (or None)
:param alignment: An AlignmentResult object defining the alignment we are mapping to (or None)
:param accession_id: An accession id describing the reference sequence (or None). Only used for accession-based variants.
:return: A list of HGVS strings relative to the `tx` or `alignment`
"""
if layer is AnnotationLayer.PROTEIN and tx is None:
msg = f"Transcript result must be provided for {layer} annotations (Transcript was `{tx}`)."
raise ValueError(msg)
if layer is AnnotationLayer.GENOMIC and alignment is None and accession_id is None:
msg = f"Alignment result or accession ID must be provided for {layer} annotations (Alignment was `{alignment}` and Accession ID was `{accession_id}`)."
raise ValueError(msg)
raw_variants = _parse_raw_variant_str(raw_description)
variants, errors = parse_variant_strings(raw_variants)
hgvs_strings = []
for variant, error in zip(variants, errors, strict=True):
if error is not None:
msg = f"Variant could not be parsed by mavehgvs: {error}"
raise ValueError(msg)
# ga4gh hgvs_tools does not support intronic variants, so they will err out when vrs allele translator is called
# therefore skip them there
if is_intronic_variant(variant):
msg = f"Variant is intronic and cannot be processed: {variant}"
raise ValueError(msg)
if layer is AnnotationLayer.PROTEIN:
assert tx # noqa: S101. mypy help
variant = _adjust_protein_variant_to_ref(variant, protein_alignment)
hgvs_strings.append(tx.np + ":" + str(variant))
elif layer is AnnotationLayer.GENOMIC:
if accession_id:
pre_mapped_hgvs = accession_id + ":" + str(variant)
# use ClinGen to align accession-based variants to genomic reference
genomic_hgvs = fetch_clingen_genomic_hgvs(pre_mapped_hgvs)
if genomic_hgvs:
hgvs_strings.append(genomic_hgvs)
else:
msg = f"Could not fetch genomic HGVS on GRCh38 for accession-based variant: {pre_mapped_hgvs}"
raise ValueError(msg)
else:
assert alignment # noqa: S101. mypy help
variant = _adjust_genomic_variant_to_ref(variant, alignment)
hgvs_strings.append(
get_chromosome_identifier(alignment.chrom) + ":" + str(variant)
)
else:
msg = (
f"Could not generate HGVS strings for invalid AnnotationLayer: {layer}"
)
raise ValueError(msg)
return hgvs_strings
def _adjust_protein_variant_to_ref(
variant: Variant,
alignment: AlignmentResult,
) -> Variant:
# adjust starts - hgvs uses 1-based numbering for c. sequences, while blat hits are 0-based
starts = []
if isinstance(variant.positions, Iterable):
is_multi_position = True
for position in variant.positions:
starts.append(position.position - 1)
else:
is_multi_position = False
starts.append(variant.positions.position - 1)
# get hit
query_subrange_containing_hit = None
target_subrange_containing_hit = None
for query_subrange, target_subrange in zip(
alignment.query_subranges, alignment.hit_subranges, strict=True
):
if all(
start >= query_subrange.start and start < query_subrange.end
for start in starts
):
query_subrange_containing_hit = query_subrange
target_subrange_containing_hit = target_subrange
break
if query_subrange_containing_hit is None or target_subrange_containing_hit is None:
msg = f"Cannot process variant {variant} because it is not fully contained within the aligned portion of the query sequence compared to the selected reference sequence"
raise ValueError(msg)
for idx, start in enumerate(starts):
# get variant start relative to the reference (the "hit")
# distance from beginning of query to variant start position:
query_to_start = start - query_subrange_containing_hit.start
# distance from beginning of ref to the variant start position:
ref_to_start = target_subrange_containing_hit.start + query_to_start
# add distance from ref to variant start; hgvs is 1-based, so convert back to 1-based
if is_multi_position:
variant.positions[idx].position = ref_to_start + 1
else:
variant.positions.position = ref_to_start + 1
return variant
def _adjust_genomic_variant_to_ref(
variant: Variant,
alignment: AlignmentResult,
) -> Variant:
"""Adjust a variant relative to a provided alignment.
:param variant: A variant object relative to a scoreset's target sequence
:param alignment: An AlignmentResult object denoting the alignment we are mapping to
:return: A variant object that describes the variant relative to the provided alignment result
"""
# adjust starts - hgvs uses 1-based numbering for c. sequences, while blat hits are 0-based
starts = []
if isinstance(variant.positions, Iterable):
is_multi_position = True
for position in variant.positions:
starts.append(position.position - 1)
else:
is_multi_position = False
starts.append(variant.positions.position - 1)
# get hit
query_subrange_containing_hit = None
target_subrange_containing_hit = None
for query_subrange, target_subrange in zip(
alignment.query_subranges, alignment.hit_subranges, strict=True
):
if all(
start >= query_subrange.start and start < query_subrange.end
for start in starts
):
query_subrange_containing_hit = query_subrange
target_subrange_containing_hit = target_subrange
break
if query_subrange_containing_hit is None or target_subrange_containing_hit is None:
msg = f"Cannot process variant {variant} because it is not fully contained within the aligned portion of the query sequence"
raise ValueError(msg)
for idx, start in enumerate(starts):
if alignment.strand is Strand.POSITIVE:
# get variant start relative to the reference (the "hit")
# distance from beginning of query to variant start position:
query_to_start = start - query_subrange_containing_hit.start
# distance from beginning of ref to the variant start position:
ref_to_start = target_subrange_containing_hit.start + query_to_start
else:
# picture the rev comp of the query/variant as mapping to the positive strand of the ref
# the start of the reverse complement of the variant is the end of the "original" variant
# so we need to know where the end of the original variant is, relative to the query molecule
end = start
# subtract 1 from end of hit range, because blat ranges are 0-based [start, end)
ref_to_start = (target_subrange_containing_hit.end - 1) - (
end - query_subrange_containing_hit.start
)
# add distance from ref to variant start; hgvs is 1-based, so convert back to 1-based
if is_multi_position:
variant.positions[idx].position = ref_to_start + 1
else:
variant.positions.position = ref_to_start + 1
# get reverse complement of sequence if the target maps to the negative strand of the reference
if alignment.strand is Strand.NEGATIVE:
# variant._sequences can be a string or an iterable
if isinstance(variant._sequences, str):
variant._sequences = str(Seq(variant._sequences).reverse_complement())
elif variant._sequences is not None:
revcomp_sequences_list = []
for sequence in variant._sequences:
revcomp_sequences_list.append(str(Seq(sequence).reverse_complement()))
variant._sequences = revcomp_sequences_list
# reverse order of positions tuple
if is_multi_position:
variant._positions = tuple(reversed(list(variant.positions)))
# change prefix from c. to g. since variant is now relative to chr reference
variant._prefix = "g"
return variant
def _parse_raw_variant_str(raw_description: str) -> list[str]:
"""Parse a string which may contain many HGVS strings into a list of each one.
:param raw_description: A string that may contain a list of variant descriptions or a single variant description
:return: A list of HGVS strings
"""
# some variant strings follow mavehgvs format, meaning they don't have a reference sequence id and colon preceding the c./g./n./p. prefix
# the reference sequence information has previously been parsed for score sets with multiple targets,
# so can discard the reference sequence id and colon if they are present
# TODO check assumption of no colon unless reference sequence identifier is supplied!
if ":" in raw_description:
raw_description = raw_description.split(":")[1]
if "[" in raw_description:
prefix = raw_description[0:2]
return [prefix + var for var in set(raw_description[3:-1].split(";"))]
return [raw_description]
def _map_protein_coding_pro(
row: ScoreRow,
sequence_id: str,
transcript: TxSelectResult,
protein_align_result: AlignmentResult,
) -> MappedScore:
"""Construct VRS object mapping for ``hgvs_pro`` variant column entry
These arguments are a little lazy and could be pruned down later
:param row: A row of output from a MaveDB score set
:param sequence_id: The GA4GH accession for the provided sequence
:param transcript: The transcript selection information for a target
:param protein_align_result: The protein-protein alignment result for a target against its selected protein reference
:return: VRS mapping object if mapping succeeds
"""
if (
row.hgvs_pro in {"_wt", "_sy"}
or is_missing_value(row.hgvs_pro)
or len(row.hgvs_pro) == 3
):
_logger.warning(
"Can't process variant syntax %s for %s", row.hgvs_pro, row.accession
)
return MappedScore(
accession_id=row.accession,
score=row.score,
error_message=f"Can't process variant syntax {row.hgvs_pro}",
)
if "fs" in row.hgvs_pro:
_logger.warning(
"Can't process variant syntax %s for %s because protein frameshift variants are not supported",
row.hgvs_pro,
row.accession,
)
return MappedScore(
accession_id=row.accession,
score=row.score,
error_message="Protein frameshift variants are not supported",
)
# TODO: Handle edge cases without hardcoding URNs.
# Special case for experiment set urn:mavedb:0000097
if row.hgvs_pro.startswith("NP_009225.1:p."):
vrs_variation = translate_hgvs_to_vrs(row.hgvs_pro)
return MappedScore(
accession_id=row.accession,
score=row.score,
annotation_layer=AnnotationLayer.PROTEIN,
pre_mapped=vrs_variation,
post_mapped=vrs_variation,
)
try:
pre_mapped_hgvs_strings = _create_pre_mapped_hgvs_strings(
row.hgvs_pro,
AnnotationLayer.PROTEIN,
tx=transcript,
)
pre_mapped_protein = _construct_vrs_allele(
pre_mapped_hgvs_strings,
AnnotationLayer.PROTEIN,
sequence_id,
True,
)
except Exception as e:
_logger.warning(
"An error occurred while generating pre-mapped protein variant for %s, accession %s: %s",
row.hgvs_pro,
row.accession,
str(e),
)
return MappedScore(
accession_id=row.accession, score=row.score, error_message=str(e)
)
try:
post_mapped_hgvs_strings = _create_post_mapped_hgvs_strings(
row.hgvs_pro,
AnnotationLayer.PROTEIN,
tx=transcript,
protein_alignment=protein_align_result,
)
post_mapped_protein = _construct_vrs_allele(
post_mapped_hgvs_strings,
AnnotationLayer.PROTEIN,
None,
False,
)
except Exception as e:
_logger.warning(
"An error occurred while generating post-mapped protein variant for %s, accession %s: %s",
row.hgvs_pro,
row.accession,
str(e),
)
return MappedScore(
accession_id=row.accession,
score=row.score,
annotation_layer=AnnotationLayer.PROTEIN,
pre_mapped=pre_mapped_protein,
error_message=str(e).strip("'"),
)
return MappedScore(
accession_id=row.accession,
score=row.score,
annotation_layer=AnnotationLayer.PROTEIN,
pre_mapped=pre_mapped_protein,
post_mapped=post_mapped_protein,
)
def _map_genomic(
row: ScoreRow,
sequence_id: str,
align_result: AlignmentResult | None,
) -> MappedScore:
"""Construct VRS object mapping for ``hgvs_nt`` variant column entry
These arguments are a little lazy and could be pruned down later
:param row: A row of output from a MaveDB score set
:param sequence_id: The GA4GH accession for the provided sequence
:param align_result: The transcript selection information for a score set
:return: VRS mapping object if mapping succeeds
"""
namespace = infer_namespace(sequence_id)
if namespace is None:
if sequence_id.startswith("SQ."):
# if the sequence id starts with SQ, it is a target sequence which is in the ga4gh namespace
namespace = "ga4gh"
else:
return MappedScore(
accession_id=row.accession,
score=row.score,
error_message=f"Namespace could not be inferred from sequence: {sequence_id}",
)
if (
row.hgvs_nt in {"_wt", "_sy", "="}
or "fs"
in row.hgvs_nt # TODO I think this line can be taken out, I don't think fs nomenclature can be used for nt anyway
or len(row.hgvs_nt) == 3
):
_logger.warning(
"Can't process variant syntax %s for %s", row.hgvs_nt, row.accession
)
return MappedScore(
accession_id=row.accession,
score=row.score,
error_message=f"Can't process variant syntax {row.hgvs_nt}",
)
if align_result is None:
# for contig accession based score sets, no mapping is performed,
# so pre- and post-mapped alleles are the same
try:
pre_mapped_hgvs_strings = (
post_mapped_hgvs_strings
) = _create_pre_mapped_hgvs_strings(
row.hgvs_nt,
AnnotationLayer.GENOMIC,
accession_id=sequence_id,
)
# accession-based pre-mapped alleles should be constructed like post-mapped alleles (sequence id is gathered from hgvs string rather than manually provided)
pre_mapped_genomic = _construct_vrs_allele(
pre_mapped_hgvs_strings,
AnnotationLayer.GENOMIC,
None,
False,
)
post_mapped_genomic = _construct_vrs_allele(
post_mapped_hgvs_strings,
AnnotationLayer.GENOMIC,
None,
False,
)
except Exception as e:
_logger.warning(
"An error occurred while generating genomic variant for %s, accession %s: %s",
row.hgvs_nt,
row.accession,
str(e),
)
return MappedScore(
accession_id=row.accession, score=row.score, error_message=str(e)
)
elif namespace.lower() in ("refseq", "ncbi", "ensembl"):
# nm/enst way
try:
pre_mapped_hgvs_strings = _create_pre_mapped_hgvs_strings(
row.hgvs_nt,
AnnotationLayer.GENOMIC,
accession_id=sequence_id,
)
# accession-based pre-mapped alleles should be constructed like post-mapped alleles (sequence id is gathered from hgvs string rather than manually provided)
pre_mapped_genomic = _construct_vrs_allele(
pre_mapped_hgvs_strings,
AnnotationLayer.GENOMIC,
None,
False,
)
except Exception as e:
_logger.warning(
"An error occurred while generating pre-mapped genomic variant for %s, accession %s: %s",
row.hgvs_nt,
row.accession,
str(e),
)
return MappedScore(
accession_id=row.accession, score=row.score, error_message=str(e)
)
try:
post_mapped_hgvs_strings = _create_post_mapped_hgvs_strings(
row.hgvs_nt,
AnnotationLayer.GENOMIC,
accession_id=sequence_id,
)
post_mapped_genomic = _construct_vrs_allele(
post_mapped_hgvs_strings,
AnnotationLayer.GENOMIC,
None,
False,
)
except Exception as e:
_logger.warning(
"An error occurred while generating post-mapped genomic variant for %s, accession %s: %s",
row.hgvs_nt,
row.accession,
str(e),
)
return MappedScore(
accession_id=row.accession,
score=row.score,
annotation_layer=AnnotationLayer.GENOMIC,
pre_mapped=pre_mapped_genomic,
error_message=str(e),
)
elif namespace.lower() == "ga4gh":
# target seq way
try:
pre_mapped_hgvs_strings = _create_pre_mapped_hgvs_strings(
row.hgvs_nt,
AnnotationLayer.GENOMIC,
alignment=align_result,
)
pre_mapped_genomic = _construct_vrs_allele(
pre_mapped_hgvs_strings,
AnnotationLayer.GENOMIC,
sequence_id,
True,
)
except Exception as e:
_logger.warning(
"An error occurred while generating pre-mapped genomic variant for %s, accession %s: %s",
row.hgvs_nt,
row.accession,
str(e),
)
return MappedScore(
accession_id=row.accession, score=row.score, error_message=str(e)
)
try:
post_mapped_hgvs_strings = _create_post_mapped_hgvs_strings(
row.hgvs_nt,
AnnotationLayer.GENOMIC,
alignment=align_result,
)
post_mapped_genomic = _construct_vrs_allele(
post_mapped_hgvs_strings,
AnnotationLayer.GENOMIC,
None,
False,
)
except Exception as e:
_logger.warning(
"An error occurred while generating post-mapped genomic variant for %s, accession %s: %s",
row.hgvs_nt,
row.accession,
str(e),
)
return MappedScore(
accession_id=row.accession,
score=row.score,
annotation_layer=AnnotationLayer.GENOMIC,
pre_mapped=pre_mapped_genomic,
error_message=str(e),
)
else:
msg = f"Unsupported reference sequence namespace: {namespace}"
raise UnsupportedReferenceSequenceNameSpaceError(msg)
return MappedScore(
accession_id=row.accession,
score=row.score,
annotation_layer=AnnotationLayer.GENOMIC,
pre_mapped=pre_mapped_genomic,
post_mapped=post_mapped_genomic,
)
def _get_allele_sequence(allele: Allele) -> str:
"""Get sequence for Allele
:param allele: VRS allele
:return: sequence
:raise ValueError: if sequence is none
"""
dp = get_seqrepo()
start = allele.location.start
end = allele.location.end
sequence = dp.get_sequence(
f"ga4gh:{allele.location.sequenceReference.refgetAccession}", start, end
)
if sequence is None:
raise ValueError
return sequence
def store_sequence(sequence: str) -> str:
"""Store sequence in SeqRepo.
:param sequence: raw sequence (ie nucleotides or amino acids)
:return: sequence ID (sans prefix, which is ``"ga4gh"``)
"""
sequence_id = f"SQ.{sha512t24u(sequence.encode('ascii'))}"
alias_dict_list = [{"namespace": "ga4gh", "alias": sequence_id}]
sr = get_seqrepo()
sr.sr.store(sequence, alias_dict_list)
return sequence_id
def _hgvs_nt_is_valid(hgvs_nt: str) -> bool:
"""Check for invalid or unavailable nucleotide MAVE-HGVS variation
:param hgvs_nt: MAVE_HGVS nucleotide expression
:return: True if expression appears populated and valid
"""
return (
(not is_missing_value(hgvs_nt))
and (hgvs_nt not in {"_wt", "_sy", "="})
and (len(hgvs_nt) != 3)
)
def _hgvs_pro_is_valid(hgvs_pro: str) -> bool:
"""Check for invalid or unavailable protein MAVE-HGVS variation
:param hgvs_nt: MAVE_HGVS protein expression
:return: True if expression appears populated and valid
"""
return (
(hgvs_pro not in {"_wt", "_sy"})
and (not is_missing_value(hgvs_pro))
and (len(hgvs_pro) != 3)
and ("fs" not in hgvs_pro)
)
def _map_protein_coding(
metadata: TargetGene,
records: list[ScoreRow],
transcript: TxSelectResult | TxSelectError,
align_result: AlignmentResult,
silent: bool,
) -> list[MappedScore]:
"""Perform mapping on protein coding experiment results
:param metadata: Target gene metadata from MaveDB API
:param records: The list of MAVE variants in a given score set
:param transcript: The transcript data for a score set, or an error message describing why an expected transcript is missing
:param align_results: The alignment data for a score set
:return: A list of mappings
"""
if metadata.target_sequence_type == TargetSequenceType.DNA:
sequence = str(
Seq(metadata.target_sequence).translate(table="1", stop_symbol="")
)
psequence_id = store_sequence(sequence)
gsequence_id = store_sequence(metadata.target_sequence)
else:
sequence = metadata.target_sequence
psequence_id = gsequence_id = store_sequence(sequence)
# align protein sequence to selected reference protein sequence to get offsets and gaps
protein_align_result = None
if isinstance(transcript, TxSelectResult):
protein_align_result = align_target_to_protein(
sequence, transcript.sequence, silent
)
variations: list[MappedScore] = []
for row in records:
hgvs_nt_mappings = None
hgvs_pro_mappings = None
if _hgvs_nt_is_valid(row.hgvs_nt):
hgvs_nt_mappings = _map_genomic(row, gsequence_id, align_result)
if (
isinstance(transcript, TxSelectError) and not hgvs_nt_mappings
): # only create error message if there is not an hgvs nt mapping
# TODO create pre mapped allele
hgvs_pro_mappings = MappedScore(
accession_id=row.accession,
score=row.score,
error_message=str(transcript).strip("'"),
)
else:
if _hgvs_pro_is_valid(row.hgvs_pro) and protein_align_result is not None:
hgvs_pro_mappings = _map_protein_coding_pro(
row, psequence_id, transcript, protein_align_result
)
# This should not occur because protein align result is only None if transcript selection failed, which should be caught by the TxSelectError check above.
elif protein_align_result is None:
hgvs_pro_mappings = MappedScore(
accession_id=row.accession,
score=row.score,
error_message="Could not perform mapping for protein variant because transcript sequence is missing or could not be aligned to reference sequence",
)
elif (
not hgvs_nt_mappings
): # only create error message if there is not an hgvs nt mapping
hgvs_pro_mappings = MappedScore(
accession_id=row.accession,
score=row.score,
error_message="Invalid protein variant syntax",
)
# append both pro and nt mappings if both available
if hgvs_pro_mappings:
variations.append(hgvs_pro_mappings)
if hgvs_nt_mappings:
variations.append(hgvs_nt_mappings)
return variations
def _map_regulatory_noncoding(
metadata: TargetGene,
records: list[ScoreRow],
align_result: AlignmentResult,
) -> list[MappedScore]:
"""Perform mapping on noncoding/regulatory experiment results
:param metadata: Target gene metadata from MaveDB API
:param records: list of MAVE experiment result rows
:param align_result: An AlignmentResult object for a score set
:return: A list of VRS mappings
"""
variations: list[MappedScore] = []
sequence_id = store_sequence(metadata.target_sequence)
for row in records:
hgvs_nt_mappings = _map_genomic(row, sequence_id, align_result)
variations.append(hgvs_nt_mappings)
return variations
def store_accession(
accession_id: str,
) -> None:
namespace = infer_namespace(accession_id)
alias_dict_list = [{"namespace": namespace, "alias": accession_id}]
cd = cdot_rest()
sequence = cd.get_seq(accession_id)
sr = get_seqrepo()
sr.sr.store(sequence, alias_dict_list)
def _map_accession(
metadata: TargetGene,
records: list[ScoreRow],
align_result: AlignmentResult
| None, # NP and NC accessions won't have alignment results
transcript: TxSelectResult | None,
) -> list[MappedScore]:
variations: list[MappedScore] = []
sequence_id = metadata.target_accession_id
if sequence_id is None:
msg = " No target_accession_id was provided by target gene metadata. Target gene metadata must have a target_accession_id to map to VRS."
raise MissingSequenceIdError(msg)
store_accession(sequence_id)
# TODO full list of protein accession id prefixes
if metadata.target_accession_id.startswith(("NP", "ENSP")):
for row in records:
hgvs_pro_mappings = _map_protein_coding_pro(
row,
sequence_id,
transcript,
)
variations.append(hgvs_pro_mappings)
# TODO full list of transcript and contig accession id prefixes
elif metadata.target_accession_id.startswith(("NM", "ENST", "NC")):
for row in records:
hgvs_nt_mappings = _map_genomic(row, sequence_id, align_result)
variations.append(hgvs_nt_mappings)
else:
msg = f"Unrecognized accession prefix for accession id: {metadata.target_accession_id}"
raise UnsupportedReferenceSequencePrefixError(msg)
return variations
def _rle_to_lse(
rle: ReferenceLengthExpression, location: SequenceLocation
) -> LiteralSequenceExpression:
"""Coerce ReferenceLengthExpression to LiteralSequenceExpression.
RLEs are helpful for long repeating sequences but a) unnecessary here and b)
create incompatibilities with some data extraction further down so to simplify,
we'll just turn them into equivalent LiteralSequenceExpressions.
"""
sr = get_seqrepo()
sequence_id = location.sequenceReference.refgetAccession
start: int = location.start
end = start + rle.repeatSubunitLength
subsequence = sr.get_sequence(f"ga4gh:{sequence_id}", start, end)
c = cycle(subsequence)
derived_sequence = ""
for _ in range(rle.length):
derived_sequence += next(c)
return LiteralSequenceExpression(sequence=derived_sequence)
def _construct_vrs_allele(
hgvs_strings: list[str],
layer: AnnotationLayer,
sequence_id: str | None,
pre_map: bool,
) -> Allele | Haplotype:
alleles: list[Allele] = []
for hgvs_string in hgvs_strings:
_logger.info("Processing HGVS string: %s", hgvs_string)
allele = translate_hgvs_to_vrs(hgvs_string)
if pre_map:
if sequence_id is None:
msg = "Must provide sequence id to construct pre-mapped VRS allele"
raise ValueError(msg)
allele.location.sequenceReference.refgetAccession = sequence_id
else:
allele.location.sequenceReference.label = hgvs_string.split(":")[0]
if "dup" in hgvs_string:
allele.state.sequence = SequenceString(2 * _get_allele_sequence(allele))
# TODO check assumption that c.= leads to an "N" in the sequence.root
if allele.state.sequence.root == "N" and layer == AnnotationLayer.GENOMIC:
allele.state.sequence = SequenceString(_get_allele_sequence(allele))
if "=" in hgvs_string and layer == AnnotationLayer.PROTEIN:
allele.state.sequence = SequenceString(_get_allele_sequence(allele))
allele = normalize(allele, data_proxy=get_seqrepo())
if isinstance(allele.state, ReferenceLengthExpression):
_logger.debug(
"Coercing state for %s into LSE: %s",
hgvs_string,
allele.state.model_dump_json(),
)
allele.state = _rle_to_lse(allele.state, allele.location)
# Run ga4gh_identify to assign VA digest
allele.id = ga4gh_identify(allele)
alleles.append(allele)
if not alleles:
msg = f"Input variant hgvs_string(s) could not be translated to an allele: {hgvs_strings}."
raise ValueError(msg)
if len(alleles) > 1:
return Haplotype(members=alleles)
return alleles[0]
def vrs_map(
metadata: TargetGene,
align_result: AlignmentResult | None,
records: list[ScoreRow],
transcript: TxSelectResult | TxSelectError | None = None,
silent: bool = True,
) -> list[MappedScore]:
"""Given a description of a MAVE scoreset and an aligned transcript, generate
the corresponding VRS objects.
:param metadata: target gene metadata from MaveDB API
:param align_result: output from the sequence alignment process
:param records: scoreset records
:param transcript: output of transcript selection process