Skip to content

Commit 514c533

Browse files
committed
add to variant classifier, add document for it.
1 parent cb404d5 commit 514c533

2 files changed

Lines changed: 38 additions & 4 deletions

File tree

countess/plugins/variant.py

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -139,6 +139,7 @@ class VariantClassifier(DuckdbSqlPlugin):
139139
name = "Protein Variant Classifier"
140140
description = "Classifies protein variants into simple types"
141141
version = VERSION
142+
link = "https://countess-project.github.io/CountESS/included-plugins/#variant-classifier"
142143

143144
variant_col = ColumnChoiceParam("Protein variant Column", "variant")
144145

@@ -155,27 +156,31 @@ def sql(self, table_name: str, columns: Iterable[str]) -> Optional[str]:
155156

156157
hgvs_aa_re = "(?:" + "|".join(v for v in AA_CODES.values() if v != 'Ter') + ")"
157158
short_aa_re = "[" + "".join(k for k in AA_CODES if k != '*') + "]"
159+
plugin_label = duckdb_escape_literal(self.name + ": ")
158160

159161
return rf"""
160162
select S.*, case
161163
when T.is_wt != '' then 'W'
164+
when T.is_hgvs_mult != '' then case
165+
when T.hgvs_mult_rhs = 'del' then 'D'
166+
else 'I' end
162167
when T.is_hgvs != '' then case
163-
when T.is_hgvs_ins != '' then 'I'
164168
when T.hgvs_rhs = 'Ter' then 'N'
165169
when T.hgvs_rhs = 'del' then 'D'
170+
when T.hgvs_rhs = 'dup' then 'I'
166171
when T.hgvs_rhs = '=' then 'S'
167172
else 'M' end
168173
when T.is_short != '' then case
169174
when T.short_rhs = '=' then 'S'
170175
when T.short_rhs = '*' or T.short_rhs = 'X' then 'N'
171176
when T.short_rhs = '-' then 'D'
172177
else 'M' end
173-
else warning(concat('unclassifiable variant: "', z, '"'), '?') end as {output_col_id}
178+
else warning(concat({plugin_label}, 'unclassifiable variant: "', z, '"'), '?') end as {output_col_id}
174179
from {table_name} S join (
175180
select {variant_col_id} as z, unnest(regexp_extract(
176181
{variant_col_id},
177-
'^(_?[Ww][Tt]|p.=)$|^(p.{hgvs_aa_re}\d+(=|{hgvs_aa_re}|del|Ter|(_{hgvs_aa_re}\d+ins{hgvs_aa_re}+)))$|^({short_aa_re}\d+({short_aa_re}|[=*X-]))$',
178-
['is_wt','is_hgvs','hgvs_rhs','is_hgvs_ins','is_short','short_rhs']
182+
'^(_?[Ww][Tt]|p.=)$|^(p.{hgvs_aa_re}\d+(=|{hgvs_aa_re}|dup|del|Ter|(_{hgvs_aa_re}\d+(del|dup|ins{hgvs_aa_re}+))))$|^({short_aa_re}\d+({short_aa_re}|[=*X-]))$',
183+
['is_wt','is_hgvs','hgvs_rhs','is_hgvs_mult', 'hgvs_mult_rhs', 'is_short','short_rhs']
179184
))
180185
from {table_name}
181186
group by z

docs/included-plugins/index.md

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -271,6 +271,35 @@ The reference sequence can either be provided directly as a configuration parame
271271

272272
*See also: [countess-minimap2 plugin](https://github.com/CountESS-Project/countess-minimap2), a variant caller which uses 'minimap2' to find sequences within a genome.*
273273

274+
### Variant Classifier
275+
276+
Takes a column of protein variants and classifies them into types:
277+
278+
Short designations:
279+
280+
|---|---|---|
281+
|format | type | explanation |
282+
|---|---|---|
283+
| `WT` <br/>`_WT` | `W` | Wild type |
284+
| `A107H` | `M` | Missense |
285+
| `A107A` <br/>`A107=` | `S` | Synonymous |
286+
| `A107*` <br/>`A107X` | `N` | Nonsense |
287+
| `A107-` | `D` | Deletion |
288+
289+
HGVS Protein designations:
290+
291+
|---|---|---|
292+
| format | type | explanation |
293+
| `p.=` | `W` | Wild type |
294+
| `p.Ala107His` | `M` | Missense |
295+
| `p.Ala107=` | `S` | Synonymous |
296+
| `p.Ala107Ter` | `N` | Nonsense |
297+
| `p.Ala107del` | `D` | Deletion |
298+
| `p.Ala107_Glu108insHis` <br/>`p.Ala107dup` <br/>`p.Ala107_Glu108dup` | `I` | Insertion / Duplication |
299+
300+
Other variant formats or invalid amino acid codes will generate a warning and the type will be set to `?`.
301+
There is currently no support for insertions in short designations.
302+
274303
#### Parameters
275304

276305
Input Column

0 commit comments

Comments
 (0)