|
1 | 1 | --- |
2 | | -title: "Getting Started" |
3 | | -author: "<h4>Authors: <i>`r auths <- eval(parse(text = gsub('person','c',read.dcf('../DESCRIPTION', fields = 'Authors@R'))));paste(auths[names(auths)=='given'],auths[names(auths)=='family'], collapse = ', ')`</i></h4>" |
| 2 | +title: "Getting Started" |
| 3 | +author: "<h4>Authors: <i>`r auths <- eval(parse(text = gsub('person','c',read.dcf('../DESCRIPTION', fields = 'Authors@R'))));paste(auths[names(auths)=='given'],auths[names(auths)=='family'], collapse = ', ')`</i></h4>" |
4 | 4 | date: "<h4>Vignette updated: <i>`r format( Sys.Date(), '%b-%d-%Y')`</i></h4>" |
5 | 5 | output: |
6 | 6 | BiocStyle::html_document |
7 | 7 | vignette: > |
8 | | - %\VignetteIndexEntry{templateR} |
| 8 | + %\VignetteIndexEntry{echoLD} |
9 | 9 | %\usepackage[utf8]{inputenc} |
10 | 10 | %\VignetteEngine{knitr::rmarkdown} |
11 | 11 | --- |
12 | 12 |
|
13 | | -```{r setup} |
| 13 | +```{r setup, include = FALSE} |
| 14 | +knitr::opts_chunk$set( |
| 15 | + collapse = TRUE, |
| 16 | + comment = "#>" |
| 17 | +) |
| 18 | +``` |
| 19 | + |
| 20 | +# Introduction |
| 21 | + |
| 22 | +`echoLD` is an [echoverse](https://github.com/RajLabMSSM/echoverse) module |
| 23 | +for retrieving and processing linkage disequilibrium (LD) matrices. |
| 24 | +It supports multiple LD reference panels: |
| 25 | + |
| 26 | +- **1000 Genomes** Phase 1 and Phase 3 (computed on-the-fly from VCF) |
| 27 | +- **UK Biobank** (pre-computed LD matrices) |
| 28 | +- **Custom VCF** files supplied by the user |
| 29 | +- **Pre-computed LD matrices** in `.rds`, `.csv`, or `.tsv` format |
| 30 | + |
| 31 | +# Setup |
| 32 | + |
| 33 | +```{r load} |
14 | 34 | library(echoLD) |
| 35 | +``` |
| 36 | + |
| 37 | +The examples below use bundled data from the `echodata` package: |
| 38 | +a subset of summary statistics from the *BST1* locus and a |
| 39 | +pre-computed LD matrix. |
15 | 40 |
|
16 | | -query_dat <- echodata::BST1[seq(1,100),] |
| 41 | +```{r example_data} |
| 42 | +query_dat <- echodata::BST1[seq_len(50), ] |
17 | 43 | locus_dir <- file.path(tempdir(), echodata::locus_dir) |
| 44 | +LD_matrix <- echodata::BST1_LD_matrix |
18 | 45 | ``` |
19 | 46 |
|
| 47 | +# Pre-computed LD matrix |
| 48 | + |
| 49 | +If you already have an LD matrix on disk, `get_LD_matrix()` reads it in |
| 50 | +and aligns it with your summary statistics. |
20 | 51 |
|
21 | | -# 1000 Genomes: Phase 1 or 3 |
| 52 | +```{r get_LD_matrix} |
| 53 | +## Write the bundled LD matrix to a temp CSV to demonstrate the workflow |
| 54 | +LD_path <- tempfile(fileext = ".csv") |
| 55 | +utils::write.csv(LD_matrix, file = LD_path, row.names = TRUE) |
22 | 56 |
|
23 | | -```{r, eval=FALSE} |
24 | | -LD_1kgp3 <- echoLD::get_LD(locus_dir = locus_dir, |
25 | | - query_dat = query_dat, |
26 | | - LD_reference = "1KGphase3") # 1KGphase1 |
| 57 | +LD_list <- echoLD::get_LD_matrix( |
| 58 | + locus_dir = locus_dir, |
| 59 | + query_dat = query_dat, |
| 60 | + LD_reference = LD_path |
| 61 | +) |
27 | 62 | ``` |
28 | 63 |
|
29 | | -## Plot |
| 64 | +# Custom VCF |
| 65 | + |
| 66 | +You can compute LD on-the-fly from any VCF file. |
| 67 | +The `echodata` package ships a small 1000 Genomes Phase 3 VCF |
| 68 | +for the BST1 locus. |
30 | 69 |
|
31 | | -```{r, eval=FALSE} |
32 | | -echoLD::plot_LD(LD_matrix = LD_1kgp3$LD, |
33 | | - query_dat = LD_1kgp3$DT, |
34 | | - span = 20) |
| 70 | +```{r get_LD_vcf, eval = requireNamespace("VariantAnnotation", quietly = TRUE) && requireNamespace("snpStats", quietly = TRUE)} |
| 71 | +LD_reference <- system.file("extdata", "BST1.1KGphase3.vcf.bgz", |
| 72 | + package = "echodata") |
| 73 | +LD_vcf <- echoLD::get_LD_vcf( |
| 74 | + locus_dir = locus_dir, |
| 75 | + query_dat = query_dat, |
| 76 | + LD_reference = LD_reference |
| 77 | +) |
35 | 78 | ``` |
36 | 79 |
|
| 80 | +# 1000 Genomes (remote) |
| 81 | + |
| 82 | +Querying the full 1000 Genomes VCF requires downloading chromosome-level |
| 83 | +VCF files from a remote server. This requires an internet connection |
| 84 | +and can take several minutes. |
37 | 85 |
|
38 | | -# UK Biobank |
| 86 | +```{r get_LD_1kg, eval = FALSE} |
| 87 | +LD_1kgp3 <- echoLD::get_LD( |
| 88 | + locus_dir = locus_dir, |
| 89 | + query_dat = query_dat, |
| 90 | + LD_reference = "1KGphase3" |
| 91 | +) |
| 92 | +``` |
39 | 93 |
|
40 | | -*WARNING*: Takes substantially longer than 1000 Genomes methods. |
| 94 | +# UK Biobank (remote) |
41 | 95 |
|
42 | | -```{r, eval=FALSE} |
43 | | -LD_ukb <- echoLD::get_LD(locus_dir = locus_dir, |
44 | | - query_dat = query_dat, |
45 | | - LD_reference = "UKB", |
46 | | - download_method = "axel", |
47 | | - nThread = 10) |
| 96 | +Pre-computed LD from a British European-descent subset of UK Biobank. |
| 97 | +**Note:** This takes substantially longer than the 1000 Genomes methods. |
| 98 | + |
| 99 | +```{r get_LD_ukb, eval = FALSE} |
| 100 | +LD_ukb <- echoLD::get_LD( |
| 101 | + locus_dir = locus_dir, |
| 102 | + query_dat = query_dat, |
| 103 | + LD_reference = "UKB", |
| 104 | + download_method = "axel", |
| 105 | + nThread = 10 |
| 106 | +) |
48 | 107 | ``` |
49 | 108 |
|
50 | | -## Plot |
| 109 | +# Filtering LD |
| 110 | + |
| 111 | +Use `filter_LD()` to remove SNPs below a minimum r2 threshold. |
51 | 112 |
|
52 | | -```{r, eval=FALSE} |
53 | | -echoLD::plot_LD(LD_matrix = LD_ukb$LD, |
54 | | - query_dat = LD_ukb$DT, |
55 | | - span = 20) |
| 113 | +```{r filter_LD} |
| 114 | +LD_list_full <- list(LD = LD_matrix, DT = echodata::BST1) |
| 115 | +LD_list_filt <- echoLD::filter_LD(LD_list = LD_list_full, min_r2 = 0.2) |
| 116 | +dim(LD_list_filt$LD) |
56 | 117 | ``` |
57 | 118 |
|
| 119 | +# Sparse matrix utilities |
58 | 120 |
|
59 | | -# Custom VCF |
| 121 | +`echoLD` provides helpers for working with sparse LD matrices, |
| 122 | +which reduce file size substantially. |
60 | 123 |
|
61 | | -```{r, eval=FALSE} |
62 | | -LD_reference <- system.file("extdata","BST1.1KGphase3.vcf.bgz", |
63 | | - package = "echodata") |
64 | | -samples <- c("HG00097","HG00099","HG00100","HG00101","HG00102") |
65 | | -LD_custom <- echoLD::get_LD(locus_dir = locus_dir, |
66 | | - query_dat = query_dat, |
67 | | - LD_reference = LD_reference) |
| 124 | +```{r sparse} |
| 125 | +## Convert a dense matrix to sparse format |
| 126 | +sparse_mat <- echoLD::to_sparse(X = LD_matrix) |
| 127 | +class(sparse_mat) |
| 128 | +
|
| 129 | +## Save to disk |
| 130 | +sparse_path <- echoLD::saveSparse(LD_matrix = LD_matrix) |
| 131 | +file.info(sparse_path)$size |
| 132 | +
|
| 133 | +## Read it back |
| 134 | +sparse_read <- echoLD::readSparse(LD_path = sparse_path) |
| 135 | +dim(sparse_read) |
68 | 136 | ``` |
69 | 137 |
|
70 | | -## Plot |
| 138 | +# Plotting LD |
| 139 | + |
| 140 | +Plot a heatmap of pairwise LD around the lead SNP. |
71 | 141 |
|
72 | | -```{r, eval=FALSE} |
73 | | -echoLD::plot_LD(LD_matrix = LD_custom$LD, |
74 | | - query_dat = LD_custom$DT, |
75 | | - span = 20) |
| 142 | +```{r plot_LD, fig.width = 7, fig.height = 6} |
| 143 | +echoLD::plot_LD( |
| 144 | + LD_matrix = LD_matrix, |
| 145 | + query_dat = echodata::BST1, |
| 146 | + span = 10 |
| 147 | +) |
76 | 148 | ``` |
77 | 149 |
|
78 | | -# Session Info |
| 150 | +# Session Info |
79 | 151 |
|
80 | | -<details> |
| 152 | +<details> |
81 | 153 |
|
82 | 154 | ```{r Session Info} |
83 | 155 | utils::sessionInfo() |
84 | 156 | ``` |
85 | 157 |
|
86 | | -</details> |
| 158 | +</details> |
87 | 159 |
|
88 | 160 | <br> |
0 commit comments