-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathgenomics_project_Q1.R
More file actions
121 lines (90 loc) · 4.85 KB
/
genomics_project_Q1.R
File metadata and controls
121 lines (90 loc) · 4.85 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
### Genomics project part 1
################################################################
#Or Shahar 206582017
#Daniel Brooker 315015594
################################################################
# This R script performs differential gene expression analysis on RNA-seq data
# The data was previously processed using Kallisto for transcript quantification
# Kallisto was run in Python to generate abundance estimates for each sample
# The Kallisto output files are used as input for this analysis
# Install packages
if (!requireNamespace("BiocManager", quietly = TRUE))
install.packages("BiocManager")
BiocManager::install("tximport")
BiocManager::install("readr")
BiocManager::install("biomaRt")
# Load libraries
library(tximport)
library(readr)
library(GenomicFeatures)
library(biomaRt)
# Load the list of Kallisto output files
files <- c("C:/Users/danie/OneDrive/מסמכים/R_programing/C1_output.tsv",
"C:/Users/danie/OneDrive/מסמכים/R_programing/C2_output.tsv",
"C:/Users/danie/OneDrive/מסמכים/R_programing/C3_output.tsv",
"C:/Users/danie/OneDrive/מסמכים/R_programing/KO1_output.tsv",
"C:/Users/danie/OneDrive/מסמכים/R_programing/KO2_output.tsv",
"C:/Users/danie/OneDrive/מסמכים/R_programing/KO3_output.tsv")
# Path to the GTF file
gtf_file <- "C:/Users/danie/OneDrive/מסמכים/R_programing/Mus_musculuss.GRCm39.112.gtf"
# Create a TxDb object from the GTF file
# This step creates a transcript database from the GTF file
txdb <- makeTxDbFromGFF(gtf_file, format = "gtf")
# Create a mapping between transcripts and genes
# Purpose: To establish a relationship between individual transcripts and their corresponding genes
# Importance: Essential for aggregating transcript-level data to gene-level
tx2gene <- select(txdb, keys(txdb, "TXNAME"), "GENEID", "TXNAME")
# Import the Kallisto data using tximport
# Purpose: To convert transcript-level abundance estimates from Kallisto to gene-level count data
# Importance:
# 1. Allows for gene-level analysis
# 2. Prepares data in a format suitable for downstream differential expression analysis in DESeq2
# 3. Accounts for uncertainty in transcript quantification when summarizing to gene-level
txi <- tximport(files, type = "kallisto", tx2gene = tx2gene, txOut = FALSE, ignoreTxVersion = TRUE)
# Edit column names
colnames(txi$counts) <- c("C1", "C2", "C3", "KO1", "KO2", "KO3")
# Check and display the first few rows of the count data
head(txi$counts)
# Load DESeq2 library for differential expression analysis
library(DESeq2)
# Create a data frame with condition information
# This matches the experimental design: 3 control samples and 3 knockout samples
coldata <- data.frame (condition = factor(c("control", "control", "control", "knockout", "knockout", "knockout")))
# Round the count values to integers
# DESeq2 requires integer count data
txi$counts <- round(txi$counts)
# Create a DESeqDataSet object
# This combines the count data with the experimental design
dds <- DESeqDataSetFromMatrix(countData = txi$counts, colData = coldata, design = ~ condition)
# Run DESeq2 analysis
dds <- DESeq(dds)
# Extract the results of differential expression analysis
res <- results(dds)
head(res)
# Sort results by log2FoldChange
res_ordered <- res[order(res$log2FoldChange, decreasing = TRUE), ]
# Extract Ensemble IDs after sorting
ensembl_ids <- rownames(res_ordered)
# Connect to Ensemble database for mouse
# This allows us to map Ensemble IDs to gene symbols for easier interpretation
ensembl <- useMart("ensembl", dataset = "mmusculus_gene_ensembl")
# Map Ensemble IDs to gene symbols
gene_mapping <- getBM(attributes = c('ensembl_gene_id', 'external_gene_name'), filters = 'ensembl_gene_id', values = ensembl_ids, mart = ensembl)
# Display the mapping
head(gene_mapping)
# Add gene names to the results table
res_ordered$gene_name <- gene_mapping$external_gene_name[match(rownames(res_ordered), gene_mapping$ensembl_gene_id)]
# Display updated results
head(res_ordered)
# Filter for significantly differential expressed genes (p-adj < 0.05)
significant_genes <- subset(res_ordered, padj < 0.05)
# Save the list of significant genes to a file
write.table(significant_genes, file="significant_genes_with_names.txt", sep="\t", row.names=FALSE)
# Filter genes higher expressed in control
higher_in_control <- subset(significant_genes, log2FoldChange > 0)
# Filter genes higher expressed in knockout
higher_in_knockout <- subset(significant_genes, log2FoldChange < 0)
# Save genes higher expressed in control to a file
write.table(higher_in_control, file="higher_in_control.txt", sep="\t", row.names=FALSE)
# Save genes higher expressed in knockout to a file
write.table(higher_in_knockout, file="higher_in_knockout.txt", sep="\t", row.names=FALSE)