library("GenomicFeatures")
# Load in the genome txdb
<- "/path/to/gtf_file.gtf.gz"
gtf_file <- makeTxDbFromGFF(gtf_file, format="gtf")
txdb <- exonsBy(txdb, by="gene", use.names = FALSE)
exonsByGene
# Read in sra metadata and save it to a DataFrame called sampleTable
<- DataFrame(read.csv("./doc/metadata/RunInfo.csv"))
sampleTable rownames(sampleTable) <- sampleTable$Run
# Load in alignment files
<- fls <- list.files(
fls "/path/to/02_STAR_outs",
full.names = TRUE,
pattern = "*.bam$"
)
library("Rsamtools")
<- BamFileList(fls)
bam_fls
# Extract sample IDs from file names
names(fls) <- stringr::str_match(
fls,"SRR[0-9]+"
1]
)[,
# Only keep observations that you have in your experiment
<- sampleTable[names(fls),]
sampleTable
# Quantify reads using GenomicAlignments package
library("BiocParallel")
register(MulticoreParam(workers=8))
library("GenomicAlignments")
<- summarizeOverlaps(
se features = exonsByGene,
reads = bam_fls,
mod = "Union",
singleEnd = FALSE,
ignore.strand = TRUE,
fragments = TRUE
)
# Annotate SummarizedExperiment with colData and rowData
<- rtracklayer::import(gtf_file, format="gtf")
gtf_gr <- subset(gtf_gr, type == "gene")
genes names(genes) <- genes$gene_id
colData(se) <- DataFrame(sampleTable)
rowData(se) <- genes[rownames(se)]
# Save your output
dir.create("/path/to/03_RNAseq_counts", showWarnings = FALSE)
saveRDS(se, file = "/path/to/03_RNAseq_counts/se.rds")
Introduction to Bioinformatics
Welcome! In this tutorial, we’ll walk through the common steps of an RNA‑seq analysis pipeline.
Downloading FASTQ files from SRA
First, we need to obtain the raw sequencing reads (FASTQ files) from the Sequence Read Archive (SRA). We’ll use prefetch to download data and fasterq-dump to extract FASTQ files.
#!/usr/bin/env bash
#
# Obtain fastq files from SRA using prefetch + fasterq-dump
#
# ----------------------------------------------
SAMPLES=sra-names.txt # A text file listing SRR ID's from SRA Run Selector
OUT=/path/to/put/00_fastq
mkdir -p $OUT
for SAMPLE in $(cat $SAMPLES)
do
prefetch $SAMPLE
fasterq-dump $SAMPLE
done
Downloading metadata for the FASTQ files from SRA
Next, it’s important to gather metadata (like sample descriptions) associated with our sequencing runs. We’ll query the SRA database by project ID and save the run information.
#!/usr/bin/env bash
#
# Obtain metadata for fastq files from SRA using esearch + efetch
#
# ----------------------------------------------
PRJ=PRJNA229998 # SRA project ID from SRA Run Selector
OUT=/docs/metadata
mkdir -p $OUT
esearch -db sra -query $PRJ | efetch -format runinfo > ${OUT}/RunInfo.csv
Trimming and filtering reads
We should remove adapters and low‑quality bases using fastp. This improves downstream alignment.
#!/usr/bin/env bash
#
# Run fastp on the raw fastq files
#
# ----------------------------------------------------------------------------
set -Eeou pipefail
FQ=/path/to/00_fastq # Directory containing raw fastq files
SAMPLES=sample-names.txt # A text file listing basenames of fastq files
OUT=/path/to/put/01_fastp # Where to save the fastp results
THREADS=8
mkdir -p $OUT
for SAMPLE in $(cat $SAMPLES)
do
fastp -i $FQ/${SAMPLE}_R1.fq.gz \
-I $FQ/${SAMPLE}_R2.fq.gz \
-o $OUT/${SAMPLE}.trimmed.1.fq.gz \
-O $OUT/${SAMPLE}.trimmed.2.fq.gz \
-h $OUT/${SAMPLE}.fastp.html \
-j $OUT/${SAMPLE}.fastp.json \
-w $THREADS
done
Aligning reads
We’ll use STAR to perform splice-aware alignment, which is ideal for RNA‑seq data.
#!/usr/bin/env bash
#
# Align reads with STAR
#
# ----------------------------------------------------------------------------
set -Eeou pipefail
SAMPLES=sample-names.txt # Same sample-names.txt file as above
FQ=/path/to/01_fastp # Directory containing the fastp output
OUT=/path/to/03_STAR_outs # Where to save the STAR results
IDX=/path/to/STAR-idx # Index used by STAR for alignment
THREADS=24
mkdir -p $OUT
for SAMPLE in $(cat $SAMPLES)
do
STAR --runThreadN $THREADS \
--genomeDir $IDX \
--readFilesIn ${FQ}/${SAMPLE}.trimmed.1.fq.gz ${FQ}/${SAMPLE}.trimmed.2.fq.gz \
--readFilesCommand zcat \
--outFilterType BySJout \
--outFileNamePrefix ${OUT}/${SAMPLE}_ \
--alignSJoverhangMin 8 \
--alignSJDBoverhangMin 1 \
--outFilterMismatchNmax 999 \
--outFilterMismatchNoverReadLmax 0.04 \
--alignIntronMin 20 \
--alignIntronMax 1000000 \
--alignMatesGapMax 1000000 \
--outMultimapperOrder Random \
--outSAMtype BAM SortedByCoordinate \
--quantMode - # Do not perform any quantifications
done
Counting reads
We’ll use summarizeOverlaps in R to count reads overlapping exonic features.