#!/usr/bin/env bash
#
# Run fastp on the raw fastq files
#
# ----------------------------------------------------------------------------
set -Eeou pipefail
FQ=/path/to/raw-fastq/00_fastq
SAMPLES=/path/to/sample-names.txt
OUT=/path/to/01_fastp
THREADS=12
mkdir -p $OUT
for SAMPLE in $(cat $SAMPLES)
do
fastp -i $FQ/${SAMPLE}_R1.fastq.gz \
-I $FQ/${SAMPLE}_R2.fastq.gz \
-o $OUT/${SAMPLE}.trimmed.1.fq.gz \
-O $OUT/${SAMPLE}.trimmed.2.fq.gz \
-h $OUT/${SAMPLE}.fastp.html \
-j $OUT/${SAMPLE}.fastp.json \
--detect_adapter_for_pe \
-w $THREADS
done
ATAC-seq Analysis
Work in progress
Filter raw fastq files
Use fastp to perform quality trimming on raw fastq files. Turn on adapter detection for paired end reads.
sample-names.txt
is a plain text file listing the basenames for all samples. For example, if you have “sample1_R1.fq.gz”, “sample2_R1.fq.gz”, “sample3_R1.fq.gz” then sample-names.txt
would be:
sample1
sample2
sample3
This file gets used throughout.
Perform alignment with Bowtie2
Use --very-sensitive
mode and set max insertion size to 1,000. The output is piped to samtools fixmate
for adding mate tag information and then to samtools sort -n
to create name sorted BAM files.
It’s assumed that you have a bowtie2 index generated for your species of interest. The number of jobs, threads, and memory allowed per thread will all be machine dependent.
#!/usr/bin/env bash
#
# Align trimmed reads with bowtie2, fixmate tags, and output name sorted BAMs
#
# ----------------------------------------------------------------------------
set -Eeou pipefail
FQ=/path/to/01_fastp
SAMPLES=/path/to/sample-names.txt
OUT=/path/to/02_align
IDX=/path/to/bt2_idx
INSERT=1000
JOBS=6
THREADS=4
mkdir -p $OUT
parallel --jobs $JOBS \
"bowtie2 -x $IDX \
-1 $FQ/{}.trimmed.1.fq.gz \
-2 $FQ/{}.trimmed.2.fq.gz \
--very-sensitive \
--threads $THREADS \
--maxins $INSERT | \
samtools fixmate -m -@$THREADS - - | \
samtools sort -n -@$THREADS -m4G -o $OUT/{}.bam - " :::: $SAMPLES
Call peaks with Genrich
Genrich is a fast peak caller designed to work with single samples or replicates. It has a dedicated option for peak calling ATAC-seq samples.
The options used below:
- Output pileups over called regions in a pseudo bedfile (-k)
- Output the log file (-f) which can be used to recall peaks with different -p and -a values
- Exclude regions in blacklist (-E)
- Exclude reads mapping to chrM (-e)
- Call in ATAC-seq mode (-j)
- Remove PCR duplicates (-r)
- Specify p-value threshold for peak calling (-p)
- Skip name sorting check (-S)
#!/usr/bin/env bash
#
# Call ATAC-seq peaks using Genrich on the name sorted BAMs
#
# ----------------------------------------------------------------------------
set -Eeou pipefail
SAMPLES=/path/to/sample-names.txt
BAM=/path/to/02_align
OUT=/path/to/03_callpeak
EXCLUDE=/path/to/excluderanges.bed
PVAL=0.05
JOBS=6
parallel --jobs $JOBS \
"Genrich -t $BAM/{}.bam \
-o $OUT/{}.narrowPeak \
-k $OUT/{}.pileup.txt \
-f $OUT/{}.log \
-E $EXCLUDE \
-p $PVAL -j -r -e chrM -S " :::: $SAMPLES
Exclusion lists
The “excluderanges.bed” file used above contains blacklisted regions of high-mappability that can be excluded from peak calling. This BED file can be created using the excluderanges R package. For example, to generate the excluded regions for mm39:
suppressMessages(library(GenomicRanges))
suppressMessages(library(AnnotationHub))
<- AnnotationHub()
ah <- subset(ah, preparerclass == "excluderanges")
query_data <- query_data[["AH107321"]]
mm39_exclude_gr ::export.bed(mm39_exclude_gr, "mm39-excluderanges.bed") rtracklayer
Determine consensus peaks for replicate samples
After peak calling finishes, I create consensus peak calls for replicate samples. The consensus peak calls can be used in downstream differential accessibility analysis or for visualization. Creating consensus peak calls can be done in R by requiring all or some peaks to be called across replicate samples.
suppressPackageStartupMessages(library(here))
suppressPackageStartupMessages(library(GenomicRanges))
# Read in peak calls
<- list.files(here("data", "04_callpeak"), pattern="*.narrowPeak", full.names=TRUE)
peak_files names(peak_files) <- gsub(".narrowPeak", "", basename(peak_files))
<- lapply(peak_files, rtracklayer::import)
peak_calls
# Separate into groups
<- peak_calls[c("control1", "control2", "control3")]
control_calls <- peak_calls[c("treatment1", "treatment2", "treatment3")]
treatment_calls <- GRangesList(control_calls)
control_calls <- GRangesList(treatment_calls)
treatment_calls
# Compute coverage across ranges
<- coverage(control_calls)
control_coverage <- coverage(treatment_calls)
treatment_coverage
# Determine regions with coverage in N replicates -- here require all 3 to have the same peaks
<- GRanges(slice(control_coverage, lower=length(control_calls), rangesOnly=TRUE))
control_covered <- GRanges(slice(treatment_coverage, lower=length(treatment_calls), rangesOnly=TRUE))
treatment_covered
# Close gaps in the resulting regions -- min.gapwidth can be adjusted
<- reduce(control_covered, min.gapwidth=101)
control_consensus <- reduce(treatment_covered, min.gapwidth=101)
treatment_consensus <- union(control_consensus, treatment_consensus)
all_consensus
# Export as BED files
::export.bed(control_consensus, here("data", "04_callpeak", "control-consensus.bed"))
rtracklayer::export.bed(treatment_consensus, here("data", "04_callpeak", "treatment-consensus.bed"))
rtracklayer::export.bed(all_consensus, here("data", "04_callpeak", "consensus.bed")) rtracklayer
Create raw signal tracks
Genrich outputs a bedgraph-ish file using the -k
flag that can be used to quickly create a a bigwig file of the raw signal which can be viewed in IGV. I find it useful to look at the raw signal tracks after peak calling so I can assess the absolute magnitude of the pileup that went into calling a peak. Then, if the peaks look too permissive or too stringent, adjust the peak calls.
Creating bigwigs from the bedgraph-ish files can be accomplished using awk
and bedGraphToBigWig from UCSC.
#!/usr/bin/env bash
#
# Convert the pileups computed by Genrich from bedGraph-ish files to bigwigs
#
# ----------------------------------------------------------------------------
set -Eeou pipefail
SAMPLES=/path/to/sample-names.txt
PEAKS=/path/to/03_callpeak
OUT=/path/to/04_bg2bw
CHROM_SIZES=/path/to/chrom.sizes
JOBS=6
mkdir -p $OUT
echo "Extracting bedGraphs from experimental signal..."
parallel --jobs $JOBS \
"awk 'NR > 2 { print \$1, \$2, \$3, \$4 }' $PEAKS/{}.pileup.txt | \
LC_ALL=C sort -k1,1 -k2,2n > $OUT/{}.sorted.bedGraph" :::: $SAMPLES
echo "Creating bigwigs from pileups..."
parallel --jobs $JOBS "bedGraphToBigWig $OUT/{}.sorted.bedGraph $CHROM_SIZES $OUT/{}.bw" :::: $SAMPLES
echo "Cleaning up intermediate bedGraphs..."
find $OUT -type f -name "*.bedGraph" -delete
Chrom sizes files
You can supply a URL to bedGraphToBigWig
to get the chrom.sizes if you’re genome is supported. Otherwise, you can create a chrom.sizes file using another UCSC utility, faSize
, and your genome fasta.
faSize -detailed -tab genome.fasta > genome.chrom.sizes
ATACSeqQC
TODO: complete this section
The ATACseqQC R package can be used to calculate various QC stats on the filtered BAM files. These should be assessed closely and compared to the ENCODE data standards.
Differential abundance using csaw
and edgeR
TODO: elaborate on this section
csaw can be used to perform differential abundance over sliding windows or by counting reads in the consensus peaks defined above. This paper contains some code for performing and assessing ATAC-seq DA analysis using both methods.
I usually generate read counts over the consensus peaks using featureCounts
from the Rsubread package. An SAF formatted file can be generated easily from the “consensus.bed” file. To get the read counts to align with the Genrich called peaks, set the -read2Pos 5
flag to count alignments over the 5’ end of reads.
Create TSS and region plots with deeptools
deeptools can be used to compute normalized coverage files in bigwig format for visualization in IGV. Normalization in deeptools
can be really context dependent and what normalization strategy to use can often be unclear.
If you’ve performed differential abundance analysis and have global scaling normalization factors, they can be used at this step to generate TMM normalized bigwigs. In the absence of scaling factors, I’ve seen people use RPGC normalization or CPM for ATAC-seq.
Then you can run deeptools
using the TSS regions of coding genes and called consensus peaks.
#!/usr/bin/env bash
#
# Compute normalized coverage bigwigs for visualization
#
# 'deeptools' mamba env must be activated before running
# -----------------------------------------------------------------------------
SAMPLES=/path/to/sample-names.txt
BLACKLIST=/path/to/excluderanges.bed
CODING=/path/to/coding.bed
BAM=/path/to/02_align
PEAKS=/path/to/consensus.bed
OUT=/path/to/05_deeptools
GENOME_SIZE=2654621783 # mm39 specific value, see https://deeptools.readthedocs.io/en/latest/content/feature/effectiveGenomeSize.html
JOBS=6
THREADS=4
mkdir -p $OUT
echo "Position sorting BAMs..."
parallel --jobs $JOBS "samtools sort -@$THREADS -m4G -o $OUT/{}.sorted.bam $BAM/{}.bam" :::: $SAMPLES
parallel --jobs $JOBS "samtools index $OUT/{}.sorted.bam" :::: $SAMPLES
echo "Computing coverage over all BAM files..."
parallel --jobs $JOBS \
"bamCoverage --bam $BAM/{}.sorted.bam \
--outFileName $OUT/{}.bw \
--outFileFormat 'bigwig' \
--normalizeUsing 'RPGC' \
--effectiveGenomeSize $GENOME_SIZE \
--binSize 1 \
--extendReads \
--blackListFileName $BLACKLIST \
--numberOfProcessors $THREADS \
--ignoreDuplicates" :::: $SAMPLES
echo "Computing matrix over TSS regions..."
computeMatrix reference-point \
--regionsFileName \
--scoreFileName $OUT/*.bw \
--outFileName $OUT/tss.mat.gz \
--referencePoint TSS \
--beforeRegionStartLength 3000 \
--afterRegionStartLength 3000 \
--blackListFileName $BLACKLIST \
--missingDataAsZero \
--numberOfProcessors $JOBS
echo "Computing coverage over peak BEDs..."
computeMatrix reference-point \
--regionsFileName $PEAKS \
--scoreFileName $OUT/*.bw \
--outFileName $OUT/peak.mat.gz \
--referencePoint 'center' \
--beforeRegionStartLength 3000 \
--afterRegionStartLength 3000 \
--blackListFileName $BLACKLIST \
--missingDataAsZero \
--numberOfProcessors $JOBS
echo "Plotting heatmap of TSS enrichment..."
plotHeatmap -m $OUT/tss.mat.gz \
-o $OUT/tss-heatmap.pdf \
--dpi 300 \
--colorMap "viridis" \
--boxAroundHeatmaps 'no' \
--samplesLabel "Ctl 1" "Ctl 2" "Ctl 3" "Trt 1" "Trt 2" "Trt 3" \
--regionsLabel "Protein Coding Genes" \
--yAxisLabel "RPGC" \
--perGroup \
--heatmapHeight 12 \
--heatmapWidth 8 \
--plotFileFormat "pdf"
echo "Plotting heatmap of MACS peak enrichment..."
plotHeatmap -m $OUT/peak.mat.gz \
-o $OUT/peak-heatmap.pdf \
--dpi 300 \
--colorMap "viridis" \
--boxAroundHeatmaps 'no' \
--samplesLabel "Ctl 1" "Ctl 2" "Ctl 3" "Trt 1" "Trt 2" "Trt 3" \
--regionsLabel "Consensus Peak Calls" \
--refPointLabel "Center of Peak" \
--yAxisLabel "RPGC" \
--perGroup \
--heatmapHeight 12 \
--heatmapWidth 8 \
--plotFileFormat "pdf"
Getting TSS coding region ranges
For ATAC-seq, I plot the enrichment over the TSS region for each sample as well as the coverage centered over the consensus peaks. To generate a BED file for protein coding gene regions (“coding.bed” above) in R:
# Get mouse annotation GTF, for example
<- "https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_mouse/release_M37/gencode.vM37.annotation.gtf.gz"
url <- rtracklayer::import(url)
gtf
# Extract the protein coding gene ranges only
<- gtf[gtf$gene_type == "protein_coding" & gtf$type == "gene", ]
coding
# rtracklayer complains if score is NA or non-numeric
$score <- 1L
coding
# Export to a BED file for use in deeptools
::export.bed(coding, here("doc", "gencode.vM37.coding.bed")) rtracklayer
Motif analysis with MEME
The MEME suite can be used to perform motif analysis on peaks that have been determined to have differential abundance. You can use R to generate fasta files for these regions based on GRanges extracted from DA analysis. These fasta files can then be used as input to XSTREME
, which performs de novo motif discovery, enrichment analysis, and comparisons with known motif databases.
For example, you can extract fasta regions like so (this uses hg38 as an example):
suppressPackageStartupMessages(library(data.table))
suppressPackageStartupMessages(library(Biostrings))
suppressPackageStartupMessages(library(BSgenome.Hsapiens.UCSC.hg38))
suppressPackageStartupMessages(library(GenomicRanges))
# Assuming you have a file from DA analysis that contains genomic coordinates
# of peaks and annotations - for example from ChIPseeker::annotatePeak()
<- fread("annotated-peaks.tsv")
peaks <- makeGRangesFromDataFrame(peaks, keep.extra.columns = TRUE)
peaks
# Extract promoter peaks -- this can be any filtering you're interested in
<- peaks[peaks$annotation %like% "Promoter"]
promoter_peaks
# Extract and write out the hg38 sequences as fasta records
<- getSeq(BSgenome.Hsapiens.UCSC.hg38, promoter_peaks)
pmtr_seqs writeXStringSet(pmtr_seqs, filepath = "promoter-peaks.fasta", format = "fasta")
These fasta records can then be used in XSTREME
analysis from the MEME suite. The meme formatted databases of known motifs can be downloaded from MEME’s website
#!/usr/bin/env bash
#
# Run XSTREME motif analysis on promoter peak sequences
#
# -------------------------------------------------------------------------------------------------
FA=/path/to/promoter-peaks.fasta
DB=/path/to/HUMAN/HOCOMOCOv11_core_HUMAN_mono_meme_format.meme
OUT=/path/to/xstreme
xstreme --oc $OUT --p $FA --seed 123 --m $DB --no-pgc --dna