https://taylorpaisie.github.io/VEME_2023_NGS_Variant_Calling/
$ mkdir -p ~/variant_calling/data/untrimmed_fastq
$ cd ~/variant_calling/data/untrimmed_fastq
$ curl -O ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR197/007/SRR1972917/SRR1972917_1.fastq.gz
$ curl -O ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR197/007/SRR1972917/SRR1972917_2.fastq.gz
$ curl -O ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR197/008/SRR1972918/SRR1972918_1.fastq.gz
$ curl -O ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR197/008/SRR1972918/SRR1972918_2.fastq.gz
$ curl -O ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR197/009/SRR1972919/SRR1972919_1.fastq.gz
$ curl -O ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR197/009/SRR1972919/SRR1972919_2.fastq.gz
$ curl -O ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR197/000/SRR1972920/SRR1972920_1.fastq.gz
$ curl -O ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR197/000/SRR1972920/SRR1972920_2.fastq.gz
$ gunzip SRR1972917_1.fastq.gz
$ head -n 4 SRR1972917_1.fastq
@SRR1972917.1 1/1
TCCGTGGGGCTGGTACGACAGTATCGATGAGGGTGGACGCTTCAAGGTCAAGCGTATACAGGTCAACCCCAAAGCTAGCCTGAGCCTTCAGAAACACCACC
+
@CCFDDFFHGHHHGIJJIIJJJJGJJJJGIJIJJFIJJJIIJJJJHHFHHFFFFAADEFEDDDDDDDDDDDD??CCCDDDDDDCCCDDCCCDD:?CABDDB
Quality encoding: !"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJ
| | | | |
Quality score: 01........11........21........31........41
$ gzip SRR1972917_1.fastq
$ fastqc -h
$ fastqc *.fastq*
application/gzip
Started analysis of SRR1972917_1.fastq.gz
Approx 5% complete for SRR1972917_1.fastq.gz
Approx 10% complete for SRR1972917_1.fastq.gz
Approx 15% complete for SRR1972917_1.fastq.gz
Approx 20% complete for SRR1972917_1.fastq.gz
Approx 25% complete for SRR1972917_1.fastq.gz
Approx 30% complete for SRR1972917_1.fastq.gz
Approx 35% complete for SRR1972917_1.fastq.gz
Approx 40% complete for SRR1972917_1.fastq.gz
Approx 45% complete for SRR1972917_1.fastq.gz
Approx 50% complete for SRR1972917_1.fastq.gz
Approx 55% complete for SRR1972917_1.fastq.gz
Approx 60% complete for SRR1972917_1.fastq.gz
Approx 65% complete for SRR1972917_1.fastq.gz
Approx 70% complete for SRR1972917_1.fastq.gz
Approx 75% complete for SRR1972917_1.fastq.gz
Approx 80% complete for SRR1972917_1.fastq.gz
Approx 85% complete for SRR1972917_1.fastq.gz
Approx 90% complete for SRR1972917_1.fastq.gz
Approx 95% complete for SRR1972917_1.fastq.gz
Analysis complete for SRR1972917_1.fastq.gz
Lets now look at the files created by FastQC:
$ ls -al
$ cd ~/variant_calling
$ mkdir -p results/fastqc_untrimmed_reads
$ mv ~/variant_calling/data/untrimmed_fastq/*.zip ~/variant_calling/results/fastqc_untrimmed_reads
$ mv ~/variant_calling/data/untrimmed_fastq/*.html ~/variant_calling/results/fastqc_untrimmed_reads
$ cd ~/variant_calling/results/fastqc_untrimmed_reads
$ trimmomatic`
Usage:
PE [-version] [-threads <threads>] [-phred33|-phred64] [-trimlog <trimLogFile>] [-summary <statsSummaryFile>] [-quiet] [-validatePairs] [-basein <inputBase> | <inputFile1> <inputFile2>] [-baseout <outputBase> | <outputFile1P> <outputFile1U> <outputFile2P> <outputFile2U>] <trimmer1>...
or:
SE [-version] [-threads <threads>] [-phred33|-phred64] [-trimlog <trimLogFile>] [-summary <statsSummaryFile>] [-quiet] <inputFile> <outputFile> <trimmer1>...
or:
-version
While, in single end mode, Trimmomatic will expect 1 file as input, after which you can enter the optional settings and lastly the name of the output file
Running Trimmomatic:
Move to the correct directory with untrimmed fastq files we downloaded:
$ cd ~/variant_calling/data/untrimmed_fastq
Copy Illumina adapters from Trimmomatic into working directory:
$ cp /usr/local/share/Trimmomatic-main/adapters/NexteraPE-PE.fa .
Run Trimmomatic:
$ java -jar /usr/local/share/Trimmomatic-main/dist/jar/trimmomatic-0.40-rc1.jar PE SRR1972917_1.fastq.gz SRR1972917_2.fastq.gz SRR1972917_1.trim.fastq.gz SRR1972917_1un.trim.fastq.gz SRR1972917_2.trim.fastq.gz SRR1972917_2un.trim.fastq.gz SLIDINGWINDOW:4:20 MINLEN:25 ILLUMINACLIP:NexteraPE-PE.fa:2:40:15
TrimmomaticPE: Started with arguments:
SRR1972917_1.fastq.gz SRR1972917_2.fastq.gz SRR1972917_1.trim.fastq.gz SRR1972917_1un.trim.fastq.gz SRR1972917_2.trim.fastq.gz SRR1972917_2un.trim.fastq.gz SLIDINGWINDOW:4:20 MINLEN:25 ILLUMINACLIP:NexteraPE-PE.fa:2:40:15
Multiple cores found: Using 4 threads
Using PrefixPair: 'AGATGTGTATAAGAGACAG' and 'AGATGTGTATAAGAGACAG'
Using Long Clipping Sequence: 'GTCTCGTGGGCTCGGAGATGTGTATAAGAGACAG'
Using Long Clipping Sequence: 'TCGTCGGCAGCGTCAGATGTGTATAAGAGACAG'
Using Long Clipping Sequence: 'CTGTCTCTTATACACATCTCCGAGCCCACGAGAC'
Using Long Clipping Sequence: 'CTGTCTCTTATACACATCTGACGCTGCCGACGA'
ILLUMINACLIP: Using 1 prefix pairs, 4 forward/reverse sequences, 0 forward only sequences, 0 reverse only sequences
Quality encoding detected as phred33
Input Read Pairs: 4377867 Both Surviving: 1241328 (28.35%) Forward Only Surviving: 2133670 (48.74%) Reverse Only Surviving: 18680 (0.43%) Dropped: 984189 (22.48%)
TrimmomaticPE: Completed successfully
List out files created by Trimmomatic:
$ ls SRR1972917*
Trimmed files should be smaller in size than our untrimmed fastq files
Running a for loop on all fastq files
$ for infile in *_1.fastq.gz
do
base=$(basename ${infile} _1.fastq.gz)
java -jar /usr/local/share/Trimmomatic-main/dist/jar/trimmomatic-0.40-rc1.jar PE ${infile} ${base}_2.fastq.gz
${base}_1.trim.fastq.gz ${base}_1un.trim.fastq.gz
${base}_2.trim.fastq.gz ${base}_2un.trim.fastq.gz
SLIDINGWINDOW:4:20 MINLEN:25 ILLUMINACLIP:NexteraPE-PE.fa:2:40:15
done
$ cd ~/variant_calling/data/untrimmed_fastq
$ mkdir -p ~/variant_calling/data/trimmed_fastq
$ mv *trim* ~/variant_calling/data/trimmed_fastq
$ cd ~/variant_calling/data/trimmed_fastq
$ ls -al
$ fastqc *trim.fastq.gz
KJ660346.2
and download fasta file$ mkdir -p ~/variant_calling/data/ref_genome
$ mv ~/Downloads/sequence.fasta ~/variant_calling/data/ref_genome/KJ660346.2.fasta
$ cd ~/variant_calling/data/ref_genome
$ cd ~/variant_calling/
$ mkdir -p results/sam results/bam results/bcf results/vcf
$ bwa index data/ref_genome/KJ660346.2.fasta
We will use the BWA-MEM algorithm, which is the latest and is generally recommended for high-quality queries as it is faster and more accurate
$ bwa mem data/ref_genome/KJ660346.2.fasta data/trimmed_fastq/SRR1972917_1.trim.fastq.gz data/trimmed_fastq/SRR1972917_2.trim.fastq.gz > results/sam/SRR1972917.aligned.sam
Convert SAM file to BAM format
$ samtools view -S -b results/sam/SRR1972917.aligned.sam > results/bam/SRR1972917.aligned.bam
Sort BAM file by coordinates
$ samtools sort -o results/bam/SRR1972917.aligned.sorted.bam results/bam/SRR1972917.aligned.bam
$ samtools flagstat results/bam/SRR1972917.aligned.sorted.bam
Calculate the read coverage of positions in the genome
$ bcftools mpileup -O b -o results/bcf/SRR1972917_raw.bcf -f data/ref_genome/KJ660346.2.fasta results/bam/SRR1972917.aligned.sorted.bam
--ploidy
, which is one for our Ebola genome. -m
allows for multiallelic and rare-variant calling, -v
tells the program to output variant sites only (not every site in the genome), and -o
specifies where to write the output file:$ bcftools call --ploidy 1 -m -v -o results/vcf/SRR1972917_variants.vcf results/bcf/SRR1972917_raw.bcf
$ vcfutils.pl varFilter results/vcf/SRR1972917_variants.vcf > results/vcf/SRR1972917_final_variants.vcf
Explore the VCF format:
$ less -S results/vcf/SRR1972917_final_variants.vcf
$ samtools index results/bam/SRR1972917.aligned.sorted.bam
tview
In order to visualize our mapped reads, we use tview, giving it the sorted bam file and the reference file:
$ samtools tview results/bam/SRR1972917.aligned.sorted.bam data/ref_genome/KJ660346.2.fasta
.
indicates a match to the reference sequence, so we can see that the consensus from our sample matches the reference in most locations?
for a help menug
Ctrl^C
or q
to exit tviewViewing with IGV
#### IGV is a stand-alone browser, which has the advantage of being installed locally and providing fast access. Web-based genome browsers, like Ensembl or the UCSC browser, are slower, but provide more functionality
#### They not only allow for more polished and flexible visualization, but also provide easy access to a wealth of annotations and external data sources
#### This makes it straightforward to relate your data with information about repeat regions, known genes, epigenetic features or areas of cross-species conservation, to name just a few
$ for infile in *_1.fastq.gz
do
base=$(basename ${infile} _1.fastq.gz)
trimmomatic PE ${infile} ${base}_2.fastq.gz
${base}_1.trim.fastq.gz ${base}_1un.trim.fastq.gz
${base}_2.trim.fastq.gz ${base}_2un.trim.fastq.gz
SLIDINGWINDOW:4:20 MINLEN:25 ILLUMINACLIP:NexteraPE-PE.fa:2:40:15
done
for
loop lesson)=
$variable_name
infile
, which was defined in the for statement, and base
, which was created from the filename during each iteration of the loop$ mkdir scripts
$ cd scripts/
$ touch run_variant_calling.sh
#! bin/bash/
set -e
cd ~/variant_calling/results
genome=~/variant_calling/data/ref_genome/KJ660346.2.fasta
bwa index $genome
# makes directories (should already be made)
# mkdir -p sam bam bcf vcf
for fq1 in ~/variant_calling/data/trimmed_fastq/*_1.trim.fastq.gz
do
echo "working with file $fq1"
base=$(basename $fq1 _1.trim.fastq.gz)
echo "base name is $base"
fq1=~/variant_calling/data/trimmed_fastq/${base}_1.trim.fastq.gz
fq2=~/variant_calling/data/trimmed_fastq/${base}_2.trim.fastq.gz
sam=~/variant_calling/results/sam/${base}.aligned.sam
bam=~/variant_calling/results/bam/${base}.aligned.bam
sorted_bam=~/variant_calling/results/bam/${base}.aligned.sorted.bam
raw_bcf=~/variant_calling/results/bcf/${base}_raw.bcf
variants=~/variant_calling/results/bcf/${base}_variants.vcf
final_variants=~/variant_calling/results/vcf/${base}_final_variants.vcf
bwa mem $genome $fq1 $fq2 > $sam
samtools view -S -b $sam > $bam
samtools sort -o $sorted_bam $bam
samtools index $sorted_bam
bcftools mpileup -O b -o $raw_bcf -f $genome $sorted_bam
bcftools call --ploidy 1 -m -v -o $variants $raw_bcf
vcfutils.pl varFilter $variants > $final_variants
done
$ cd ~/variant_calling/results
$ genome=~/variant_calling/data/ref_genome/KJ660346.2.fasta
$ bwa index $genome
results
that are neededfor fq1 in ~/variant_calling/data/trimmed_fastq/*_1.trim.fastq.gz
do
echo "working with file $fq1"
base=$(basename $fq1 _1.trim.fastq.gz)
echo "base name is $base"
.# input fastq files
fq1=~/variant_calling/data/trimmed_fastq/${base}_1.trim.fastq.gz
fq2=~/variant_calling/data/trimmed_fastq/${base}_2.trim.fastq.gz
# output files
sam=~/variant_calling/results/sam/${base}.aligned.sam
bam=~/variant_calling/results/bam/${base}.aligned.bam
sorted_bam=~/variant_calling/results/bam/${base}.aligned.sorted.bam
raw_bcf=~/variant_calling/results/bcf/${base}_raw.bcf
variants=~/variant_calling/results/vcf/${base}_variants.vcf
final_variants=~/variant_calling/results/vcf/${base}_final_variants.vcf
$ bwa mem $genome $fq1 $fq2 > $sam
$ samtools view -S -b $sam > $bam
$ samtools sort -o $sorted_bam $bam
$ samtools index $sorted_bam
$ bcftools mpileup -O b -o $raw_bcf -f $genome $sorted_bam
$ bcftools call --ploidy 1 -m -v -o $variants $raw_bcf
$ vcfutils.pl varFilter $variants > $final_variants
$ bash run_variant_calling.sh