data(vcfR_example)
vcf

## ***** Object of Class vcfR *****
## 18 samples
## 1 CHROMs
## 2,533 variants
## Object size: 3.2 Mb
## 8.497 percent missing data
## *****        *****         *****

strwrap(vcf@meta[1:7])

## [1] "##fileformat=VCFv4.1"                                                   
## [2] "##source=\"GATK haplotype Caller, phased with beagle4\""                
## [3] "##FILTER=<ID=LowQual,Description=\"Low quality\">"                      
## [4] "##FORMAT=<ID=AD,Number=.,Type=Integer,Description=\"Allelic depths for" 
## [5] "the ref and alt alleles in the order listed\">"                         
## [6] "##FORMAT=<ID=DP,Number=1,Type=Integer,Description=\"Approximate read"   
## [7] "depth (reads with MQ=255 or with bad mates are filtered)\">"            
## [8] "##FORMAT=<ID=GQ,Number=1,Type=Integer,Description=\"Genotype Quality\">"
## [9] "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">"

queryMETA(vcf)

##  [1] "FILTER=ID=LowQual"                   
##  [2] "FORMAT=ID=AD"                        
##  [3] "FORMAT=ID=DP"                        
##  [4] "FORMAT=ID=GQ"                        
##  [5] "FORMAT=ID=GT"                        
##  [6] "FORMAT=ID=PL"                        
##  [7] "GATKCommandLine=ID=HaplotypeCaller"  
##  [8] "INFO=ID=AC"                          
##  [9] "INFO=ID=AF"                          
## [10] "INFO=ID=AN"                          
## [11] "INFO=ID=BaseQRankSum"                
## [12] "INFO=ID=ClippingRankSum"             
## [13] "INFO=ID=DP"                          
## [14] "INFO=ID=DS"                          
## [15] "INFO=ID=FS"                          
## [16] "INFO=ID=HaplotypeScore"              
## [17] "INFO=ID=InbreedingCoeff"             
## [18] "INFO=ID=MLEAC"                       
## [19] "INFO=ID=MLEAF"                       
## [20] "INFO=ID=MQ"                          
## [21] "INFO=ID=MQ0"                         
## [22] "INFO=ID=MQRankSum"                   
## [23] "INFO=ID=QD"                          
## [24] "INFO=ID=ReadPosRankSum"              
## [25] "INFO=ID=SOR"                         
## [26] "1 contig=<IDs omitted from queryMETA"

queryMETA(vcf, element = 'DP')

## [[1]]
## [1] "FORMAT=ID=DP"                                                                         
## [2] "Number=1"                                                                             
## [3] "Type=Integer"                                                                         
## [4] "Description=Approximate read depth (reads with MQ=255 or with bad mates are filtered)"
## 
## [[2]]
## [1] "INFO=ID=DP"                                                           
## [2] "Number=1"                                                             
## [3] "Type=Integer"                                                         
## [4] "Description=Approximate read depth; some reads may have been filtered"

head(getFIX(vcf))

##      CHROM              POS   ID REF ALT QUAL     FILTER
## [1,] "Supercontig_1.50" "2"   NA "T" "A" "44.44"  NA    
## [2,] "Supercontig_1.50" "246" NA "C" "G" "144.21" NA    
## [3,] "Supercontig_1.50" "549" NA "A" "C" "68.49"  NA    
## [4,] "Supercontig_1.50" "668" NA "G" "C" "108.07" NA    
## [5,] "Supercontig_1.50" "765" NA "A" "C" "92.78"  NA    
## [6,] "Supercontig_1.50" "780" NA "G" "T" "58.38"  NA

vcf@gt[1:6, 1:4]

##      FORMAT           BL2009P4_us23               DDR7602                  
## [1,] "GT:AD:DP:GQ:PL" "0|0:62,0:62:99:0,190,2835" "0|0:12,0:12:39:0,39,585"
## [2,] "GT:AD:DP:GQ:PL" "1|0:5,5:10:99:111,0,114"   NA                       
## [3,] "GT:AD:DP:GQ:PL" NA                          NA                       
## [4,] "GT:AD:DP:GQ:PL" "0|0:1,0:1:3:0,3,44"        NA                       
## [5,] "GT:AD:DP:GQ:PL" "0|0:2,0:2:6:0,6,49"        "0|0:1,0:1:3:0,3,34"     
## [6,] "GT:AD:DP:GQ:PL" "0|0:2,0:2:6:0,6,49"        "0|0:1,0:1:3:0,3,34"     
##      IN2009T1_us22              
## [1,] "0|0:37,0:37:99:0,114,1709"
## [2,] "0|1:2,1:3:16:16,0,48"     
## [3,] "0|0:2,0:2:6:0,6,51"       
## [4,] "1|1:0,1:1:3:25,3,0"       
## [5,] "0|0:1,0:1:3:0,3,31"       
## [6,] "0|0:3,0:3:9:0,9,85"

Exercise 1

How would we find more information about read.vcfR()? ?read.vcfR
How would we learn what the acronym “AD” stands for?

queryMETA(vcf, element = 'AD')

## [[1]]
## [1] "FORMAT=ID=AD"                                                              
## [2] "Number=."                                                                  
## [3] "Type=Integer"                                                              
## [4] "Description=Allelic depths for the ref and alt alleles in the order listed"

We used the head() function to view the first few lines of fix data. How would we view the last few lines of fix data?

tail(getFIX(vcf))

##         CHROM              POS     ID REF ALT QUAL      FILTER
## [2528,] "Supercontig_1.50" "99946" NA "T" "A" "133.41"  NA    
## [2529,] "Supercontig_1.50" "99961" NA "T" "C" "90.77"   NA    
## [2530,] "Supercontig_1.50" "99969" NA "G" "T" "6547.20" NA    
## [2531,] "Supercontig_1.50" "99975" NA "T" "A" "48.24"   NA    
## [2532,] "Supercontig_1.50" "99983" NA "T" "G" "7172.77" NA    
## [2533,] "Supercontig_1.50" "99989" NA "C" "T" "7314.05" NA

There is a column in the fix portion of the data called QUAL. It is not defined in the meta portion of the data because it is defined in the VCF specification. It stands for ‘quality’. Does QUAL appear useful to us? Why or why not?

Yes it is useful. Vcf software tend to aggresively call variants. QuAL will be useful to filter our low quality calls.

How would we query the sample names?

colnames(vcf@gt)

##  [1] "FORMAT"        "BL2009P4_us23" "DDR7602"       "IN2009T1_us22"
##  [5] "LBUS5"         "NL07434"       "P10127"        "P10650"       
##  [9] "P11633"        "P12204"        "P13527"        "P1362"        
## [13] "P13626"        "P17777us22"    "P6096"         "P7722"        
## [17] "RS2009P1_us8"  "blue13"        "t30-4"

Part II - Analysis of Genome Data

vcf <- read.vcfR("data/pinfsc50_filtered.vcf.gz")

## Scanning file to determine attributes.
## File attributes:
##   meta lines: 29
##   header_line: 30
##   variant count: 2190
##   column count: 27
## 
Meta line 29 read in.
## All meta lines processed.
## gt matrix initialized.
## Character matrix gt created.
##   Character matrix gt rows: 2190
##   Character matrix gt cols: 27
##   skip: 0
##   nrows: 2190
##   row_num: 0
## 
Processed variant 1000
Processed variant 2000
Processed variant: 2190
## All variants processed

vcf

## ***** Object of Class vcfR *****
## 18 samples
## 1 CHROMs
## 2,190 variants
## Object size: 2.9 Mb
## 0 percent missing data
## *****        *****         *****

x <- vcfR2genlight(vcf) # Convert to genlight object

##  /// GENLIGHT OBJECT /////////
## 
##  // 18 genotypes,  2,146 binary SNPs, size: 240.4 Kb
##  0 (0 %) missing data
## 
##  // Basic content
##    @gen: list of 18 SNPbin
## 
##  // Optional content
##    @ind.names:  18 individual labels
##    @loc.names:  2146 locus labels
##    @chromosome: factor storing chromosomes of the SNPs
##    @position: integer storing positions of the SNPs
##    @other: a list containing: elements without names

# vcfR
gt <- extract.gt(vcf, element = "GT")
gt[c(2,6,18), 1:3]

##                        BL2009P4_us23 DDR7602 IN2009T1_us22
## Supercontig_1.50_80063 "1|0"         "1|0"   "0|1"        
## Supercontig_1.50_80089 "0|0"         "1|0"   "0|1"        
## Supercontig_1.50_94108 "0|1"         "0|1"   "1|1"

# genlight
t(as.matrix(x))[c(1,5,17), 1:3]

##                        BL2009P4_us23 DDR7602 IN2009T1_us22
## Supercontig_1.50_80063             1       1             1
## Supercontig_1.50_80089             0       1             1
## Supercontig_1.50_94108             1       1             2

Set up population data

pop(x) <- as.factor(c("us", "eu", "us", "af", "eu", "us", "mx", "eu", "eu", "sa", "mx", "sa", "us", "sa", "Pmir", "us", "eu", "eu"))
popNames(x)

## [1] "af"   "eu"   "mx"   "Pmir" "sa"   "us"

ploidy(x) <- 2

Distance matrices

x.dist <- dist(x)

x.dist <- poppr::bitwise.dist(x)

Creating chromR object

# Find the files.
vcf_file <- system.file("extdata", "pinf_sc50.vcf.gz", package = "pinfsc50")
dna_file <- system.file("extdata", "pinf_sc50.fasta", package = "pinfsc50")
gff_file <- system.file("extdata", "pinf_sc50.gff", package = "pinfsc50")

# Input the files.
vcf <- read.vcfR(vcf_file, verbose = FALSE)
dna <- ape::read.dna(dna_file, format = "fasta")
gff <- read.table(gff_file, sep="\t", quote="")

# Create a chromR object.
chrom <- create.chromR(name="Supercontig", vcf=vcf, seq=dna, ann=gff, verbose=TRUE)

chrom

## *****   Class chromR, method Show   *****
## Name: Supercontig 
## Chromosome length: 1,042,442 bp
##   Chromosome labels: Supercontig_1.50 of Phytophthora infestans T30-4
## Annotation (@ann) count: 223 
##   Annotation chromosome names: Supercontig_1.50
## Variant (@vcf) count: 22,031 
##   Variant (@vcf) chromosome names: Supercontig_1.50
## Object size: 24 Mb
## Use head(object) for more details.
## *****      End Show (chromR)        *****

plot(chrom)

chromoqc(chrom, dp.alpha = 66)

chrom <- proc.chromR(chrom, verbose = TRUE)

plot(chrom)

chromoqc(chrom, dp.alpha = 66)

#vcf <- read.vcfR("data/pinfsc50_qc.vcf.gz", verbose = FALSE)
vcf <- read.vcfR("data/pinfsc50_filtered.vcf.gz", verbose = FALSE)
chrom <- create.chromR(name="Supercontig", vcf=vcf, seq=dna, ann=gff, verbose=FALSE)
chrom <- proc.chromR(chrom, verbose = FALSE)
chromoqc(chrom, dp.alpha = 66)

head(chrom@var.info)

##              CHROM   POS    MQ  DP mask  n Allele_counts         He       Ne
## 1 Supercontig_1.50 80058 58.96 508 TRUE 18       25,10,1 0.64364712 2.806207
## 2 Supercontig_1.50 80063 58.95 514 TRUE 18         25,11 0.42438272 1.737265
## 3 Supercontig_1.50 80067 58.88 499 TRUE 18         23,13 0.46141975 1.856734
## 4 Supercontig_1.50 80073 58.77 490 TRUE 18          35,1 0.05401235 1.057096
## 5 Supercontig_1.50 80074 58.75 482 TRUE 18         26,10 0.40123457 1.670103
## 6 Supercontig_1.50 80089 58.80 481 TRUE 18         25,11 0.42438272 1.737265

head(chrom@win.info)

##              CHROM window start  end length   A   C   G   T    N other genic
## 1 Supercontig_1.50      1     1 1000   1000 267 213 293 227    0     0     0
## 2 Supercontig_1.50      2  1001 2000   1000 283 206 309 202    0     0     0
## 3 Supercontig_1.50      3  2001 3000   1000 229 213 235 177  146     0     0
## 4 Supercontig_1.50      4  3001 4000   1000   0   0   0   0 1000     0     0
## 5 Supercontig_1.50      5  4001 5000   1000   0   0   0   0 1000     0     0
## 6 Supercontig_1.50      6  5001 6000   1000   0   0   0   0 1000     0     0
##   variants
## 1        0
## 2        0
## 3        0
## 4        0
## 5        0
## 6        0

Genetic differentiation

data(vcfR_example)
pop <- as.factor(c("us", "eu", "us", "af", "eu", "us", "mx", "eu", "eu", "sa", "mx", "sa", "us", "sa", "Pmir", "us", "eu", "eu"))
myDiff <- genetic_diff(vcf, pops = pop, method = 'nei')
knitr::kable(head(myDiff[,1:15]))

CHROM	POS	Hs_af	Hs_eu	Hs_mx	Hs_Pmir	Hs_sa	Hs_us	Ht	n_af	n_eu	n_mx	n_Pmir	n_sa	n_us
Supercontig_1.50	2	0	0.0	0.000	0.5	0.000	0.00	0.0798611	2	4	4	2	4	8
Supercontig_1.50	246	NaN	0.0	0.375	NaN	0.000	0.50	0.3512397	0	4	4	0	6	8
Supercontig_1.50	549	NaN	0.0	NaN	NaN	NaN	0.50	0.4444444	0	2	0	0	0	4
Supercontig_1.50	668	NaN	0.5	0.000	NaN	0.000	0.50	0.5000000	0	4	2	0	2	8
Supercontig_1.50	765	0	0.0	0.000	0.0	0.000	0.00	0.1107266	2	12	4	2	4	10
Supercontig_1.50	780	0	0.0	0.000	0.0	0.375	0.18	0.1244444	2	8	4	2	4	10

knitr::kable(head(myDiff[,16:19]))

Gst	Htmax	Gstmax	Gprimest
0.4782609	0.7951389	0.9475983	0.5047085
NaN	0.8057851	NaN	NaN
NaN	0.6666667	NaN	NaN
NaN	0.8125000	NaN	NaN
1.0000000	0.7543253	1.0000000	1.0000000
0.1160714	0.8000000	0.8625000	0.1345756

knitr::kable(round(colMeans(myDiff[,c(3:9,16,19)], na.rm = TRUE), digits = 3))

	x
Hs_af	0.176
Hs_eu	0.188
Hs_mx	0.168
Hs_Pmir	0.052
Hs_sa	0.198
Hs_us	0.155
Ht	0.247
Gst	0.595
Gprimest	0.632

dpf <- melt(myDiff[,c(3:8,19)], varnames=c('Index', 'Sample'), value.name = 'Depth', na.rm=TRUE)
p <- ggplot(dpf, aes(x=variable, y=Depth)) + geom_violin(fill="#2ca25f", adjust = 1.2)
p <- p + xlab("")
p <- p + ylab("")
p <- p + theme_bw()
p

Exercises Part II

You actually have everything you need to make a Manhattan plot. Can you figure out how to plot G′ST (y-axis) by genomic position (POS)?

plot(getPOS(vcf), myDiff$Gprimest,  pch = 20, col = "#1E90FF44", xlab = "", ylab = "", ylim = c(0, 1), xaxt = "n")
axis(side = 1, at = seq(0, 1e5, by = 1e4), labels = seq(0, 100, by = 10))
title(xlab='Genomic position (Kbp)')
title(ylab = expression(italic("G'"["ST"])))

This Manhatttan plot shouldlook a bit unusual. Can you think of anything that may be wrong with this analysis?

Small sample size

Can you figure out how to zoom in on a particular region of a chromosome in chromoqc()?

chromoqc(chrom, dp.alpha = 66, xlim = c(2e05, 4e05))

Can you use the function queryMETA() to look for other data in your file that may be of interest?

queryMETA(vcf)

##  [1] "FILTER=ID=LowQual"                   
##  [2] "FORMAT=ID=AD"                        
##  [3] "FORMAT=ID=DP"                        
##  [4] "FORMAT=ID=GQ"                        
##  [5] "FORMAT=ID=GT"                        
##  [6] "FORMAT=ID=PL"                        
##  [7] "GATKCommandLine=ID=HaplotypeCaller"  
##  [8] "INFO=ID=AC"                          
##  [9] "INFO=ID=AF"                          
## [10] "INFO=ID=AN"                          
## [11] "INFO=ID=BaseQRankSum"                
## [12] "INFO=ID=ClippingRankSum"             
## [13] "INFO=ID=DP"                          
## [14] "INFO=ID=DS"                          
## [15] "INFO=ID=FS"                          
## [16] "INFO=ID=HaplotypeScore"              
## [17] "INFO=ID=InbreedingCoeff"             
## [18] "INFO=ID=MLEAC"                       
## [19] "INFO=ID=MLEAF"                       
## [20] "INFO=ID=MQ"                          
## [21] "INFO=ID=MQ0"                         
## [22] "INFO=ID=MQRankSum"                   
## [23] "INFO=ID=QD"                          
## [24] "INFO=ID=ReadPosRankSum"              
## [25] "INFO=ID=SOR"                         
## [26] "1 contig=<IDs omitted from queryMETA"

Lab10 - Population genomics based on high throughput sequencing (HTS)

Berent Aldikacti"

12/04/2020

Exercise 1

Part II - Analysis of Genome Data

Set up population data

Distance matrices

Creating chromR object

Genetic differentiation

Exercises Part II