#title: "R script for computing local scores" ### **Description**: ## Detecting genomic outliers in a sequence of correlated scores, from Fariello et al 2017. ### Import libraries. library(ggplot2) library(data.table) library(RColorBrewer) library(tidyverse) #Import functions. source('scorelocalfunctions.R') ### Import toy data. #Data should be organized as one locus per line and three columns #chromosome number | position | score #The first line of the table should include the names: chr pos pval #The p-values of the used statistic should follow a uniform distribution. #If this is not the case, then new coefficients of the Gumbel Law should be computed. mydata = fread("quailfreqs.txt", head=T, select=c('chr', 'pos','pval')) setkey(mydata, chr) mydata$pval[mydata$pval==0]=1e-16 Nchr=length(mydata[,unique(chr)]) ### Computation of absoulte position in the genome. #This is useful for doing genomewide plots. chrInfo=mydata[,.(L=.N,cor=autocor(pval)),chr] setkey(chrInfo,chr) data.table(chr=mydata[,unique(chr),], S=cumsum(c(0,chrInfo$L[-Nchr]))) %>% setkey(.,chr) %>% mydata[.,posT:=pos+S] ### Choice of $\xi$ # To choose the apropiate threshold ($\xi = 1, \dots, 4$) we look at the distribution of −log10(p − value). #Then the score function will be $−log10(p − value) − \xi$. qplot(pval, data=mydata, geom='histogram', binwidth=0.1, main='P-values histogram') qplot(-log10(pval), data=mydata, geom='histogram', binwidth=0.1, main='-log10(p-values) histogram') mean(-log10(mydata$pval)) # In this case we choose $\xi$ = 1, as there are not values above 2. # Remember that we have to choose some value between mean(-log10(mydata$pval)) and max(-log10(mydata$pval)). ### This plot can take some time, if you are sure what you are doing, you can skip this plot ggplot(data=mydata[chr==1], aes(posT,-log10(pval), colour=as.factor(chr))) + geom_point() + geom_abline(intercept=c(1,2) , slope=0) ## Computation of the score and the Lindley Process # We should verify that the mean of the score is negative. #It should be according to the chosen value of xi. #Set the chosen xi!! xi=1 mydata[,score:= -log10(pval)-xi] # The score mean must be negative mean(mydata$score) mydata[,lindley:=lindley(score),chr] # Compute significance threshold for each chromosome ## Uniform distribution of p-values #If the distribution of the p-values is uniform and if $\xi=$ is 1, 2, 3 or 4, #it is possible to compute the significancy thresholds for each chromosome directely, #given the length and the autocorrelation of the chromosome . chrInfo[,th:=thresUnif(L, cor, 1),chr] mydata=mydata[chrInfo] sigZones=mydata[chrInfo,sig_sl(lindley, pos, unique(th)),chr] pdf('ScoreLocalAll.pdf') par(mfrow=c(4,1), mar=c(5,2,1,1)) for (g in chrInfo$chr){ plot(mydata[chr==g,pos],mydata[chr==g,lindley],xlab=paste("position (chr ",g,")",sep=''),ylab="Score Local", type='l', ylim=c(0,max(mydata[chr==g,max(lindley)],chrInfo[chr==g, th]))) abline(h=chrInfo[chr==g, th], col='grey') abline(v=sigZones[chr==g, beg], col='red', lty=3) abline(v=sigZones[chr==g, end], col='red', lty=3) } dev.off() ### In this example there are too many significant regions because the distribution #of the p-values is not uniform due to experimental designs and a re-sampling strategy #as follows would be suited ############################################################################# ######## Run only if the distribution of your p-values is not uniform ###### ############################################################################# ## If the distribution of the p-values is not uniform, then a re-sampling strategy should be used. coefsG=coefsGumb(mydata, Ls=seq(30000,60000,10000), nSeq=5000) #chrInfo[,thG05:=threshold(L, cor, coefsG$aCoef, coefsG$bCoef,0.05),] #chrInfo[,thG01:=threshold(L, cor, coefsG$aCoef, coefsG$bCoef,0.01),] chrInfo[,':=' (thG05=threshold(L, cor, coefsG$aCoef, coefsG$bCoef,0.05), thG01=threshold(L, cor, coefsG$aCoef, coefsG$bCoef,0.01)),] mydata=mydata[chrInfo] sigZones05=mydata[,sig_sl(lindley, pos, unique(thG05)),chr] sigZones01=mydata[,sig_sl(lindley, pos, unique(thG01)),chr] pdf('ScoreLocalAll.pdf') par(mfrow=c(4,1), mar=c(5,2,1,1)) for (g in chrInfo$chr)){ plot(mydata[chr==g,pos],mydata[chr==g,lindley],xlab=paste("position chr",g,sep=' '),ylab="Score Local", type='l', ylim=c(0,max(mydata[chr==g,max(lindley)],mydata[chr==g, unique(thG01)]))) abline(h=mydata[chr==g, unique(thG05)], col='grey') abline(h=mydata[chr==g, unique(thG01)], lty=2, col='grey') abline(v=sigZones01[chr==g, beg], col='grey', lty=3) abline(v=sigZones01[chr==g, end], col='grey', lty=3) } dev.off()