scaling_chi2_hapflk.py

python script to calculate hapflk p-values - Bertrand Servin, 02/12/2015 09:53

Télécharger (3,105 ko)

 
1
#!/usr/bin/env python2.7
2

    
3
import argparse
4
import numpy as np
5
from scipy.stats import chi2
6
import statsmodels.api as sm
7
from scipy.optimize import minimize as optim
8

    
9
def set_options():
10
    myparser=argparse.ArgumentParser(description='Estimate hapflk distribution and/or calculate hapFLK p-values')
11
    myparser.add_argument('infiles',metavar='file',type=str,help='An input file',nargs='+')
12
    myparser.add_argument('K',metavar='K',type=float,help='Number of clusters')
13
    myparser.add_argument('N',metavar='N',type=float,help='Number of populations')
14
    myparser.add_argument('--pars',nargs=2,metavar=('mu','beta'),type=float,help='regression parameters',default=None)
15
    myparser.add_argument('--optimize',help='Optimize chi-squared df',action="store_true",default=False)
16
    return myparser
17

    
18
def estimate_hapflk_qreg(sample,K,Npop,probs=np.linspace(0.05,0.95,num=19)):
19
    ## observed quantiles
20
    oq=np.percentile(sample,q=list(100*probs))
21
    ## theoretical quantiles
22
    mydf=(K-1)*(Npop-1)
23
    tq=chi2.ppf(probs,df=mydf)
24
    ## robust regression
25
    reg=sm.RLM(tq,np.vstack([np.ones(len(oq)),oq]).T)
26
    res=reg.fit()
27
    mu,beta=res.params
28
    return {'mu':mu,'beta':beta,'df':mydf}
29

    
30
def qfunc(ndf,oq,probs):
31
    tq=chi2.ppf(probs,df=ndf)
32
    reg=sm.RLM(tq,np.vstack([np.ones(len(oq)),oq]).T)
33
    res=reg.fit()
34
    return np.sum(np.power(res.resid,2))
35

    
36
def optimize_hapflk_qreg(sample,K,Npop,probs=np.linspace(0.05,0.95,num=19)):
37
    oq=np.percentile(sample,q=list(100*probs))
38
    res=optim(qfunc,K,args=(oq,probs))
39
    mydf=res.x[0]
40
    tq=chi2.ppf(probs,df=mydf)
41
    ## robust regression
42
    reg=sm.RLM(tq,np.vstack([np.ones(len(oq)),oq]).T)
43
    res=reg.fit()
44
    mu,beta=res.params
45
    return {'mu':mu,'beta':beta,'df':mydf}
46
    
47
def scale_hapflk_sample(sample,params):
48
    scaled_sample=params['mu']+params['beta']*sample
49
    if scaled_sample < 0:
50
        scaled_sample=0
51
    pvalue=chi2.sf(scaled_sample,df=params['df'])
52
    return scaled_sample,pvalue
53

    
54
                        
55
def main():
56
    parser=set_options()
57
    args=parser.parse_args()
58
    flog=open('scaling_hapflk.log','w')
59
    datasets=[]
60
    filenames=[]
61
    for fic in args.infiles:
62
        print fic
63
        dset=np.genfromtxt(fic,dtype=None,names=True)
64
        datasets.append(dset)
65
        filenames.append(fic)
66
    if args.pars==None:
67
        hflk_sample=np.concatenate([x['hapflk'] for x in datasets])
68
        if args.optimize:
69
            reg_par=optimize_hapflk_qreg(hflk_sample,args.K,args.N)
70
        else:
71
            reg_par=estimate_hapflk_qreg(hflk_sample,args.K,args.N)
72
        print reg_par
73
    else:
74
        reg_par={'mu':args.pars[0],'beta':args.pars[1],'df':(args.K-1)*(args.N-1)}
75

    
76
    print >>flog,"regression parameters"
77
    for k,v in reg_par.items():
78
        print >>flog,k,v
79

    
80
    for i,d in enumerate(datasets):
81
        with open(filenames[i]+'_sc','w') as fout:
82
            print >>fout,'rs','chr','pos','hapflk','hapflk_scaled','pvalue'
83
            for x in d:
84
                xs=scale_hapflk_sample(x['hapflk'],reg_par)
85
                print >>fout,x['rs'],x['chr'],x['pos'],x['hapflk'],xs[0],xs[1]
86
    
87
                                 
88
if __name__=='__main__':
89
    main()