local_reynolds.py

Python (2.7) script to compute local reynolds distances - Bertrand Servin, 10/02/2014 17:51

Télécharger (5,673 ko)

 
1
#!/usr/bin/env python
2
import sys
3
import re
4
import bz2
5
from os import listdir
6
import numpy as np
7
import argparse
8
np.seterr(all='ignore')
9

    
10
def reynolds_multi_allelic(Mf,nloc,decompose=False):
11
    '''
12
    Compute Reynolds distances from multiallelic loci
13

14
    Parameters:
15
    -----------
16
    
17
    Mf : numpy array of allele frequencies in populations
18
         rows are populations
19
         columns are allele frequencies, by loci
20
    nloc : number of loci
21

22
    Returns:
23
    --------
24

25
    dist : numpy array (n x n) of reynolds genetic distances
26
    numerator    : numerator of the Reynolds Distance
27
    denominator  : denominator of the Reynolds Distance
28
    '''
29
    npop=Mf.shape[0]
30
    A=np.dot(Mf,Mf.T)
31
    dist=np.zeros((npop,npop))
32
    if decompose:
33
        numerator=np.zeros((npop,npop))
34
        denominator=np.zeros((npop,npop))
35
    else:
36
        numerator=None
37
        denominator=None
38
    for i in range(npop-1):
39
        for j in range(i+1,npop):
40
            dist[i,j]=0.5*(A[i,i]+A[j,j]-2*A[i,j])/(nloc-A[i,j])
41
            if decompose:
42
                numerator[i,j]=A[i,i]+A[j,j]-2*A[i,j]
43
                denominator[i,j]=2*(nloc-A[i,j])
44
    dist=dist+dist.T
45
    numerator=numerator+numerator.T
46
    denominator=denominator+denominator.T
47
    return dist,numerator,denominator
48
   
49
def reynolds(Mf):
50
    '''
51
    Compute Reynolds distances from SNP allele frequencies
52

53
    Parameters:
54
    ----------------
55

56
    Mf : numpy array of allele frequencies in populations
57
          rows are populations (n), columns are markers (p).
58

59
    Returns:
60
    -----------
61

62
    dist : numpy array (n x n) of reynolds genetic distances
63
    '''
64
    npop,nloc=Mf.shape
65
    dist=np.zeros((npop,npop))
66
    A=np.dot(Mf,Mf.T)+np.dot((1-Mf),(1-Mf).T)
67
    for i in range(npop-1):
68
        for j in range(i+1,npop):
69
            dist[i,j]=0.5*(A[i,i]+A[j,j]-2*A[i,j])/(nloc-A[i,j])
70
    dist=dist+dist.T
71
    return dist
72

    
73
def get_pop_names(fname):
74
    thepop=None
75
    popnames=[]
76
    with bz2.BZ2File(fname,'r') as f:
77
        for i,ligne  in enumerate(f):
78
            if i==0:
79
                continue
80
            buf=ligne.split()
81
            if thepop==None:
82
                thepop=buf[0]
83
                popnames.append(buf[0])
84
                continue
85
            if buf[0]==popnames[-1]:
86
                continue
87
            else:
88
                popnames.append(buf[0])
89
    return popnames
90

    
91
def get_file_list(prefix=None):
92
    ## find all files with cluster frequencies in current directory
93
    if prefix is None:
94
        mysearch=re.compile("\w*\.kfrq\.fit_[0-9]*\.bz2$")
95
    else:
96
        mysearch=re.compile(prefix+"\.kfrq\.fit_[0-9]*\.bz2$")
97
    myfic=[x for x in listdir('.') if mysearch.match(x)]
98
    return myfic
99

    
100
def get_opt():
101
    parser=argparse.ArgumentParser()
102
    parser.add_argument('-p',dest='prefix',help='Work on files with prefix PREFIX',metavar='PREFIX',default=None)
103
    parser.add_argument('-l',dest='left',help='Leftmost coordinate to consider',default=0,type=float)
104
    parser.add_argument('-r',dest='right',help='rightmost coordinate to consider',default=np.inf,type=float)
105
    parser.add_argument('-o',dest='oprefix',help='prefix for output files',default='')
106
    return parser
107
def main():
108
    myparser=get_opt()
109
    myopts=myparser.parse_args()
110
    myfic=get_file_list(myopts.prefix)
111
    if myopts.prefix is None:
112
        myopts.prefix=myfic[0].split('.')[0]
113
    if myopts.oprefix!='':
114
        myopts.oprefix=myopts.oprefix+'_'
115
    ## SNP Reynolds
116
    print "Calculating Reynolds from SNP frequencies"
117
    with open(myopts.prefix+'.frq') as f_frq:
118
        head=f_frq.readline()
119
        my_pops=head.split()[5:]
120
    a=np.genfromtxt(myopts.prefix+'.frq',skip_header=1,usecols=tuple([2]+range(5,5+len(my_pops))))
121
    condl=a[:,0]>myopts.left
122
    condr=a[:,0]<myopts.right
123
    a=a[condl&condr,1:]
124
    sDR=reynolds(a.T)
125
    fout=open(myopts.oprefix+'hapflk_snp_reynolds.txt','w')
126
    print>>fout,' '.join(my_pops)
127
    for i in range(len(my_pops)):
128
        print>>fout,my_pops[i],
129
        print>>fout,' '.join([str(x) for x in sDR[i,]])
130
    ### Haplotype Reynolds
131
    my_pops=get_pop_names(myfic[0])
132
    DR_em={}
133
    get_EM=re.compile('_\d+.b')
134
    print 'Reading haplotype cluster frequencies from',len(myfic),'files'
135
    ## get names of loci
136
    locnames=np.genfromtxt(myfic[0],skip_header=1,usecols=(1),dtype=str)
137
    for fname in myfic:
138
        sys.stdout.write(fname+'\r')
139
        sys.stdout.flush()
140
        myEM=int(get_EM.findall(fname)[0][1:-2])
141
        a=np.genfromtxt(fname,skip_header=1,usecols=(2,3,4))
142
        ## subset a by position
143
        condl=a[:,0]>myopts.left
144
        condr=a[:,0]<myopts.right
145
        a=a[condl&condr,]
146
        nloc=len(set(locnames[condl&condr,]))
147
        nclus=len(set(a[:,1]))
148
        npop=a.shape[0]/(nclus*nloc)
149
        start_pop=range(0,a.shape[0],nclus*nloc)
150
        print a.shape[0],nloc,nclus,npop,start_pop
151
        mf=np.vstack([a[x:(x+(nclus*nloc)),2] for x in start_pop])
152
        DR,num,den=reynolds_multi_allelic(mf,nloc,decompose=True)
153
        try:
154
            myres=DR_em[myEM]
155
        except KeyError:
156
            myres={'dist':[],'num':[],'den':[]}
157
            DR_em[myEM]=myres
158
        myres['dist'].append(DR)
159
        myres['num'].append(num)
160
        myres['den'].append(den)
161
    ##D1=[]
162
    D2=[]
163
    for em,val in DR_em.items():
164
        ##D1.append(np.mean(val['dist'],axis=0))
165
        D2.append(np.sum(val['num'],axis=0)/np.sum(val['den'],axis=0))
166
    ## by chromosome averaged over EMs
167
    DRm=np.mean(D2,axis=0)
168
    np.fill_diagonal(DRm,0)
169
    DRs=np.std(D2,axis=0)
170
    np.fill_diagonal(DRs,0)
171
    fout=open(myopts.oprefix+'hapflk_hap_reynolds.txt','w')
172
    print>>fout,' '.join(my_pops)
173
    for i in range(len(my_pops)):
174
        print>>fout,my_pops[i],
175
        print>>fout,' '.join([str(x) for x in DRm[i,]])
176

    
177
if __name__=='__main__':
178
    main()