1
|
|
2
|
import sys
|
3
|
import re
|
4
|
import bz2
|
5
|
from os import listdir
|
6
|
import numpy as np
|
7
|
import argparse
|
8
|
np.seterr(all='ignore')
|
9
|
|
10
|
def reynolds_multi_allelic(Mf,nloc,decompose=False):
|
11
|
'''
|
12
|
Compute Reynolds distances from multiallelic loci
|
13
|
|
14
|
Parameters:
|
15
|
-----------
|
16
|
|
17
|
Mf : numpy array of allele frequencies in populations
|
18
|
rows are populations
|
19
|
columns are allele frequencies, by loci
|
20
|
nloc : number of loci
|
21
|
|
22
|
Returns:
|
23
|
--------
|
24
|
|
25
|
dist : numpy array (n x n) of reynolds genetic distances
|
26
|
numerator : numerator of the Reynolds Distance
|
27
|
denominator : denominator of the Reynolds Distance
|
28
|
'''
|
29
|
npop=Mf.shape[0]
|
30
|
A=np.dot(Mf,Mf.T)
|
31
|
dist=np.zeros((npop,npop))
|
32
|
if decompose:
|
33
|
numerator=np.zeros((npop,npop))
|
34
|
denominator=np.zeros((npop,npop))
|
35
|
else:
|
36
|
numerator=None
|
37
|
denominator=None
|
38
|
for i in range(npop-1):
|
39
|
for j in range(i+1,npop):
|
40
|
dist[i,j]=0.5*(A[i,i]+A[j,j]-2*A[i,j])/(nloc-A[i,j])
|
41
|
if decompose:
|
42
|
numerator[i,j]=A[i,i]+A[j,j]-2*A[i,j]
|
43
|
denominator[i,j]=2*(nloc-A[i,j])
|
44
|
dist=dist+dist.T
|
45
|
numerator=numerator+numerator.T
|
46
|
denominator=denominator+denominator.T
|
47
|
return dist,numerator,denominator
|
48
|
|
49
|
def reynolds(Mf):
|
50
|
'''
|
51
|
Compute Reynolds distances from SNP allele frequencies
|
52
|
|
53
|
Parameters:
|
54
|
----------------
|
55
|
|
56
|
Mf : numpy array of allele frequencies in populations
|
57
|
rows are populations (n), columns are markers (p).
|
58
|
|
59
|
Returns:
|
60
|
-----------
|
61
|
|
62
|
dist : numpy array (n x n) of reynolds genetic distances
|
63
|
'''
|
64
|
npop,nloc=Mf.shape
|
65
|
dist=np.zeros((npop,npop))
|
66
|
A=np.dot(Mf,Mf.T)+np.dot((1-Mf),(1-Mf).T)
|
67
|
for i in range(npop-1):
|
68
|
for j in range(i+1,npop):
|
69
|
dist[i,j]=0.5*(A[i,i]+A[j,j]-2*A[i,j])/(nloc-A[i,j])
|
70
|
dist=dist+dist.T
|
71
|
return dist
|
72
|
|
73
|
def get_pop_names(fname):
|
74
|
thepop=None
|
75
|
popnames=[]
|
76
|
with bz2.BZ2File(fname,'r') as f:
|
77
|
for i,ligne in enumerate(f):
|
78
|
if i==0:
|
79
|
continue
|
80
|
buf=ligne.split()
|
81
|
if thepop==None:
|
82
|
thepop=buf[0]
|
83
|
popnames.append(buf[0])
|
84
|
continue
|
85
|
if buf[0]==popnames[-1]:
|
86
|
continue
|
87
|
else:
|
88
|
popnames.append(buf[0])
|
89
|
return popnames
|
90
|
|
91
|
def get_file_list(prefix=None):
|
92
|
|
93
|
if prefix is None:
|
94
|
mysearch=re.compile("\w*\.kfrq\.fit_[0-9]*\.bz2$")
|
95
|
else:
|
96
|
mysearch=re.compile(prefix+"\.kfrq\.fit_[0-9]*\.bz2$")
|
97
|
myfic=[x for x in listdir('.') if mysearch.match(x)]
|
98
|
return myfic
|
99
|
|
100
|
def get_opt():
|
101
|
parser=argparse.ArgumentParser()
|
102
|
parser.add_argument('-p',dest='prefix',help='Work on files with prefix PREFIX',metavar='PREFIX',default=None)
|
103
|
parser.add_argument('-l',dest='left',help='Leftmost coordinate to consider',default=0,type=float)
|
104
|
parser.add_argument('-r',dest='right',help='rightmost coordinate to consider',default=np.inf,type=float)
|
105
|
parser.add_argument('-o',dest='oprefix',help='prefix for output files',default='')
|
106
|
return parser
|
107
|
def main():
|
108
|
myparser=get_opt()
|
109
|
myopts=myparser.parse_args()
|
110
|
myfic=get_file_list(myopts.prefix)
|
111
|
if myopts.prefix is None:
|
112
|
myopts.prefix=myfic[0].split('.')[0]
|
113
|
if myopts.oprefix!='':
|
114
|
myopts.oprefix=myopts.oprefix+'_'
|
115
|
|
116
|
print "Calculating Reynolds from SNP frequencies"
|
117
|
with open(myopts.prefix+'.frq') as f_frq:
|
118
|
head=f_frq.readline()
|
119
|
my_pops=head.split()[5:]
|
120
|
a=np.genfromtxt(myopts.prefix+'.frq',skip_header=1,usecols=tuple([2]+range(5,5+len(my_pops))))
|
121
|
condl=a[:,0]>myopts.left
|
122
|
condr=a[:,0]<myopts.right
|
123
|
a=a[condl&condr,1:]
|
124
|
sDR=reynolds(a.T)
|
125
|
fout=open(myopts.oprefix+'hapflk_snp_reynolds.txt','w')
|
126
|
print>>fout,' '.join(my_pops)
|
127
|
for i in range(len(my_pops)):
|
128
|
print>>fout,my_pops[i],
|
129
|
print>>fout,' '.join([str(x) for x in sDR[i,]])
|
130
|
|
131
|
my_pops=get_pop_names(myfic[0])
|
132
|
DR_em={}
|
133
|
get_EM=re.compile('_\d+.b')
|
134
|
print 'Reading haplotype cluster frequencies from',len(myfic),'files'
|
135
|
|
136
|
locnames=np.genfromtxt(myfic[0],skip_header=1,usecols=(1),dtype=str)
|
137
|
for fname in myfic:
|
138
|
sys.stdout.write(fname+'\r')
|
139
|
sys.stdout.flush()
|
140
|
myEM=int(get_EM.findall(fname)[0][1:-2])
|
141
|
a=np.genfromtxt(fname,skip_header=1,usecols=(2,3,4))
|
142
|
|
143
|
condl=a[:,0]>myopts.left
|
144
|
condr=a[:,0]<myopts.right
|
145
|
a=a[condl&condr,]
|
146
|
nloc=len(set(locnames[condl&condr,]))
|
147
|
nclus=len(set(a[:,1]))
|
148
|
npop=a.shape[0]/(nclus*nloc)
|
149
|
start_pop=range(0,a.shape[0],nclus*nloc)
|
150
|
print a.shape[0],nloc,nclus,npop,start_pop
|
151
|
mf=np.vstack([a[x:(x+(nclus*nloc)),2] for x in start_pop])
|
152
|
DR,num,den=reynolds_multi_allelic(mf,nloc,decompose=True)
|
153
|
try:
|
154
|
myres=DR_em[myEM]
|
155
|
except KeyError:
|
156
|
myres={'dist':[],'num':[],'den':[]}
|
157
|
DR_em[myEM]=myres
|
158
|
myres['dist'].append(DR)
|
159
|
myres['num'].append(num)
|
160
|
myres['den'].append(den)
|
161
|
|
162
|
D2=[]
|
163
|
for em,val in DR_em.items():
|
164
|
|
165
|
D2.append(np.sum(val['num'],axis=0)/np.sum(val['den'],axis=0))
|
166
|
|
167
|
DRm=np.mean(D2,axis=0)
|
168
|
np.fill_diagonal(DRm,0)
|
169
|
DRs=np.std(D2,axis=0)
|
170
|
np.fill_diagonal(DRs,0)
|
171
|
fout=open(myopts.oprefix+'hapflk_hap_reynolds.txt','w')
|
172
|
print>>fout,' '.join(my_pops)
|
173
|
for i in range(len(my_pops)):
|
174
|
print>>fout,my_pops[i],
|
175
|
print>>fout,' '.join([str(x) for x in DRm[i,]])
|
176
|
|
177
|
if __name__=='__main__':
|
178
|
main()
|