forked from rajanil/fastStructure
-
Notifications
You must be signed in to change notification settings - Fork 2
/
parse_str.pyx
44 lines (35 loc) · 1.2 KB
/
parse_str.pyx
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
import numpy as np
cimport numpy as np
def load(file):
# read in data from file
handle = open(file+'.str','r')
loci = np.array([line.strip().split()[6:] for line in handle])
N = loci.shape[0]/2
L = loci.shape[1]
genotype = np.zeros((N,L),dtype=np.uint8)
for l,locus in enumerate(loci.T):
alleles = list(np.unique(locus))
# assumes that missing alleles are coded as -9
if '-9' in alleles:
alleles.remove('-9')
# selecting major and minor alleles
major = alleles[0]
try:
minor = alleles[1]
except IndexError:
minor = ''
# maps structure represntation of genotypes to an unsigned integer
genomap = { minor+minor : 2,
minor+major : 1,
major+minor : 1,
major+major : 0
}
# convert genotypes to fastStructure representation
for n in range(N):
try:
geno = genomap[''.join(locus[2*n:2*(n+1)])]
genotype[n,l] = geno
except KeyError:
# missing values are coded by `3`.
genotype[n,l] = 3
return genotype