Newer
Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
# input: the work folder which contains a PED subfolder where all family PED files were copied
# output: only for trio families
# FAM_IDs.txt, PRO_IDs.txt and FAM_PRO.txt
#
# Author: MH
# last modified: SEPT 25, 2019
import sys
import os
def go(work_dir):
out_fam_file = work_dir + '/FAM_IDs.txt'
out_pro_file = work_dir + '/PRO_IDs.txt'
out_f_p_file = work_dir + '/FAM_PRO.txt'
out_fam_han = open(out_fam_file,'w')
out_pro_han = open(out_pro_file,'w')
out_f_p_han = open(out_f_p_file,'w')
cntr_fam = 0
cntr_pro = 0
cntr_f_p = 0
# go over the PED folder in the working dir and process each PED file
ped_dir = work_dir + '/PED'
print ""
print "Processing the PED files (in %s) to extract FAM_ID, PRO_ID amd FAM_PRO files" % (ped_dir)
for file in os.listdir(ped_dir): # per each PED file
if file.endswith(".ped"):
# print(os.path.join(ped_dir, file))
print " %s" % (file)
in_file = os.path.join(ped_dir, file)
CHILD_ID = 0
FAM_ID = 0
in_han = open(in_file,'r')
for line in in_han:
data = [x.strip() for x in line.strip().split('\t')]
x_plate,x_fam = data[0].split('_') # in the internal PED file, family_id is plateID_familyID, will keep only clean family_id, which corresponds to DECIPHER ID
y_indi,y_fam = data[1].split('_') # in the internal PED file, indi_id is indiID_familyID, split
# print "data[0]=%s,data[1]=%s,x_plate=%s,x_fam=%s,y_indi=%s,y_fam=%s" % (data[0],data[1],x_plate,x_fam,y_indi,y_fam)
if FAM_ID == 0:
FAM_ID = x_fam
elif FAM_ID != x_fam:
print "ERROR: more than one FAMILY_ID in %s" % (file)
raise SystemExit
if data[2] != '0' and data[3] != '0': # this is the child in the trio
if CHILD_ID == 0:
CHILD_ID = y_indi
else: # seen another child
print "WARNING: already have seen a child (possibly a quad) in %s" % (file)
CHILD_ID = 0
break
CHILD_SEX = int(data[4])
if (CHILD_SEX == 1) or (CHILD_SEX == 2):
pass
else:
print "ERROR: proband sex unknown"
print line
raise SystemExit
if int(data[5]) != 2:
print "ERROR: child in a trio not affected"
print line
raise SystemExit
if FAM_ID == 0:
print "ERROR: Cannot find the FAMILY_ID in %s" % (file)
raise SystemExit
if CHILD_ID == 0:
print "WARNING: Cannot find exactly one CHILD_ID (with 2 available parents) in %s : not a trio --> will not be analyzed" % (file)
else:
out_fam_han.write('%s\n' % (FAM_ID))
cntr_fam += 1
out_pro_han.write('%s\n' % (CHILD_ID))
cntr_pro += 1
out_f_p_han.write('%s\t%s\n' % (FAM_ID,CHILD_ID))
cntr_f_p += 1
out_fam_han.close()
out_pro_han.close()
out_f_p_han.close()
print ""
print "Trio Families:"
print " %s FAM_IDs --> %s" % (cntr_fam, out_fam_file)
print " %s PRO_IDs --> %s" % (cntr_pro, out_pro_file)
print " %s FAM_PRO --> %s" % (cntr_f_p, out_f_p_file)
print ""
if __name__ == '__main__':
if len(sys.argv) == 2:
go(sys.argv[1])
else:
print "Suggested use: time python /home/u035/u035/shared/scripts/NHS_WES_extract_trio_FAM_PRO_ID.py /scratch/u035/u035/shared/trio_whole_exome/analysis/${PROJECT_ID}"