Newer
Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
# input: the work folder which contains a PED subfolder where all family PED files were copied
# output: only for singletons
# solo_FAM_IDs.txt, solo_PRO_IDs.txt and solo_FAM_PRO.txt
#
# Author: MH
# last modified: MARCH 04, 2022
import sys
import os
def go(work_dir):
out_fam_file = work_dir + '/solo_FAM_IDs.txt'
out_pro_file = work_dir + '/solo_PRO_IDs.txt'
out_f_p_file = work_dir + '/solo_FAM_PRO.txt'
out_fam_han = open(out_fam_file,'w')
out_pro_han = open(out_pro_file,'w')
out_f_p_han = open(out_f_p_file,'w')
cntr_fam = 0
cntr_pro = 0
cntr_f_p = 0
# go over the PED folder in the working dir and process each PED file
ped_dir = work_dir + '/PED'
print ""
print "Processing the PED files (in %s) to extract singleton FAM_ID, PRO_ID amd FAM_PRO files" % (ped_dir)
for file in os.listdir(ped_dir): # per each PED file
if file.endswith(".ped"):
print " %s" % (file)
in_file = os.path.join(ped_dir, file)
# check how many lines in the PED file - if more than one cannot be a singleton, ignore
num_lines = 0
in_han = open(in_file,'r')
for line in in_han:
num_lines += 1
in_han.close()
if num_lines > 1:
continue
if num_lines == 0:
print "ERROR: empty PED file %s" % (file)
raise SystemExit
# if here, the PED file contains exactly one line
# check if all fine: parents IDs = 0, kid with known sex and is affected
CHILD_ID = 0
FAM_ID = 0
in_han = open(in_file,'r')
for line in in_han:
data = [x.strip() for x in line.strip().split('\t')]
x_plate,x_fam = data[0].split('_') # in the internal PED file, family_id is plateID_familyID, will keep only clean family_id, which corresponds to DECIPHER ID
y_indi,y_fam = data[1].split('_') # in the internal PED file, indi_id is indiID_familyID, split
if x_fam != y_fam:
print "ERROR: FAMILY_ID mismatch in %s" % (file)
print line
raise SystemExit
FAM_ID = x_fam
CHILD_ID = y_indi
# check both parent IDs == 0
if (data[2] != '0') or (data[3] != '0'):
print "ERROR: found parent ID for a singleton child in %s" % (file)
print line
raise SystemExit
# make sure the sex of the child is known
CHILD_SEX = int(data[4])
if (CHILD_SEX == 1) or (CHILD_SEX == 2):
pass
else:
print "ERROR: proband sex unknown in %s" % (file)
print line
raise SystemExit
# check kid is affected
if int(data[5]) != 2:
print "ERROR: singleton child not affected"
print line
raise SystemExit
if FAM_ID == 0:
print "ERROR: Cannot find the FAMILY_ID in %s" % (file)
raise SystemExit
if CHILD_ID == 0:
print "ERROR: Cannot find CHILD_ID in %s" % (file)
raise SystemExit
else:
out_fam_han.write('%s\n' % (FAM_ID))
cntr_fam += 1
out_pro_han.write('%s\n' % (CHILD_ID))
cntr_pro += 1
out_f_p_han.write('%s\t%s\n' % (FAM_ID,CHILD_ID))
cntr_f_p += 1
out_fam_han.close()
out_pro_han.close()
out_f_p_han.close()
print ""
print "Singleton Families:"
print " %s FAM_IDs --> %s" % (cntr_fam, out_fam_file)
print " %s PRO_IDs --> %s" % (cntr_pro, out_pro_file)
print " %s FAM_PRO --> %s" % (cntr_f_p, out_f_p_file)
print ""
if __name__ == '__main__':
if len(sys.argv) == 2:
go(sys.argv[1])
else:
print "Suggested use: time python /home/u035/u035/shared/scripts/extract_solo_FAM_PRO_ID.py /home/u035/u035/shared/analysis/work/<PROJECT_ID>"
raise SystemExit