Newer
Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
#
# Author: MH
# last modified: SEPT 25, 2020
import sys
import os
import fnmatch
from collections import defaultdict
V1_VARIANTS = defaultdict(list) # key: <indi_id>_<fam_id>; value: list of variants in the decipher upload file in the form internal_id:chr:pos:ref:alt
V2_VARIANTS = defaultdict(list) # key: <indi_id>_<fam_id>; value: list of variants in the decipher upload file in the form internal_id:chr:pos:ref:alt
def go(v1_dir,v2_dir):
v1_files = []
for root, dirnames, filenames in os.walk(v1_dir):
for filename in fnmatch.filter(filenames, '*_DEC_FLT.csv'):
v1_files.append(os.path.join(root, filename))
print "Found %s *_DEC_FLT.csv files in %s" % (len(v1_files),v1_dir)
for v1_file in v1_files:
parsed = [x.strip() for x in v1_file.strip().split('/')]
dname = parsed[8]
fname = parsed[9]
fname_parsed = [y.strip() for y in fname.strip().split('_')]
indi_id = fname_parsed[0]
fam_id = fname_parsed[1]
key = "%s_%s" % (indi_id,fam_id)
V1_VARIANTS[key] = []
# print "Processing V1: folder = %s, file = %s, indi_id = %s, family_id = %s, key = %s" % (dname,fname,indi_id,fam_id,key)
# now open the file and read the variants
in_han = open(v1_file,'r')
for line in in_han:
if line.startswith("Internal reference number or ID"):
continue
data = [z.strip() for z in line.strip().split(',')]
internal_id = data[0]
chr = data[1]
pos = data[2]
ref = data[4]
alt = data[5]
value = '%s:%s:%s:%s:%s' % (internal_id,chr,pos,ref,alt)
# print value
V1_VARIANTS[key].append(value)
# print " Found %s variants for %s" % (len(V1_VARIANTS[key]),key)
# now process V2
v2_files = []
for root, dirnames, filenames in os.walk(v2_dir):
for filename in fnmatch.filter(filenames, '*_DEC_FLT.csv'):
v2_files.append(os.path.join(root, filename))
print "Found %s *_DEC_FLT.csv files in %s" % (len(v2_files),v2_dir)
for v2_file in v2_files:
parsed = [x.strip() for x in v2_file.strip().split('/')]
dname = parsed[8]
fname = parsed[9]
fname_parsed = [y.strip() for y in fname.strip().split('_')]
indi_id = fname_parsed[0]
fam_id = fname_parsed[1]
key = "%s_%s" % (indi_id,fam_id)
V2_VARIANTS[key] = []
# print "Processing V2: folder = %s, file = %s, indi_id = %s, family_id = %s, key = %s" % (dname,fname,indi_id,fam_id,key)
# now open the file and read the variants
in_han = open(v2_file,'r')
for line in in_han:
if line.startswith("Internal reference number or ID"):
continue
data = [z.strip() for z in line.strip().split(',')]
internal_id = data[0]
chr = data[1]
pos = data[2]
ref = data[4]
alt = data[5]
value = '%s:%s:%s:%s:%s' % (internal_id,chr,pos,ref,alt)
# print value
V2_VARIANTS[key].append(value)
# print " Found %s variants for %s" % (len(V2_VARIANTS[key]),key)
# check that in both folders we have the same individuals (i.e. keys)
v1_keys = V1_VARIANTS.keys()
v2_keys = V2_VARIANTS.keys()
for v1_key in v1_keys:
if v1_key not in v2_keys:
print "v1_key = %s not found in v2_keys" % (v1_key)
raise SystemExit
for v2_key in v2_keys:
if v2_key not in v1_keys:
print "v2_key = %s not found in v1_keys" % (v2_key)
raise SystemExit
print "Match individuals between the two vesrions: OK"
sys.stdout.flush()
# now compare the variants per individual
for key in v1_keys:
print ""
print "Processing individual = %s" % (key)
v1_variants = V1_VARIANTS[key]
v2_variants = V2_VARIANTS[key]
for v1_var in v1_variants:
if v1_var in v2_variants:
print "%s: %s found in both versions" % (key,v1_var)
v2_variants.remove(v1_var)
else:
print " <<<< %s: %s found only in V1" % (key,v1_var)
# in v2_variants, remaining ar only those for which no match in V1 was found
for v2_var in v2_variants:
print " >>>> %s: %s found only in V2" % (key,v2_var)
print "-----------------------------"
if __name__ == '__main__':
if len(sys.argv) == 3:
go(sys.argv[1],sys.argv[2])
else:
print ("Suggested use: time python compare_indi_vars_by_version.py results_folder_v1 results_folder_v2")
raise SystemExit