compare_indi_vars_by_version.py 4.69 KiB
#
# Author: MH
# last modified: SEPT 25, 2020
import sys
import os
import fnmatch
from collections import defaultdict
V1_VARIANTS = defaultdict(list) # key: <indi_id>_<fam_id>; value: list of variants in the decipher upload file in the form internal_id:chr:pos:ref:alt
V2_VARIANTS = defaultdict(list) # key: <indi_id>_<fam_id>; value: list of variants in the decipher upload file in the form internal_id:chr:pos:ref:alt
def go(v1_dir,v2_dir):
v1_files = []
for root, dirnames, filenames in os.walk(v1_dir):
for filename in fnmatch.filter(filenames, '*_DEC_FLT.csv'):
v1_files.append(os.path.join(root, filename))
print "Found %s *_DEC_FLT.csv files in %s" % (len(v1_files),v1_dir)
for v1_file in v1_files:
parsed = [x.strip() for x in v1_file.strip().split('/')]
dname = parsed[8]
fname = parsed[9]
fname_parsed = [y.strip() for y in fname.strip().split('_')]
indi_id = fname_parsed[0]
fam_id = fname_parsed[1]
key = "%s_%s" % (indi_id,fam_id)
V1_VARIANTS[key] = []
# print "Processing V1: folder = %s, file = %s, indi_id = %s, family_id = %s, key = %s" % (dname,fname,indi_id,fam_id,key)
# now open the file and read the variants
in_han = open(v1_file,'r')
for line in in_han:
if line.startswith("Internal reference number or ID"):
continue
data = [z.strip() for z in line.strip().split(',')]
internal_id = data[0]
chr = data[1]
pos = data[2]
ref = data[4]
alt = data[5]
value = '%s:%s:%s:%s:%s' % (internal_id,chr,pos,ref,alt)
# print value
V1_VARIANTS[key].append(value)
# print " Found %s variants for %s" % (len(V1_VARIANTS[key]),key)
# now process V2
v2_files = []
for root, dirnames, filenames in os.walk(v2_dir):
for filename in fnmatch.filter(filenames, '*_DEC_FLT.csv'):
v2_files.append(os.path.join(root, filename))
print "Found %s *_DEC_FLT.csv files in %s" % (len(v2_files),v2_dir)
for v2_file in v2_files:
parsed = [x.strip() for x in v2_file.strip().split('/')]
dname = parsed[8]
fname = parsed[9]
fname_parsed = [y.strip() for y in fname.strip().split('_')]
indi_id = fname_parsed[0]
fam_id = fname_parsed[1]
key = "%s_%s" % (indi_id,fam_id)
V2_VARIANTS[key] = []
# print "Processing V2: folder = %s, file = %s, indi_id = %s, family_id = %s, key = %s" % (dname,fname,indi_id,fam_id,key)
# now open the file and read the variants
in_han = open(v2_file,'r')
for line in in_han:
if line.startswith("Internal reference number or ID"):
continue
data = [z.strip() for z in line.strip().split(',')]
internal_id = data[0]
chr = data[1]
pos = data[2]
ref = data[4]
alt = data[5]
value = '%s:%s:%s:%s:%s' % (internal_id,chr,pos,ref,alt)
# print value
V2_VARIANTS[key].append(value)
# print " Found %s variants for %s" % (len(V2_VARIANTS[key]),key)
# check that in both folders we have the same individuals (i.e. keys)
v1_keys = V1_VARIANTS.keys()
v2_keys = V2_VARIANTS.keys()
for v1_key in v1_keys:
if v1_key not in v2_keys:
print "v1_key = %s not found in v2_keys" % (v1_key)
raise SystemExit
for v2_key in v2_keys:
if v2_key not in v1_keys:
print "v2_key = %s not found in v1_keys" % (v2_key)
raise SystemExit
print "Match individuals between the two vesrions: OK"
sys.stdout.flush()
# now compare the variants per individual
for key in v1_keys:
print ""
print "Processing individual = %s" % (key)
v1_variants = V1_VARIANTS[key]
v2_variants = V2_VARIANTS[key]
for v1_var in v1_variants:
if v1_var in v2_variants:
print "%s: %s found in both versions" % (key,v1_var)
v2_variants.remove(v1_var)
else:
print " <<<< %s: %s found only in V1" % (key,v1_var)
# in v2_variants, remaining ar only those for which no match in V1 was found
for v2_var in v2_variants:
print " >>>> %s: %s found only in V2" % (key,v2_var)
print "-----------------------------"
if __name__ == '__main__':
if len(sys.argv) == 3:
go(sys.argv[1],sys.argv[2])
else:
print ("Suggested use: time python compare_indi_vars_by_version.py results_folder_v1 results_folder_v2")
raise SystemExit