# # Author: MH # last modified: SEPT 25, 2020 import sys import os import fnmatch from collections import defaultdict V1_VARIANTS = defaultdict(list) # key: <indi_id>_<fam_id>; value: list of variants in the decipher upload file in the form internal_id:chr:pos:ref:alt V2_VARIANTS = defaultdict(list) # key: <indi_id>_<fam_id>; value: list of variants in the decipher upload file in the form internal_id:chr:pos:ref:alt def go(v1_dir,v2_dir): v1_files = [] for root, dirnames, filenames in os.walk(v1_dir): for filename in fnmatch.filter(filenames, '*_DEC_FLT.csv'): v1_files.append(os.path.join(root, filename)) print "Found %s *_DEC_FLT.csv files in %s" % (len(v1_files),v1_dir) for v1_file in v1_files: parsed = [x.strip() for x in v1_file.strip().split('/')] dname = parsed[8] fname = parsed[9] fname_parsed = [y.strip() for y in fname.strip().split('_')] indi_id = fname_parsed[0] fam_id = fname_parsed[1] key = "%s_%s" % (indi_id,fam_id) V1_VARIANTS[key] = [] # print "Processing V1: folder = %s, file = %s, indi_id = %s, family_id = %s, key = %s" % (dname,fname,indi_id,fam_id,key) # now open the file and read the variants in_han = open(v1_file,'r') for line in in_han: if line.startswith("Internal reference number or ID"): continue data = [z.strip() for z in line.strip().split(',')] internal_id = data[0] chr = data[1] pos = data[2] ref = data[4] alt = data[5] value = '%s:%s:%s:%s:%s' % (internal_id,chr,pos,ref,alt) # print value V1_VARIANTS[key].append(value) # print " Found %s variants for %s" % (len(V1_VARIANTS[key]),key) # now process V2 v2_files = [] for root, dirnames, filenames in os.walk(v2_dir): for filename in fnmatch.filter(filenames, '*_DEC_FLT.csv'): v2_files.append(os.path.join(root, filename)) print "Found %s *_DEC_FLT.csv files in %s" % (len(v2_files),v2_dir) for v2_file in v2_files: parsed = [x.strip() for x in v2_file.strip().split('/')] dname = parsed[8] fname = parsed[9] fname_parsed = [y.strip() for y in fname.strip().split('_')] indi_id = fname_parsed[0] fam_id = fname_parsed[1] key = "%s_%s" % (indi_id,fam_id) V2_VARIANTS[key] = [] # print "Processing V2: folder = %s, file = %s, indi_id = %s, family_id = %s, key = %s" % (dname,fname,indi_id,fam_id,key) # now open the file and read the variants in_han = open(v2_file,'r') for line in in_han: if line.startswith("Internal reference number or ID"): continue data = [z.strip() for z in line.strip().split(',')] internal_id = data[0] chr = data[1] pos = data[2] ref = data[4] alt = data[5] value = '%s:%s:%s:%s:%s' % (internal_id,chr,pos,ref,alt) # print value V2_VARIANTS[key].append(value) # print " Found %s variants for %s" % (len(V2_VARIANTS[key]),key) # check that in both folders we have the same individuals (i.e. keys) v1_keys = V1_VARIANTS.keys() v2_keys = V2_VARIANTS.keys() for v1_key in v1_keys: if v1_key not in v2_keys: print "v1_key = %s not found in v2_keys" % (v1_key) raise SystemExit for v2_key in v2_keys: if v2_key not in v1_keys: print "v2_key = %s not found in v1_keys" % (v2_key) raise SystemExit print "Match individuals between the two vesrions: OK" sys.stdout.flush() # now compare the variants per individual for key in v1_keys: print "" print "Processing individual = %s" % (key) v1_variants = V1_VARIANTS[key] v2_variants = V2_VARIANTS[key] for v1_var in v1_variants: if v1_var in v2_variants: print "%s: %s found in both versions" % (key,v1_var) v2_variants.remove(v1_var) else: print " <<<< %s: %s found only in V1" % (key,v1_var) # in v2_variants, remaining ar only those for which no match in V1 was found for v2_var in v2_variants: print " >>>> %s: %s found only in V2" % (key,v2_var) print "-----------------------------" if __name__ == '__main__': if len(sys.argv) == 3: go(sys.argv[1],sys.argv[2]) else: print ("Suggested use: time python compare_indi_vars_by_version.py results_folder_v1 results_folder_v2") raise SystemExit