Skip to content
Snippets Groups Projects
compare_indi_vars_by_version.py 4.69 KiB
#
#       Author: MH
#       last modified: SEPT 25, 2020



import sys
import os
import fnmatch
from collections import defaultdict



V1_VARIANTS = defaultdict(list)		# key: <indi_id>_<fam_id>; value: list of variants in the decipher upload file in the form internal_id:chr:pos:ref:alt
V2_VARIANTS = defaultdict(list)		# key: <indi_id>_<fam_id>; value: list of variants in the decipher upload file in the form internal_id:chr:pos:ref:alt




def go(v1_dir,v2_dir):

    v1_files = []
    for root, dirnames, filenames in os.walk(v1_dir):
        for filename in fnmatch.filter(filenames, '*_DEC_FLT.csv'):
            v1_files.append(os.path.join(root, filename))
    print "Found %s *_DEC_FLT.csv files in %s" % (len(v1_files),v1_dir)

    for v1_file in v1_files:
        parsed = [x.strip() for x in v1_file.strip().split('/')]
        dname = parsed[8]
        fname = parsed[9]
        fname_parsed = [y.strip() for y in fname.strip().split('_')]
        indi_id = fname_parsed[0]
        fam_id = fname_parsed[1]
        key = "%s_%s" % (indi_id,fam_id)
        V1_VARIANTS[key] = []  
#        print "Processing V1: folder = %s, file = %s, indi_id = %s, family_id = %s, key = %s" % (dname,fname,indi_id,fam_id,key)   
        
        # now open the file and read the variants
        in_han = open(v1_file,'r')
        for line in in_han:
            if line.startswith("Internal reference number or ID"):
                continue
            data = [z.strip() for z in line.strip().split(',')]
            internal_id = data[0]
            chr = data[1]
            pos = data[2]
            ref = data[4]
            alt = data[5]
            value = '%s:%s:%s:%s:%s' % (internal_id,chr,pos,ref,alt)
#            print value
            V1_VARIANTS[key].append(value)

#        print "    Found %s variants for %s" % (len(V1_VARIANTS[key]),key)              

        





    # now process V2
    v2_files = []
    for root, dirnames, filenames in os.walk(v2_dir):
        for filename in fnmatch.filter(filenames, '*_DEC_FLT.csv'):
            v2_files.append(os.path.join(root, filename))
    print "Found %s *_DEC_FLT.csv files in %s" % (len(v2_files),v2_dir)

    for v2_file in v2_files:
        parsed = [x.strip() for x in v2_file.strip().split('/')]
        dname = parsed[8]
        fname = parsed[9]
        fname_parsed = [y.strip() for y in fname.strip().split('_')]
        indi_id = fname_parsed[0]
        fam_id = fname_parsed[1]
        key = "%s_%s" % (indi_id,fam_id)
        V2_VARIANTS[key] = []
#        print "Processing V2: folder = %s, file = %s, indi_id = %s, family_id = %s, key = %s" % (dname,fname,indi_id,fam_id,key)
    
 
        # now open the file and read the variants
        in_han = open(v2_file,'r')
        for line in in_han:
            if line.startswith("Internal reference number or ID"):
                continue
            data = [z.strip() for z in line.strip().split(',')]
            internal_id = data[0]
            chr = data[1]
            pos = data[2]
            ref = data[4]
            alt = data[5]
            value = '%s:%s:%s:%s:%s' % (internal_id,chr,pos,ref,alt)
#            print value
            V2_VARIANTS[key].append(value)
#        print "    Found %s variants for %s" % (len(V2_VARIANTS[key]),key)




    # check that in both folders we have the same individuals (i.e. keys)
    v1_keys = V1_VARIANTS.keys()
    v2_keys = V2_VARIANTS.keys()
    for v1_key in v1_keys:
        if v1_key not in v2_keys:  
            print "v1_key = %s not found in v2_keys" % (v1_key)
            raise SystemExit
    for v2_key in v2_keys:
        if v2_key not in v1_keys:
            print "v2_key = %s not found in v1_keys" % (v2_key)
            raise SystemExit

    print "Match individuals between the two vesrions: OK"
    sys.stdout.flush()





    # now compare the variants per individual
    for key in v1_keys:
        print ""
        print "Processing individual = %s" % (key)
        v1_variants = V1_VARIANTS[key]
        v2_variants = V2_VARIANTS[key]

        for v1_var in v1_variants:
            if v1_var in v2_variants:
                print "%s: %s found in both versions" % (key,v1_var)
                v2_variants.remove(v1_var)
            else:
                print "  <<<< %s: %s found only in V1" % (key,v1_var)    

        # in v2_variants, remaining ar only those for which no match in V1 was found
        for v2_var in v2_variants:
            print "  >>>> %s: %s found only in V2" % (key,v2_var)
        print "-----------------------------"






if __name__ == '__main__':
    if len(sys.argv) == 3:
        go(sys.argv[1],sys.argv[2])
    else:
        print ("Suggested use: time python compare_indi_vars_by_version.py results_folder_v1 results_folder_v2")
        raise SystemExit