NHS_WES_extract_shared_vars.py

#	given a VCF file and a list of IDs (as in the #CHROM line)
#	extract a subset VCF containing only variants which are not HOM REF in *all* the listed IDs 
#
#
#       Author: MH
#       last modified: AUG 18, 2020


import sys
import os
import gzip


ID_DICT = {}	#	key: ID, value: field index (0 = chr, 1 = pos, ....)


def go(in_file,list_ids,out_file):

    ids = [x.strip() for x in list_ids.strip().split(',')]
    print "Found %s IDs for which the shared variants are to be extracted" % (len(ids))
    print ids

    in_han = gzip.open(in_file,'r')
    out_han = open(out_file,'w')
    in_cntr = 0
    out_cntr = 0

    for line in in_han:
        if line.startswith('#CHROM'):
            print line
            # find the index for each ID
            # potential problems - ID more than once in the VCF (will return only the first match) - should not occur in VCFs
            #                    - ID not in the list  
            header_fields = [x.strip() for x in line.strip().split('\t')]
            for i in ids:
                try:
                    index = header_fields.index(i)
                    if i not in ID_DICT:
                        ID_DICT[i] = index
                    else:
                        print "The supplied list of IDs contain duplicates for %s" % (i)
                        raise SystemExit  
                except ValueError:
                    print "%s not found in %s" % (i,line)
                    raise SystemExit  

            if len(ID_DICT) != len(ids):
                print "Could not find the indexes for all IDs"
                print ID_DICT
                raise SystemExit 
            
            # write the new line in out_file 
            new_header = '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t'
            for z in xrange(0,len(ids)):
                new_header = new_header + '%s\t' % (ids[z])
            new_header = new_header[:-1] + '\n'
            out_han.write(new_header)
            continue

        if line.startswith('#'):
            out_han.write(line)
            continue 

        # startng to read the variants
        in_cntr += 1
        data = [x.strip() for x in line.strip().split('\t')]  
        desired = {}	# key: id; value: the variant info
        for i in ids:
           ind = ID_DICT[i]
           var = data[ind]
           desired[i] = var 
        print desired  

        out_cntr += 1 

    in_han.close()
    out_han.close()
    print "Read %s vars from the family file, wrote %s shared variants" % (in_cntr,out_cntr)
    sys.stdout.flush()


def temp():

    AFF_PROBANDS = []

    in_han = open(in_file,'r')
    for line in in_han:
        data = [x.strip() for x in line.strip().split('\t')]
        pro_fam_id = data[1]
        par_1 = int(data[2])
        par_2 = int(data[3])
        aff = int(data[5])

        if (par_1 != 0) or (par_2 != 0):
            print "ERROR: Found a proband with a parent"
            print line
            raise SystemExit(1) 

        if aff != 2:
            print "ERROR: Found unaffected proband"
            print line
            raise SystemExit(1)

        if pro_fam_id not in AFF_PROBANDS:
            AFF_PROBANDS.append(pro_fam_id)
        else:
            print "ERROR: Found duplicate proband"
            print line
            raise SystemExit(1)

    in_han.close()
    print "PED file checks: success"
    print "Found %s affected probands with no parents in %s" % (len(AFF_PROBANDS),in_file)
    sys.stdout.flush()   


if __name__ == '__main__':
    if len(sys.argv) == 4:
        go(sys.argv[1],sys.argv[2],sys.argv[3])
    else:
        print "Suggested use: time python /home/u035/project/scripts/NHS_WES_extract_shared_vars.py in_vcf comma,sep,list,of,ids out_vcf"
        raise SystemExit