# given a VCF file and a list of IDs (as in the #CHROM line) # extract a subset VCF containing only variants which are not HOM REF in *all* the listed IDs # # # Author: MH # last modified: AUG 18, 2020 import sys import os import gzip ID_DICT = {} # key: ID, value: field index (0 = chr, 1 = pos, ....) def go(in_file,list_ids,out_file): ids = [x.strip() for x in list_ids.strip().split(',')] print "Found %s IDs for which the shared variants are to be extracted" % (len(ids)) print ids in_han = gzip.open(in_file,'r') out_han = open(out_file,'w') in_cntr = 0 out_cntr = 0 for line in in_han: if line.startswith('#CHROM'): print line # find the index for each ID # potential problems - ID more than once in the VCF (will return only the first match) - should not occur in VCFs # - ID not in the list header_fields = [x.strip() for x in line.strip().split('\t')] for i in ids: try: index = header_fields.index(i) if i not in ID_DICT: ID_DICT[i] = index else: print "The supplied list of IDs contain duplicates for %s" % (i) raise SystemExit except ValueError: print "%s not found in %s" % (i,line) raise SystemExit if len(ID_DICT) != len(ids): print "Could not find the indexes for all IDs" print ID_DICT raise SystemExit # write the new line in out_file new_header = '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t' for z in xrange(0,len(ids)): new_header = new_header + '%s\t' % (ids[z]) new_header = new_header[:-1] + '\n' out_han.write(new_header) continue if line.startswith('#'): out_han.write(line) continue # startng to read the variants in_cntr += 1 data = [x.strip() for x in line.strip().split('\t')] desired = {} # key: id; value: the variant info for i in ids: ind = ID_DICT[i] var = data[ind] desired[i] = var print desired out_cntr += 1 in_han.close() out_han.close() print "Read %s vars from the family file, wrote %s shared variants" % (in_cntr,out_cntr) sys.stdout.flush() def temp(): AFF_PROBANDS = [] in_han = open(in_file,'r') for line in in_han: data = [x.strip() for x in line.strip().split('\t')] pro_fam_id = data[1] par_1 = int(data[2]) par_2 = int(data[3]) aff = int(data[5]) if (par_1 != 0) or (par_2 != 0): print "ERROR: Found a proband with a parent" print line raise SystemExit(1) if aff != 2: print "ERROR: Found unaffected proband" print line raise SystemExit(1) if pro_fam_id not in AFF_PROBANDS: AFF_PROBANDS.append(pro_fam_id) else: print "ERROR: Found duplicate proband" print line raise SystemExit(1) in_han.close() print "PED file checks: success" print "Found %s affected probands with no parents in %s" % (len(AFF_PROBANDS),in_file) sys.stdout.flush() if __name__ == '__main__': if len(sys.argv) == 4: go(sys.argv[1],sys.argv[2],sys.argv[3]) else: print "Suggested use: time python /home/u035/u035/shared/scripts/NHS_WES_extract_shared_vars.py in_vcf comma,sep,list,of,ids out_vcf" raise SystemExit