# given # the BED file for all genes (/home/u035/u035/shared/resources/exome_targets/CCDS.20180614.plus15bp.merged.bed) # and the file for the genes in the DDG2P (unique gene names, inluding synonyms, i.e., /home/u035/u035/shared/resources/G2P/genes_in_DDG2P.30082018.txt) # extract # the BED file for the DDG2P genes # # Author: MH # last modified: MAY 21, 2019 import sys import os import csv import gzip DDG2P_GENE_NAME = {} # key: gene name; value: irrelevant FOUND_GENES = {} # key: gene name for which an interval is found in the all genes BED file; value: irrelavnt def go(in_BED_file,DDG2P_gene_file,out_BED_file): # read in the DDG2P file in_han = open(DDG2P_gene_file,'r') for line in in_han: data = [x.strip() for x in line.strip().split('\t')] if len(data) != 1: print "Problem in %s" % (DDG2P_gene_file) print line raise SystemExit gene = data[0] if gene not in DDG2P_GENE_NAME: DDG2P_GENE_NAME[gene] = 0 else: print "Duplicate gene name = %s in %s" % (gene,DDG2P_gene_file) raise SystemExit in_han.close() print "Found %s unique gene names in %s" % (len(DDG2P_GENE_NAME),DDG2P_gene_file) sys.stdout.flush() in_han = open(in_BED_file,'r') out_han = open(out_BED_file,'w') in_cntr = 0 out_cntr = 0 for line in in_han: in_cntr += 1 data = [x.strip() for x in line.strip().split('\t')] if len(data) != 4: print "Problem in %s" % (in_BED_file) print line raise SystemExit uniq_gene_names = [] names = data[3] # PLEKHN1-CCDS4.1.1,PLEKHN1-CCDS53256.1.1 name_list = [y.strip() for y in names.strip().split(',')] # [PLEKHN1-CCDS4.1.1,PLEKHN1-CCDS53256.1.1] for z in name_list: info = z.strip().split('-CCDS') # otherwise problem for read-through genes, e.g. Problem: 'CENPS-CORT-CCDS114.1.1' if len(info) != 2: print "Problem: %s" % (z) raise SystemExit gene_name = info[0] if gene_name not in uniq_gene_names: uniq_gene_names.append(gene_name) for u in uniq_gene_names: if u in DDG2P_GENE_NAME: FOUND_GENES[u] = 1 out_han.write(line) out_cntr += 1 break in_han.close() out_han.close() print "Read %s records from the input BED file = %s" % (in_cntr,in_BED_file) print "Wrote %s record for the DDG2P genes in the output BED file = %s" % (out_cntr,out_BED_file) print "Found intervals for %s uniq DDG2P genes" % (len(FOUND_GENES)) if __name__ == '__main__': if len(sys.argv) == 4: go(sys.argv[1],sys.argv[2],sys.argv[3]) else: print "Suggested use: time python /exports/igmm/eddie/IGMM-VariantAnalysis/mike/scripts/extract_BED_CCDS_DDG2P.py \ /exports/igmm/eddie/IGMM-VariantAnalysis/alison/exome_kit_compare/targets/CCDS.20180614.plus15bp.merged.bed \ /exports/igmm/eddie/IGMM-VariantAnalysis/resources/G2P/genes_in_DDG2P.30082018.txt \ /exports/igmm/eddie/IGMM-VariantAnalysis/alison/exome_kit_compare/targets/DDG2P.CCDS.20180614.plus15bp.merged.bed" raise SystemExit