extract_BED_CCDS_DDG2P.py

#	given 
#		the BED file for all genes (/home/u035/u035/shared/resources/exome_targets/CCDS.20180614.plus15bp.merged.bed)
#		and the file for the genes in the DDG2P (unique gene names, inluding synonyms, i.e., /home/u035/u035/shared/resources/G2P/genes_in_DDG2P.30082018.txt)
#	extract 
#		the BED file for the DDG2P genes
#
#       Author: MH
#       last modified: MAY 21, 2019


import sys
import os
import csv
import gzip


DDG2P_GENE_NAME = {}		# key: gene name; value: irrelevant
FOUND_GENES = {}		# key: gene name for which an interval is found in the all genes BED file; value: irrelavnt


def go(in_BED_file,DDG2P_gene_file,out_BED_file):

    # read in the DDG2P file
    in_han = open(DDG2P_gene_file,'r')
    for line in in_han:
        data = [x.strip() for x in line.strip().split('\t')]
        if len(data) != 1:
            print "Problem in %s" % (DDG2P_gene_file)
            print line
            raise SystemExit
        gene = data[0]
        if gene not in DDG2P_GENE_NAME:
            DDG2P_GENE_NAME[gene] = 0
        else:
            print "Duplicate gene name = %s in %s" % (gene,DDG2P_gene_file)  
            raise SystemExit
    in_han.close()
    print "Found %s unique gene names in %s" % (len(DDG2P_GENE_NAME),DDG2P_gene_file)
    sys.stdout.flush()


    in_han = open(in_BED_file,'r')
    out_han = open(out_BED_file,'w')
    in_cntr = 0
    out_cntr = 0

    for line in in_han:
        in_cntr += 1
        data = [x.strip() for x in line.strip().split('\t')]
        if len(data) != 4:
            print "Problem in %s" % (in_BED_file)
            print line
            raise SystemExit

        uniq_gene_names = []
        names = data[3]								# PLEKHN1-CCDS4.1.1,PLEKHN1-CCDS53256.1.1
        name_list = [y.strip() for y in names.strip().split(',')]		# [PLEKHN1-CCDS4.1.1,PLEKHN1-CCDS53256.1.1]
        for z in name_list:
            info = z.strip().split('-CCDS')					# otherwise problem for read-through genes, e.g. Problem: 'CENPS-CORT-CCDS114.1.1'
            if len(info) != 2:
                print "Problem: %s" % (z)
                raise SystemExit
            gene_name = info[0]
            if gene_name not in uniq_gene_names:
                uniq_gene_names.append(gene_name)

        for u in uniq_gene_names:
            if u in DDG2P_GENE_NAME:
                FOUND_GENES[u] = 1
                out_han.write(line)
                out_cntr += 1
                break 
          
    in_han.close()     
    out_han.close()
    print "Read %s records from the input BED file = %s" % (in_cntr,in_BED_file)
    print "Wrote %s record for the DDG2P genes in the output BED file = %s" % (out_cntr,out_BED_file)
    print "Found intervals for %s uniq DDG2P genes" % (len(FOUND_GENES))


if __name__ == '__main__':
    if len(sys.argv) == 4:
        go(sys.argv[1],sys.argv[2],sys.argv[3])
    else:
        print "Suggested use: time python /exports/igmm/eddie/IGMM-VariantAnalysis/mike/scripts/extract_BED_CCDS_DDG2P.py \
                              /exports/igmm/eddie/IGMM-VariantAnalysis/alison/exome_kit_compare/targets/CCDS.20180614.plus15bp.merged.bed \
                              /exports/igmm/eddie/IGMM-VariantAnalysis/resources/G2P/genes_in_DDG2P.30082018.txt \
                              /exports/igmm/eddie/IGMM-VariantAnalysis/alison/exome_kit_compare/targets/DDG2P.CCDS.20180614.plus15bp.merged.bed"
        raise SystemExit