Newer
Older
# the BED file for all genes (/home/u035/u035/shared/resources/exome_targets/CCDS.20180614.plus15bp.merged.bed)
# and the file for the genes in the DDG2P (unique gene names, inluding synonyms, i.e., /home/u035/u035/shared/resources/G2P/genes_in_DDG2P.30082018.txt)
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
# extract
# the BED file for the DDG2P genes
#
# Author: MH
# last modified: MAY 21, 2019
import sys
import os
import csv
import gzip
DDG2P_GENE_NAME = {} # key: gene name; value: irrelevant
FOUND_GENES = {} # key: gene name for which an interval is found in the all genes BED file; value: irrelavnt
def go(in_BED_file,DDG2P_gene_file,out_BED_file):
# read in the DDG2P file
in_han = open(DDG2P_gene_file,'r')
for line in in_han:
data = [x.strip() for x in line.strip().split('\t')]
if len(data) != 1:
print "Problem in %s" % (DDG2P_gene_file)
print line
raise SystemExit
gene = data[0]
if gene not in DDG2P_GENE_NAME:
DDG2P_GENE_NAME[gene] = 0
else:
print "Duplicate gene name = %s in %s" % (gene,DDG2P_gene_file)
raise SystemExit
in_han.close()
print "Found %s unique gene names in %s" % (len(DDG2P_GENE_NAME),DDG2P_gene_file)
sys.stdout.flush()
in_han = open(in_BED_file,'r')
out_han = open(out_BED_file,'w')
in_cntr = 0
out_cntr = 0
for line in in_han:
in_cntr += 1
data = [x.strip() for x in line.strip().split('\t')]
if len(data) != 4:
print "Problem in %s" % (in_BED_file)
print line
raise SystemExit
uniq_gene_names = []
names = data[3] # PLEKHN1-CCDS4.1.1,PLEKHN1-CCDS53256.1.1
name_list = [y.strip() for y in names.strip().split(',')] # [PLEKHN1-CCDS4.1.1,PLEKHN1-CCDS53256.1.1]
for z in name_list:
info = z.strip().split('-CCDS') # otherwise problem for read-through genes, e.g. Problem: 'CENPS-CORT-CCDS114.1.1'
if len(info) != 2:
print "Problem: %s" % (z)
raise SystemExit
gene_name = info[0]
if gene_name not in uniq_gene_names:
uniq_gene_names.append(gene_name)
for u in uniq_gene_names:
if u in DDG2P_GENE_NAME:
FOUND_GENES[u] = 1
out_han.write(line)
out_cntr += 1
break
in_han.close()
out_han.close()
print "Read %s records from the input BED file = %s" % (in_cntr,in_BED_file)
print "Wrote %s record for the DDG2P genes in the output BED file = %s" % (out_cntr,out_BED_file)
print "Found intervals for %s uniq DDG2P genes" % (len(FOUND_GENES))
if __name__ == '__main__':
if len(sys.argv) == 4:
go(sys.argv[1],sys.argv[2],sys.argv[3])
else:
print "Suggested use: time python /exports/igmm/eddie/IGMM-VariantAnalysis/mike/scripts/extract_BED_CCDS_DDG2P.py \
/exports/igmm/eddie/IGMM-VariantAnalysis/alison/exome_kit_compare/targets/CCDS.20180614.plus15bp.merged.bed \
/exports/igmm/eddie/IGMM-VariantAnalysis/resources/G2P/genes_in_DDG2P.30082018.txt \
/exports/igmm/eddie/IGMM-VariantAnalysis/alison/exome_kit_compare/targets/DDG2P.CCDS.20180614.plus15bp.merged.bed"
raise SystemExit