diff --git a/convert_DEC_to_v10.py b/convert_DEC_to_v10.py index f01d316337f5a2748840aa6ca4b35fc539078b34..67f26a78bb6cd0b061a1c298a7704d8b268c6217 100644 --- a/convert_DEC_to_v10.py +++ b/convert_DEC_to_v10.py @@ -2,7 +2,7 @@ # convert it to v10 # # Author: MH -# last modified: AUG 05, 2020 +# last modified: APR 25, 2022 @@ -25,12 +25,10 @@ def go(inout_dir,id): # the folder where the bulk upload files are strored; id # create the worksheet worksheet = workbook.add_worksheet('Sequence Variants') - # write the header row header = ('Patient internal reference number or ID','Shared','Assembly','HGVS code','Chromosome','Genomic start','Ref sequence','Alt sequence','Gene name','Transcript','Is intergenic','Genotype','Inheritance','Pathogenicity','Pathogenicity evidence','Contribution','Genotype groups') worksheet.write_row(0,0,header) - # now, open and read the old file, for each variant collecting the information required for v10 and writing it in the v10 file cntr = 0 diff --git a/extract_solo_FAM_PRO_ID.py b/extract_solo_FAM_PRO_ID.py index 903ab671b0166cb623ed023aeedb6b5410f5015b..3dcfa453973cf6c680b46b34751f426bbe4f4c28 100755 --- a/extract_solo_FAM_PRO_ID.py +++ b/extract_solo_FAM_PRO_ID.py @@ -3,7 +3,7 @@ # solo_FAM_IDs.txt, solo_PRO_IDs.txt and solo_FAM_PRO.txt # # Author: MH -# last modified: MARCH 04, 2022 +# last modified: APR 25, 2022 diff --git a/extract_trio_FAM_PRO_ID.py b/extract_trio_FAM_PRO_ID.py index 5e2ae5348704133dadd32d236a75fe6262f61d9e..4dac3b03ba55b6003b79dab6ff4e99582b408035 100755 --- a/extract_trio_FAM_PRO_ID.py +++ b/extract_trio_FAM_PRO_ID.py @@ -3,7 +3,7 @@ # FAM_IDs.txt, PRO_IDs.txt and FAM_PRO.txt # # Author: MH -# last modified: NOV 02, 2021 +# last modified: APR 25, 2022 @@ -34,8 +34,6 @@ def go(work_dir): for file in os.listdir(ped_dir): # per each PED file if file.endswith(".ped"): -# print(os.path.join(ped_dir, file)) - print " %s" % (file) in_file = os.path.join(ped_dir, file) @@ -49,9 +47,6 @@ def go(work_dir): x_plate,x_fam = data[0].split('_') # in the internal PED file, family_id is plateID_familyID, will keep only clean family_id, which corresponds to DECIPHER ID y_indi,y_fam = data[1].split('_') # in the internal PED file, indi_id is indiID_familyID, split - -# print "data[0]=%s,data[1]=%s,x_plate=%s,x_fam=%s,y_indi=%s,y_fam=%s" % (data[0],data[1],x_plate,x_fam,y_indi,y_fam) - if FAM_ID == 0: FAM_ID = x_fam elif FAM_ID != x_fam: diff --git a/filter_LQ_GT.py b/filter_LQ_GT.py index 16169e07e2f96287f7e75a633cb5f89dc73b378f..fff3d37e2a61d9f71be84d10513a9f1350bf959f 100644 --- a/filter_LQ_GT.py +++ b/filter_LQ_GT.py @@ -3,7 +3,7 @@ # # # Author: MH -# last modified: JUNE 06, 2019 +# last modified: APR 25, 2022 @@ -27,7 +27,6 @@ def go(black_file,in_file,out_file): print " Reading the blacklist file: Found %s blacklisted variants in %s" % (len(BLACKLIST),black_file) print "" - cntr_sites = 0 cntr_vars = 0 cntr_reset = 0 diff --git a/gather_trio_results.sh b/gather_trio_results.sh index 7325bffbcb4d76771d147504145f2a6fdf30adbe..c9faf5bc6f1252bd1aad803df0bdf03d7288efdb 100755 --- a/gather_trio_results.sh +++ b/gather_trio_results.sh @@ -7,9 +7,12 @@ #SBATCH --error=gather_results.%A_%a.err +# Author: MH +# last modified: APR 25, 2022 + -### folder structure for the downstream analysis - created by trio_setup.sh ### +### folder structure for the downstream analysis - created by trio_setup.sh ### BASE=/home/u035/u035/shared/analysis/work WORK_DIR=${BASE}/${PROJECT_ID} NHS_DIR=${WORK_DIR}/${BATCH_NUM}_${VERSION_N}_results @@ -19,6 +22,8 @@ NHS_DIR=${WORK_DIR}/${BATCH_NUM}_${VERSION_N}_results FAMILY_IDS=${WORK_DIR}/FAM_IDs.txt # created by trio_setup.sh CHILD_IDS=${WORK_DIR}/PRO_IDs.txt # created by trio_setup.sh + +# print out the input echo "BATCH_NUM = ${BATCH_NUM}" # the numerical part of the BATCH_ID echo "PLATE_ID = ${PLATE_ID}" # the PCR plate ID of the batch being currently processed, e.g. 16862 echo "PROJECT_ID = ${PROJECT_ID}" # this the the folder (${BASE}/${PROJECT_ID}) where the downstream analysis will be done @@ -32,19 +37,6 @@ if [ ! -d "${NHS_DIR}" ]; then fi -#~## enable running singletons -#~#if [ -z $PBS_ARRAY_INDEX ] -#~#then -#~# if [ -z $INDEX ] -#~# then -#~# export PBS_ARRAY_INDEX=1 -#~# else -#~# export PBS_ARRAY_INDEX=$INDEX -#~# fi -#~#fi - - - FAMILY_ID=`head -n ${SLURM_ARRAY_TASK_ID} ${FAMILY_IDS} | tail -n 1` # contains only the family IDs (e.g.385295) PROBAND_ID=`head -n ${SLURM_ARRAY_TASK_ID} ${CHILD_IDS} | tail -n 1` # contains only the proband IDs (e.g. 107060) @@ -84,5 +76,7 @@ cp ${WORK_DIR}/DECIPHER/IGV/${PLATE_ID}_${FAMILY_ID}/*.png ${FAM_DIR} cp ${WORK_DIR}/COV/${PROBAND_ID}_${FAMILY_ID}.DD15.COV.txt ${FAM_DIR} cp ${WORK_DIR}/COV/${PROBAND_ID}_${FAMILY_ID}.REC_SNP_COV.txt ${FAM_DIR} +echo "" +echo "" echo "OK: Results for ${FAMILY_ID} are stored in ${FAM_DIR}" diff --git a/generate_DEC_IGV_scripts.py b/generate_DEC_IGV_scripts.py index 81892871a58e221e5c1928e1d8d73028bff0393b..8debbaf7f3aafede5e64e71fa150de44d375049e 100755 --- a/generate_DEC_IGV_scripts.py +++ b/generate_DEC_IGV_scripts.py @@ -15,7 +15,7 @@ # all VASE denovo variants found in the individual VCF # # Author: MH -# last modified: JAN 18, 2022 +# last modified: APR 25, 2022 @@ -650,7 +650,6 @@ def read_G2P(in_file): global NUM_UNIQ_G2P_VARS -#.# known_OBS_states = ['monoallelic','biallelic','hemizygous','x-linked dominant','x-linked over-dominance'] known_OBS_states = ['monoallelic_autosomal','biallelic_autosomal','monoallelic_X_hem','monoallelic_X_het'] # first, read the G2P variants on canonical transcripts for each of the family members @@ -1189,54 +1188,6 @@ def read_G2P(in_file): pass - -#.# ######################################################################## -#.# ### process x-linked over-dominance genes - no filtering to be done ### -#.# ######################################################################## - -#.# for key in CHILD_DICT['x-linked over-dominance']: # this the second key: chr:start:end:ref:alt; value: (ZYG,gene,trans) - -#.# CHILD_GT = CHILD_DICT['x-linked over-dominance'][key][0] -#.# CHILD_GENE = CHILD_DICT['x-linked over-dominance'][key][1] -#.# CHILD_TRANS = CHILD_DICT['x-linked over-dominance'][key][2] - -#.# # if a non-normalized INDEL in child G2P - must adjust (should not happen really, we split, normalized and left-aligned the family VCF before sending it to VEP+G2P) -#.# chr,start,end,ref,alt = key.split(":") -#.# if len(ref) > 1 and len(alt) > 1: # an INDEL - not normalized -#.# if len(ref) < len(alt): # an INS -#.# orig_start = start -#.# orig_ref = ref -#.# orig_alt = alt -#.# start = orig_start -#.# ref = '-' -#.# alt = orig_alt[len(orig_ref):] -#.# print " WARNING: original INS = %s:%s:%s:%s:%s --> replaced with INS = %s:%s:%s:%s" % (chr,orig_start,end,orig_ref,orig_alt,chr,start,ref,alt) -#.# else: # a DEL -#.# print "ERROR: At the momemnt, cannot deal with this non-normalized deletion" -#.# print line -#.# raise SystemExit - -#.# new_key = '%s:%s:%s:%s' % (chr,start,ref,alt) - -#.# # record the data for CHILD G2P variants (for OBS=x-linked over-dominance) -#.# if new_key not in G2P_DICT: -#.# G2P_DICT[new_key] = 0 -#.# else: -#.# # print "ERROR: duplicate G2P variant new_key = %s" % (new_key) -#.# # raise SystemExit -#.# # this will happen if a gene is e.g. hemizygous,x-linked dominant - there will be two separate lines in the output for each req -#.# pass - -#.# # and record the required data (CHILD_TRANS,CHILD_GENE,CHILD_GT) in G2P_DATA -#.# if new_key not in G2P_DATA: -#.# G2P_DATA[new_key] = (CHILD_TRANS,CHILD_GENE,CHILD_GT) -#.# else: -#.# # print "ERROR: duplicate G2P variant new_key = %s" % (new_key) -#.# # raise SystemExit -#.# # this will happen if a gene is e.g. hemizygous,x-linked dominant - there will be two separate lines in the output for each req -#.# pass - - NUM_UNIQ_G2P_VARS = len(G2P_DICT) print "Found %s unique G2P variants in CHILD (%s) after considering MONOALLELIC, BIALLELIC and X-LINKED genes" % (NUM_UNIQ_G2P_VARS,CHILD_ID) sys.stdout.flush() @@ -1251,15 +1202,6 @@ def read_G2P(in_file): - - - - - - - - - def read_ped(in_file): global CHILD_ID diff --git a/get_cov_output.py b/get_cov_output.py index cc81c649c0248d3e5c1116804ebbdd80e8929a7c..49add7fa11c8c8979d2485da701c639b05e380e9 100644 --- a/get_cov_output.py +++ b/get_cov_output.py @@ -1,4 +1,4 @@ -# given +# given # <gene_set>.<ID>.sample_interval_summary - e.g. DDG2P.20180830.ClinVar.20190520.plus15bp.txt # <gene_set>.<gene_set_date>.ClinVar.<clinvar_date>.plus15bp.txt - e.g. DDG2P.20180830.ClinVar.20190520.plus15bp.txt # @@ -7,7 +7,7 @@ # all the columns from the ClinVar file + another column with the percentage of bases covered at least 20x from the coverage file # # Author: MH -# last modified: SEPT 26, 2019 +# last modified: APR 25, 2022 @@ -22,7 +22,6 @@ COV_DICT = {} # key: 'chr:start-1:end'; value: coverage column (data[8]) - def go(in_gatk_file,in_clin_file,out_cov_file): # read in the coverage file diff --git a/process_trio.sh b/process_trio.sh index b0ae363af34b001a7da780fdbfda5ef9ba586093..4600a36ed4071a265a80f426cfb1eb49acc4cc3f 100755 --- a/process_trio.sh +++ b/process_trio.sh @@ -7,6 +7,9 @@ #SBATCH --error=process_trio.%A_%a.err +# Author: MH +# last modified: APR 25, 2022 + # setup PATH export PATH=$PATH:/home/u035/u035/shared/software/bcbio/anaconda/envs/python2/bin:/home/u035/u035/shared/software/bcbio/anaconda/bin @@ -56,7 +59,7 @@ VEP="/home/u035/u035/shared/software/bcbio/anaconda/bin/perl /home/u035/u035/sha REFERENCE_GENOME=/home/u035/u035/shared/software/bcbio/genomes/Hsapiens/hg38/seq/hg38.fa - +# print out the input echo "SOURCE_DIR = ${SOURCE_DIR}" # the general path to the source BAM files (VCF and PED already copied) i.e. /home/u035/u035/shared/results echo "BATCH_ID = ${BATCH_ID}" # the ID of the batch being processed e.g. 19650_Ansari_Morad echo "BATCH_NUM = ${BATCH_NUM}" # the numerical part of the BATCH_ID e.g. 19650 @@ -78,7 +81,6 @@ cd ${LOG_DIR} ##### for each family #### ################################ -#~#FAMILY_ID=`head -n ${PBS_ARRAY_INDEX} ${FAMILY_IDS} | tail -n 1` FAMILY_ID=`head -n ${SLURM_ARRAY_TASK_ID} ${FAMILY_IDS} | tail -n 1` @@ -287,14 +289,12 @@ echo "" ##### for each proband #### ################################# -#~#PROBAND_ID=`head -n ${PBS_ARRAY_INDEX} ${CHILD_IDS} | tail -n 1` # contains only the proband IDs (e.g. 107060) PROBAND_ID=`head -n ${SLURM_ARRAY_TASK_ID} ${CHILD_IDS} | tail -n 1` # contains only the proband IDs (e.g. 107060) echo "Performing coverage analysis for PROBAND_ID = ${PROBAND_ID}_${FAMILY_ID} ..." # make sure we are reading the data from the exact batch & plate ID -#~#BAM_FILE=${SOURCE_DIR}/????-??-??_${VERSION_N}_${BATCH_ID}_${PLATE_ID}_${FAMILY_ID}/${PROBAND_ID}_${FAMILY_ID}/${PROBAND_ID}_${FAMILY_ID}-ready.bam BAM_FILE=${SOURCE_DIR}/${BATCH_NUM}_${VERSION_N}/families/????-??-??_${BATCH_NUM}_${VERSION_N}_${PLATE_ID}_${FAMILY_ID}/${PROBAND_ID}_${FAMILY_ID}/${PROBAND_ID}_${FAMILY_ID}-ready.bam OUT_FILE=${COV_DIR}/${PROBAND_ID}_${FAMILY_ID}.DD15 @@ -411,12 +411,10 @@ DEC_MAP=${WORK_DIR}/DECIPHER_INTERNAL_IDs.txt IN_G2P_FILE=${G2P_DIR}/${PLATE_ID}_${FAMILY_ID}_LOG_DIR/${PLATE_ID}_${FAMILY_ID}.report.txt IN_VASE_FILE=${VASE_DIR}/${PLATE_ID}_${FAMILY_ID}.ready.denovo.vcf FAM_IGV_DIR=${IGV_DIR}/${PLATE_ID}_${FAMILY_ID} -#~#FAM_BAM_DIR=${SOURCE_DIR}/????-??-??_${VERSION_N}_${BATCH_ID}_${PLATE_ID}_${FAMILY_ID} FAM_BAM_DIR=${SOURCE_DIR}/${BATCH_NUM}_${VERSION_N}/families/????-??-??_${BATCH_NUM}_${VERSION_N}_${PLATE_ID}_${FAMILY_ID} ## call the python scrpit -##time ${PYTHON2} /home/u035/u035/shared/temp/generate_DEC_IGV_scripts.py \ time ${PYTHON2} ${SCRIPTS_DIR}/generate_DEC_IGV_scripts.py \ ${DEC_MAP} \ ${TRANS_MAP} \ @@ -491,9 +489,6 @@ echo "...kid_id = ${kid_id}, par_1_id = ${par_1_id}, par_2_id = ${par_2_id} " # gather the trio BAM files -#~#kid_bam=${SOURCE_DIR}/????-??-??_${VERSION_N}_${BATCH_ID}_${PLATE_ID}_${FAMILY_ID}/${kid_id}/${kid_id}-ready.bam -#~#par_1_bam=${SOURCE_DIR}/????-??-??_${VERSION_N}_${BATCH_ID}_${PLATE_ID}_${FAMILY_ID}/${par_1_id}/${par_1_id}-ready.bam -#~#par_2_bam=${SOURCE_DIR}/????-??-??_${VERSION_N}_${BATCH_ID}_${PLATE_ID}_${FAMILY_ID}/${par_2_id}/${par_2_id}-ready.bam kid_bam=${SOURCE_DIR}/${BATCH_NUM}_${VERSION_N}/families/????-??-??_${BATCH_NUM}_${VERSION_N}_${PLATE_ID}_${FAMILY_ID}/${kid_id}/${kid_id}-ready.bam par_1_bam=${SOURCE_DIR}/${BATCH_NUM}_${VERSION_N}/families/????-??-??_${BATCH_NUM}_${VERSION_N}_${PLATE_ID}_${FAMILY_ID}/${par_1_id}/${par_1_id}-ready.bam par_2_bam=${SOURCE_DIR}/${BATCH_NUM}_${VERSION_N}/families/????-??-??_${BATCH_NUM}_${VERSION_N}_${PLATE_ID}_${FAMILY_ID}/${par_2_id}/${par_2_id}-ready.bam @@ -522,7 +517,6 @@ while IFS= read -r line do echo "$line" IFS=, read -ra ary <<<"$line" -# for key in "${!ary[@]}"; do echo "$key ${ary[$key]}"; done chr=${ary[1]} pos=${ary[2]} ref=${ary[4]} diff --git a/trio_setup.sh b/trio_setup.sh index 57bbba9fa6b98e4848c85d9be3d7cc7d129d4dcc..7f664060c1e5e2974ad173a49ef0b95d1624b332 100755 --- a/trio_setup.sh +++ b/trio_setup.sh @@ -6,7 +6,8 @@ #SBATCH --output=trio_setup.%A_%a.out #SBATCH --error=trio_setup.%A_%a.err - +# Author: MH +# last modified: APR 25, 2022 ### Setup the folder structure for the downstream analysis### @@ -29,16 +30,7 @@ SCRIPTS_DIR=/home/u035/u035/shared/scripts PYTHON2=/home/u035/u035/shared/software/bcbio/anaconda/envs/python2/bin/python2.7 - -#~### check if ${WORK_DIR} already exists - if so, exit - to prevent accidental overwriting -#~#if [ -d "${WORK_DIR}" ]; then -#~# echo "${WORK_DIR} already exists - EXIT! If really intended, delete manually!!!!" -#~# exit -#~#fi - - - - +# print out the input echo "SOURCE_DIR = ${SOURCE_DIR}" # the general path to the source VCF, BAM and PED files i.e. /home/u035/u035/shared/results echo "BATCH_ID = ${BATCH_ID}" # the ID of the batch being processed e.g. 19650_Ansari_Morad echo "BATCH_NUM = ${BATCH_NUM}" # the numerical part of the BATCH_ID e.g. 19650 @@ -47,7 +39,6 @@ echo "PROJECT_ID = ${PROJECT_ID}" # this the the folder (${BASE}/${PROJECT_ID}) echo "VERSION_N = ${VERSION_N}" # the version of the alignment and genotyping analysis -#~#S_PED_DIR=${SOURCE_DIR}/../../params # requires that the family PED files are in this folder