diff --git a/delete_BAM.sh b/delete_BAM.sh new file mode 100755 index 0000000000000000000000000000000000000000..c765ff4a338c9a576e5afca7397f4faf8b7d8800 --- /dev/null +++ b/delete_BAM.sh @@ -0,0 +1,95 @@ +#!/bin/bash +#SBATCH --cpus-per-task=1 +#SBATCH --mem=2GB +#SBATCH --time=01:00:00 +#SBATCH --job-name=delete_bam +#SBATCH --output=delete_bam.%A_%a.out +#SBATCH --error=delete_bam.%A_%a.err + + +### Setup the folder structure for the downstream analysis### +BASE=/home/u035/u035/shared/analysis/work +WORK_DIR=$BASE/${PROJECT_ID} +VCF_DIR=${WORK_DIR}/VCF +PED_DIR=${WORK_DIR}/PED +LOG_DIR=${WORK_DIR}/LOG +G2P_DIR=${WORK_DIR}/G2P +VASE_DIR=${WORK_DIR}/VASE +COV_DIR=${WORK_DIR}/COV +DEC_DIR=${WORK_DIR}/DECIPHER +IGV_DIR=${DEC_DIR}/IGV +CNV_DIR=${WORK_DIR}/CNV +BAMOUT_DIR=${WORK_DIR}/BAMOUT +SCRIPTS_DIR=/home/u035/u035/shared/scripts + + + +echo "SOURCE_DIR = ${SOURCE_DIR}" # the general path to the source VCF, BAM and PED files i.e. /home/u035/u035/shared/results +echo "BATCH_ID = ${BATCH_ID}" # the ID of the batch being processed e.g. 19650_Ansari_Morad +echo "BATCH_NUM = ${BATCH_NUM}" # the numerical part of the BATCH_ID e.g. 19650 +echo "PLATE_ID = ${PLATE_ID}" # the PCR plate ID of the batch being currently processed, e.g. 19285 +echo "PROJECT_ID = ${PROJECT_ID}" # this the the folder (${BASE}/${PROJECT_ID}) where the downstream analysis will be done +echo "VERSION_N = ${VERSION_N}" # the version of the alignment and genotyping analysis + + + + + + + +############################################################## +### Delete indivdual BAMs (and indexes) iff CRAM found ### +############################################################## + +# make sure we are reading the data from the exact version, batch & plate ID +SOURCE_VCF_DIRS=${SOURCE_DIR}/${BATCH_NUM}_${VERSION_N}/families/????-??-??_${BATCH_NUM}_${VERSION_N}_${PLATE_ID}_* + + +for S_VCF_DIR in ${SOURCE_VCF_DIRS} +do + VCF_DIR_NAME="${S_VCF_DIR##*/}" + IFS=_ read -ra my_arr <<< "${VCF_DIR_NAME}" + FAM_ID=${my_arr[-1]} + echo " FAM_ID = ${FAM_ID}" + + # identify all folders (one for each individual) for this family containing cram/bam files (format: <INDI_ID>_<FAM_ID>) + cd ${SOURCE_DIR}/${BATCH_NUM}_${VERSION_N}/families/????-??-??_${BATCH_NUM}_${VERSION_N}_${PLATE_ID}_${FAM_ID} + for ITEM in `ls -l` + do + if test -d $ITEM && [[ "$ITEM" == *"_"* ]] + then + echo " $ITEM is a CRAM/BAM folder..." + BAM=${ITEM}/${ITEM}-ready.bam + CRAM=${ITEM}/${ITEM}-ready.cram + + # check if the CRAM file exists, iff yes, delete the BAM file and its index + if [[ -f "$CRAM" ]] + then + echo " Found ${CRAM}" + echo " Removing ${BAM}" + rm ${BAM} + echo " Removing ${BAM}.bai" + rm ${BAM}.bai + else + echo " ERROR: CRAM file ${CRAM} not found - have not deleted BAM ${BAM}!" + fi + fi + done +done + + + + +echo "" +echo "" +echo "OK: Deletion of BAM files and their indexes for PROJECT_ID = $PROJECT_ID successful" + + + + + + + + + + diff --git a/generate_DEC_IGV_aff_sib_scripts_from_quad.py b/generate_DEC_IGV_aff_sib_scripts_from_quad.py index 728882d76752cc7f30f61cf176ad316918804206..7fbf4bff32fd6943a669ee861b8ee7a03d280aa1 100755 --- a/generate_DEC_IGV_aff_sib_scripts_from_quad.py +++ b/generate_DEC_IGV_aff_sib_scripts_from_quad.py @@ -12,7 +12,7 @@ # all G2P variants found in the individual VCF # # Author: MH -# last modified: SEPT 16, 2020 +# last modified: DEC 07, 2021 @@ -33,11 +33,13 @@ ACCESS = 'No' TRANS_DICT = {} # key: transcriptID not found in DECIPHER; value: the chosen replacement transcriptID from those available in DECIPHER KIDS_SEX_DICT = {} # key: <indi_fam_id>; value: sex (in the format 46XX/46XY) KIDS_G2P_DICT = defaultdict(dict) # 1st level key: <indi_fam_id>; 2nd level key: chr:start:end:ref:alt; value: (ZYG,gene,trans) -KIDS_VCF_DICT = defaultdict(dict) # 1st level key: <indi_fam_id>; 2nd level key: chr:pos:ref:alt; value: irrelevant +KIDS_VCF_DICT = defaultdict(dict) # 1st level key: <indi_fam_id>; 2nd level key: chr:pos:ref:alt; value: (FS,SOR) SHARED_DICT = {} # key: chr:start:ref:alt; value: (ZYG,gene,trans) NUM_SHARED_G2P_VARS = 0 SNAP_FLANK = 25 +FS_THRESH = float(60) +SOR_THRESH = float(3) @@ -118,9 +120,11 @@ def go(dec_id,trans_map_file,ped_file,in_g2p_file,fam_igv_dir,vcf_dir,plate_id,f # SHARED_DICT = {} # key: chr:start:ref:alt; value: (ZYG,gene,trans) pro_vcf_vars = KIDS_VCF_DICT[pro_id] - for pro_vcf_var,rub in pro_vcf_vars.iteritems(): + for pro_vcf_var,fs_sor in pro_vcf_vars.iteritems(): chr,pos,ref,alt = pro_vcf_var.split(':') pos = int(pos) + FS = fs_sor[0] + SOR = fs_sor[1] # adjust pro_vcf_var for indels to match G2P style of recording if len(ref) == len(alt): # SNP @@ -210,7 +214,16 @@ def go(dec_id,trans_map_file,ped_file,in_g2p_file,fam_igv_dir,vcf_dir,plate_id,f # write to the IGV file i_s = pos - SNAP_FLANK i_e = pos + SNAP_FLANK - i_name = '%s_shared_%s_%s_%s_%s.png' % (pro_id,chr,pos,ref,alt) + + # check if above FS/SOR_THRESH to include in the snapshot name + if (FS == '') or (SOR == ''): + flag = 'NA' + elif (FS >= FS_THRESH) and (SOR >= SOR_THRESH): + flag = 'FS_%.1f_SOR_%.1f' % (FS,SOR) + else: + flag = 'OK' + i_name = '%s_%s_%s_%s_%s_%s.png' % (CHILD_ID,chr,pos,ref,alt,flag) + out_igv_han.write('goto %s:%s-%s\n' % (chr,i_s,i_e)) out_igv_han.write('sort strand\n') out_igv_han.write('squish\n') @@ -249,6 +262,18 @@ def read_all_VCF_vars(in_vcf_file,THIS_DICT,pro_id): ref = data[3] alt = data[4] + # extract FS and SOR + FS = '' + SOR = '' + infos = [y.strip() for y in data[7].strip().split(';')] + for info in infos: + if info.startswith('FS='): + tag,FS = info.split('=') + FS = float(FS) + elif info.startswith('SOR='): + tag,SOR = info.split('=') + SOR = float(SOR) + # did the splitting and normalizing - should not have multiallelic variants if alt.find(',') != -1: print "ERROR: found multiallelic variant" @@ -258,9 +283,9 @@ def read_all_VCF_vars(in_vcf_file,THIS_DICT,pro_id): key = '%s:%s:%s:%s' % (chr,pos,ref,alt) if pro_id not in THIS_DICT: - THIS_DICT[pro_id][key] = 1 + THIS_DICT[pro_id][key] = (FS,SOR) elif key not in THIS_DICT[pro_id]: - THIS_DICT[pro_id][key] = 1 + THIS_DICT[pro_id][key] = (FS,SOR) else: print "ERROR: duplicate key = %s in %s" % (key,in_vcf_file) raise SystemExit diff --git a/generate_DEC_IGV_scripts.py b/generate_DEC_IGV_scripts.py index b42a82f99595566df1ebb15a256c6ea76fac8ef8..3c3ced5a886a19895491486d6a2f2085b055d588 100755 --- a/generate_DEC_IGV_scripts.py +++ b/generate_DEC_IGV_scripts.py @@ -15,7 +15,7 @@ # all VASE denovo variants found in the individual VCF # # Author: MH -# last modified: DEC 05, 2019 +# last modified: DEC 03, 2021 @@ -65,6 +65,8 @@ MAP_DICT = {} # key: family_id (aka decipher_id); value: internal (decipher) I TRANS_DICT = {} # key: transcriptID not found in DECIPHER; value: the chosen replacement transcriptID from those available in DECIPHER +FS_THRESH = float(60) +SOR_THRESH = float(3) @@ -194,6 +196,18 @@ def go(dec_map_file,trans_map_file,ped_file,in_g2p_file,in_vase_file,fam_igv_dir ref = data[3] alt = data[4] + # extract FS and SOR + FS = '' + SOR = '' + infos = [y.strip() for y in data[7].strip().split(';')] + for info in infos: + if info.startswith('FS='): + tag,FS = info.split('=') + FS = float(FS) + elif info.startswith('SOR='): + tag,SOR = info.split('=') + SOR = float(SOR) + VCF_VAR = data[9] key = '%s:%s:%s:%s' % (chr,pos,ref,alt) @@ -283,7 +297,16 @@ def go(dec_map_file,trans_map_file,ped_file,in_g2p_file,in_vase_file,fam_igv_dir # write to the IGV file i_s = pos - SNAP_FLANK i_e = pos + SNAP_FLANK - i_name = '%s_%s_%s_%s_%s.png' % (CHILD_ID,chr,pos,ref,alt) + + # check if above FS/SOR_THRESH to include in the snapshot name + if (FS == '') or (SOR == ''): + flag = 'NA' + elif (FS >= FS_THRESH) and (SOR >= SOR_THRESH): + flag = 'FS_%.1f_SOR_%.1f' % (FS,SOR) + else: + flag = 'OK' + i_name = '%s_%s_%s_%s_%s_%s.png' % (CHILD_ID,chr,pos,ref,alt,flag) + out_igv_han.write('goto %s:%s-%s\n' % (chr,i_s,i_e)) out_igv_han.write('sort strand\n') out_igv_han.write('squish\n') @@ -370,7 +393,16 @@ def go(dec_map_file,trans_map_file,ped_file,in_g2p_file,in_vase_file,fam_igv_dir # write to the IGV file i_s = pos - SNAP_FLANK i_e = pos + SNAP_FLANK - i_name = '%s_%s_%s_%s_%s.png' % (CHILD_ID,chr,pos,ref,alt) + + # check if above FS/SOR_THRESH to include in the snapshot name + if (FS == '') or (SOR == ''): + flag = 'NA' + elif (FS >= FS_THRESH) and (SOR >= SOR_THRESH): + flag = 'FS_%.1f_SOR_%.1f' % (FS,SOR) + else: + flag = 'OK' + i_name = '%s_%s_%s_%s_%s_%s.png' % (CHILD_ID,chr,pos,ref,alt,flag) + out_igv_han.write('goto %s:%s-%s\n' % (chr,i_s,i_e)) out_igv_han.write('sort strand\n') out_igv_han.write('squish\n') @@ -458,7 +490,16 @@ def go(dec_map_file,trans_map_file,ped_file,in_g2p_file,in_vase_file,fam_igv_dir # write to the IGV file i_s = pos - SNAP_FLANK i_e = pos + SNAP_FLANK - i_name = '%s_%s_%s_%s_%s.png' % (CHILD_ID,chr,pos,ref,alt) + + # check if above FS/SOR_THRESH to include in the snapshot name + if (FS == '') or (SOR == ''): + flag = 'NA' + elif (FS >= FS_THRESH) and (SOR >= SOR_THRESH): + flag = 'FS_%.1f_SOR_%.1f' % (FS,SOR) + else: + flag = 'OK' + i_name = '%s_%s_%s_%s_%s_%s.png' % (CHILD_ID,chr,pos,ref,alt,flag) + out_igv_han.write('goto %s:%s-%s\n' % (chr,i_s,i_e)) out_igv_han.write('sort strand\n') out_igv_han.write('squish\n') diff --git a/generate_DEC_IGV_shared_scripts.py b/generate_DEC_IGV_shared_scripts.py index eb2b1ece3e52e0b33293e964b67ee271cbe69cc4..96c9f5e4b4204e85a16e32c8d0f59290a517cf4b 100755 --- a/generate_DEC_IGV_shared_scripts.py +++ b/generate_DEC_IGV_shared_scripts.py @@ -12,7 +12,7 @@ # all G2P variants found in the individual VCF # # Author: MH -# last modified: MAR 23, 2020 +# last modified: DEC 07, 2021 @@ -33,12 +33,15 @@ ACCESS = 'No' TRANS_DICT = {} # key: transcriptID not found in DECIPHER; value: the chosen replacement transcriptID from those available in DECIPHER KIDS_SEX_DICT = {} # key: <indi_fam_id>; value: sex (in the format 46XX/46XY) KIDS_G2P_DICT = defaultdict(dict) # 1st level key: <indi_fam_id>; 2nd level key: chr:start:end:ref:alt; value: (ZYG,gene,trans) -KIDS_VCF_DICT = defaultdict(dict) # 1st level key: <indi_fam_id>; 2nd level key: chr:pos:ref:alt; value: irrelevant +KIDS_VCF_DICT = defaultdict(dict) # 1st level key: <indi_fam_id>; 2nd level key: chr:pos:ref:alt; value: (FS,SOR) SHARED_DICT = {} # key: chr:start:ref:alt; value: (ZYG,gene,trans) NUM_SHARED_G2P_VARS = 0 SNAP_FLANK = 25 +FS_THRESH = float(60) +SOR_THRESH = float(3) + ### call the python scrpit @@ -113,9 +116,11 @@ def go(dec_id,trans_map_file,ped_file,in_g2p_file,fam_igv_dir,vcf_dir,plate_id,f # SHARED_DICT = {} # key: chr:start:ref:alt; value: (ZYG,gene,trans) pro_vcf_vars = KIDS_VCF_DICT[pro_id] - for pro_vcf_var,rub in pro_vcf_vars.iteritems(): + for pro_vcf_var,fs_sor in pro_vcf_vars.iteritems(): chr,pos,ref,alt = pro_vcf_var.split(':') pos = int(pos) + FS = fs_sor[0] + SOR = fs_sor[1] # adjust pro_vcf_var for indels to match G2P style of recording if len(ref) == len(alt): # SNP @@ -205,7 +210,16 @@ def go(dec_id,trans_map_file,ped_file,in_g2p_file,fam_igv_dir,vcf_dir,plate_id,f # write to the IGV file i_s = pos - SNAP_FLANK i_e = pos + SNAP_FLANK - i_name = '%s_%s_%s_%s_%s.png' % (pro_id,chr,pos,ref,alt) + + # check if above FS/SOR_THRESH to include in the snapshot name + if (FS == '') or (SOR == ''): + flag = 'NA' + elif (FS >= FS_THRESH) and (SOR >= SOR_THRESH): + flag = 'FS_%.1f_SOR_%.1f' % (FS,SOR) + else: + flag = 'OK' + i_name = '%s_%s_%s_%s_%s_%s.png' % (CHILD_ID,chr,pos,ref,alt,flag) + out_igv_han.write('goto %s:%s-%s\n' % (chr,i_s,i_e)) out_igv_han.write('sort strand\n') out_igv_han.write('squish\n') @@ -244,6 +258,18 @@ def read_all_VCF_vars(in_vcf_file,THIS_DICT,pro_id): ref = data[3] alt = data[4] + # extract FS and SOR + FS = '' + SOR = '' + infos = [y.strip() for y in data[7].strip().split(';')] + for info in infos: + if info.startswith('FS='): + tag,FS = info.split('=') + FS = float(FS) + elif info.startswith('SOR='): + tag,SOR = info.split('=') + SOR = float(SOR) + # did the splitting and normalizing - should not have multiallelic variants if alt.find(',') != -1: print "ERROR: found multiallelic variant" @@ -253,9 +279,9 @@ def read_all_VCF_vars(in_vcf_file,THIS_DICT,pro_id): key = '%s:%s:%s:%s' % (chr,pos,ref,alt) if pro_id not in THIS_DICT: - THIS_DICT[pro_id][key] = 1 + THIS_DICT[pro_id][key] = (FS,SOR) elif key not in THIS_DICT[pro_id]: - THIS_DICT[pro_id][key] = 1 + THIS_DICT[pro_id][key] = (FS,SOR) else: print "ERROR: duplicate key = %s in %s" % (key,in_vcf_file) raise SystemExit diff --git a/generate_DEC_IGV_trio_scripts_from_quad.py b/generate_DEC_IGV_trio_scripts_from_quad.py index 2537002b64c6c451c5ff4f692216cdbc7794934d..3afdd794f3e4a95a9abc32baa77d6a22f83133c0 100755 --- a/generate_DEC_IGV_trio_scripts_from_quad.py +++ b/generate_DEC_IGV_trio_scripts_from_quad.py @@ -15,7 +15,7 @@ # all VASE denovo variants found in the individual VCF # # Author: MH -# last modified: DEC 05, 2019 +# last modified: DEC 07, 2021 @@ -65,6 +65,8 @@ MAP_DICT = {} # key: family_id (aka decipher_id); value: internal (decipher) I TRANS_DICT = {} # key: transcriptID not found in DECIPHER; value: the chosen replacement transcriptID from those available in DECIPHER +FS_THRESH = float(60) +SOR_THRESH = float(3) @@ -217,6 +219,18 @@ def go(dec_id,trans_map_file,ped_file,in_g2p_file,in_vase_file,fam_igv_dir,vcf_d ref = data[3] alt = data[4] + # extract FS and SOR + FS = '' + SOR = '' + infos = [y.strip() for y in data[7].strip().split(';')] + for info in infos: + if info.startswith('FS='): + tag,FS = info.split('=') + FS = float(FS) + elif info.startswith('SOR='): + tag,SOR = info.split('=') + SOR = float(SOR) + VCF_VAR = data[9] key = '%s:%s:%s:%s' % (chr,pos,ref,alt) @@ -306,7 +320,16 @@ def go(dec_id,trans_map_file,ped_file,in_g2p_file,in_vase_file,fam_igv_dir,vcf_d # write to the IGV file i_s = pos - SNAP_FLANK i_e = pos + SNAP_FLANK - i_name = '%s_%s_%s_%s_%s.png' % (CHILD_ID,chr,pos,ref,alt) + + # check if above FS/SOR_THRESH to include in the snapshot name + if (FS == '') or (SOR == ''): + flag = 'NA' + elif (FS >= FS_THRESH) and (SOR >= SOR_THRESH): + flag = 'FS_%.1f_SOR_%.1f' % (FS,SOR) + else: + flag = 'OK' + i_name = '%s_%s_%s_%s_%s_%s.png' % (CHILD_ID,chr,pos,ref,alt,flag) + out_igv_han.write('goto %s:%s-%s\n' % (chr,i_s,i_e)) out_igv_han.write('sort strand\n') out_igv_han.write('squish\n') @@ -393,7 +416,16 @@ def go(dec_id,trans_map_file,ped_file,in_g2p_file,in_vase_file,fam_igv_dir,vcf_d # write to the IGV file i_s = pos - SNAP_FLANK i_e = pos + SNAP_FLANK - i_name = '%s_%s_%s_%s_%s.png' % (CHILD_ID,chr,pos,ref,alt) + + # check if above FS/SOR_THRESH to include in the snapshot name + if (FS == '') or (SOR == ''): + flag = 'NA' + elif (FS >= FS_THRESH) and (SOR >= SOR_THRESH): + flag = 'FS_%.1f_SOR_%.1f' % (FS,SOR) + else: + flag = 'OK' + i_name = '%s_%s_%s_%s_%s_%s.png' % (CHILD_ID,chr,pos,ref,alt,flag) + out_igv_han.write('goto %s:%s-%s\n' % (chr,i_s,i_e)) out_igv_han.write('sort strand\n') out_igv_han.write('squish\n') @@ -481,7 +513,16 @@ def go(dec_id,trans_map_file,ped_file,in_g2p_file,in_vase_file,fam_igv_dir,vcf_d # write to the IGV file i_s = pos - SNAP_FLANK i_e = pos + SNAP_FLANK - i_name = '%s_%s_%s_%s_%s.png' % (CHILD_ID,chr,pos,ref,alt) + + # check if above FS/SOR_THRESH to include in the snapshot name + if (FS == '') or (SOR == ''): + flag = 'NA' + elif (FS >= FS_THRESH) and (SOR >= SOR_THRESH): + flag = 'FS_%.1f_SOR_%.1f' % (FS,SOR) + else: + flag = 'OK' + i_name = '%s_%s_%s_%s_%s_%s.png' % (CHILD_ID,chr,pos,ref,alt,flag) + out_igv_han.write('goto %s:%s-%s\n' % (chr,i_s,i_e)) out_igv_han.write('sort strand\n') out_igv_han.write('squish\n') diff --git a/process_trio.sh b/process_trio.sh index cb18c41d6c2c0e625bc8658cadb5e31fd73dbdda..5cbe9b40d3f86bc2cdfbc07e8bcba1f9333797cf 100755 --- a/process_trio.sh +++ b/process_trio.sh @@ -367,6 +367,7 @@ FAM_BAM_DIR=${SOURCE_DIR}/${BATCH_NUM}_${VERSION_N}/families/????-??-??_${BATCH_ ## call the python scrpit +#~~#time ${PYTHON2} /home/u035/u035/shared/temp/generate_DEC_IGV_scripts.py \ time ${PYTHON2} ${SCRIPTS_DIR}/generate_DEC_IGV_scripts.py \ ${DEC_MAP} \ ${TRANS_MAP} \ diff --git a/trio_cram_setup.sh b/trio_cram_setup.sh new file mode 100755 index 0000000000000000000000000000000000000000..0cad9bd38dc8e55e77950c31a7dbccc7a36a89da --- /dev/null +++ b/trio_cram_setup.sh @@ -0,0 +1,137 @@ +#!/bin/bash +#SBATCH --cpus-per-task=16 +#SBATCH --mem=2GB +#SBATCH --time=24:00:00 +#SBATCH --job-name=trio_cram_setup +#SBATCH --output=trio_cram_setup.%A_%a.out +#SBATCH --error=trio_cram_setup.%A_%a.err + + +### Setup the folder structure for the downstream analysis### +BASE=/home/u035/u035/shared/analysis/work +WORK_DIR=$BASE/${PROJECT_ID} +VCF_DIR=${WORK_DIR}/VCF +PED_DIR=${WORK_DIR}/PED +LOG_DIR=${WORK_DIR}/LOG +G2P_DIR=${WORK_DIR}/G2P +VASE_DIR=${WORK_DIR}/VASE +COV_DIR=${WORK_DIR}/COV +DEC_DIR=${WORK_DIR}/DECIPHER +IGV_DIR=${DEC_DIR}/IGV +CNV_DIR=${WORK_DIR}/CNV +BAMOUT_DIR=${WORK_DIR}/BAMOUT +SCRIPTS_DIR=/home/u035/u035/shared/scripts + + + +### Tools +PYTHON2=/home/u035/u035/shared/software/bcbio/anaconda/envs/python2/bin/python2.7 +SAMTOOLS=/home/u035/u035/shared/software/bcbio/anaconda/bin/samtools +PICARD=/home/u035/u035/shared/software/bcbio/anaconda/bin/picard +REFERENCE_GENOME=/home/u035/u035/shared/software/bcbio/genomes/Hsapiens/hg38/seq/hg38.fa + + + +#~## check if ${WORK_DIR} already exists - if so, exit - to prevent accidental overwriting +#~#if [ -d "${WORK_DIR}" ]; then +#~# echo "${WORK_DIR} already exists - EXIT! If really intended, delete manually!!!!" +#~# exit +#~#fi + + + + +echo "SOURCE_DIR = ${SOURCE_DIR}" # the general path to the source VCF, BAM and PED files i.e. /home/u035/u035/shared/results +echo "BATCH_ID = ${BATCH_ID}" # the ID of the batch being processed e.g. 19650_Ansari_Morad +echo "BATCH_NUM = ${BATCH_NUM}" # the numerical part of the BATCH_ID e.g. 19650 +echo "PLATE_ID = ${PLATE_ID}" # the PCR plate ID of the batch being currently processed, e.g. 19285 +echo "PROJECT_ID = ${PROJECT_ID}" # this the the folder (${BASE}/${PROJECT_ID}) where the downstream analysis will be done +echo "VERSION_N = ${VERSION_N}" # the version of the alignment and genotyping analysis + +#~#S_PED_DIR=${SOURCE_DIR}/../../params # requires that the family PED files are in this folder + + + + +# create the working dir and the required subfolders +mkdir ${WORK_DIR} +mkdir ${VCF_DIR} +mkdir ${PED_DIR} +mkdir ${LOG_DIR} +mkdir ${G2P_DIR} +mkdir ${VASE_DIR} +mkdir ${COV_DIR} +mkdir ${DEC_DIR} +mkdir ${IGV_DIR} +mkdir ${CNV_DIR} +mkdir ${BAMOUT_DIR} +echo "Created ${WORK_DIR} for this batch and all the required subfolders" + + + +###################################################### +### Copy the VCF and PED file per each family ### +###################################################### + +SOURCE_VCF_DIRS=${SOURCE_DIR}/${BATCH_NUM}_${VERSION_N}/families/????-??-??_${BATCH_NUM}_${VERSION_N}_${PLATE_ID}_* +SOURCE_PED_DIR=${SOURCE_DIR}/${BATCH_NUM}_${VERSION_N}/params + +for S_VCF_DIR in ${SOURCE_VCF_DIRS} +do +# echo " ${S_VCF_DIR}" + VCF_DIR_NAME="${S_VCF_DIR##*/}" +# echo " ${VCF_DIR_NAME}" + IFS=_ read -ra my_arr <<< "${VCF_DIR_NAME}" + FAM_ID=${my_arr[-1]} +# echo " BATCH = ${BATCH_ID}, PLATE = ${PLATE_ID}, FAM_ID = ${FAM_ID}" + echo " FAM_ID = ${FAM_ID}" + + # construct the VCF and PED file names for this family + S_VCF_FILE=${S_VCF_DIR}/${PLATE_ID}_${FAM_ID}-gatk-haplotype-annotated.vcf.gz + S_PED_FILE=${SOURCE_PED_DIR}/${BATCH_ID}_${PLATE_ID}_${FAM_ID}.ped + + + # copy the trio VCF and PED files + cp ${S_VCF_FILE} ${VCF_DIR} + cp ${S_PED_FILE} ${PED_DIR} + echo " copied ${S_VCF_FILE} --> ${VCF_DIR}" + echo " copied ${S_PED_FILE} --> ${PED_DIR}" + + + # identify all folders (one for each individual) for this family containing cram/bam files (format: <INDI_ID>_<FAM_ID>) + cd ${SOURCE_DIR}/${BATCH_NUM}_${VERSION_N}/families/????-??-??_${BATCH_NUM}_${VERSION_N}_${PLATE_ID}_${FAM_ID} + for ITEM in `ls -l` + do + if test -d $ITEM && [[ "$ITEM" == *"_"* ]] + then + echo " $ITEM is a CRAM/BAM folder..." + BAM=${ITEM}/${ITEM}-ready.bam + CRAM=${ITEM}/${ITEM}-ready.cram + ${SAMTOOLS} view -@ 16 -T ${REFERENCE_GENOME} -hb -o ${BAM} ${CRAM} + ${PICARD} BuildBamIndex -I ${BAM} -O ${BAM}.bai -USE_JDK_DEFLATER true -USE_JDK_INFLATER true -VERBOSITY ERROR -QUIET true + echo " Generated ${BAM} and ${BAM}.bai from ${CRAM}" + fi + done + +done + + + +###################################################################################### +### generate the FAM_IDs.txt, PRO_IDs.txt and FAM_PRO.txt *only for trio* families ### +###################################################################################### + +time ${PYTHON2} ${SCRIPTS_DIR}/extract_trio_FAM_PRO_ID.py ${WORK_DIR} + +echo "" +echo "" +echo "OK: Setup for PROJECT_ID = $PROJECT_ID successful" + + + + + + + + +