diff --git a/bin/compute_DDG2P_coverage_illumina_exome_bam.sh b/bin/compute_DDG2P_coverage_illumina_exome_bam.sh index d344ee2fc8ca8682c4e0cf515771b009c5b5ba26..9cd6fc4c1984660774db2b17fe549facfe870f3b 100755 --- a/bin/compute_DDG2P_coverage_illumina_exome_bam.sh +++ b/bin/compute_DDG2P_coverage_illumina_exome_bam.sh @@ -8,7 +8,7 @@ # Author: MH -# last modified: FEB 27, 2023 +# last modified: SEPT 11, 2023 # setup PATH @@ -17,7 +17,8 @@ export PERL5LIB=$PERL5LIB:/home/u035/u035/shared/software/bcbio/anaconda/lib/sit # The current DDG2P gene panel BED file -TARGETS=/home/u035/u035/shared/resources/G2P/DDG2P.20221207.plus15bp.merged.bed +#TARGETS=/home/u035/u035/shared/resources/G2P/DDG2P.20221207.plus15bp.merged.bed +TARGETS=/home/u035/u035/shared/resources/G2P/DDG2P.20230817.plus15bp.merged.bed GATK3=/home/u035/u035/shared/software/GenomeAnalysisTK-3.8/GenomeAnalysisTK.jar SAMTOOLS=/home/u035/u035/shared/software/bcbio/anaconda/bin/samtools @@ -37,7 +38,7 @@ BAM_FILE=`head -n ${SLURM_ARRAY_TASK_ID} ${BAM_LIST} | tail -n 1` # get the ID (Illumina Exome) to generate the out file name IFS='/' read -r -a array <<< "${BAM_FILE}" -ID="${array[8]}" +ID="${array[9]}" OUT_FILE=${ID}.DDG2P.COV diff --git a/bin/compute_DDG2P_default_mosdepth_coverage.sh b/bin/compute_DDG2P_default_mosdepth_coverage.sh new file mode 100755 index 0000000000000000000000000000000000000000..f444bde4c5f1f5a34af50017f236d60bf6f85767 --- /dev/null +++ b/bin/compute_DDG2P_default_mosdepth_coverage.sh @@ -0,0 +1,67 @@ +#!/bin/bash +#SBATCH --cpus-per-task=1 +#SBATCH --mem=16GB +#SBATCH --time=24:00:00 +#SBATCH --job-name=DDG2P_default_mosdepth_cov +#SBATCH --output=DDG2P_default_mosdepth_cov.%A_%a.out +#SBATCH --error=DDG2P_default_mosdepth_cov.%A_%a.err + + +# Author: MH +# last modified: SEPT 15, 2023 + + +# setup PATH +export PATH=$PATH:/home/u035/u035/shared/software/bcbio/anaconda/envs/python2/bin:/home/u035/u035/shared/software/bcbio/anaconda/bin +export PERL5LIB=$PERL5LIB:/home/u035/u035/shared/software/bcbio/anaconda/lib/site_perl/5.26.2 + + + +export MOSDEPTH_Q0=NO_COVERAGE +export MOSDEPTH_Q1=LOW_COVERAGE +export MOSDEPTH_Q2=CALLABLE + +MOSDEPTH=/mnt/e1000/home/u035/u035/shared/software/bcbio/galaxy/../anaconda/bin/mosdepth + + + +## The current TWIST gene panel BED file +#TARGETS=/home/u035/u035/shared/resources/exome_targets/hg38_Twist_ILMN_Exome_2.0_Plus_Panel_annotated_June2022.plus15bp.bed + +# The current DDG2P gene panel BED file +TARGETS=/home/u035/u035/shared/resources/G2P/DDG2P.20230817.plus15bp.merged.bed + + + + + + +################################# +###### for each BAM file #### +################################# +#BAM_FILE=`head -n ${SLURM_ARRAY_TASK_ID} ${BAM_LIST} | tail -n 1` +# +## get the ID (Illumina Exome) to generate the out file name +#IFS='/' read -r -a array <<< "${BAM_FILE}" +#ID="${array[9]}" + +#time ${MOSDEPTH} -t 16 -F 1804 -Q 1 --no-per-base --by ${TARGETS} --quantize 0:1:4: --thresholds 20,50 ${ID}.DDG2P ${BAM_FILE} + + +################################# +##### for each CRAM file #### +################################# +CRAM_FILE=`head -n ${SLURM_ARRAY_TASK_ID} ${CRAM_LIST} | tail -n 1` + +# get the ID (Illumina Exome) to generate the out file name +IFS='/' read -r -a array <<< "${CRAM_FILE}" +ID="${array[9]}" + +REFERENCE_GENOME=/home/u035/u035/shared/software/bcbio/genomes/Hsapiens/hg38/seq/hg38.fa + +time ${MOSDEPTH} -t 16 -F 1804 -Q 1 --no-per-base --by ${TARGETS} --quantize 0:1:4: --thresholds 20,50 --fasta ${REFERENCE_GENOME} ${ID}.DDG2P ${CRAM_FILE} + + + + + diff --git a/bin/compute_full_coverage_illumina_exome_bam.sh b/bin/compute_full_coverage_illumina_exome_bam.sh new file mode 100755 index 0000000000000000000000000000000000000000..14f97717567397461135e5b6a1968ee59bca9d5e --- /dev/null +++ b/bin/compute_full_coverage_illumina_exome_bam.sh @@ -0,0 +1,66 @@ +#!/bin/bash +#SBATCH --cpus-per-task=1 +#SBATCH --mem=16GB +#SBATCH --time=24:00:00 +#SBATCH --job-name=compute_full_cov +#SBATCH --output=compute_full_cov.%A_%a.out +#SBATCH --error=compute_full_cov.%A_%a.err + + +# Author: MH +# last modified: SEPT 12, 2023 + + +# setup PATH +export PATH=$PATH:/home/u035/u035/shared/software/bcbio/anaconda/envs/python2/bin:/home/u035/u035/shared/software/bcbio/anaconda/bin +export PERL5LIB=$PERL5LIB:/home/u035/u035/shared/software/bcbio/anaconda/lib/site_perl/5.26.2 + + +# The current DDG2P gene panel BED file +# TARGETS=/home/u035/u035/shared/resources/G2P/DDG2P.20230817.plus15bp.merged.bed + +# The current TWIST gene panel BED file +TARGETS=/home/u035/u035/shared/resources/exome_targets/hg38_Twist_ILMN_Exome_2.0_Plus_Panel_annotated_June2022.plus15bp.bed + + + +GATK3=/home/u035/u035/shared/software/GenomeAnalysisTK-3.8/GenomeAnalysisTK.jar +SAMTOOLS=/home/u035/u035/shared/software/bcbio/anaconda/bin/samtools +PICARD=/home/u035/u035/shared/software/bcbio/anaconda/bin/picard +REFERENCE_GENOME=/home/u035/u035/shared/software/bcbio/genomes/Hsapiens/hg38/seq/hg38.fa + + + + +# echo "BAM_LIST = ${BAM_LIST}" + + +################################ +##### for each BAM file #### +################################ +BAM_FILE=`head -n ${SLURM_ARRAY_TASK_ID} ${BAM_LIST} | tail -n 1` + +# get the ID (Illumina Exome) to generate the out file name +IFS='/' read -r -a array <<< "${BAM_FILE}" +ID="${array[9]}" +#OUT_FILE=${ID}.DDG2P.COV +OUT_FILE=${ID}.full.COV + + +time java -Xmx8g -jar ${GATK3} -T DepthOfCoverage -R ${REFERENCE_GENOME} -o ${OUT_FILE} -I ${BAM_FILE} -L ${TARGETS} \ + --omitDepthOutputAtEachBase \ + --minBaseQuality 20 \ + --minMappingQuality 20 \ + -ct 1 \ + -ct 5 \ + -ct 10 \ + -ct 15 \ + -ct 20 \ + -ct 25 \ + -ct 30 \ + -ct 50 \ + -jdk_deflater \ + -jdk_inflater \ + --allow_potentially_misencoded_quality_scores + + diff --git a/bin/compute_mosdepth_with_overlap_coverage_bam.sh b/bin/compute_mosdepth_with_overlap_coverage_bam.sh new file mode 100755 index 0000000000000000000000000000000000000000..4b49bf002491dbbda987afd088587ef8bcf6c096 --- /dev/null +++ b/bin/compute_mosdepth_with_overlap_coverage_bam.sh @@ -0,0 +1,44 @@ +#!/bin/bash +#SBATCH --cpus-per-task=1 +#SBATCH --mem=16GB +#SBATCH --time=24:00:00 +#SBATCH --job-name=compute_mosdepth_with_overlap_cov +#SBATCH --output=compute_mosdepth_with_overlap_cov.%A_%a.out +#SBATCH --error=compute_mosdepth_with_overlap_cov.%A_%a.err + + +# Author: MH +# last modified: SEPT 13, 2023 + + +# setup PATH +export PATH=$PATH:/home/u035/u035/shared/software/bcbio/anaconda/envs/python2/bin:/home/u035/u035/shared/software/bcbio/anaconda/bin +export PERL5LIB=$PERL5LIB:/home/u035/u035/shared/software/bcbio/anaconda/lib/site_perl/5.26.2 + + + +export MOSDEPTH_Q0=NO_COVERAGE +export MOSDEPTH_Q1=LOW_COVERAGE +export MOSDEPTH_Q2=CALLABLE + +MOSDEPTH=/mnt/e1000/home/u035/u035/shared/software/bcbio/galaxy/../anaconda/bin/mosdepth + + +# The current TWIST gene panel BED file +TARGETS=/home/u035/u035/shared/resources/exome_targets/hg38_Twist_ILMN_Exome_2.0_Plus_Panel_annotated_June2022.plus15bp.bed + + +################################ +##### for each BAM file #### +################################ +BAM_FILE=`head -n ${SLURM_ARRAY_TASK_ID} ${BAM_LIST} | tail -n 1` + +# get the ID (Illumina Exome) to generate the out file name +IFS='/' read -r -a array <<< "${BAM_FILE}" +ID="${array[9]}" + + +time ${MOSDEPTH} --fast-mode -t 16 -F 1804 -Q 1 --no-per-base --by ${TARGETS} --quantize 0:1:4: ${ID} ${BAM_FILE} + + + diff --git a/bin/convert_DEC_to_v10.py b/bin/convert_DEC_to_v10.py old mode 100755 new mode 100644 index f4b23df27aaaaff3fc01bb4d1a83d224d4ff01ab..67f26a78bb6cd0b061a1c298a7704d8b268c6217 --- a/bin/convert_DEC_to_v10.py +++ b/bin/convert_DEC_to_v10.py @@ -1,4 +1,3 @@ -#!/bin/env python # given a DECIPHER bulk upload file v9 # convert it to v10 # @@ -15,7 +14,10 @@ import xlsxwriter -def go(id, in_file, out_file): # the folder where the bulk upload files are strored; id is in the format <indiv_id>_<family_id> +def go(inout_dir,id): # the folder where the bulk upload files are strored; id is in the format <indiv_id>_<family_id> + + in_file = '%s/%s_DEC_FLT.csv' % (inout_dir,id) + out_file = '%s/%s_DECIPHER_v10.xlsx' % (inout_dir,id) # create the workbook workbook = xlsxwriter.Workbook(out_file) @@ -70,9 +72,9 @@ def go(id, in_file, out_file): # the folder where the bulk upload files are str if __name__ == '__main__': - if len(sys.argv) == 4: - go(*sys.argv[1:]) + if len(sys.argv) == 3: + go(sys.argv[1],sys.argv[2]) else: - print("Suggested use: convert_DEC_to_v10.py <indiv_id>_<family_id> <in_file> <out_file>") + print ("Suggested use: time python convert_DEC_to_v10.py decipher_dir <indiv_id>_<family_id>") raise SystemExit diff --git a/bin/count_decipher_entries.py b/bin/count_decipher_entries.py new file mode 100644 index 0000000000000000000000000000000000000000..82916ef065a7eb144c4c788c28c7febfbf707741 --- /dev/null +++ b/bin/count_decipher_entries.py @@ -0,0 +1,36 @@ +# Counts the number of variants in a DECIPHER Excel upload file (v10 or v11). +# +# Author: AM +# last modified: SEP 01, 2023 + + + + + + +import sys +import os +import shutil +import csv +import xlsxwriter +import xlrd + + +def go(DEC_file): + + if not os.path.exists(DEC_file): + print ('ERROR: cannot find ', DEC_file) + raise SystemExit + + wb = xlrd.open_workbook(DEC_file) + ws = wb.sheet_by_index(0) + print (DEC_file, ' ', ws.nrows) + +if __name__ == '__main__': + if len(sys.argv) == 2: + go(sys.argv[1]) + else: + print ('Suggested use: time python3 count_decipher_entries.py DECIPHER_v11.xlsx') + + + raise SystemExit diff --git a/bin/extract_solo_FAM_PRO_ID.py b/bin/extract_solo_FAM_PRO_ID.py index 6306fa987fa369962c8b8fd67ecd50a03e2f613b..648dd3d6f254126da138109217a3dcc6e4c7b02a 100755 --- a/bin/extract_solo_FAM_PRO_ID.py +++ b/bin/extract_solo_FAM_PRO_ID.py @@ -1,17 +1,22 @@ -#!/usr/bin/env python3 # input: the work folder which contains a PED subfolder where all family PED files were copied # output: only for singletons # solo_FAM_IDs.txt, solo_PRO_IDs.txt and solo_FAM_PRO.txt # # Author: MH -# last modified: JUL 6, 2022 +# last modified: SEPT 05, 2023 -import argparse -def go(args): - out_fam_file = 'solo_FAM_IDs.txt' - out_pro_file = 'solo_PRO_IDs.txt' - out_f_p_file = 'solo_FAM_PRO.txt' + +import sys +import os + + + +def go(work_dir): + + out_fam_file = work_dir + '/solo_FAM_IDs.txt' + out_pro_file = work_dir + '/solo_PRO_IDs.txt' + out_f_p_file = work_dir + '/solo_FAM_PRO.txt' out_fam_han = open(out_fam_file,'w') out_pro_han = open(out_pro_file,'w') @@ -21,34 +26,58 @@ def go(args): cntr_pro = 0 cntr_f_p = 0 + # go over the PED folder in the working dir and process each PED file - print("") - print("Processing %i PED files to extract singleton FAM_ID, PRO_ID and FAM_PRO files" % (len(args.ped_files))) + ped_dir = work_dir + '/PED' + print "" + print "Processing the PED files (in %s) to extract singleton FAM_ID, PRO_ID amd FAM_PRO files" % (ped_dir) + + - for ped_file in args.ped_files: # per each PED file - if ped_file.endswith(".ped"): + for file in os.listdir(ped_dir): # per each PED file + if file.endswith(".ped"): - print(" %s" % (ped_file)) + print " %s" % (file) + in_file = os.path.join(ped_dir, file) - # check how many lines in the PED file - if more than one cannot be a singleton, ignore - num_lines = 0 - in_han = open(ped_file,'r') + # check if this PED has a single record - most likely a singleton + first_cntr_check = 0 + in_han = open(in_file,'r') for line in in_han: - num_lines += 1 + first_cntr_check += 1 in_han.close() - if num_lines > 1: + if first_cntr_check != 1: + # print "WARNING: There are %s records in PED file = %s" % (first_cntr_check, file) + # print " This family will not be treated as a singleton family" continue - if num_lines == 0: - print("ERROR: empty PED file %s" % (ped_file)) - raise SystemExit - # if here, the PED file contains exactly one line - # check if all fine: parents IDs = 0, kid with known sex and is affected +# # check how many lines in the PED file - if more than one cannot be a singleton, ignore +# num_lines = 0 +# in_han = open(in_file,'r') +# for line in in_han: +# num_lines += 1 +# in_han.close() +# +# if num_lines > 1: +# #print "ERROR: found more than 1 individual in a singleton PED file = %s" % (in_file) +# #raise SystemExit +# print "WARNING: found more than 1 individual in PED file = %s, not a singleton, skipping...." % (in_file) +# continue +# if num_lines == 0: +# print "ERROR: empty PED file %s" % (file) +# raise SystemExit +# +# # if here, the PED file contains exactly one line +# # check if all fine: parents IDs = 0, kid with known sex and is affected + + + + # if here, there are exactly 1 record in the family PED file - keep checking and collect information CHILD_ID = 0 FAM_ID = 0 - in_han = open(ped_file,'r') + in_han = open(in_file,'r') for line in in_han: data = [x.strip() for x in line.strip().split('\t')] @@ -56,8 +85,8 @@ def go(args): y_indi,y_fam = data[1].split('_') # in the internal PED file, indi_id is indiID_familyID, split if x_fam != y_fam: - print("ERROR: FAMILY_ID mismatch in %s" % (ped_file)) - print(line) + print "ERROR: FAMILY_ID mismatch in %s" % (file) + print line raise SystemExit FAM_ID = x_fam @@ -65,8 +94,8 @@ def go(args): # check both parent IDs == 0 if (data[2] != '0') or (data[3] != '0'): - print("ERROR: found parent ID for a singleton child in %s" % (ped_file)) - print(line) + print "ERROR: found parent ID for a singleton child in %s" % (file) + print line raise SystemExit # make sure the sex of the child is known @@ -74,23 +103,23 @@ def go(args): if (CHILD_SEX == 1) or (CHILD_SEX == 2): pass else: - print("ERROR: proband sex unknown in %s" % (ped_file)) - print(line) + print "ERROR: proband sex unknown in %s" % (file) + print line raise SystemExit # check kid is affected if int(data[5]) != 2: - print("ERROR: singleton child not affected") - print(line) + print "ERROR: singleton child not affected" + print line raise SystemExit if FAM_ID == 0: - print("ERROR: Cannot find the FAMILY_ID in %s" % (ped_file)) + print "ERROR: Cannot find the FAMILY_ID in %s" % (file) raise SystemExit if CHILD_ID == 0: - print("ERROR: Cannot find CHILD_ID in %s" % (ped_file)) + print "ERROR: Cannot find CHILD_ID in %s" % (file) raise SystemExit - elif FAM_ID not in args.exclude_families: + else: out_fam_han.write('%s\n' % (FAM_ID)) cntr_fam += 1 out_pro_han.write('%s\n' % (CHILD_ID)) @@ -102,23 +131,23 @@ def go(args): out_pro_han.close() out_f_p_han.close() - print("") - print("Singleton Families:") - print(" %s FAM_IDs --> %s" % (cntr_fam, out_fam_file)) - print(" %s PRO_IDs --> %s" % (cntr_pro, out_pro_file)) - print(" %s FAM_PRO --> %s" % (cntr_f_p, out_f_p_file)) - print("") + + + print "" + print "Singleton Families:" + print " %s FAM_IDs --> %s" % (cntr_fam, out_fam_file) + print " %s PRO_IDs --> %s" % (cntr_pro, out_pro_file) + print " %s FAM_PRO --> %s" % (cntr_f_p, out_f_p_file) + print "" + + + if __name__ == '__main__': - a = argparse.ArgumentParser( - description=''' - Extract singleton FAM_ID, PRO_ID and FAM_PRO files. Suggested use: - time python /home/u035/u035/shared/scripts/extract_solo_FAM_PRO_ID.py - /home/u035/u035/shared/analysis/work/<PROJECT_ID> - ''' - ) - a.add_argument('ped_files', nargs='+') - args = a.parse_args() - a.add_argument('--exclude-families', nargs='*') - go(args) + if len(sys.argv) == 2: + go(sys.argv[1]) + else: + print "Suggested use: time python /home/u035/u035/shared/scripts/extract_solo_FAM_PRO_ID.py /home/u035/u035/shared/analysis/work/<PROJECT_ID>" + raise SystemExit + diff --git a/bin/extract_trio_FAM_PRO_ID.py b/bin/extract_trio_FAM_PRO_ID.py index 3bec2e218b0ac6ea5ce18fa9ee2c4d49580eaddc..bc10665a0881ce1f2b906b5cdc28706b6f91c1c9 100755 --- a/bin/extract_trio_FAM_PRO_ID.py +++ b/bin/extract_trio_FAM_PRO_ID.py @@ -1,18 +1,22 @@ -#!/usr/bin/env python3 # input: the work folder which contains a PED subfolder where all family PED files were copied # output: only for trio families # FAM_IDs.txt, PRO_IDs.txt and FAM_PRO.txt # # Author: MH -# last modified: JUL 6, 2022 +# last modified: SEPT 05, 2023 -import argparse -def go(args): - out_fam_file = 'FAM_IDs.txt' - out_pro_file = 'PRO_IDs.txt' - out_f_p_file = 'FAM_PRO.txt' +import sys +import os + + + +def go(work_dir): + + out_fam_file = work_dir + '/FAM_IDs.txt' + out_pro_file = work_dir + '/PRO_IDs.txt' + out_f_p_file = work_dir + '/FAM_PRO.txt' out_fam_han = open(out_fam_file,'w') out_pro_han = open(out_pro_file,'w') @@ -22,18 +26,34 @@ def go(args): cntr_pro = 0 cntr_f_p = 0 + # go over the PED folder in the working dir and process each PED file - print("") - print("Processing %i PED files to extract FAM_ID, PRO_ID and FAM_PRO files" % (len(args.ped_files))) + ped_dir = work_dir + '/PED' + print "" + print "Processing the PED files (in %s) to extract FAM_ID, PRO_ID amd FAM_PRO files" % (ped_dir) + + for file in os.listdir(ped_dir): # per each PED file + if file.endswith(".ped"): + print " %s" % (file) + in_file = os.path.join(ped_dir, file) + + # check if this PED has three records - most likely a trio + first_cntr_check = 0 + in_han = open(in_file,'r') + for line in in_han: + first_cntr_check += 1 + in_han.close() - for ped_file in args.ped_files: # per each PED file - if ped_file.endswith(".ped"): - print(" %s" % (ped_file)) + if first_cntr_check != 3: + # print "WARNING: There are %s records in PED file = %s" % (first_cntr_check, file) + # print " This family will not be treated as a trio family" + continue + # if here, there are exactly 3 records in the family PED file - keep checking and collect information CHILD_ID = 0 FAM_ID = 0 - in_han = open(ped_file,'r') + in_han = open(in_file,'r') for line in in_han: data = [x.strip() for x in line.strip().split('\t')] @@ -43,14 +63,14 @@ def go(args): if FAM_ID == 0: FAM_ID = x_fam elif FAM_ID != x_fam: - print("ERROR: more than one FAMILY_ID in %s" % (ped_file)) + print "ERROR: more than one FAMILY_ID in %s" % (file) raise SystemExit if data[2] != '0' and data[3] != '0': # this is the child in the trio if CHILD_ID == 0: CHILD_ID = y_indi else: # seen another child - print("WARNING: already have seen a child (possibly a quad) in %s" % (ped_file)) + print "WARNING: already have seen a child (possibly a quad) in %s" % (file) CHILD_ID = 0 break @@ -58,21 +78,27 @@ def go(args): if (CHILD_SEX == 1) or (CHILD_SEX == 2): pass else: - print("ERROR: proband sex unknown") - print(line) + print "ERROR: proband sex unknown" + print line raise SystemExit if int(data[5]) != 2: - print("ERROR: child in a trio not affected") - print(line) + print "ERROR: child in a trio not affected" + print line raise SystemExit + elif data[2] == '0' and data[3] == '0': # this is a parent in the trio + if int(data[5]) == 2: + print "WARNING: affected parent %s in a trio - still processed as a trio, but no de novo analysis" % (y_indi) + print line + + if FAM_ID == 0: - print("ERROR: Cannot find the FAMILY_ID in %s" % (ped_file)) + print "ERROR: Cannot find the FAMILY_ID in %s" % (file) raise SystemExit if CHILD_ID == 0: - print("WARNING: Cannot find exactly one CHILD_ID (with 2 available parents) in %s : not a trio --> will not be analyzed" % (ped_file)) - elif FAM_ID not in args.exclude_families: + print "WARNING: Cannot find exactly one CHILD_ID (with 2 available parents) in %s : not a trio --> will not be analyzed" % (file) + else: out_fam_han.write('%s\n' % (FAM_ID)) cntr_fam += 1 out_pro_han.write('%s\n' % (CHILD_ID)) @@ -80,27 +106,29 @@ def go(args): out_f_p_han.write('%s\t%s\n' % (FAM_ID,CHILD_ID)) cntr_f_p += 1 + in_han.close() + out_fam_han.close() out_pro_han.close() out_f_p_han.close() - print("") - print("Trio Families:") - print(" %s FAM_IDs --> %s" % (cntr_fam, out_fam_file)) - print(" %s PRO_IDs --> %s" % (cntr_pro, out_pro_file)) - print(" %s FAM_PRO --> %s" % (cntr_f_p, out_f_p_file)) - print("") + + + print "" + print "Trio Families:" + print " %s FAM_IDs --> %s" % (cntr_fam, out_fam_file) + print " %s PRO_IDs --> %s" % (cntr_pro, out_pro_file) + print " %s FAM_PRO --> %s" % (cntr_f_p, out_f_p_file) + print "" + + + if __name__ == '__main__': - a = argparse.ArgumentParser( - description=''' - Extract FAM_ID, PRO_ID and FAM_PRO files from Ped files. Suggested use: - time python /home/u035/u035/shared/scripts/extract_trio_FAM_PRO_ID.py - /home/u035/u035/shared/analysis/work/<PROJECT_ID> - ''' - ) - a.add_argument('ped_files', nargs='+') - a.add_argument('--exclude-families', nargs='*') - args = a.parse_args() - go(args) + if len(sys.argv) == 2: + go(sys.argv[1]) + else: + print "Suggested use: time python /home/u035/u035/shared/scripts/extract_trio_FAM_PRO_ID.py /home/u035/u035/shared/analysis/work/<PROJECT_ID>" + raise SystemExit + diff --git a/bin/filter_LQ_GT.py b/bin/filter_LQ_GT.py index 2e612f867280cfd82140c4599bc98c506c4fe9b5..b53a222a42fb3b4fe97121439c8ee24fee04c404 100755 --- a/bin/filter_LQ_GT.py +++ b/bin/filter_LQ_GT.py @@ -1,4 +1,3 @@ -#!/bin/env python2.7 # given a multi-sample VCF # reset the GT to ./. (no-call) for indi variant with num_ALT < num_ALT_THERSH or VAF < VAF_THRESH # @@ -51,7 +50,7 @@ def go(black_file,in_file,out_file): alt = data[4] key = '%s:%s:%s:%s' % (chr,pos,ref,alt) if key in BLACKLIST: - print " Warning: %s is a blacklisted variant, excluding it ..." % (key) + print " INFO: %s is a blacklisted variant, excluding it ..." % (key) continue @@ -180,6 +179,6 @@ if __name__ == '__main__': if len(sys.argv) == 4: go(sys.argv[1],sys.argv[2],sys.argv[3]) else: - print "Suggested use: time $PYTHON2 ${BLACKLIST} ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}.AC0.vcf ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}.clean.vcf" + print "Suggested use: time $PYTHON2 filter_LQ_GT.py ${BLACKLIST} ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}.AC0.vcf ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}.clean.vcf" raise SystemExit diff --git a/bin/generate_DEC_IGV_scripts.py b/bin/generate_DEC_IGV_scripts.py index 0c347caa630e76cd198b8bef01c66110035f5545..48f9e1400ada94cb7716489dbe3236bccaf3eddd 100755 --- a/bin/generate_DEC_IGV_scripts.py +++ b/bin/generate_DEC_IGV_scripts.py @@ -1,4 +1,3 @@ -#!/bin/env python2.7 # input: # the family PED file [${BATCH_ID}_${PLATE_ID}_${FAMILY_ID}.ped] # individual VCF file for the trio proband [${FAMILY_ID}-gatk-haplotype-annotated.${SAMPLE_ID}.vcf.gz] @@ -72,10 +71,12 @@ SOR_THRESH = float(3) -def go(dec_map_file,trans_map_file,ped_file,in_g2p_file,in_vase_file,fam_igv_dir,child_vcf_file,mom_vcf_file,dad_vcf_file,plate_id,fam_id,child_bam,mom_bam,dad_bam, out_dec_file, out_igv_file): +def go(dec_map_file,trans_map_file,ped_file,in_g2p_file,in_vase_file,fam_igv_dir,vcf_dir,plate_id,fam_id,dec_dir,fam_bam_dir): + # read the decipher to internal ID mapping file read_map_file(dec_map_file) + # read the transcript mapping file read_trans_map(trans_map_file) @@ -105,6 +106,11 @@ def go(dec_map_file,trans_map_file,ped_file,in_g2p_file,in_vase_file,fam_igv_dir # now read the individual VCFs and record all the variants + child_vcf_file = '%s/%s_%s.ready.%s.vcf.gz' % (vcf_dir,plate_id,fam_id,CHILD_ID) + mom_vcf_file = '%s/%s_%s.ready.%s.vcf.gz' % (vcf_dir,plate_id,fam_id,MOM_ID) + dad_vcf_file = '%s/%s_%s.ready.%s.vcf.gz' % (vcf_dir,plate_id,fam_id,DAD_ID) + + read_all_VCF_vars(child_vcf_file,ALL_CHILD_DICT) print "Found %s unique VCF variants for CHILD (%s)" % (len(ALL_CHILD_DICT),CHILD_ID) sys.stdout.flush() @@ -144,11 +150,13 @@ def go(dec_map_file,trans_map_file,ped_file,in_g2p_file,in_vase_file,fam_igv_dir # setup the DECIPHER output file + out_dec_file = '%s/%s_DEC_FLT.csv' % (dec_dir,CHILD_ID) ################################ out_han = open(out_dec_file,'w') out_han.write('Internal reference number or ID,Chromosome,Start,Genome assembly,Reference allele,Alternate allele,Transcript,Gene name,Intergenic,Chromosomal sex,Other rearrangements/aneuploidy,Open-access consent,Age at last clinical assessment,Prenatal age in weeks,Note,Inheritance,Pathogenicity,Phenotypes,HGVS code,Genotype,Responsible contact\n') # setup the IGV snapshot file + out_igv_file = '%s/IGV/%s.snapshot.FLT.txt' % (dec_dir,CHILD_ID) ################################# out_igv_han = open(out_igv_file,'w') out_igv_han.write('new\n') out_igv_han.write('genome hg38\n') @@ -156,6 +164,9 @@ def go(dec_map_file,trans_map_file,ped_file,in_g2p_file,in_vase_file,fam_igv_dir out_igv_han.write('maxPanelHeight 10000\n') out_igv_han.write('new\n') + child_bam = '%s/%s/%s-ready.bam' % (fam_bam_dir,CHILD_ID,CHILD_ID) + mom_bam = '%s/%s/%s-ready.bam' % (fam_bam_dir,MOM_ID,MOM_ID) + dad_bam = '%s/%s/%s-ready.bam' % (fam_bam_dir,DAD_ID,DAD_ID) out_igv_han.write('load %s\n' % (child_bam)) out_igv_han.write('load %s\n' % (mom_bam)) out_igv_han.write('load %s\n' % (dad_bam)) @@ -171,6 +182,7 @@ def go(dec_map_file,trans_map_file,ped_file,in_g2p_file,in_vase_file,fam_igv_dir in_cntr = 0 out_cntr = 0 + child_vcf_file = '%s/%s_%s.ready.%s.vcf.gz' % (vcf_dir,plate_id,fam_id,CHILD_ID) in_han = gzip.open(child_vcf_file,'r') @@ -1331,10 +1343,10 @@ def read_trans_map(in_file): if __name__ == '__main__': - if len(sys.argv) == 17: - go(*sys.argv[1:]) + if len(sys.argv) == 12: + go(sys.argv[1],sys.argv[2],sys.argv[3],sys.argv[4],sys.argv[5],sys.argv[6],sys.argv[7],sys.argv[8],sys.argv[9],sys.argv[10],sys.argv[11]) else: print "Suggested use: time python /home/u035/u035/shared/scripts/NHS_WES_generate_DEC_IGV.py \ - dec_map_file,trans_map_file,ped_file,in_g2p_file,in_vase_file,fam_igv_dir,child_vcf_file,mom_vcf_file,dad_vcf_file,plate_id,fam_id,child_bam,mom_bam,dad_bam,out_dec_file.txt,out_igv_file.txt" + dec_map_file,trans_map_file,ped_file,in_g2p_file,in_vase_file,fam_igv_dir,vcf_dir,plate_id,fam_id,dec_dir,fam_bam_dir" raise SystemExit diff --git a/bin/merge_and_rename_NGI_fastq_files.py b/bin/merge_and_rename_NGI_fastq_files.py new file mode 100644 index 0000000000000000000000000000000000000000..b0ef918ce1c0bc9f04ccaf873ed8ee6760e44521 --- /dev/null +++ b/bin/merge_and_rename_NGI_fastq_files.py @@ -0,0 +1,35 @@ +#!/usr/bin/env python + +import re +import os +import sys +import shutil +import argparse +import collections + +def merge_files(sample_name, read_nb, file_list, dest_dir, suffix): + outfile=os.path.join(dest_dir, "{}{}_R{}.fastq.gz".format(sample_name, suffix, read_nb)) + tomerge = file_list.split(':') + print("Merging the following files:") + if not tomerge: + print("No read {} files found".format(read_nb)) + return + for fq in tomerge: + print(fq) + print("as {}".format(outfile)) + with open(outfile, 'wb') as wfp: + for fn in tomerge: + with open(fn, 'rb') as rfp: + shutil.copyfileobj(rfp, wfp) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description=""" Merges the given list of FASTQ files. Output as {dest_dir}/{sample_name}{suffix}_R{read_end}.fastq.gz.""") + parser.add_argument("files", nargs='?', default='.', help="Colon-delimited list of FASTQ files to merge.") + parser.add_argument("sample_name", nargs='?', default='.', help="Output sample name.") + parser.add_argument("read_end", nargs='?', default='.', help="Read end (1 or 2).") + parser.add_argument("dest_dir", nargs='?', default='.', help="Path to where the merged files should be output. ") + parser.add_argument("suffix", nargs='?', default='', help="Optional suffix for sample names in output file names, e.g. sample_R1.fastq.gz. ") + args = parser.parse_args() + merge_files(args.sample_name, args.read_end, args.files, args.dest_dir, args.suffix) + diff --git a/bin/old_convert_DEC_to_v10.py b/bin/old_convert_DEC_to_v10.py new file mode 100755 index 0000000000000000000000000000000000000000..f4b23df27aaaaff3fc01bb4d1a83d224d4ff01ab --- /dev/null +++ b/bin/old_convert_DEC_to_v10.py @@ -0,0 +1,78 @@ +#!/bin/env python +# given a DECIPHER bulk upload file v9 +# convert it to v10 +# +# Author: MH +# last modified: APR 25, 2022 + + + +import sys +import os +import csv +import xlsxwriter + + + + +def go(id, in_file, out_file): # the folder where the bulk upload files are strored; id is in the format <indiv_id>_<family_id> + + # create the workbook + workbook = xlsxwriter.Workbook(out_file) + + # create the worksheet + worksheet = workbook.add_worksheet('Sequence Variants') + + # write the header row + header = ('Patient internal reference number or ID','Shared','Assembly','HGVS code','Chromosome','Genomic start','Ref sequence','Alt sequence','Gene name','Transcript','Is intergenic','Genotype','Inheritance','Pathogenicity','Pathogenicity evidence','Contribution','Genotype groups') + worksheet.write_row(0,0,header) + + # now, open and read the old file, for each variant collecting the information required for v10 and writing it in the v10 file + cntr = 0 + + with open(in_file,'r') as tsvfile: + reader = csv.reader(tsvfile, delimiter=',', quotechar='"') + for row in reader: + if row[0] == 'Internal reference number or ID': # ignore the header line + continue + + cntr += 1 + id = str(row[0]) + shared = 'NHS-SCE' + assembly = 'GRCh38' + HGVS = '' + chr = str(row[1]) + start = str(row[2]) + ref = str(row[4]) + alt = str(row[5]) + gene = str(row[7]) + trans = str(row[6]) + inter = str(row[8]) + genotype = str(row[19]) + inher = str(row[15]) + if inher == 'Maternally inherited, constitutive in mother': + inher = 'Maternally inherited' + elif inher == 'Paternally inherited, constitutive in father': + inher = 'Paternally inherited' + patho = '' + evid = '' + cont = '' + gt_groups = '' + data = (id,shared,assembly,HGVS,chr,start,ref,alt,gene,trans,inter,genotype,inher,patho,evid,cont,gt_groups) + + # write it + worksheet.write_row(cntr,0,data) + + + # close the workbook + workbook.close() + + + +if __name__ == '__main__': + if len(sys.argv) == 4: + go(*sys.argv[1:]) + else: + print("Suggested use: convert_DEC_to_v10.py <indiv_id>_<family_id> <in_file> <out_file>") + raise SystemExit + diff --git a/bin/peddy_validation.pl b/bin/peddy_validation.pl index f8239522291223f9cc10bb2fb1fe1464239693fb..f25bf4831e33f8376a029203b51faf2f09ae593e 100755 --- a/bin/peddy_validation.pl +++ b/bin/peddy_validation.pl @@ -78,6 +78,7 @@ $out_fh->open($out_file, "w") or die "Could not open $out_file\n$!"; printf $out_fh "sample_a\tsample_b\tpedigree_parents\tpredicted_parents\tparent_error\n"; +my %out_lines; foreach my $family_id (sort keys %ped) { my $glob_str = sprintf("$fam_dir/*%s*/*%s*/qc/peddy/*.ped_check.csv", $family_id, $ped{$family_id}{'aff'}); @@ -88,8 +89,10 @@ foreach my $family_id (sort keys %ped) next; } - my $peddy_fh = new IO::File; - $peddy_fh->open($peddy_glob[0], "r") or die "Could not open $peddy_glob[0]\n$!"; + foreach my $peddy_file (@peddy_glob) + { + my $peddy_fh = new IO::File; + $peddy_fh->open($peddy_file, "r") or die "Could not open $peddy_file\n$!"; my @headers; my %info; @@ -128,11 +131,16 @@ foreach my $family_id (sort keys %ped) $info{'parent_error'}{$sample_pair} = $info{'pedigree_parents'}{$sample_pair} eq $info{'predicted_parents'}{$sample_pair} ? 'False' : 'True'; - printf $out_fh "$sample_pair\t%s\t%s\t%s\n", + my $out_line = sprintf "$sample_pair\t%s\t%s\t%s", $info{'pedigree_parents'}{$sample_pair}, $info{'predicted_parents'}{$sample_pair}, $info{'parent_error'}{$sample_pair}; + + $out_lines{$out_line}++; } + } } +printf $out_fh join("\n", sort keys %out_lines); + $out_fh->close(); diff --git a/bin/qc_metrics.R b/bin/qc_metrics.R index 73e79d83321b74705cb00ebeaa1c63f9b1632e69..d5af02c82d427fc9ce338bc0e441e0b79533c858 100644 --- a/bin/qc_metrics.R +++ b/bin/qc_metrics.R @@ -1,6 +1,7 @@ x = read.table("multiqc_general_stats.txt", header=T, stringsAsFactors=F, sep="\t") -y = read.table("samples_not_on_run.txt", header=F, stringsAsFactors=F) -z = subset(x, !(x$Sample %in% y$V1)) +#y = read.table("samples_not_on_run.txt", header=F, stringsAsFactors=F) +#z = subset(x, !(x$Sample %in% y$V1)) +z = x print("Total reads < 50M") diff --git a/bin/extract_quad_FAM_PRO_ID.py b/bin/redo_extract_quad_FAM_PRO_ID.py similarity index 100% rename from bin/extract_quad_FAM_PRO_ID.py rename to bin/redo_extract_quad_FAM_PRO_ID.py diff --git a/bin/extract_shared_FAM_PRO_ID.py b/bin/redo_extract_shared_FAM_PRO_ID.py similarity index 100% rename from bin/extract_shared_FAM_PRO_ID.py rename to bin/redo_extract_shared_FAM_PRO_ID.py diff --git a/bin/redo_extract_solo_FAM_PRO_ID.py b/bin/redo_extract_solo_FAM_PRO_ID.py new file mode 100755 index 0000000000000000000000000000000000000000..c7a9451b59a53912c4180f39206c45fc17e54c61 --- /dev/null +++ b/bin/redo_extract_solo_FAM_PRO_ID.py @@ -0,0 +1,153 @@ +# input: the work folder which contains a PED subfolder where all family PED files were copied +# output: only for singletons +# solo_FAM_IDs.txt, solo_PRO_IDs.txt and solo_FAM_PRO.txt +# +# Author: MH +# last modified: SEPT 05, 2023 + + + +import sys +import os + + + +def go(work_dir): + + out_fam_file = work_dir + '/solo_FAM_IDs.txt' + out_pro_file = work_dir + '/solo_PRO_IDs.txt' + out_f_p_file = work_dir + '/solo_FAM_PRO.txt' + + out_fam_han = open(out_fam_file,'w') + out_pro_han = open(out_pro_file,'w') + out_f_p_han = open(out_f_p_file,'w') + + cntr_fam = 0 + cntr_pro = 0 + cntr_f_p = 0 + + + # go over the PED folder in the working dir and process each PED file + ped_dir = work_dir + '/PED' + print "" + print "Processing the PED files (in %s) to extract singleton FAM_ID, PRO_ID amd FAM_PRO files" % (ped_dir) + + + + for file in os.listdir(ped_dir): # per each PED file + if file.endswith(".ped"): + + print " %s" % (file) + in_file = os.path.join(ped_dir, file) + + # check if this PED has a single record - most likely a singleton + first_cntr_check = 0 + in_han = open(in_file,'r') + for line in in_han: + first_cntr_check += 1 + in_han.close() + + if first_cntr_check != 1: + print "WARNING: There are %s records in PED file = %s" % (first_cntr_check, file) + print " This family will not be treated as a singleton family!!!!" + continue + +# # check how many lines in the PED file - if more than one cannot be a singleton, ignore +# num_lines = 0 +# in_han = open(in_file,'r') +# for line in in_han: +# num_lines += 1 +# in_han.close() +# +# if num_lines > 1: +# #print "ERROR: found more than 1 individual in a singleton PED file = %s" % (in_file) +# #raise SystemExit +# print "WARNING: found more than 1 individual in PED file = %s, not a singleton, skipping...." % (in_file) +# continue +# if num_lines == 0: +# print "ERROR: empty PED file %s" % (file) +# raise SystemExit +# +# # if here, the PED file contains exactly one line +# # check if all fine: parents IDs = 0, kid with known sex and is affected + + + + # if here, there are exactly 1 record in the family PED file - keep checking and collect information + CHILD_ID = 0 + FAM_ID = 0 + + in_han = open(in_file,'r') + for line in in_han: + data = [x.strip() for x in line.strip().split('\t')] + + x_plate,x_fam = data[0].split('_') # in the internal PED file, family_id is plateID_familyID, will keep only clean family_id, which corresponds to DECIPHER ID + y_indi,y_fam = data[1].split('_') # in the internal PED file, indi_id is indiID_familyID, split + + if x_fam != y_fam: + print "ERROR: FAMILY_ID mismatch in %s" % (file) + print line + raise SystemExit + + FAM_ID = x_fam + CHILD_ID = y_indi + + # check both parent IDs == 0 + if (data[2] != '0') or (data[3] != '0'): + print "ERROR: found parent ID for a singleton child in %s" % (file) + print line + raise SystemExit + + # make sure the sex of the child is known + CHILD_SEX = int(data[4]) + if (CHILD_SEX == 1) or (CHILD_SEX == 2): + pass + else: + print "ERROR: proband sex unknown in %s" % (file) + print line + raise SystemExit + + # check kid is affected + if int(data[5]) != 2: + print "ERROR: singleton child not affected" + print line + raise SystemExit + + if FAM_ID == 0: + print "ERROR: Cannot find the FAMILY_ID in %s" % (file) + raise SystemExit + if CHILD_ID == 0: + print "ERROR: Cannot find CHILD_ID in %s" % (file) + raise SystemExit + else: + out_fam_han.write('%s\n' % (FAM_ID)) + cntr_fam += 1 + out_pro_han.write('%s\n' % (CHILD_ID)) + cntr_pro += 1 + out_f_p_han.write('%s\t%s\n' % (FAM_ID,CHILD_ID)) + cntr_f_p += 1 + + out_fam_han.close() + out_pro_han.close() + out_f_p_han.close() + + + + print "" + print "Singleton Families:" + print " %s FAM_IDs --> %s" % (cntr_fam, out_fam_file) + print " %s PRO_IDs --> %s" % (cntr_pro, out_pro_file) + print " %s FAM_PRO --> %s" % (cntr_f_p, out_f_p_file) + print "" + + + + + +if __name__ == '__main__': + if len(sys.argv) == 2: + go(sys.argv[1]) + else: + print "Suggested use: time python /home/u035/u035/shared/scripts/extract_solo_FAM_PRO_ID.py /home/u035/u035/shared/analysis/work/<PROJECT_ID>" + raise SystemExit + diff --git a/bin/redo_extract_trio_FAM_PRO_ID.py b/bin/redo_extract_trio_FAM_PRO_ID.py new file mode 100755 index 0000000000000000000000000000000000000000..3d02a30731bfbcb57fa52141d77907a4c431087d --- /dev/null +++ b/bin/redo_extract_trio_FAM_PRO_ID.py @@ -0,0 +1,134 @@ +# input: the work folder which contains a PED subfolder where all family PED files were copied +# output: only for trio families +# FAM_IDs.txt, PRO_IDs.txt and FAM_PRO.txt +# +# Author: MH +# last modified: SEPT 05, 2023 + + + +import sys +import os + + + +def go(work_dir): + + out_fam_file = work_dir + '/FAM_IDs.txt' + out_pro_file = work_dir + '/PRO_IDs.txt' + out_f_p_file = work_dir + '/FAM_PRO.txt' + + out_fam_han = open(out_fam_file,'w') + out_pro_han = open(out_pro_file,'w') + out_f_p_han = open(out_f_p_file,'w') + + cntr_fam = 0 + cntr_pro = 0 + cntr_f_p = 0 + + + # go over the PED folder in the working dir and process each PED file + ped_dir = work_dir + '/PED' + print "" + print "Processing the PED files (in %s) to extract FAM_ID, PRO_ID amd FAM_PRO files" % (ped_dir) + + for file in os.listdir(ped_dir): # per each PED file + if file.endswith(".ped"): + print " %s" % (file) + in_file = os.path.join(ped_dir, file) + + # check if this PED has three records - most likely a trio + first_cntr_check = 0 + in_han = open(in_file,'r') + for line in in_han: + first_cntr_check += 1 + in_han.close() + + if first_cntr_check != 3: + print "WARNING: There are %s records in PED file = %s" % (first_cntr_check, file) + print " This family will not be treated as a trio family!!!" + continue + + # if here, there are exactly 3 records in the family PED file - keep checking and collect information + CHILD_ID = 0 + FAM_ID = 0 + + in_han = open(in_file,'r') + for line in in_han: + data = [x.strip() for x in line.strip().split('\t')] + + x_plate,x_fam = data[0].split('_') # in the internal PED file, family_id is plateID_familyID, will keep only clean family_id, which corresponds to DECIPHER ID + y_indi,y_fam = data[1].split('_') # in the internal PED file, indi_id is indiID_familyID, split + + if FAM_ID == 0: + FAM_ID = x_fam + elif FAM_ID != x_fam: + print "ERROR: more than one FAMILY_ID in %s" % (file) + raise SystemExit + + if data[2] != '0' and data[3] != '0': # this is the child in the trio + if CHILD_ID == 0: + CHILD_ID = y_indi + else: # seen another child + print "WARNING: already have seen a child (possibly a quad) in %s" % (file) + CHILD_ID = 0 + break + + CHILD_SEX = int(data[4]) + if (CHILD_SEX == 1) or (CHILD_SEX == 2): + pass + else: + print "ERROR: proband sex unknown" + print line + raise SystemExit + + if int(data[5]) != 2: + print "ERROR: child in a trio not affected" + print line + raise SystemExit + + elif data[2] == '0' and data[3] == '0': # this is a parent in the trio + if int(data[5]) == 2: + print "WARNING: affected parent %s in a trio - still processed as a trio, but no de novo analysis" % (y_indi) + print line + + + if FAM_ID == 0: + print "ERROR: Cannot find the FAMILY_ID in %s" % (file) + raise SystemExit + if CHILD_ID == 0: + print "WARNING: Cannot find exactly one CHILD_ID (with 2 available parents) in %s : not a trio --> will not be analyzed" % (file) + else: + out_fam_han.write('%s\n' % (FAM_ID)) + cntr_fam += 1 + out_pro_han.write('%s\n' % (CHILD_ID)) + cntr_pro += 1 + out_f_p_han.write('%s\t%s\n' % (FAM_ID,CHILD_ID)) + cntr_f_p += 1 + + in_han.close() + + out_fam_han.close() + out_pro_han.close() + out_f_p_han.close() + + + + print "" + print "Trio Families:" + print " %s FAM_IDs --> %s" % (cntr_fam, out_fam_file) + print " %s PRO_IDs --> %s" % (cntr_pro, out_pro_file) + print " %s FAM_PRO --> %s" % (cntr_f_p, out_f_p_file) + print "" + + + + + +if __name__ == '__main__': + if len(sys.argv) == 2: + go(sys.argv[1]) + else: + print "Suggested use: time python /home/u035/u035/shared/scripts/extract_trio_FAM_PRO_ID.py /home/u035/u035/shared/analysis/work/<PROJECT_ID>" + raise SystemExit + diff --git a/bin/redo_generate_DEC_IGV_trio_scripts.py b/bin/redo_generate_DEC_IGV_trio_scripts.py index 96e12e72c7a200933ee23ade4ad408f7b13f9563..62cce17bbd21b06c11212571115e6f318bae82b3 100755 --- a/bin/redo_generate_DEC_IGV_trio_scripts.py +++ b/bin/redo_generate_DEC_IGV_trio_scripts.py @@ -56,7 +56,7 @@ ALL_MOM_DICT = {} # key: chr:pos:ref:alt; value: irrelevant ALL_DAD_DICT = {} # key: chr:pos:ref:alt; value: irrelevant -CHILD_INHER_DICT = {} # key: chr:pos:ref:alt; value: 'Paternally inherited, constitutive in father' | 'Maternally inherited, constitutive in mother' | 'Biparental' | 'De novo constitutive' | 'Unknown' +CHILD_INHER_DICT = {} # key: chr:pos:ref:alt; value: 'Paternally inherited, constitutive in father' | 'Maternally inherited, constitutive in mother' | 'Biparental' | 'De novo (parentage confirmed)' | 'Unknown' #+#SNAP_FLANK = 25 IGV_PAD = 25 @@ -240,7 +240,7 @@ def go(dec_map_file,trans_map_file,ped_file,in_g2p_file,in_vase_file,fam_igv_dir if is_denovo: if inher_stat == 'Unknown': - inher_stat = 'De novo constitutive' + inher_stat = 'De novo (parentage confirmed)' else: print "ERROR: %s is both VASE denovo and %s from VCF" % (key,inher_stat) raise SystemExit @@ -337,7 +337,7 @@ def go(dec_map_file,trans_map_file,ped_file,in_g2p_file,in_vase_file,fam_igv_dir if is_denovo: if inher_stat == 'Unknown': - inher_stat = 'De novo constitutive' + inher_stat = 'De novo (parentage confirmed)' else: print "ERROR: %s is both VASE denovo and %s from VCF" % (key,inher_stat) raise SystemExit @@ -433,7 +433,7 @@ def go(dec_map_file,trans_map_file,ped_file,in_g2p_file,in_vase_file,fam_igv_dir if is_denovo: if inher_stat == 'Unknown': - inher_stat = 'De novo constitutive' + inher_stat = 'De novo (parentage confirmed)' else: print "ERROR: %s is both VASE denovo and %s from VCF" % (key,inher_stat) raise SystemExit diff --git a/bin/redo_hunt_novel_shared_vars.py b/bin/redo_hunt_novel_shared_vars.py index 59881955fc7bcbb45b555f89720406c9e96d1e51..2948028be221a41a83680d89de4430b2a1edf0f2 100755 --- a/bin/redo_hunt_novel_shared_vars.py +++ b/bin/redo_hunt_novel_shared_vars.py @@ -21,7 +21,7 @@ # # # Author: MH -# last modified: AUG 24, 2023 +# last modified: SEPT 06, 2023 @@ -45,12 +45,14 @@ def go(old_DEC_DIR,new_DEC_DIR,FAMILY_ID): for file in os.listdir(new_DEC_DIR): if file.startswith("IGV"): continue + if file.find(FAMILY_ID) == -1: # DECIPHER upload file *not* for this family + continue + # DECIPHER upload file for this FAMILY_ID data = [x.strip() for x in file.strip().split('_')] if str(data[0]) not in INDIS: INDIS.append(str(data[0])) - # now, process each indivdual - + # now, process each indivdual from this family for INDI_ID in INDIS: INDI_FAM = "%s_%s" % (INDI_ID,FAMILY_ID) diff --git a/bin/redo_setup_quad.sh b/bin/redo_setup_quad.sh index 9ccbdb3623a4283ffb75a3feda7232e4a5f3af53..312a0ad9bfa31fc708a740a038ef5edaa5d616fd 100755 --- a/bin/redo_setup_quad.sh +++ b/bin/redo_setup_quad.sh @@ -114,7 +114,7 @@ echo "" ### generate the FAM_IDs.txt, PRO_IDs.txt and FAM_PRO.txt *only for quad* families ### ########################################################################################### -time ${PYTHON2} ${SCRIPTS_DIR}/extract_quad_FAM_PRO_ID.py ${WORK_DIR} +time ${PYTHON2} ${SCRIPTS_DIR}/redo_extract_quad_FAM_PRO_ID.py ${WORK_DIR} @@ -129,7 +129,7 @@ echo "Copying the the previous DECIPHER upload files ..." mkdir ${WORK_DIR}/previous_DECIPHER_upload while IFS=$'\t' read -r -a myArray do - old_DEC_FILE=${INPUT_DIR}/previous_DECIPHER_upload/*${myArray[2]}* + old_DEC_FILE=${INPUT_DIR}/previous_DECIPHER_upload/quad/*${myArray[2]}* cp ${old_DEC_FILE} ${WORK_DIR}/previous_DECIPHER_upload/ done < ${INPUT_DIR}/params/quad.txt #cp -r ${INPUT_DIR}/previous_DECIPHER_upload ${WORK_DIR} diff --git a/bin/redo_setup_shared.sh b/bin/redo_setup_shared.sh index fa8401e784f595f10f14da66bec8b0bef545fe1b..d36eda4234f0361080b041d0f15f63e852904dff 100755 --- a/bin/redo_setup_shared.sh +++ b/bin/redo_setup_shared.sh @@ -111,7 +111,7 @@ echo "" ### generate the FAM_IDs.txt, PRO_IDs.txt and FAM_PRO.txt *only for shared* families ### ########################################################################################### -time ${PYTHON2} ${SCRIPTS_DIR}/extract_shared_FAM_PRO_ID.py ${WORK_DIR} +time ${PYTHON2} ${SCRIPTS_DIR}/redo_extract_shared_FAM_PRO_ID.py ${WORK_DIR} @@ -126,7 +126,7 @@ echo "Copying the the previous DECIPHER upload files ..." mkdir ${WORK_DIR}/previous_DECIPHER_upload while IFS=$'\t' read -r -a myArray do - old_DEC_FILE=${INPUT_DIR}/previous_DECIPHER_upload/*${myArray[2]}* + old_DEC_FILE=${INPUT_DIR}/previous_DECIPHER_upload/shared_affected/*${myArray[2]}* cp ${old_DEC_FILE} ${WORK_DIR}/previous_DECIPHER_upload/ done < ${INPUT_DIR}/params/shared_affected.txt #cp -r ${INPUT_DIR}/previous_DECIPHER_upload ${WORK_DIR} diff --git a/bin/redo_setup_solo.sh b/bin/redo_setup_solo.sh index 67ef05ccc5a1503f4751daaf7804e90703647e93..fb10f0ac0787685a84403c72d2b767acfeabb610 100755 --- a/bin/redo_setup_solo.sh +++ b/bin/redo_setup_solo.sh @@ -8,7 +8,7 @@ # Author: MH -# last modified: AUG 29, 2023 +# last modified: AUG 31, 2023 ################################################################################################### @@ -106,7 +106,7 @@ echo "" ### generate the FAM_IDs.txt, PRO_IDs.txt and FAM_PRO.txt *only for singleton* families ### ########################################################################################### -time ${PYTHON2} ${SCRIPTS_DIR}/extract_solo_FAM_PRO_ID.py ${WORK_DIR} +time ${PYTHON2} ${SCRIPTS_DIR}/redo_extract_solo_FAM_PRO_ID.py ${WORK_DIR} @@ -119,7 +119,7 @@ echo "Copying the the previous DECIPHER upload files ..." mkdir ${WORK_DIR}/previous_DECIPHER_upload while IFS=$'\t' read -r -a myArray do - old_DEC_FILE=${INPUT_DIR}/previous_DECIPHER_upload/*${myArray[2]}* + old_DEC_FILE=${INPUT_DIR}/previous_DECIPHER_upload/singleton/*${myArray[2]}* cp ${old_DEC_FILE} ${WORK_DIR}/previous_DECIPHER_upload/ done < ${INPUT_DIR}/params/singleton.txt #cp -r ${INPUT_DIR}/previous_DECIPHER_upload ${WORK_DIR} diff --git a/bin/redo_setup_trio.sh b/bin/redo_setup_trio.sh index 587ae56a4bdd85e6988dbfa9ede738a1778e4c0f..ed290f3baf620f9ca1472ef7a879566d42df79a2 100755 --- a/bin/redo_setup_trio.sh +++ b/bin/redo_setup_trio.sh @@ -8,7 +8,7 @@ # Author: MH -# last modified: AUG 29, 2023 +# last modified: AUG 31, 2023 ########################################################################################### @@ -110,7 +110,7 @@ echo "" ### generate the FAM_IDs.txt, PRO_IDs.txt and FAM_PRO.txt *only for trio* families ### ###################################################################################### -time ${PYTHON2} ${SCRIPTS_DIR}/extract_trio_FAM_PRO_ID.py ${WORK_DIR} +time ${PYTHON2} ${SCRIPTS_DIR}/redo_extract_trio_FAM_PRO_ID.py ${WORK_DIR} @@ -126,7 +126,7 @@ echo "Copying the the previous DECIPHER upload files ..." mkdir ${WORK_DIR}/previous_DECIPHER_upload while IFS=$'\t' read -r -a myArray do - old_DEC_FILE=${INPUT_DIR}/previous_DECIPHER_upload/*${myArray[2]}* + old_DEC_FILE=${INPUT_DIR}/previous_DECIPHER_upload/trio/*${myArray[2]}* cp ${old_DEC_FILE} ${WORK_DIR}/previous_DECIPHER_upload/ done < ${INPUT_DIR}/params/trio.txt #cp -r ${INPUT_DIR}/previous_DECIPHER_upload ${WORK_DIR} @@ -135,7 +135,6 @@ echo "" - echo "" echo "" echo "OK: Setup for *trio* PROJECT_ID = $PROJECT_ID successful" diff --git a/bin/select_decipher_upload_files.pl b/bin/select_decipher_upload_files.pl index b16a529a97c5b856cf20855f9cd6ea421a5d4ff5..0ff0b5a548833838564be9f7430812b910919b69 100644 --- a/bin/select_decipher_upload_files.pl +++ b/bin/select_decipher_upload_files.pl @@ -3,6 +3,20 @@ use IO::File; use strict; +my %all; +my $all_fh = new IO::File; +$all_fh->open("../params/all.txt", "r") or die "Could not open ../params/all.txt\n$!"; + +while (my $line = <$all_fh>) +{ + chomp $line; + my ($project, $batch, $family) = split('\t', $line); + + $all{$family} = $batch; +} + +$all_fh->close(); + my %files; while (my $line = <>) { @@ -11,6 +25,16 @@ while (my $line = <>) my $filename = $tokens[-1]; my $family = $tokens[-2]; + if ($line =~ /20220418_reanalysis/) + { + my $bare_family = $family; + $bare_family =~ s/_shared//; + + my $new_family = $all{$bare_family} . "_" . $family; +# print "Mapping $family to $new_family\n"; + $family = $new_family; + } + # get the last modified time of the file my $fh = new IO::File; $fh->open($line, "r") or die "Could not open $line\n$!"; @@ -52,12 +76,17 @@ foreach my $family (keys %files) } my @times = sort { $a <=> $b } keys %{ $files{$family}{$indv}{"DECIPHER_v11.xlsx"}{$filenames[0]} }; - my $file_to_use = $files{$family}{$indv}{"DECIPHER_v11.xlsx"}{$filenames[0]}{$times[-1]}; - `mkdir -p $family/$indv`; - `cp $file_to_use $family/$indv/`; + my $file_to_use = $files{$family}{$indv}{"DECIPHER_v11.xlsx"}{$filenames[0]}{$times[0]}; + if (scalar(@times) > 0) + { + $file_to_use = $files{$family}{$indv}{"DECIPHER_v11.xlsx"}{$filenames[0]}{$times[-1]}; + } + +# `mkdir -p $family/$indv`; +# `cp $file_to_use $family/$indv/`; -# printf "$family\t$indv\tDECIPHER_v11\t$filenames[0]\t$times[-1]\t%s\n", $files{$family}{$indv}{"DECIPHER_v11.xlsx"}{$filenames[0]}{$times[-1]}; + printf "$family\t$indv\tDECIPHER_v11.xlsx\t$filenames[0]\t$times[-1]\t$file_to_use\n"; } elsif (exists($files{$family}{$indv}{"DECIPHER_v10.xlsx"})) { @@ -67,14 +96,18 @@ foreach my $family (keys %files) printf STDERR "More than one file for $family $indv DECIPHER_v10.xlsx: %s\n", join(", ", keys %{ $files{$family}{$indv}{"DECIPHER_v10.xlsx"} }); exit; } - + my @times = sort { $a <=> $b } keys %{ $files{$family}{$indv}{"DECIPHER_v10.xlsx"}{$filenames[0]} }; - my $file_to_use = $files{$family}{$indv}{"DECIPHER_v11.xlsx"}{$filenames[0]}{$times[-1]}; + my $file_to_use = $files{$family}{$indv}{"DECIPHER_v10.xlsx"}{$filenames[0]}{$times[0]}; + if (scalar(@times) > 0) + { + $file_to_use = $files{$family}{$indv}{"DECIPHER_v10.xlsx"}{$filenames[0]}{$times[-1]}; + } - `mkdir -p $family/$indv`; - `cp $file_to_use $family/$indv/`; +# `mkdir -p $family/$indv`; +# `cp $file_to_use $family/$indv/`; -# printf "$family\t$indv\tDECIPHER_v10\t$filenames[0]\t$times[-1]\t%s\n", $files{$family}{$indv}{"DECIPHER_v10.xlsx"}{$filenames[0]}{$times[-1]}; + printf "$family\t$indv\tDECIPHER_v10.xlsx\t$filenames[0]\t$times[-1]\t$file_to_use\n"; } elsif (exists($files{$family}{$indv}{"DEC_FLT.csv"})) { @@ -86,12 +119,17 @@ foreach my $family (keys %files) } my @times = sort { $a <=> $b } keys %{ $files{$family}{$indv}{"DEC_FLT.csv"}{$filenames[0]} }; - my $file_to_use = $files{$family}{$indv}{"DECIPHER_v11.xlsx"}{$filenames[0]}{$times[-1]}; - `mkdir -p $family/$indv`; - `cp $file_to_use $family/$indv/`; + my $file_to_use = $files{$family}{$indv}{"DEC_FLT.csv"}{$filenames[0]}{$times[0]}; + if (scalar(@times) > 0) + { + $file_to_use = $files{$family}{$indv}{"DEC_FLT.csv"}{$filenames[0]}{$times[-1]}; + } + +# `mkdir -p $family/$indv`; +# `cp $file_to_use $family/$indv/`; -# printf "$family\t$indv\tDECIPHER_v10\t$filenames[0]\t$times[-1]\t%s\n", $files{$family}{$indv}{"DEC_FLT.csv"}{$filenames[0]}{$times[-1]}; + printf "$family\t$indv\tDEC_FLT.csv\t$filenames[0]\t$times[-1]\t$file_to_use\n"; } } } diff --git a/bin/submit_trio_wes_rerun_vep.sh b/bin/submit_trio_wes_rerun_vep.sh index b01a032493893485a2adab96213b4983fbbed8d8..297488e0abc56d5bfcfa49bb5ddac0478c2392b4 100755 --- a/bin/submit_trio_wes_rerun_vep.sh +++ b/bin/submit_trio_wes_rerun_vep.sh @@ -43,7 +43,7 @@ export PATH=/mnt/e1000/home/u035/u035/shared/software/bcbio/anaconda/bin:"$PATH" -i $NOANN_VCF_FILE \ --fork 16 \ --species homo_sapiens \ ---no_stats \ +--stats_file ${VCF_FILE%.vcf.gz}.stats.txt --stats_text \ --cache \ --offline \ --dir /mnt/e1000/home/u035/u035/shared/software/bcbio/genomes/Hsapiens/hg38/vep \ diff --git a/bin/trio_wes_prepare_bcbio_config_crf.sh b/bin/trio_wes_prepare_bcbio_config_crf.sh index c1c1d77a446b07312aee962fd66788707680a614..93e9605da48c031f6b235e5b0984bdfb304c1e2e 100755 --- a/bin/trio_wes_prepare_bcbio_config_crf.sh +++ b/bin/trio_wes_prepare_bcbio_config_crf.sh @@ -99,13 +99,16 @@ do COMPRESSED_ID=`echo "$FAMILY_ID" | perl -pe "s/\_//"` perl -i -pe "s/${COMPRESSED_ID}/${FAMILY_ID}/" $CONFIG_DIR/$PREFIX.yaml + perl -pi -e "s/\'\[/\[/" $CONFIG_DIR/${PREFIX}.yaml + perl -pi -e "s/\]\'/\]/" $CONFIG_DIR/${PREFIX}.yaml + perl -pi -e "s|\./results|$OUTPUT_DIR/${SHORT_PROJECT_ID}_${VERSION}/families|" $CONFIG_DIR/${PREFIX}.yaml rm -r $PREFIX mkdir -p ${OUTPUT_DIR}/${SHORT_PROJECT_ID}_${VERSION}/params/ mv ${PREFIX}.csv ${OUTPUT_DIR}/${SHORT_PROJECT_ID}_${VERSION}/params/ - mv ${PROJECT_ID}_${FAMILY_ID}.ped ${OUTPUT_DIR}/${SHORT_PROJECT_ID}_${VERSION}/params/ + cp ${PROJECT_ID}_${FAMILY_ID}.ped ${OUTPUT_DIR}/${SHORT_PROJECT_ID}_${VERSION}/params/ done -mv *.txt *.log *.ped ${OUTPUT_DIR}/${SHORT_PROJECT_ID}_${VERSION}/params/ +cp *.txt *.log *.ped ${OUTPUT_DIR}/${SHORT_PROJECT_ID}_${VERSION}/params/ diff --git a/bin/trio_wes_prepare_bcbio_config_singleton_from_duo.sh b/bin/trio_wes_prepare_bcbio_config_singleton_from_duo.sh index 03c7e4a5a30295cc374e778a5dee1824e52be056..1d36a7f6cfcd93544abf38cf693273383a702dc5 100755 --- a/bin/trio_wes_prepare_bcbio_config_singleton_from_duo.sh +++ b/bin/trio_wes_prepare_bcbio_config_singleton_from_duo.sh @@ -1,106 +1,49 @@ #!/bin/bash # -# trio_wes_prepare_bcbio_singleton_from_duo_config.sh <config.sh> <project_id> <params> +# trio_wes_prepare_bcbio_singleton_from_duo_config.sh <config.sh> <project_id> <version> # -# Assumes that reads for the samples are in the path -# $READS_DIR/<project_id>/<date>/<sample><sample_suffix>/*.gz, -# and that no samples other than those with reads are listed in the -# PED file. $READS_DIR is specified in the <config.sh> file. -# -# Assumes that the sample names in the PED file match those -# specifying the read directories with the addition of a specified -# suffix. -# # All samples must be annotated with sex (1=male, 2=female) in the # 5th column and phenotype (1=unaffected, 2=affected) in the 6th # column of the PED file. # -# Runs bcbio sample preparation and configuration file generation, -# assuming the template configuration file is at $BCBIO_TEMPLATE, -# specified in the <config.sh> file. -# -# Assumes bcbio is on the PATH (set in <config.sh>). +# Assumes the duo config and ped file exists with the original family names (i.e. no duo suffix). # CONFIG_SH=$1 PROJECT_ID=$2 -PARAMS=$3 +VERSION=$3 source $CONFIG_SH -# -# Create the file $PROJECT_ID.family_ids.txt -# cd $PARAMS_DIR -cat *.ped | cut -f 1 > $PROJECT_ID.family_ids.txt - SHORT_PROJECT_ID=`echo $PROJECT_ID | cut -f 1 -d '_'` - -COUNT=`wc -l ${PROJECT_ID}.family_ids.txt | awk '{ print $1 }'` +COUNT=`wc -l ${PROJECT_ID}.singleton_from_duo.txt | awk '{ print $1 }'` for ((i = 1; i <= $COUNT; i = i + 1)) do + FAMILY_ID=`head -n $i ${PROJECT_ID}.singleton_from_duo.txt | tail -n 1` + PROBAND_ID=`grep 2$ ${PROJECT_ID}_${FAMILY_ID}.ped | cut -f 2` - ORIG_PROJECT_ID=`head -n $i $PARAMS | tail -n 1 | cut -f 1 -d '_'` - ORIG_VERSION=`head -n $i $PARAMS | tail -n 1 | cut -f 1 | cut -f 2 -d '_'` - BATCH_ID=`head -n $i $PARAMS | tail -n 1 | cut -f 2` - FAMILY_ID=`head -n $i $PARAMS | tail -n 1 | cut -f 3` - - SAMPLE=`cut -f 2 *_${FAMILY_ID}.ped` - SEX=`cut -f 5 *_${FAMILY_ID}.ped` - PHENOTYPE=`cut -f 6 *_${FAMILY_ID}.ped` - - PREFIX=${ORIG_PROJECT_ID}_${ORIG_VERSION}_${BATCH_ID}_${FAMILY_ID} - echo "samplename,description,batch,sex,phenotype,variant_regions" > ${PREFIX}.csv - len=`expr length $ORIG_PROJECT_ID` - - if [ $len -eq 5 ] - then - mkdir -p $READS_DIR/$PROJECT_ID/symlinks/$SAMPLE - - for FILE in `find $DOWNLOAD_DIR/$ORIG_PROJECT_ID* -wholename "*$SAMPLE*/*_1_*_1.fastq.gz"` - do - newname=`basename $FILE | sed -e 's/_1_/_one_/'` - ln -s $FILE $READS_DIR/$PROJECT_ID/symlinks/$SAMPLE/${newname%1.fastq.gz}R1.fastq.gz - done - for FILE in `find $DOWNLOAD_DIR/$ORIG_PROJECT_ID* -wholename "*$SAMPLE*/*_1_*_2.fastq.gz"` - do - newname=`basename $FILE | sed -e 's/_1_/_one_/'` - ln -s $FILE $READS_DIR/$PROJECT_ID/symlinks/$SAMPLE/${newname%2.fastq.gz}R2.fastq.gz - done - for FILE in `find $DOWNLOAD_DIR/$ORIG_PROJECT_ID* -wholename "*$SAMPLE*/*_2_*_1.fastq.gz"` - do - newname=`basename $FILE | sed -e 's/_2_/_two_/'` - ln -s $FILE $READS_DIR/$PROJECT_ID/symlinks/$SAMPLE/${newname%1.fastq.gz}R1.fastq.gz - done - for FILE in `find $DOWNLOAD_DIR/$ORIG_PROJECT_ID* -wholename "*$SAMPLE*/*_2_*_2.fastq.gz"` - do - newname=`basename $FILE | sed -e 's/_2_/_two_/'` - ln -s $FILE $READS_DIR/$PROJECT_ID/symlinks/$SAMPLE/${newname%2.fastq.gz}R2.fastq.gz - done - - for FILE in `ls $READS_DIR/$PROJECT_ID/symlinks/$SAMPLE/*_R[1,2].fastq.gz` - do - echo "$FILE,$SAMPLE,${BATCH_ID}_${FAMILY_ID},$SEX,$PHENOTYPE,$TARGET" >> ${PREFIX}.csv - done - - else - for FILE in `ls $DOWNLOAD_DIR/$ORIG_PROJECT_ID*/*${SAMPLE}*.gz` - do - echo "$FILE,$SAMPLE,${BATCH_ID}_${FAMILY_ID},$SEX,$PHENOTYPE,$TARGET" >> $PREFIX.csv - done - fi - - bcbio_prepare_samples.py --out $READS_DIR/$PROJECT_ID --csv ${PREFIX}.csv + # rename the original PED file with duo suffix + mv ${PROJECT_ID}_${FAMILY_ID}.ped ${PROJECT_ID}_${FAMILY_ID}duo.ped - mv ${PREFIX}-merged.csv ${PREFIX}.csv + # create the proband PED file - set parent ids to 0 + grep $PROBAND_ID ${PROJECT_ID}_${FAMILY_ID}duo.ped | awk '{ print $1 "\t" $2 "\t0\t0\t" $5 "\t" $6 }' > ${PROJECT_ID}_${FAMILY_ID}.ped - bcbio_nextgen.py -w template $BCBIO_TEMPLATE ${PREFIX}.csv $READS_DIR/$PROJECT_ID/*_${FAMILY_ID}_R[12].fastq.gz + # move into the config folder and rename the original YAML file with duo suffix + cd $CONFIG_DIR + mv ${SHORT_PROJECT_ID}_${VERSION}_${FAMILY_ID}.yaml ${SHORT_PROJECT_ID}_${VERSION}_${FAMILY_ID}duo.yaml - mv ${PREFIX}/config/${PREFIX}.yaml $CONFIG_DIR/ + # create the singleton config file from the duo file + head -n 1 ${SHORT_PROJECT_ID}_${VERSION}_${FAMILY_ID}duo.yaml > ${SHORT_PROJECT_ID}_${VERSION}_${FAMILY_ID}.yaml + affected=`grep -n 'phenotype: 2' ${SHORT_PROJECT_ID}_${VERSION}_${FAMILY_ID}duo.yaml | cut -f 1 -d ':'` + head -n $((affected + 1)) ${SHORT_PROJECT_ID}_${VERSION}_${FAMILY_ID}duo.yaml | tail -n 30 >> ${SHORT_PROJECT_ID}_${VERSION}_${FAMILY_ID}.yaml + tail -n 6 ${SHORT_PROJECT_ID}_${VERSION}_${FAMILY_ID}duo.yaml >> ${SHORT_PROJECT_ID}_${VERSION}_${FAMILY_ID}.yaml - perl -i -pe "s/${BATCH_ID}${FAMILY_ID}/${BATCH_ID}_${FAMILY_ID}/" $CONFIG_DIR/${PREFIX}.yaml + # change family id to add duo suffix for duo YAML + perl -pi -e "s/${FAMILY_ID}/${FAMILY_ID}duo/" ${SHORT_PROJECT_ID}_${VERSION}_${FAMILY_ID}duo.yaml - rm -r ${PREFIX} + # move back to params folder + cd $PARAMS_DIR done diff --git a/bin/trio_whole_exome_config_illumina_twist.sh b/bin/trio_whole_exome_config_illumina_twist.sh new file mode 100644 index 0000000000000000000000000000000000000000..377c9a0ebfef473614da25fa96173d213ee1fd65 --- /dev/null +++ b/bin/trio_whole_exome_config_illumina_twist.sh @@ -0,0 +1,24 @@ +#!/usr/bin/bash +# +# Basic configuration options for trio WES pipeline +# + +# primary locations +BASE=/home/u035/u035/shared +SCRIPTS=$BASE/scripts/bin +DOWNLOAD_DIR=$BASE/data +OUTPUT_DIR=$BASE/results + +# resource locations +BCBIO_TEMPLATE=$BASE/scripts/assets/trio_whole_exome_bcbio_template.yaml +TARGET=$BASE/resources/exome_targets/hg38_Twist_ILMN_Exome_2.0_Plus_Panel_annotated_June2022.plus15bp.bed +REFERENCE_GENOME=$BASE/software/bcbio/genomes/Hsapiens/hg38/seq/hg38.fa + +# temporary working files +PARAMS_DIR=$BASE/analysis/params +READS_DIR=$BASE/analysis/reads +CONFIG_DIR=$BASE/analysis/config +WORK_DIR=$BASE/analysis/work +LOGS_DIR=$BASE/analysis/logs + +export PATH=$BASE/software/bcbio/tools/bin:$PATH diff --git a/docs/SOP_reanalysis_preparation.md b/docs/SOP_reanalysis_preparation.md index fb425255595d4992eb8f5ca8e2e30778eb100b61..283938dd152298f356aba2f6ea26b22efa9338dd 100644 --- a/docs/SOP_reanalysis_preparation.md +++ b/docs/SOP_reanalysis_preparation.md @@ -200,9 +200,9 @@ cd .. 8. Copy DECIPHER upload files from previous analyses. If multiple files are possible, select in preference order from the most recent re-analysis (if available), or from the most recent version otherwise. -a. <INDI>_<FAM>_DECIPHER_v11.xlsx (the best) -b. <INDI>_<FAM>_DECIPHER_v10.xlsx (if we do not have v11 version) -c. <INDI>_<FAM>_DEC_FLT.csv (least preferred, if we do not have a xlsx version -> this was the case at the beginning of the NHS Trio WES service) +a. `<INDI>_<FAM>_DECIPHER_v11.xlsx` (the best) +b. `<INDI>_<FAM>_DECIPHER_v10.xlsx` (if we do not have v11 version) +c. `<INDI>_<FAM>_DEC_FLT.csv` (least preferred, if we do not have a xlsx version -> this was the case at the beginning of the NHS Trio WES service) Create a list of terms to exclude from path matches in `previous_DECIPHER_upload/exclude.txt`, e.g.