diff --git a/reanalysis_preparation.sh b/reanalysis_preparation.sh new file mode 100644 index 0000000000000000000000000000000000000000..217cfa671688c01adc053a8b12f434a6ff78349e --- /dev/null +++ b/reanalysis_preparation.sh @@ -0,0 +1,51 @@ +#!/bin/bash + +# +# Create the reanalysis folder named e.g. results/20220418_reanalysis, and create files for each +# of the family types, tab-delimited text format: +# project_version plate_id family_id (excluding plate id) +# +# Run this script in the reanalysis folder. Ensure that the environment variable project_id is set to +# the same name as the reanalysis folder. +# + +# Create symlinks for the families that don't require any re-processing +for file in quad.txt shared_affected.txt singleton.txt trio_affected_parent.txt trio.txt +do + count=`wc -l params/$file | awk '{ print $1 }'` + + for ((i = 1; i <= $count; i = i + 1)) + do + project=`head -n $i params/$file | tail -n 1 | cut -f 1` + family=`head -n $i params/$file | tail -n 1 | cut -f 3` + + cd families + family_dir=`ls ../../${project}/families | grep $family` + ln -s ../../${project}/families/$family_dir $family_dir + + cd ../params + ped=`ls ../../${project}/params/*.ped | grep $family` + ln -s $ped `basename $ped` + + cd .. + done +done + +# For the singletons from duos with unaffected parents that need to be re-generated, +# prepare appropriate PED files in the analysis/params folder to begin analysis. +cp singleton_from_duo.txt ../../analysis/params +cd ../../analysis/params + +count=`wc -l singleton_from_duo.txt | awk '{ print $1 }'` +file=singleton_from_duo.txt +for ((i = 1; i <= $count; i = i + 1)) +do + project=`head -n $i $file | tail -n 1 | cut -f 1` + family=`head -n $i $file | tail -n 1 | cut -f 3` + + ped=`ls ../../results/${project}/params/*.ped | grep $family` + grep 2$ $ped | awk '{ print $1 "\t" $2 "\t0\t0\t" $5 "\t" $6}' > `basename $ped` +done + +# Create a family ids list +cat *.ped | cut -f 1 | sort > $project_id.family_ids.txt diff --git a/trio_wes_prepare_bcbio_config_singleton_from_duo.sh b/trio_wes_prepare_bcbio_config_singleton_from_duo.sh new file mode 100755 index 0000000000000000000000000000000000000000..03c7e4a5a30295cc374e778a5dee1824e52be056 --- /dev/null +++ b/trio_wes_prepare_bcbio_config_singleton_from_duo.sh @@ -0,0 +1,106 @@ +#!/bin/bash +# +# trio_wes_prepare_bcbio_singleton_from_duo_config.sh <config.sh> <project_id> <params> +# +# Assumes that reads for the samples are in the path +# $READS_DIR/<project_id>/<date>/<sample><sample_suffix>/*.gz, +# and that no samples other than those with reads are listed in the +# PED file. $READS_DIR is specified in the <config.sh> file. +# +# Assumes that the sample names in the PED file match those +# specifying the read directories with the addition of a specified +# suffix. +# +# All samples must be annotated with sex (1=male, 2=female) in the +# 5th column and phenotype (1=unaffected, 2=affected) in the 6th +# column of the PED file. +# +# Runs bcbio sample preparation and configuration file generation, +# assuming the template configuration file is at $BCBIO_TEMPLATE, +# specified in the <config.sh> file. +# +# Assumes bcbio is on the PATH (set in <config.sh>). +# + +CONFIG_SH=$1 +PROJECT_ID=$2 +PARAMS=$3 + +source $CONFIG_SH + +# +# Create the file $PROJECT_ID.family_ids.txt +# +cd $PARAMS_DIR + +cat *.ped | cut -f 1 > $PROJECT_ID.family_ids.txt + +SHORT_PROJECT_ID=`echo $PROJECT_ID | cut -f 1 -d '_'` + +COUNT=`wc -l ${PROJECT_ID}.family_ids.txt | awk '{ print $1 }'` + +for ((i = 1; i <= $COUNT; i = i + 1)) +do + + ORIG_PROJECT_ID=`head -n $i $PARAMS | tail -n 1 | cut -f 1 -d '_'` + ORIG_VERSION=`head -n $i $PARAMS | tail -n 1 | cut -f 1 | cut -f 2 -d '_'` + BATCH_ID=`head -n $i $PARAMS | tail -n 1 | cut -f 2` + FAMILY_ID=`head -n $i $PARAMS | tail -n 1 | cut -f 3` + + SAMPLE=`cut -f 2 *_${FAMILY_ID}.ped` + SEX=`cut -f 5 *_${FAMILY_ID}.ped` + PHENOTYPE=`cut -f 6 *_${FAMILY_ID}.ped` + + PREFIX=${ORIG_PROJECT_ID}_${ORIG_VERSION}_${BATCH_ID}_${FAMILY_ID} + echo "samplename,description,batch,sex,phenotype,variant_regions" > ${PREFIX}.csv + len=`expr length $ORIG_PROJECT_ID` + + if [ $len -eq 5 ] + then + mkdir -p $READS_DIR/$PROJECT_ID/symlinks/$SAMPLE + + for FILE in `find $DOWNLOAD_DIR/$ORIG_PROJECT_ID* -wholename "*$SAMPLE*/*_1_*_1.fastq.gz"` + do + newname=`basename $FILE | sed -e 's/_1_/_one_/'` + ln -s $FILE $READS_DIR/$PROJECT_ID/symlinks/$SAMPLE/${newname%1.fastq.gz}R1.fastq.gz + done + for FILE in `find $DOWNLOAD_DIR/$ORIG_PROJECT_ID* -wholename "*$SAMPLE*/*_1_*_2.fastq.gz"` + do + newname=`basename $FILE | sed -e 's/_1_/_one_/'` + ln -s $FILE $READS_DIR/$PROJECT_ID/symlinks/$SAMPLE/${newname%2.fastq.gz}R2.fastq.gz + done + for FILE in `find $DOWNLOAD_DIR/$ORIG_PROJECT_ID* -wholename "*$SAMPLE*/*_2_*_1.fastq.gz"` + do + newname=`basename $FILE | sed -e 's/_2_/_two_/'` + ln -s $FILE $READS_DIR/$PROJECT_ID/symlinks/$SAMPLE/${newname%1.fastq.gz}R1.fastq.gz + done + for FILE in `find $DOWNLOAD_DIR/$ORIG_PROJECT_ID* -wholename "*$SAMPLE*/*_2_*_2.fastq.gz"` + do + newname=`basename $FILE | sed -e 's/_2_/_two_/'` + ln -s $FILE $READS_DIR/$PROJECT_ID/symlinks/$SAMPLE/${newname%2.fastq.gz}R2.fastq.gz + done + + for FILE in `ls $READS_DIR/$PROJECT_ID/symlinks/$SAMPLE/*_R[1,2].fastq.gz` + do + echo "$FILE,$SAMPLE,${BATCH_ID}_${FAMILY_ID},$SEX,$PHENOTYPE,$TARGET" >> ${PREFIX}.csv + done + + else + for FILE in `ls $DOWNLOAD_DIR/$ORIG_PROJECT_ID*/*${SAMPLE}*.gz` + do + echo "$FILE,$SAMPLE,${BATCH_ID}_${FAMILY_ID},$SEX,$PHENOTYPE,$TARGET" >> $PREFIX.csv + done + fi + + bcbio_prepare_samples.py --out $READS_DIR/$PROJECT_ID --csv ${PREFIX}.csv + + mv ${PREFIX}-merged.csv ${PREFIX}.csv + + bcbio_nextgen.py -w template $BCBIO_TEMPLATE ${PREFIX}.csv $READS_DIR/$PROJECT_ID/*_${FAMILY_ID}_R[12].fastq.gz + + mv ${PREFIX}/config/${PREFIX}.yaml $CONFIG_DIR/ + + perl -i -pe "s/${BATCH_ID}${FAMILY_ID}/${BATCH_ID}_${FAMILY_ID}/" $CONFIG_DIR/${PREFIX}.yaml + + rm -r ${PREFIX} +done