trio_wes_prepare_bcbio_config_singleton_from_duo.sh

#!/bin/bash
#
# trio_wes_prepare_bcbio_singleton_from_duo_config.sh <config.sh> <project_id> <params>
# 
# Assumes that reads for the samples are in the path
# $READS_DIR/<project_id>/<date>/<sample><sample_suffix>/*.gz,
# and that no samples other than those with reads are listed in the 
# PED file. $READS_DIR is specified in the <config.sh> file.
#
# Assumes that the sample names in the PED file match those 
# specifying the read directories with the addition of a specified
# suffix.
#
# All samples must be annotated with sex (1=male, 2=female) in the
# 5th column and phenotype (1=unaffected, 2=affected) in the 6th
# column of the PED file.
#
# Runs bcbio sample preparation and configuration file generation,
# assuming the template configuration file is at $BCBIO_TEMPLATE,
# specified in the <config.sh> file.
#
# Assumes bcbio is on the PATH (set in <config.sh>).
#

CONFIG_SH=$1
PROJECT_ID=$2
PARAMS=$3

source $CONFIG_SH

#
# Create the file $PROJECT_ID.family_ids.txt
#
cd $PARAMS_DIR

cat *.ped | cut -f 1 > $PROJECT_ID.family_ids.txt

SHORT_PROJECT_ID=`echo $PROJECT_ID | cut -f 1 -d '_'`

COUNT=`wc -l ${PROJECT_ID}.family_ids.txt | awk '{ print $1 }'`

for ((i = 1; i <= $COUNT; i = i + 1))
do

    ORIG_PROJECT_ID=`head -n $i $PARAMS | tail -n 1 | cut -f 1 -d '_'`
    ORIG_VERSION=`head -n $i $PARAMS | tail -n 1 | cut -f 1 | cut -f 2 -d '_'`
    BATCH_ID=`head -n $i $PARAMS | tail -n 1 | cut -f 2`
    FAMILY_ID=`head -n $i $PARAMS | tail -n 1 | cut -f 3`

    SAMPLE=`cut -f 2 *_${FAMILY_ID}.ped`
    SEX=`cut -f 5 *_${FAMILY_ID}.ped`
    PHENOTYPE=`cut -f 6 *_${FAMILY_ID}.ped`

    PREFIX=${ORIG_PROJECT_ID}_${ORIG_VERSION}_${BATCH_ID}_${FAMILY_ID}
    echo "samplename,description,batch,sex,phenotype,variant_regions" > ${PREFIX}.csv
    len=`expr length $ORIG_PROJECT_ID`

    if [ $len -eq 5 ]
    then
	mkdir -p $READS_DIR/$PROJECT_ID/symlinks/$SAMPLE

	for FILE in `find $DOWNLOAD_DIR/$ORIG_PROJECT_ID* -wholename "*$SAMPLE*/*_1_*_1.fastq.gz"`
	do
	    newname=`basename $FILE | sed -e 's/_1_/_one_/'`
	    ln -s $FILE $READS_DIR/$PROJECT_ID/symlinks/$SAMPLE/${newname%1.fastq.gz}R1.fastq.gz
	done
	for FILE in `find $DOWNLOAD_DIR/$ORIG_PROJECT_ID* -wholename "*$SAMPLE*/*_1_*_2.fastq.gz"`
	do
	    newname=`basename $FILE | sed -e 's/_1_/_one_/'`
	    ln -s $FILE $READS_DIR/$PROJECT_ID/symlinks/$SAMPLE/${newname%2.fastq.gz}R2.fastq.gz
	done
	for FILE in `find $DOWNLOAD_DIR/$ORIG_PROJECT_ID* -wholename "*$SAMPLE*/*_2_*_1.fastq.gz"`
	do
	    newname=`basename $FILE | sed -e 's/_2_/_two_/'`
	    ln -s $FILE $READS_DIR/$PROJECT_ID/symlinks/$SAMPLE/${newname%1.fastq.gz}R1.fastq.gz
	done
	for FILE in `find $DOWNLOAD_DIR/$ORIG_PROJECT_ID* -wholename "*$SAMPLE*/*_2_*_2.fastq.gz"`
	do
	    newname=`basename $FILE | sed -e 's/_2_/_two_/'`
	    ln -s $FILE $READS_DIR/$PROJECT_ID/symlinks/$SAMPLE/${newname%2.fastq.gz}R2.fastq.gz
	done

	for FILE in `ls $READS_DIR/$PROJECT_ID/symlinks/$SAMPLE/*_R[1,2].fastq.gz`
	do
	    echo "$FILE,$SAMPLE,${BATCH_ID}_${FAMILY_ID},$SEX,$PHENOTYPE,$TARGET" >> ${PREFIX}.csv
	done

    else
	for FILE in `ls $DOWNLOAD_DIR/$ORIG_PROJECT_ID*/*${SAMPLE}*.gz`
	do
	    echo "$FILE,$SAMPLE,${BATCH_ID}_${FAMILY_ID},$SEX,$PHENOTYPE,$TARGET" >> $PREFIX.csv
	done
    fi

    bcbio_prepare_samples.py --out $READS_DIR/$PROJECT_ID --csv ${PREFIX}.csv

    mv ${PREFIX}-merged.csv ${PREFIX}.csv

    bcbio_nextgen.py -w template $BCBIO_TEMPLATE ${PREFIX}.csv $READS_DIR/$PROJECT_ID/*_${FAMILY_ID}_R[12].fastq.gz

    mv ${PREFIX}/config/${PREFIX}.yaml $CONFIG_DIR/

    perl -i -pe "s/${BATCH_ID}${FAMILY_ID}/${BATCH_ID}_${FAMILY_ID}/" $CONFIG_DIR/${PREFIX}.yaml

    rm -r ${PREFIX}
done