trio_wes_prepare_bcbio_config_crf.sh

#!/bin/bash
#
# prepare_bcbio_config_crf.sh <config.sh> <project_id> <version> <sample_suffix>
# 
# Adaptation of prepare_bcbio_config.sh for data from the Clinical
# Research Facility.
#
# Given a <project_id>.ped file for a set of trios (families) in the 
# folder $PARAMS_DIR, creates the files <project_id>.family_ids.txt
# and <project>.sample_ids.txt in the same folder.
#
# Assumes that reads for the samples are in the path
# $READS_DIR/<project_id>/*.gz,
# and that no samples other than those with reads are listed in the 
# PED file. $READS_DIR is specified in the <config.sh> file.
#
# Assumes that the sample names in the PED file match those 
# specifying the read directories with the addition of a specified
# suffix.
#
# All samples must be annotated with sex (1=male, 2=female) in the
# 5th column and phenotype (1=unaffected, 2=affected) in the 6th
# column of the PED file.
#
# Runs bcbio sample preparation and configuration file generation,
# assuming the template configuration file is at $BCBIO_TEMPLATE,
# specified in the <config.sh> file.
#
# Assumes bcbio is on the PATH (set in <config.sh>).
#

CONFIG_SH=$1
PROJECT_ID=$2
VERSION=$3
SAMPLE_SUFFIX=$4

source $CONFIG_SH

#
# Create the files:
#  $PROJECT_ID.family_ids.txt - format <pcr_plate_id>_<family_id>
#  $PROJECT_ID.$FAMILY_ID.ped - select only the individuals in a given family, 
#                               prefix <family_id> with <pcr_plate_id> and
#                               add suffix <family_id> to <individual_id> 
#
cd $PARAMS_DIR

# remove DOS newline characters if necessary
perl -pi -e 's/\r//' $PROJECT_ID.ped

# create reads directory for project
mkdir -p $READS_DIR/$PROJECT_ID

# generate the family_ids list
ls $DOWNLOAD_DIR/$PROJECT_ID/*.gz | grep -v Undetermined | cut -d '/' -f 8 | cut -f 1,3 -d '_' | sort -u > $PROJECT_ID.family_ids.txt

PLATE_ID=`cut -f 1 -d '_' $PROJECT_ID.family_ids.txt | sort -u`
for FAMILY_ID in `cut -f 2 -d '_' $PROJECT_ID.family_ids.txt`
do
  grep $FAMILY_ID $PROJECT_ID.ped | \
    perl $SCRIPTS/add_plate_and_family_id_to_ped.pl \
    --plate_id ${PLATE_ID} --family_id ${FAMILY_ID} \
    > ${PROJECT_ID}_${PLATE_ID}_${FAMILY_ID}.ped
done

SHORT_PROJECT_ID=`echo $PROJECT_ID | cut -f 1 -d '_'`

for FAMILY_ID in `cat ${PROJECT_ID}.family_ids.txt`
do
  PREFIX=${SHORT_PROJECT_ID}_${VERSION}_${FAMILY_ID}
  echo "samplename,description,batch,sex,phenotype,variant_regions" > $PREFIX.csv
  COUNT=`wc -l ${PROJECT_ID}_${FAMILY_ID}.ped | awk '{ print $1 }'`

  echo $COUNT

  for ((i=1; i<=$COUNT; i=i+1))
  do
    SAMPLE=`head -n $i ${PROJECT_ID}_${FAMILY_ID}.ped | tail -n 1 | cut -f 2`
    SEX=`head -n $i ${PROJECT_ID}_${FAMILY_ID}.ped | tail -n 1 | cut -f 5`
    PHENOTYPE=`head -n $i ${PROJECT_ID}_${FAMILY_ID}.ped | tail -n 1 | cut -f 6`

    for FILE in `ls $DOWNLOAD_DIR/$PROJECT_ID/*${SAMPLE}*.gz`
    do
      echo "$FILE,$SAMPLE,$FAMILY_ID,$SEX,$PHENOTYPE,$TARGET" >> $PREFIX.csv
    done

  done

  bcbio_prepare_samples.py --out $READS_DIR/$PROJECT_ID --csv $PREFIX.csv

  mv $PREFIX-merged.csv $PREFIX.csv

  BARE_FAMILY_ID=`echo $FAMILY_ID | cut -d '_' -f 2`

  bcbio_nextgen.py -w template $BCBIO_TEMPLATE $PREFIX.csv $READS_DIR/$PROJECT_ID/*_${BARE_FAMILY_ID}_R[12].fastq.gz

  mv $PREFIX/config/$PREFIX.yaml $CONFIG_DIR/

  COMPRESSED_ID=`echo "$FAMILY_ID" | perl -pe "s/\_//"`

  perl -i -pe "s/${COMPRESSED_ID}/${FAMILY_ID}/" $CONFIG_DIR/$PREFIX.yaml

  rm -r $PREFIX

  mkdir -p ${OUTPUT_DIR}/${SHORT_PROJECT_ID}_${VERSION}/params/
  mv ${PREFIX}.csv ${OUTPUT_DIR}/${SHORT_PROJECT_ID}_${VERSION}/params/
  mv ${PROJECT_ID}_${FAMILY_ID}.ped ${OUTPUT_DIR}/${SHORT_PROJECT_ID}_${VERSION}/params/

done

mv *.txt *.log *.ped ${OUTPUT_DIR}/${SHORT_PROJECT_ID}_${VERSION}/params/