#!/bin/bash # # prepare_bcbio_config.sh <config.sh> <project_id> <version> <sample_suffix> # # Given a <project_id>.ped file for a set of trios (families) in the # folder $PARAMS_DIR, creates the files <project_id>.family_ids.txt # and <project>.sample_ids.txt in the same folder. # # Assumes that reads for the samples are in the path # $READS_DIR/<project_id>/<date>/<sample><sample_suffix>/*.gz, # and that no samples other than those with reads are listed in the # PED file. $READS_DIR is specified in the <config.sh> file. # # Assumes that the sample names in the PED file match those # specifying the read directories with the addition of a specified # suffix. # # All samples must be annotated with sex (1=male, 2=female) in the # 5th column and phenotype (1=unaffected, 2=affected) in the 6th # column of the PED file. # # Runs bcbio sample preparation and configuration file generation, # assuming the template configuration file is at $BCBIO_TEMPLATE, # specified in the <config.sh> file. # # Assumes bcbio is on the PATH (set in <config.sh>). # CONFIG_SH=$1 PROJECT_ID=$2 VERSION=$3 SAMPLE_SUFFIX=$4 source $CONFIG_SH # # Create the files: # $PROJECT_ID.family_ids.txt - format <pcr_plate_id>_<family_id> # $PROJECT_ID.$FAMILY_ID.ped - select only the individuals in a given family, # prefix <family_id> with <pcr_plate_id> and # add suffix <family_id> to <individual_id> # cd $PARAMS_DIR # remove DOS newline characters if necessary perl -pi -e 's/\r//' $PROJECT_ID.ped # create reads directory for project and symlink directory underneath mkdir -p $READS_DIR/$PROJECT_ID/symlinks cat $DOWNLOAD_DIR/$PROJECT_ID/raw_data/*/file_list.tsv | \ perl $SCRIPTS/trio_whole_exome_create_parameter_files.pl \ --prefix ./$PROJECT_ID \ --ped $PROJECT_ID.ped \ --suffix $SAMPLE_SUFFIX SHORT_PROJECT_ID=`echo $PROJECT_ID | cut -f 1 -d '_'` mkdir -p ${OUTPUT_DIR}/${SHORT_PROJECT_ID}_${VERSION}/params for FAMILY_ID in `cat ${PROJECT_ID}.family_ids.txt` do PREFIX=${SHORT_PROJECT_ID}_${VERSION}_${FAMILY_ID} echo "samplename,description,batch,sex,phenotype,variant_regions" > ${PREFIX}.csv COUNT=`wc -l ${PROJECT_ID}_${FAMILY_ID}.ped | awk '{ print $1 }'` for ((i=1; i<=$COUNT; i=i+1)) do SAMPLE=`head -n $i ${PROJECT_ID}_${FAMILY_ID}.ped | tail -n 1 | cut -f 2` SEX=`head -n $i ${PROJECT_ID}_${FAMILY_ID}.ped | tail -n 1 | cut -f 5` PHENOTYPE=`head -n $i ${PROJECT_ID}_${FAMILY_ID}.ped | tail -n 1 | cut -f 6` # create symlinks for problematic filenames mkdir $READS_DIR/$PROJECT_ID/symlinks/$SAMPLE for FILE in `find $DOWNLOAD_DIR/$ORIG_PROJECT_ID* -wholename "*$SAMPLE*/*_1_*_1.fastq.gz"` do newname=`basename $FILE | sed -e 's/_1_/_one_/'` ln -s $FILE $READS_DIR/$PROJECT_ID/symlinks/$SAMPLE/${newname%1.fastq.gz}R1.fastq.gz done for FILE in `find $DOWNLOAD_DIR/$ORIG_PROJECT_ID* -wholename "*$SAMPLE*/*_1_*_2.fastq.gz"` do newname=`basename $FILE | sed -e 's/_1_/_one_/'` ln -s $FILE $READS_DIR/$PROJECT_ID/symlinks/$SAMPLE/${newname%2.fastq.gz}R2.fastq.gz done for FILE in `find $DOWNLOAD_DIR/$ORIG_PROJECT_ID* -wholename "*$SAMPLE*/*_2_*_1.fastq.gz"` do newname=`basename $FILE | sed -e 's/_2_/_two_/'` ln -s $FILE $READS_DIR/$PROJECT_ID/symlinks/$SAMPLE/${newname%1.fastq.gz}R1.fastq.gz done for FILE in `find $DOWNLOAD_DIR/$ORIG_PROJECT_ID* -wholename "*$SAMPLE*/*_2_*_2.fastq.gz"` do newname=`basename $FILE | sed -e 's/_2_/_two_/'` ln -s $FILE $READS_DIR/$PROJECT_ID/symlinks/$SAMPLE/${newname%2.fastq.gz}R2.fastq.gz done for FILE in `ls $READS_DIR/$PROJECT_ID/symlinks/$SAMPLE/*_R[1,2].fastq.gz` do echo "$FILE,$SAMPLE,$FAMILY_ID,$SEX,$PHENOTYPE,$TARGET" >> ${PREFIX}.csv done done bcbio_prepare_samples.py --out $READS_DIR/$PROJECT_ID --csv ${PREFIX}.csv mv ${PREFIX}-merged.csv ${PREFIX}.csv BARE_FAMILY_ID=`echo $FAMILY_ID | cut -d '_' -f 2` bcbio_nextgen.py -w template $BCBIO_TEMPLATE ${PREFIX}.csv $READS_DIR/$PROJECT_ID/*_${BARE_FAMILY_ID}_R[12].fastq.gz mv ${PREFIX}/config/${PREFIX}.yaml $CONFIG_DIR/ COMPRESSED_ID=`echo "$FAMILY_ID" | perl -pe "s/\_//"` perl -i -pe "s/${COMPRESSED_ID}/${FAMILY_ID}/" $CONFIG_DIR/${PREFIX}.yaml rm -r ${PREFIX} cp ${PREFIX}.csv ${OUTPUT_DIR}/${SHORT_PROJECT_ID}_${VERSION}/params/ cp ${PROJECT_ID}_${FAMILY_ID}.ped ${OUTPUT_DIR}/${SHORT_PROJECT_ID}_${VERSION}/params/ done # clean up symlinks temporary folder rm -r $READS_DIR/$PROJECT_ID/symlinks