Newer
Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
#!/bin/bash
#
# prepare_bcbio_config.sh <config.sh> <project_id> <sample_suffix>
#
# Given a <project_id>.ped file for a set of trios (families) in the
# folder $PARAMS_DIR, creates the files <project_id>.family_ids.txt
# and <project>.sample_ids.txt in the same folder.
#
# Assumes that reads for the samples are in the path
# $READS_DIR/<project_id>/raw_data/<date>/<sample><sample_suffix>/*.gz,
# and that no samples other than those with reads are listed in the
# PED file. $READS_DIR is specified in the <config.sh> file.
#
# Assumes that the sample names in the PED file match those
# specifying the read directories with the addition of a specified
# suffix.
#
# All samples must be annotated with sex (1=male, 2=female) in the
# 5th column and phenotype (1=unaffected, 2=affected) in the 6th
# column of the PED file.
#
# Runs bcbio sample preparation and configuration file generation,
# assuming the template configuration file is at $BCBIO_TEMPLATE,
# specified in the <config.sh> file.
#
# Assumes bcbio is on the PATH (set in <config.sh>).
#
CONFIG_SH=$1
PROJECT_ID=$2
SAMPLE_SUFFIX=$3
source $CONFIG_SH
#
# Create the files:
# $PROJECT_ID.family_ids.txt - format <pcr_plate_id>_<family_id>
# $PROJECT_ID.$FAMILY_ID.ped - select only the individuals in a given family,
# prefix <family_id> with <pcr_plate_id> and
# add suffix <family_id> to <individual_id>
#
cd $PARAMS_DIR
# remove DOS newline characters if necessary
perl -pi -e 's/\r//' $PROJECT_ID.ped
ameyner2
committed
# create reads directory for project
mkdir -p $READS_DIR/$PROJECT_ID
cat $DOWNLOAD_DIR/$PROJECT_ID/raw_data/*/file_list.tsv | \
perl $SCRIPTS/trio_whole_exome_create_parameter_files.pl \
--prefix ./$PROJECT_ID \
--ped $PROJECT_ID.ped \
--suffix $SAMPLE_SUFFIX
for FAMILY_ID in `cat ${PROJECT_ID}.family_ids.txt`
do
echo "samplename,description,batch,sex,phenotype,variant_regions" > ${PROJECT_ID}_${FAMILY_ID}.csv
COUNT=`wc -l ${PROJECT_ID}_${FAMILY_ID}.ped | awk '{ print $1 }'`
for ((i=1; i<=$COUNT; i=i+1))
do
SAMPLE=`head -n $i ${PROJECT_ID}_${FAMILY_ID}.ped | tail -n 1 | cut -f 2`
SEX=`head -n $i ${PROJECT_ID}_${FAMILY_ID}.ped | tail -n 1 | cut -f 5`
PHENOTYPE=`head -n $i ${PROJECT_ID}_${FAMILY_ID}.ped | tail -n 1 | cut -f 6`
# create symlinks for problematic filenames
for FILE in `ls $DOWNLOAD_DIR/$PROJECT_ID/raw_data/*/*${SAMPLE}*/*_1_*_1.fastq.gz`
do
newname=`echo $FILE | sed -e 's/_1_/_one_/'`
ln -s $FILE ${newname%1.fastq.gz}R1.fastq.gz
done
for FILE in `ls $DOWNLOAD_DIR/$PROJECT_ID/raw_data/*/*${SAMPLE}*/*_1_*_2.fastq.gz`
do
newname=`echo $FILE | sed -e 's/_1_/_one_/'`
ln -s $FILE ${newname%2.fastq.gz}R2.fastq.gz
done
for FILE in `ls $DOWNLOAD_DIR/$PROJECT_ID/raw_data/*/*${SAMPLE}*/*_2_*_1.fastq.gz`
do
newname=`echo $FILE | sed -e 's/_2_/_two_/'`
ln -s $FILE ${newname%1.fastq.gz}R1.fastq.gz
done
for FILE in `ls $DOWNLOAD_DIR/$PROJECT_ID/raw_data/*/*${SAMPLE}*/*_2_*_2.fastq.gz`
do
newname=`echo $FILE | sed -e 's/_2_/_two_/'`
ln -s $FILE ${newname%2.fastq.gz}R2.fastq.gz
done
for FILE in `ls $DOWNLOAD_DIR/$PROJECT_ID/raw_data/*/*${SAMPLE}*/*_R[1,2].fastq.gz`
echo "$FILE,$SAMPLE,$FAMILY_ID,$SEX,$PHENOTYPE,$TARGET" >> ${PROJECT_ID}_${FAMILY_ID}.csv
ameyner2
committed
bcbio_prepare_samples.py --out $READS_DIR/$PROJECT_ID --csv ${PROJECT_ID}_${FAMILY_ID}.csv
mv ${PROJECT_ID}_${FAMILY_ID}-merged.csv ${PROJECT_ID}_${FAMILY_ID}.csv
BARE_FAMILY_ID=`echo $FAMILY_ID | cut -d '_' -f 2`
ameyner2
committed
bcbio_nextgen.py -w template $BCBIO_TEMPLATE ${PROJECT_ID}_${FAMILY_ID}.csv $READS_DIR/$PROJECT_ID/*_${BARE_FAMILY_ID}_R[12].fastq.gz
mv ${PROJECT_ID}_${FAMILY_ID}/config/${PROJECT_ID}_${FAMILY_ID}.yaml $CONFIG_DIR/
COMPRESSED_ID=`echo "$FAMILY_ID" | perl -pe "s/\_//"`
perl -i -pe "s/${COMPRESSED_ID}/${FAMILY_ID}/" $CONFIG_DIR/${PROJECT_ID}_${FAMILY_ID}.yaml
rm -r ${PROJECT_ID}_${FAMILY_ID}
done