Newer
Older
user name
committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
#!/bin/bash
#
# trio_wes_prepare_bcbio_singleton_from_duo_config.sh <config.sh> <project_id> <params>
#
# Assumes that reads for the samples are in the path
# $READS_DIR/<project_id>/<date>/<sample><sample_suffix>/*.gz,
# and that no samples other than those with reads are listed in the
# PED file. $READS_DIR is specified in the <config.sh> file.
#
# Assumes that the sample names in the PED file match those
# specifying the read directories with the addition of a specified
# suffix.
#
# All samples must be annotated with sex (1=male, 2=female) in the
# 5th column and phenotype (1=unaffected, 2=affected) in the 6th
# column of the PED file.
#
# Runs bcbio sample preparation and configuration file generation,
# assuming the template configuration file is at $BCBIO_TEMPLATE,
# specified in the <config.sh> file.
#
# Assumes bcbio is on the PATH (set in <config.sh>).
#
CONFIG_SH=$1
PROJECT_ID=$2
PARAMS=$3
source $CONFIG_SH
#
# Create the file $PROJECT_ID.family_ids.txt
#
cd $PARAMS_DIR
cat *.ped | cut -f 1 > $PROJECT_ID.family_ids.txt
SHORT_PROJECT_ID=`echo $PROJECT_ID | cut -f 1 -d '_'`
COUNT=`wc -l ${PROJECT_ID}.family_ids.txt | awk '{ print $1 }'`
for ((i = 1; i <= $COUNT; i = i + 1))
do
ORIG_PROJECT_ID=`head -n $i $PARAMS | tail -n 1 | cut -f 1 -d '_'`
ORIG_VERSION=`head -n $i $PARAMS | tail -n 1 | cut -f 1 | cut -f 2 -d '_'`
BATCH_ID=`head -n $i $PARAMS | tail -n 1 | cut -f 2`
FAMILY_ID=`head -n $i $PARAMS | tail -n 1 | cut -f 3`
SAMPLE=`cut -f 2 *_${FAMILY_ID}.ped`
SEX=`cut -f 5 *_${FAMILY_ID}.ped`
PHENOTYPE=`cut -f 6 *_${FAMILY_ID}.ped`
PREFIX=${ORIG_PROJECT_ID}_${ORIG_VERSION}_${BATCH_ID}_${FAMILY_ID}
echo "samplename,description,batch,sex,phenotype,variant_regions" > ${PREFIX}.csv
len=`expr length $ORIG_PROJECT_ID`
if [ $len -eq 5 ]
then
mkdir -p $READS_DIR/$PROJECT_ID/symlinks/$SAMPLE
for FILE in `find $DOWNLOAD_DIR/$ORIG_PROJECT_ID* -wholename "*$SAMPLE*/*_1_*_1.fastq.gz"`
do
newname=`basename $FILE | sed -e 's/_1_/_one_/'`
ln -s $FILE $READS_DIR/$PROJECT_ID/symlinks/$SAMPLE/${newname%1.fastq.gz}R1.fastq.gz
done
for FILE in `find $DOWNLOAD_DIR/$ORIG_PROJECT_ID* -wholename "*$SAMPLE*/*_1_*_2.fastq.gz"`
do
newname=`basename $FILE | sed -e 's/_1_/_one_/'`
ln -s $FILE $READS_DIR/$PROJECT_ID/symlinks/$SAMPLE/${newname%2.fastq.gz}R2.fastq.gz
done
for FILE in `find $DOWNLOAD_DIR/$ORIG_PROJECT_ID* -wholename "*$SAMPLE*/*_2_*_1.fastq.gz"`
do
newname=`basename $FILE | sed -e 's/_2_/_two_/'`
ln -s $FILE $READS_DIR/$PROJECT_ID/symlinks/$SAMPLE/${newname%1.fastq.gz}R1.fastq.gz
done
for FILE in `find $DOWNLOAD_DIR/$ORIG_PROJECT_ID* -wholename "*$SAMPLE*/*_2_*_2.fastq.gz"`
do
newname=`basename $FILE | sed -e 's/_2_/_two_/'`
ln -s $FILE $READS_DIR/$PROJECT_ID/symlinks/$SAMPLE/${newname%2.fastq.gz}R2.fastq.gz
done
for FILE in `ls $READS_DIR/$PROJECT_ID/symlinks/$SAMPLE/*_R[1,2].fastq.gz`
do
echo "$FILE,$SAMPLE,${BATCH_ID}_${FAMILY_ID},$SEX,$PHENOTYPE,$TARGET" >> ${PREFIX}.csv
done
else
for FILE in `ls $DOWNLOAD_DIR/$ORIG_PROJECT_ID*/*${SAMPLE}*.gz`
do
echo "$FILE,$SAMPLE,${BATCH_ID}_${FAMILY_ID},$SEX,$PHENOTYPE,$TARGET" >> $PREFIX.csv
done
fi
bcbio_prepare_samples.py --out $READS_DIR/$PROJECT_ID --csv ${PREFIX}.csv
mv ${PREFIX}-merged.csv ${PREFIX}.csv
bcbio_nextgen.py -w template $BCBIO_TEMPLATE ${PREFIX}.csv $READS_DIR/$PROJECT_ID/*_${FAMILY_ID}_R[12].fastq.gz
mv ${PREFIX}/config/${PREFIX}.yaml $CONFIG_DIR/
perl -i -pe "s/${BATCH_ID}${FAMILY_ID}/${BATCH_ID}_${FAMILY_ID}/" $CONFIG_DIR/${PREFIX}.yaml
rm -r ${PREFIX}
done