Skip to content
Snippets Groups Projects
Commit fe372d2c authored by ameyner2's avatar ameyner2
Browse files

Edinburgh Genomics change of FASTQ folders

parent 489c77f8
No related branches found
No related tags found
No related merge requests found
......@@ -7,7 +7,7 @@
# and <project>.sample_ids.txt in the same folder.
#
# Assumes that reads for the samples are in the path
# $READS_DIR/<project_id>/raw_data/<date>/<sample><sample_suffix>/*.gz,
# $READS_DIR/<project_id>/<date>/<sample><sample_suffix>/*.gz,
# and that no samples other than those with reads are listed in the
# PED file. $READS_DIR is specified in the <config.sh> file.
#
......@@ -48,7 +48,7 @@ perl -pi -e 's/\r//' $PROJECT_ID.ped
# create reads directory for project
mkdir -p $READS_DIR/$PROJECT_ID
cat $DOWNLOAD_DIR/$PROJECT_ID/raw_data/*/file_list.tsv | \
cat $DOWNLOAD_DIR/$PROJECT_ID/*/file_list.tsv | \
perl $SCRIPTS/trio_whole_exome_create_parameter_files.pl \
--prefix ./$PROJECT_ID \
--ped $PROJECT_ID.ped \
......@@ -66,28 +66,28 @@ do
PHENOTYPE=`head -n $i ${PROJECT_ID}_${FAMILY_ID}.ped | tail -n 1 | cut -f 6`
# create symlinks for problematic filenames
for FILE in `ls $DOWNLOAD_DIR/$PROJECT_ID/raw_data/*/*${SAMPLE}*/*_1_*_1.fastq.gz`
for FILE in `ls $DOWNLOAD_DIR/$PROJECT_ID/*/*${SAMPLE}*/*_1_*_1.fastq.gz`
do
newname=`echo $FILE | sed -e 's/_1_/_one_/'`
ln -s $FILE ${newname%1.fastq.gz}R1.fastq.gz
done
for FILE in `ls $DOWNLOAD_DIR/$PROJECT_ID/raw_data/*/*${SAMPLE}*/*_1_*_2.fastq.gz`
for FILE in `ls $DOWNLOAD_DIR/$PROJECT_ID/*/*${SAMPLE}*/*_1_*_2.fastq.gz`
do
newname=`echo $FILE | sed -e 's/_1_/_one_/'`
ln -s $FILE ${newname%2.fastq.gz}R2.fastq.gz
done
for FILE in `ls $DOWNLOAD_DIR/$PROJECT_ID/raw_data/*/*${SAMPLE}*/*_2_*_1.fastq.gz`
for FILE in `ls $DOWNLOAD_DIR/$PROJECT_ID/*/*${SAMPLE}*/*_2_*_1.fastq.gz`
do
newname=`echo $FILE | sed -e 's/_2_/_two_/'`
ln -s $FILE ${newname%1.fastq.gz}R1.fastq.gz
done
for FILE in `ls $DOWNLOAD_DIR/$PROJECT_ID/raw_data/*/*${SAMPLE}*/*_2_*_2.fastq.gz`
for FILE in `ls $DOWNLOAD_DIR/$PROJECT_ID/*/*${SAMPLE}*/*_2_*_2.fastq.gz`
do
newname=`echo $FILE | sed -e 's/_2_/_two_/'`
ln -s $FILE ${newname%2.fastq.gz}R2.fastq.gz
done
for FILE in `ls $DOWNLOAD_DIR/$PROJECT_ID/raw_data/*/*${SAMPLE}*/*_R[1,2].fastq.gz`
for FILE in `ls $DOWNLOAD_DIR/$PROJECT_ID/*/*${SAMPLE}*/*_R[1,2].fastq.gz`
do
echo "$FILE,$SAMPLE,$FAMILY_ID,$SEX,$PHENOTYPE,$TARGET" >> ${VERSION}_${PROJECT_ID}_${FAMILY_ID}.csv
done
......
#!/bin/bash
#
# prepare_bcbio_config.sh <config.sh> <project_id> <version> <sample_suffix>
#
# Given a <project_id>.ped file for a set of trios (families) in the
# folder $PARAMS_DIR, creates the files <project_id>.family_ids.txt
# and <project>.sample_ids.txt in the same folder.
#
# Assumes that reads for the samples are in the path
# $READS_DIR/<project_id>/<date>/<sample><sample_suffix>/*.gz,
# and that no samples other than those with reads are listed in the
# PED file. $READS_DIR is specified in the <config.sh> file.
#
# Assumes that the sample names in the PED file match those
# specifying the read directories with the addition of a specified
# suffix.
#
# All samples must be annotated with sex (1=male, 2=female) in the
# 5th column and phenotype (1=unaffected, 2=affected) in the 6th
# column of the PED file.
#
# Runs bcbio sample preparation and configuration file generation,
# assuming the template configuration file is at $BCBIO_TEMPLATE,
# specified in the <config.sh> file.
#
# Assumes bcbio is on the PATH (set in <config.sh>).
#
CONFIG_SH=$1
PROJECT_ID=$2
VERSION=$3
SAMPLE_SUFFIX=$4
source $CONFIG_SH
#
# Create the files:
# $PROJECT_ID.family_ids.txt - format <pcr_plate_id>_<family_id>
# $PROJECT_ID.$FAMILY_ID.ped - select only the individuals in a given family,
# prefix <family_id> with <pcr_plate_id> and
# add suffix <family_id> to <individual_id>
#
cd $PARAMS_DIR
# remove DOS newline characters if necessary
perl -pi -e 's/\r//' $PROJECT_ID.ped
# create reads directory for project
mkdir -p $READS_DIR/$PROJECT_ID
cat $DOWNLOAD_DIR/$PROJECT_ID/*/*/file_list.tsv | \
perl $SCRIPTS/trio_whole_exome_create_parameter_files.pl \
--prefix ./$PROJECT_ID \
--ped $PROJECT_ID.ped \
--suffix $SAMPLE_SUFFIX
for FAMILY_ID in `head -n 1 ${PROJECT_ID}.family_ids.txt`
do
echo "samplename,description,batch,sex,phenotype,variant_regions" > ${VERSION}_${PROJECT_ID}_${FAMILY_ID}.csv
COUNT=`wc -l ${PROJECT_ID}_${FAMILY_ID}.ped | awk '{ print $1 }'`
for ((i=1; i<=$COUNT; i=i+1))
do
SAMPLE=`head -n $i ${PROJECT_ID}_${FAMILY_ID}.ped | tail -n 1 | cut -f 2`
SEX=`head -n $i ${PROJECT_ID}_${FAMILY_ID}.ped | tail -n 1 | cut -f 5`
PHENOTYPE=`head -n $i ${PROJECT_ID}_${FAMILY_ID}.ped | tail -n 1 | cut -f 6`
for FILE in `ls $DOWNLOAD_DIR/$PROJECT_ID/*/*${SAMPLE}*/*_[1,2].fastq.gz`
do
echo "$FILE,$SAMPLE,$FAMILY_ID,$SEX,$PHENOTYPE,$TARGET" >> ${VERSION}_${PROJECT_ID}_${FAMILY_ID}.csv
done
done
bcbio_prepare_samples.py --out $READS_DIR/$PROJECT_ID --csv ${VERSION}_${PROJECT_ID}_${FAMILY_ID}.csv
mv ${VERSION}_${PROJECT_ID}_${FAMILY_ID}-merged.csv ${VERSION}_${PROJECT_ID}_${FAMILY_ID}.csv
BARE_FAMILY_ID=`echo $FAMILY_ID | cut -d '_' -f 2`
bcbio_nextgen.py -w template $BCBIO_TEMPLATE ${VERSION}_${PROJECT_ID}_${FAMILY_ID}.csv $READS_DIR/$PROJECT_ID/*_${BARE_FAMILY_ID}_R[12].fastq.gz
mv ${VERSION}_${PROJECT_ID}_${FAMILY_ID}/config/${VERSION}_${PROJECT_ID}_${FAMILY_ID}.yaml $CONFIG_DIR/
COMPRESSED_ID=`echo "$FAMILY_ID" | perl -pe "s/\_//"`
perl -i -pe "s/${COMPRESSED_ID}/${FAMILY_ID}/" $CONFIG_DIR/${VERSION}_${PROJECT_ID}_${FAMILY_ID}.yaml
rm -r ${VERSION}_${PROJECT_ID}_${FAMILY_ID}
done
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment