Skip to content
Snippets Groups Projects
Commit 1cfac4a4 authored by user name's avatar user name
Browse files

Reanalysis scripts for preparation with symlinks and creating singleton families from duos

parent 356edbb6
No related branches found
No related tags found
1 merge request!3Ultra 2 SOP/doc updates
#!/bin/bash
#
# Create the reanalysis folder named e.g. results/20220418_reanalysis, and create files for each
# of the family types, tab-delimited text format:
# project_version plate_id family_id (excluding plate id)
#
# Run this script in the reanalysis folder. Ensure that the environment variable project_id is set to
# the same name as the reanalysis folder.
#
# Create symlinks for the families that don't require any re-processing
for file in quad.txt shared_affected.txt singleton.txt trio_affected_parent.txt trio.txt
do
count=`wc -l params/$file | awk '{ print $1 }'`
for ((i = 1; i <= $count; i = i + 1))
do
project=`head -n $i params/$file | tail -n 1 | cut -f 1`
family=`head -n $i params/$file | tail -n 1 | cut -f 3`
cd families
family_dir=`ls ../../${project}/families | grep $family`
ln -s ../../${project}/families/$family_dir $family_dir
cd ../params
ped=`ls ../../${project}/params/*.ped | grep $family`
ln -s $ped `basename $ped`
cd ..
done
done
# For the singletons from duos with unaffected parents that need to be re-generated,
# prepare appropriate PED files in the analysis/params folder to begin analysis.
cp singleton_from_duo.txt ../../analysis/params
cd ../../analysis/params
count=`wc -l singleton_from_duo.txt | awk '{ print $1 }'`
file=singleton_from_duo.txt
for ((i = 1; i <= $count; i = i + 1))
do
project=`head -n $i $file | tail -n 1 | cut -f 1`
family=`head -n $i $file | tail -n 1 | cut -f 3`
ped=`ls ../../results/${project}/params/*.ped | grep $family`
grep 2$ $ped | awk '{ print $1 "\t" $2 "\t0\t0\t" $5 "\t" $6}' > `basename $ped`
done
# Create a family ids list
cat *.ped | cut -f 1 | sort > $project_id.family_ids.txt
#!/bin/bash
#
# trio_wes_prepare_bcbio_singleton_from_duo_config.sh <config.sh> <project_id> <params>
#
# Assumes that reads for the samples are in the path
# $READS_DIR/<project_id>/<date>/<sample><sample_suffix>/*.gz,
# and that no samples other than those with reads are listed in the
# PED file. $READS_DIR is specified in the <config.sh> file.
#
# Assumes that the sample names in the PED file match those
# specifying the read directories with the addition of a specified
# suffix.
#
# All samples must be annotated with sex (1=male, 2=female) in the
# 5th column and phenotype (1=unaffected, 2=affected) in the 6th
# column of the PED file.
#
# Runs bcbio sample preparation and configuration file generation,
# assuming the template configuration file is at $BCBIO_TEMPLATE,
# specified in the <config.sh> file.
#
# Assumes bcbio is on the PATH (set in <config.sh>).
#
CONFIG_SH=$1
PROJECT_ID=$2
PARAMS=$3
source $CONFIG_SH
#
# Create the file $PROJECT_ID.family_ids.txt
#
cd $PARAMS_DIR
cat *.ped | cut -f 1 > $PROJECT_ID.family_ids.txt
SHORT_PROJECT_ID=`echo $PROJECT_ID | cut -f 1 -d '_'`
COUNT=`wc -l ${PROJECT_ID}.family_ids.txt | awk '{ print $1 }'`
for ((i = 1; i <= $COUNT; i = i + 1))
do
ORIG_PROJECT_ID=`head -n $i $PARAMS | tail -n 1 | cut -f 1 -d '_'`
ORIG_VERSION=`head -n $i $PARAMS | tail -n 1 | cut -f 1 | cut -f 2 -d '_'`
BATCH_ID=`head -n $i $PARAMS | tail -n 1 | cut -f 2`
FAMILY_ID=`head -n $i $PARAMS | tail -n 1 | cut -f 3`
SAMPLE=`cut -f 2 *_${FAMILY_ID}.ped`
SEX=`cut -f 5 *_${FAMILY_ID}.ped`
PHENOTYPE=`cut -f 6 *_${FAMILY_ID}.ped`
PREFIX=${ORIG_PROJECT_ID}_${ORIG_VERSION}_${BATCH_ID}_${FAMILY_ID}
echo "samplename,description,batch,sex,phenotype,variant_regions" > ${PREFIX}.csv
len=`expr length $ORIG_PROJECT_ID`
if [ $len -eq 5 ]
then
mkdir -p $READS_DIR/$PROJECT_ID/symlinks/$SAMPLE
for FILE in `find $DOWNLOAD_DIR/$ORIG_PROJECT_ID* -wholename "*$SAMPLE*/*_1_*_1.fastq.gz"`
do
newname=`basename $FILE | sed -e 's/_1_/_one_/'`
ln -s $FILE $READS_DIR/$PROJECT_ID/symlinks/$SAMPLE/${newname%1.fastq.gz}R1.fastq.gz
done
for FILE in `find $DOWNLOAD_DIR/$ORIG_PROJECT_ID* -wholename "*$SAMPLE*/*_1_*_2.fastq.gz"`
do
newname=`basename $FILE | sed -e 's/_1_/_one_/'`
ln -s $FILE $READS_DIR/$PROJECT_ID/symlinks/$SAMPLE/${newname%2.fastq.gz}R2.fastq.gz
done
for FILE in `find $DOWNLOAD_DIR/$ORIG_PROJECT_ID* -wholename "*$SAMPLE*/*_2_*_1.fastq.gz"`
do
newname=`basename $FILE | sed -e 's/_2_/_two_/'`
ln -s $FILE $READS_DIR/$PROJECT_ID/symlinks/$SAMPLE/${newname%1.fastq.gz}R1.fastq.gz
done
for FILE in `find $DOWNLOAD_DIR/$ORIG_PROJECT_ID* -wholename "*$SAMPLE*/*_2_*_2.fastq.gz"`
do
newname=`basename $FILE | sed -e 's/_2_/_two_/'`
ln -s $FILE $READS_DIR/$PROJECT_ID/symlinks/$SAMPLE/${newname%2.fastq.gz}R2.fastq.gz
done
for FILE in `ls $READS_DIR/$PROJECT_ID/symlinks/$SAMPLE/*_R[1,2].fastq.gz`
do
echo "$FILE,$SAMPLE,${BATCH_ID}_${FAMILY_ID},$SEX,$PHENOTYPE,$TARGET" >> ${PREFIX}.csv
done
else
for FILE in `ls $DOWNLOAD_DIR/$ORIG_PROJECT_ID*/*${SAMPLE}*.gz`
do
echo "$FILE,$SAMPLE,${BATCH_ID}_${FAMILY_ID},$SEX,$PHENOTYPE,$TARGET" >> $PREFIX.csv
done
fi
bcbio_prepare_samples.py --out $READS_DIR/$PROJECT_ID --csv ${PREFIX}.csv
mv ${PREFIX}-merged.csv ${PREFIX}.csv
bcbio_nextgen.py -w template $BCBIO_TEMPLATE ${PREFIX}.csv $READS_DIR/$PROJECT_ID/*_${FAMILY_ID}_R[12].fastq.gz
mv ${PREFIX}/config/${PREFIX}.yaml $CONFIG_DIR/
perl -i -pe "s/${BATCH_ID}${FAMILY_ID}/${BATCH_ID}_${FAMILY_ID}/" $CONFIG_DIR/${PREFIX}.yaml
rm -r ${PREFIX}
done
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment