Edinburgh Genomics change of FASTQ folders

fe372d2c · ameyner2 · 489c77f8 · fe372d2c · fe372d2c
Commit fe372d2c authored 4 years ago by ameyner2
--- a/prepare_bcbio_config.sh
+++ b/prepare_bcbio_config.sh
@@ -7,7 +7,7 @@
 # and <project>.sample_ids.txt in the same folder.
 #
 # Assumes that reads for the samples are in the path
-# $READS_DIR/<project_id>/raw_data/<date>/<sample><sample_suffix>/*.gz,
+# $READS_DIR/<project_id>/<date>/<sample><sample_suffix>/*.gz,
 # and that no samples other than those with reads are listed in the 
 # PED file. $READS_DIR is specified in the <config.sh> file.
 #
@@ -48,7 +48,7 @@ perl -pi -e 's/\r//' $PROJECT_ID.ped
 # create reads directory for project
 mkdir -p $READS_DIR/$PROJECT_ID

-cat $DOWNLOAD_DIR/$PROJECT_ID/raw_data/*/file_list.tsv | \
+cat $DOWNLOAD_DIR/$PROJECT_ID/*/file_list.tsv | \
  perl $SCRIPTS/trio_whole_exome_create_parameter_files.pl \
    --prefix ./$PROJECT_ID \
    --ped $PROJECT_ID.ped \
@@ -66,28 +66,28 @@ do
    PHENOTYPE=`head -n $i ${PROJECT_ID}_${FAMILY_ID}.ped | tail -n 1 | cut -f 6`

    # create symlinks for problematic filenames
-    for FILE in `ls $DOWNLOAD_DIR/$PROJECT_ID/raw_data/*/*${SAMPLE}*/*_1_*_1.fastq.gz`
+    for FILE in `ls $DOWNLOAD_DIR/$PROJECT_ID/*/*${SAMPLE}*/*_1_*_1.fastq.gz`
    do
      newname=`echo $FILE | sed -e 's/_1_/_one_/'`
      ln -s $FILE ${newname%1.fastq.gz}R1.fastq.gz
    done
-    for FILE in `ls $DOWNLOAD_DIR/$PROJECT_ID/raw_data/*/*${SAMPLE}*/*_1_*_2.fastq.gz`
+    for FILE in `ls $DOWNLOAD_DIR/$PROJECT_ID/*/*${SAMPLE}*/*_1_*_2.fastq.gz`
    do
      newname=`echo $FILE | sed -e 's/_1_/_one_/'`
      ln -s $FILE ${newname%2.fastq.gz}R2.fastq.gz
    done
-    for FILE in `ls $DOWNLOAD_DIR/$PROJECT_ID/raw_data/*/*${SAMPLE}*/*_2_*_1.fastq.gz`
+    for FILE in `ls $DOWNLOAD_DIR/$PROJECT_ID/*/*${SAMPLE}*/*_2_*_1.fastq.gz`
    do
      newname=`echo $FILE | sed -e 's/_2_/_two_/'`
      ln -s $FILE ${newname%1.fastq.gz}R1.fastq.gz
    done
-    for FILE in `ls $DOWNLOAD_DIR/$PROJECT_ID/raw_data/*/*${SAMPLE}*/*_2_*_2.fastq.gz`
+    for FILE in `ls $DOWNLOAD_DIR/$PROJECT_ID/*/*${SAMPLE}*/*_2_*_2.fastq.gz`
    do
      newname=`echo $FILE | sed -e 's/_2_/_two_/'`
      ln -s $FILE ${newname%2.fastq.gz}R2.fastq.gz
    done

-    for FILE in `ls $DOWNLOAD_DIR/$PROJECT_ID/raw_data/*/*${SAMPLE}*/*_R[1,2].fastq.gz`
+    for FILE in `ls $DOWNLOAD_DIR/$PROJECT_ID/*/*${SAMPLE}*/*_R[1,2].fastq.gz`
    do
      echo "$FILE,$SAMPLE,$FAMILY_ID,$SEX,$PHENOTYPE,$TARGET" >> ${VERSION}_${PROJECT_ID}_${FAMILY_ID}.csv
    done

--- a/prepare_bcbio_config_old_edge.sh
+++ b/prepare_bcbio_config_old_edge.sh
+#!/bin/bash
+#
+# prepare_bcbio_config.sh <config.sh> <project_id> <version> <sample_suffix>
+# 
+# Given a <project_id>.ped file for a set of trios (families) in the 
+# folder $PARAMS_DIR, creates the files <project_id>.family_ids.txt
+# and <project>.sample_ids.txt in the same folder.
+#
+# Assumes that reads for the samples are in the path
+# $READS_DIR/<project_id>/<date>/<sample><sample_suffix>/*.gz,
+# and that no samples other than those with reads are listed in the 
+# PED file. $READS_DIR is specified in the <config.sh> file.
+#
+# Assumes that the sample names in the PED file match those 
+# specifying the read directories with the addition of a specified
+# suffix.
+#
+# All samples must be annotated with sex (1=male, 2=female) in the
+# 5th column and phenotype (1=unaffected, 2=affected) in the 6th
+# column of the PED file.
+#
+# Runs bcbio sample preparation and configuration file generation,
+# assuming the template configuration file is at $BCBIO_TEMPLATE,
+# specified in the <config.sh> file.
+#
+# Assumes bcbio is on the PATH (set in <config.sh>).
+#
+
+CONFIG_SH=$1
+PROJECT_ID=$2
+VERSION=$3
+SAMPLE_SUFFIX=$4
+
+source $CONFIG_SH
+
+#
+# Create the files:
+#  $PROJECT_ID.family_ids.txt - format <pcr_plate_id>_<family_id>
+#  $PROJECT_ID.$FAMILY_ID.ped - select only the individuals in a given family, 
+#                               prefix <family_id> with <pcr_plate_id> and
+#                               add suffix <family_id> to <individual_id> 
+#
+cd $PARAMS_DIR
+
+# remove DOS newline characters if necessary
+perl -pi -e 's/\r//' $PROJECT_ID.ped
+
+# create reads directory for project
+mkdir -p $READS_DIR/$PROJECT_ID
+
+cat $DOWNLOAD_DIR/$PROJECT_ID/*/*/file_list.tsv | \
+  perl $SCRIPTS/trio_whole_exome_create_parameter_files.pl \
+    --prefix ./$PROJECT_ID \
+    --ped $PROJECT_ID.ped \
+    --suffix $SAMPLE_SUFFIX
+
+for FAMILY_ID in `head -n 1 ${PROJECT_ID}.family_ids.txt`
+do
+  echo "samplename,description,batch,sex,phenotype,variant_regions" > ${VERSION}_${PROJECT_ID}_${FAMILY_ID}.csv
+  COUNT=`wc -l ${PROJECT_ID}_${FAMILY_ID}.ped | awk '{ print $1 }'`
+
+  for ((i=1; i<=$COUNT; i=i+1))
+  do
+    SAMPLE=`head -n $i ${PROJECT_ID}_${FAMILY_ID}.ped | tail -n 1 | cut -f 2`
+    SEX=`head -n $i ${PROJECT_ID}_${FAMILY_ID}.ped | tail -n 1 | cut -f 5`
+    PHENOTYPE=`head -n $i ${PROJECT_ID}_${FAMILY_ID}.ped | tail -n 1 | cut -f 6`
+
+    for FILE in `ls $DOWNLOAD_DIR/$PROJECT_ID/*/*${SAMPLE}*/*_[1,2].fastq.gz`
+    do
+      echo "$FILE,$SAMPLE,$FAMILY_ID,$SEX,$PHENOTYPE,$TARGET" >> ${VERSION}_${PROJECT_ID}_${FAMILY_ID}.csv
+    done
+
+  done
+
+  bcbio_prepare_samples.py --out $READS_DIR/$PROJECT_ID --csv ${VERSION}_${PROJECT_ID}_${FAMILY_ID}.csv
+
+  mv ${VERSION}_${PROJECT_ID}_${FAMILY_ID}-merged.csv ${VERSION}_${PROJECT_ID}_${FAMILY_ID}.csv
+
+  BARE_FAMILY_ID=`echo $FAMILY_ID | cut -d '_' -f 2`
+
+  bcbio_nextgen.py -w template $BCBIO_TEMPLATE ${VERSION}_${PROJECT_ID}_${FAMILY_ID}.csv $READS_DIR/$PROJECT_ID/*_${BARE_FAMILY_ID}_R[12].fastq.gz
+
+  mv ${VERSION}_${PROJECT_ID}_${FAMILY_ID}/config/${VERSION}_${PROJECT_ID}_${FAMILY_ID}.yaml $CONFIG_DIR/
+
+  COMPRESSED_ID=`echo "$FAMILY_ID" | perl -pe "s/\_//"`
+
+  perl -i -pe "s/${COMPRESSED_ID}/${FAMILY_ID}/" $CONFIG_DIR/${VERSION}_${PROJECT_ID}_${FAMILY_ID}.yaml
+
+  rm -r ${VERSION}_${PROJECT_ID}_${FAMILY_ID}
+
+done