Newer
Older
#!/bin/bash
#PBS -l walltime=01:00:00
#PBS -l ncpus=1,mem=2gb
#PBS -q uv2000
#PBS -N down_setup
#PBS -j oe
### Setup the folder structure for the downstream analysis###
BASE=/scratch/u035/u035/shared/analysis/wes_pilot
WORK_DIR=$BASE/${PROJECT_ID}
VCF_DIR=${WORK_DIR}/VCF
PED_DIR=${WORK_DIR}/PED
LOG_DIR=${WORK_DIR}/LOG
G2P_DIR=${WORK_DIR}/G2P
VASE_DIR=${WORK_DIR}/VASE
COV_DIR=${WORK_DIR}/COV
DEC_DIR=${WORK_DIR}/DECIPHER
IGV_DIR=${DEC_DIR}/IGV
CNV_DIR=${WORK_DIR}/CNV
SCRIPTS_DIR=/home/u035/u035/shared/scripts
PYTHON2=/home/u035/u035/shared/software/bcbio/anaconda/envs/python2/bin/python2.7
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
# check if ${WORK_DIR} already exists - if so, exit - to prevent accidental overwriting
if [ -d "${WORK_DIR}" ]; then
echo "${WORK_DIR} already exists - EXIT! If really intended, delete manually!!!!"
exit
fi
echo "PROJECT_ID = ${PROJECT_ID}" # this the the folder (${BASE}/${PROJECT_ID}) where the downstream analysis will be done
echo "SOURCE_DIR = ${SOURCE_DIR}" # the command-line argument SOURCE_DIR is the general path to the source VCF, BAM and PED files
S_PED_DIR=${SOURCE_DIR}/../params # requires that the PED files are in this folder
# create the working dir and the required subfolders
mkdir ${WORK_DIR}
mkdir ${VCF_DIR}
mkdir ${PED_DIR}
mkdir ${LOG_DIR}
mkdir ${G2P_DIR}
mkdir ${VASE_DIR}
mkdir ${COV_DIR}
mkdir ${DEC_DIR}
mkdir ${IGV_DIR}
mkdir ${CNV_DIR}
echo "Created ${WORK_DIR} for this batch and all the required subfolders"
######################################################
### Copy the VCF and PED file per each family ###
######################################################
SOURCE_VCF_DIRS=${SOURCE_DIR}/????-??-??_*
echo "Found the following source VCF folders"
for S_VCF_DIR in ${SOURCE_VCF_DIRS}
do
# echo " ${S_VCF_DIR}"
VCF_DIR_NAME="${S_VCF_DIR##*/}"
# echo " ${VCF_DIR_NAME}"
IFS=_ read -ra my_arr <<< "${VCF_DIR_NAME}"
BATCH=${my_arr[1]}
FAM_ID=""
for ELEMENT in ${my_arr[@]:2}; do
FAM_ID+="${ELEMENT}_"
done
FAM_ID=${FAM_ID::-1}
# echo " BATCH = ${BATCH}, FAM_ID = ${FAM_ID}"
S_VCF_FILE=${S_VCF_DIR}/${FAM_ID}-gatk-haplotype-annotated.vcf.gz
S_PED_FILE=${S_PED_DIR}/${BATCH}_${FAM_ID}.ped
cp ${S_VCF_FILE} ${VCF_DIR}
cp ${S_PED_FILE} ${PED_DIR}
echo " copied ${S_VCF_FILE} to ${VCF_DIR}"
echo " copied ${S_PED_FILE} to ${PED_DIR}"
done
######################################################################################
### generate the FAM_IDs.txt, PRO_IDs.txt and FAM_PRO.txt *only for trio* families ###
######################################################################################
time ${PYTHON2} ${SCRIPTS_DIR}/extract_trio_FAM_PRO_ID.py ${WORK_DIR}
echo ""
echo ""
echo "OK: Setup for PROJECT_ID = $PROJECT_ID successful"