Skip to content
Snippets Groups Projects
Commit c0508979 authored by ameyner2's avatar ameyner2
Browse files

Clean up pilot submission script, add shared vars script

parent 0849b8df
No related branches found
No related tags found
No related merge requests found
*_transfer_info_file.sh
# given a VCF file and a list of IDs (as in the #CHROM line)
# extract a subset VCF containing only variants which are not HOM REF in *all* the listed IDs
#
#
# Author: MH
# last modified: AUG 18, 2020
import sys
import os
import gzip
ID_DICT = {} # key: ID, value: field index (0 = chr, 1 = pos, ....)
def go(in_file,list_ids,out_file):
ids = [x.strip() for x in list_ids.strip().split(',')]
print "Found %s IDs for which the shared variants are to be extracted" % (len(ids))
print ids
in_han = gzip.open(in_file,'r')
out_han = open(out_file,'w')
in_cntr = 0
out_cntr = 0
for line in in_han:
if line.startswith('#CHROM'):
print line
# find the index for each ID
# potential problems - ID more than once in the VCF (will return only the first match) - should not occur in VCFs
# - ID not in the list
header_fields = [x.strip() for x in line.strip().split('\t')]
for i in ids:
try:
index = header_fields.index(i)
if i not in ID_DICT:
ID_DICT[i] = index
else:
print "The supplied list of IDs contain duplicates for %s" % (i)
raise SystemExit
except ValueError:
print "%s not found in %s" % (i,line)
raise SystemExit
if len(ID_DICT) != len(ids):
print "Could not find the indexes for all IDs"
print ID_DICT
raise SystemExit
# write the new line in out_file
new_header = '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t'
for z in xrange(0,len(ids)):
new_header = new_header + '%s\t' % (ids[z])
new_header = new_header[:-1] + '\n'
out_han.write(new_header)
continue
if line.startswith('#'):
out_han.write(line)
continue
# startng to read the variants
in_cntr += 1
data = [x.strip() for x in line.strip().split('\t')]
desired = {} # key: id; value: the variant info
for i in ids:
ind = ID_DICT[i]
var = data[ind]
desired[i] = var
print desired
out_cntr += 1
in_han.close()
out_han.close()
print "Read %s vars from the family file, wrote %s shared variants" % (in_cntr,out_cntr)
sys.stdout.flush()
def temp():
AFF_PROBANDS = []
in_han = open(in_file,'r')
for line in in_han:
data = [x.strip() for x in line.strip().split('\t')]
pro_fam_id = data[1]
par_1 = int(data[2])
par_2 = int(data[3])
aff = int(data[5])
if (par_1 != 0) or (par_2 != 0):
print "ERROR: Found a proband with a parent"
print line
raise SystemExit(1)
if aff != 2:
print "ERROR: Found unaffected proband"
print line
raise SystemExit(1)
if pro_fam_id not in AFF_PROBANDS:
AFF_PROBANDS.append(pro_fam_id)
else:
print "ERROR: Found duplicate proband"
print line
raise SystemExit(1)
in_han.close()
print "PED file checks: success"
print "Found %s affected probands with no parents in %s" % (len(AFF_PROBANDS),in_file)
sys.stdout.flush()
if __name__ == '__main__':
if len(sys.argv) == 4:
go(sys.argv[1],sys.argv[2],sys.argv[3])
else:
print "Suggested use: time python /home/u035/project/scripts/NHS_WES_extract_shared_vars.py in_vcf comma,sep,list,of,ids out_vcf"
raise SystemExit
#!/bin/bash
#PBS -l walltime=48:00:00
#PBS -l ncpus=16,mem=8gb
#PBS -q sgp
#PBS -N nhss_wes_bcbio
#PBS -j oe
# enable running singletons
if [ -z $PBS_ARRAY_INDEX ]
then
if [ -z $INDEX ]
then
export PBS_ARRAY_INDEX=1
else
export PBS_ARRAY_INDEX=$INDEX
fi
fi
export PATH=$PATH:/home/u027/project/software/bcbio/tools/bin
BCBIO_CONFIG=/scratch/u027/project/analysis/wes_pilot/bcbio-1.1.5_test/config
BCBIO_WORK=/scratch/u027/project/analysis/wes_pilot/bcbio-1.1.5_test/work
# Expects environment variables to be set
# BATCH - nhsswesXXX
# FAMILY_IDS - text file listing family ids
FAMILY_ID=`head -n $PBS_ARRAY_INDEX $FAMILY_IDS | tail -n 1`
CONFIG_FILE=$BCBIO_CONFIG/${BATCH}_${FAMILY_ID}.yaml
mkdir -p $BCBIO_WORK/$FAMILY_ID
cd $BCBIO_WORK/$FAMILY_ID
bcbio_nextgen.py $CONFIG_FILE -n 16 -t local
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment