diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..7cd56b326d9f0d3c9c8c0161c818b922e42aac7c --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +*_transfer_info_file.sh diff --git a/NHS_WES_extract_shared_vars.py b/NHS_WES_extract_shared_vars.py new file mode 100755 index 0000000000000000000000000000000000000000..83e94aa20a398592d7fc9aca1cf5a04c4bd1e409 --- /dev/null +++ b/NHS_WES_extract_shared_vars.py @@ -0,0 +1,131 @@ +# given a VCF file and a list of IDs (as in the #CHROM line) +# extract a subset VCF containing only variants which are not HOM REF in *all* the listed IDs +# +# +# Author: MH +# last modified: AUG 18, 2020 + + + + +import sys +import os +import gzip + + +ID_DICT = {} # key: ID, value: field index (0 = chr, 1 = pos, ....) + + + +def go(in_file,list_ids,out_file): + + ids = [x.strip() for x in list_ids.strip().split(',')] + print "Found %s IDs for which the shared variants are to be extracted" % (len(ids)) + print ids + + in_han = gzip.open(in_file,'r') + out_han = open(out_file,'w') + in_cntr = 0 + out_cntr = 0 + + for line in in_han: + if line.startswith('#CHROM'): + print line + # find the index for each ID + # potential problems - ID more than once in the VCF (will return only the first match) - should not occur in VCFs + # - ID not in the list + header_fields = [x.strip() for x in line.strip().split('\t')] + for i in ids: + try: + index = header_fields.index(i) + if i not in ID_DICT: + ID_DICT[i] = index + else: + print "The supplied list of IDs contain duplicates for %s" % (i) + raise SystemExit + except ValueError: + print "%s not found in %s" % (i,line) + raise SystemExit + + if len(ID_DICT) != len(ids): + print "Could not find the indexes for all IDs" + print ID_DICT + raise SystemExit + + # write the new line in out_file + new_header = '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t' + for z in xrange(0,len(ids)): + new_header = new_header + '%s\t' % (ids[z]) + new_header = new_header[:-1] + '\n' + out_han.write(new_header) + continue + + if line.startswith('#'): + out_han.write(line) + continue + + # startng to read the variants + in_cntr += 1 + data = [x.strip() for x in line.strip().split('\t')] + desired = {} # key: id; value: the variant info + for i in ids: + ind = ID_DICT[i] + var = data[ind] + desired[i] = var + print desired + + out_cntr += 1 + + in_han.close() + out_han.close() + print "Read %s vars from the family file, wrote %s shared variants" % (in_cntr,out_cntr) + sys.stdout.flush() + + + +def temp(): + + AFF_PROBANDS = [] + + in_han = open(in_file,'r') + for line in in_han: + data = [x.strip() for x in line.strip().split('\t')] + pro_fam_id = data[1] + par_1 = int(data[2]) + par_2 = int(data[3]) + aff = int(data[5]) + + if (par_1 != 0) or (par_2 != 0): + print "ERROR: Found a proband with a parent" + print line + raise SystemExit(1) + + if aff != 2: + print "ERROR: Found unaffected proband" + print line + raise SystemExit(1) + + if pro_fam_id not in AFF_PROBANDS: + AFF_PROBANDS.append(pro_fam_id) + else: + print "ERROR: Found duplicate proband" + print line + raise SystemExit(1) + + in_han.close() + print "PED file checks: success" + print "Found %s affected probands with no parents in %s" % (len(AFF_PROBANDS),in_file) + sys.stdout.flush() + + + + + + +if __name__ == '__main__': + if len(sys.argv) == 4: + go(sys.argv[1],sys.argv[2],sys.argv[3]) + else: + print "Suggested use: time python /home/u035/project/scripts/NHS_WES_extract_shared_vars.py in_vcf comma,sep,list,of,ids out_vcf" + raise SystemExit + diff --git a/submit_bcbio_wes_pilot.sh b/submit_bcbio_wes_pilot.sh deleted file mode 100755 index 67fedbceaef67fd90c88956a16379a204d32861b..0000000000000000000000000000000000000000 --- a/submit_bcbio_wes_pilot.sh +++ /dev/null @@ -1,33 +0,0 @@ -#!/bin/bash -#PBS -l walltime=48:00:00 -#PBS -l ncpus=16,mem=8gb -#PBS -q sgp -#PBS -N nhss_wes_bcbio -#PBS -j oe - -# enable running singletons -if [ -z $PBS_ARRAY_INDEX ] -then - if [ -z $INDEX ] - then - export PBS_ARRAY_INDEX=1 - else - export PBS_ARRAY_INDEX=$INDEX - fi -fi - -export PATH=$PATH:/home/u027/project/software/bcbio/tools/bin -BCBIO_CONFIG=/scratch/u027/project/analysis/wes_pilot/bcbio-1.1.5_test/config -BCBIO_WORK=/scratch/u027/project/analysis/wes_pilot/bcbio-1.1.5_test/work - -# Expects environment variables to be set -# BATCH - nhsswesXXX -# FAMILY_IDS - text file listing family ids - -FAMILY_ID=`head -n $PBS_ARRAY_INDEX $FAMILY_IDS | tail -n 1` - -CONFIG_FILE=$BCBIO_CONFIG/${BATCH}_${FAMILY_ID}.yaml -mkdir -p $BCBIO_WORK/$FAMILY_ID -cd $BCBIO_WORK/$FAMILY_ID - -bcbio_nextgen.py $CONFIG_FILE -n 16 -t local