Setup_variant_prioritization.md

U> ls -l /home/u035/u035/shared/software/bcbio/genomes/Hsapiens/hg38/vep/homo_sapiens_merged/100_GRCh38/<chr>
types        : SO consequence types to include ...		(line 84)
types => {map {$_ => 1} qw(splice_donor_variant ...		(line 145)
U> cd /home/u035/u035/shared/resources/gnomad/r3.1.1/genomes
U> for i in {1..22} X Y
U> do
U>   wget https://storage.googleapis.com/gcp-public-data--gnomad/release/3.1.1/vcf/genomes/gnomad.genomes.v3.1.1.sites.chr${i}.vcf.bgz
U>   wget https://storage.googleapis.com/gcp-public-data--gnomad/release/3.1.1/vcf/genomes/gnomad.genomes.v3.1.1.sites.chr${i}.vcf.bgz.tbi
U> done
U> cd /home/u035/u035/shared/resources/gnomad/r2.1.1/exomes
U> for i in {1..22} X Y
U> do
U>   wget https://storage.googleapis.com/gcp-public-data--gnomad/release/2.1.1/liftover_grch38/vcf/exomes/gnomad.exomes.r2.1.1.sites.${i}.liftover_grch38.vcf.bgz
U>   wget https://storage.googleapis.com/gcp-public-data--gnomad/release/2.1.1/liftover_grch38/vcf/exomes/gnomad.exomes.r2.1.1.sites.${i}.liftover_grch38.vcf.bgz.tbi
U> done
"id": "gnomADg_r3.1.1_GRCh38",
"description": "Genome Aggregation Database genomes r3.1.1",
"species": "homo_sapiens",
"assembly": "GRCh38",
"type": "local",
"filename_template": "/home/u035/u035/shared/resources/gnomad/r3.1.1/genomes/gnomad.genomes.v3.1.1.sites.chr###CHR###.vcf.bgz",
"id": "gnomADe_r2.1.1_GRCh38",
"description": "Genome Aggregation Database exomes r2.1.1 liftover to GRCh38",
"species": "homo_sapiens",
"assembly": "GRCh38",
"type": "local",
"filename_template": "/home/u035/u035/shared/resources/gnomad/r2.1.1/exomes/gnomad.exomes.r2.1.1.sites.###CHR###.liftover_grch38.vcf.bgz",
VEP="/home/u035/u035/shared/software/bcbio/anaconda/bin/perl /home/u035/u035/shared/software/bcbio/anaconda/bin/vep"
# this points to ../share/ensembl-vep-100.4-0/vep

REFERENCE_GENOME=/home/u035/u035/shared/software/bcbio/genomes/Hsapiens/hg38/seq/hg38.fa

IN_FILE=${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}.clean.vcf
G2P_LOG_DIR=${G2P_DIR}/${PLATE_ID}_${FAMILY_ID}_LOG_DIR
mkdir ${G2P_LOG_DIR}
TXT_OUT=${G2P_LOG_DIR}/${PLATE_ID}_${FAMILY_ID}.report.txt
HTML_OUT=${G2P_LOG_DIR}/${PLATE_ID}_${FAMILY_ID}.report.html
VCF_KEYS='gnomADe_GRCh38|gnomADg_r3.0_GRCh38'

time ${VEP} \
    -i ${IN_FILE} \
    --output_file ${G2P_LOG_DIR}/${PLATE_ID}_${FAMILY_ID}_inter_out.txt \
    --force_overwrite \
    --assembly GRCh38 \
    --fasta ${REFERENCE_GENOME} \
    --offline \
    --merged \
    --use_given_ref \
    --cache --cache_version 100 \
    --dir_cache /home/u035/u035/shared/software/bcbio/genomes/Hsapiens/hg38/vep \
    --individual all \
    --transcript_filter "gene_symbol in /home/u035/u035/shared/resources/G2P/genes_in_DDG2P.20201208.txt" \
    --dir_plugins /home/u035/u035/shared/software/bcbio/anaconda/share/ensembl-vep-100.4-0 \
    --plugin G2P,file='/home/u035/u035/shared/resources/G2P/DDG2P.20201208.csv',af_from_vcf=1,confidence_levels='confirmed&probable&both RD and IF',af_from_vcf_keys=${VCF_KEYS},\
log_dir=${G2P_LOG_DIR},txt_report=${TXT_OUT},html_report=${HTML_OUT}
E> cd <eddie_work_folder>
E> wget https://www.ebi.ac.uk/gene2phenotype/downloads/DDG2P.csv.gz
E> mv DDG2P.csv.gz DDG2P.orig.<date_downloaded>.csv.gz
E> gunzip -c DDG2P.orig.<date_downloaded>.csv.gz > DDG2P.orig.<date_downloaded>.csv
	records with biallelic requirement:				1339	# 1310 in 20201208
	records with monoallelic requirement:			873	# 911 in 20201208
	records with hemizygous requirement:			159	# 179	in 20201208
	records with x-linked dominant requirement:		46	# 44 in 20201208
	records with x-linked over-dominance requirement:	2	# 2 in 20201208
	records with digenic requirement:				1	(G2P ignores them)
	records with imprinted requirement:			10	(G2P ignores them)
	records with mitochondrial requirement:			2	(G2P ignores them)
	records with uncertain requirement:			0	(G2P ignores them)
	records with mosaic requirement:				12	(G2P ignores them)
	records with no allelic requirement:			3	(excluded)
E> cd <eddie_work_folder>
E> time python /exports/igmm/eddie/IGMM-VariantAnalysis/mike/scripts/extract_unique_genes.py DDG2P.<date_downloaded>.csv genes_in_DDG2P.<date_downloaded>.txt
Found 3553 unique gene names (incl.synonyms) in DDG2P.20210706.csv
recorded 3553 unique gene names (incl. synonyms); outfile = genes_in_DDG2P.20210706.txt
TARGETS=/home/u035/u035/shared/resources/G2P/DDG2P.20210706.plus15bp.merged.bed
CLINVAR=/home/u035/u035/shared/resources/G2P/DDG2P.20210706.clinvar.20210626.plus15bp.txt
echo "Performing G2P analysis (DD genes)for FAMILY_ID = ${PLATE_ID}_${FAMILY_ID}..."
echo "Using ${TARGETS}"
--transcript_filter "gene_symbol in /home/u035/u035/shared/resources/G2P/genes_in_DDG2P.20210706.txt"
--plugin G2P,file='/home/u035/u035/shared/resources/G2P/DDG2P.20210706.csv',af_from_vcf...
E> perl ../scripts/ccds_to_bed.pl
E> perl ../scripts/ccds_to_bed.pl -i CCDS.20180614.txt -o CCDS.20180614.bed
E> mysql --user=genome --host=genome-mysql.cse.ucsc.edu -A -e "select chrom, size from hg38.chromInfo" > hg38.genome
E> bedtools slop -i CCDS.20180614.bed -b 15 -g hg38.genome > CCDS.20180614.plus15bp.bed
E> bedtools sort -i CCDS.20180614.plus15bp.bed -faidx /exports/igmm/eddie/bioinfsvice/ameynert/software/bcbio-1.0.7/genomes/Hsapiens/hg38/seq/hg38.fa.fai > CCDS.20180614.plus15bp.sorted.bed
E> bedtools merge -i CCDS.20180614.plus15bp.sorted.bed -c 4 -o distinct > CCDS.20180614.plus15bp.merged.bed
U> cd /home/u035/u035/shared/resources/G2P
U> PYTHON=/home/u035/u035/shared/software/bcbio/anaconda/envs/python2/bin/python2.7
U> time $PYTHON /home/u035/u035/shared/scripts/extract_BED_CCDS_DDG2P.py CCDS.20180614.plus15bp.merged.bed genes_in_DDG2P.20210706.txt DDG2P.20210706.plus15bp.merged.bed
Found 3553 unique gene names in genes_in_DDG2P.20210706.txt
Read 193346 records from the input BED file = CCDS.20180614.plus15bp.merged.bed
Wrote 33275 record for the DDG2P genes in the output BED file = DDG2P.20210706.plus15bp.merged.bed
Found intervals for 2156 uniq DDG2P genes
U> cd /home/u035/u035/shared/resources/clinvar
U> wget ftp://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh38/clinvar_20210626.vcf.gz
U> wget ftp://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh38/clinvar_20210626.vcf.gz.tbi

# grab the header and only variants annotated as Pathogenic or Likely Pathogenic # and not with conflicting interpretation
U> zgrep '^#' clinvar_20210626.vcf.gz > clinvar_20210626.P_LP.vcf && zgrep -E 'CLNSIG=Likely_pathogenic;|CLNSIG=Pathogenic;' clinvar_20210626.vcf.gz >>  clinvar_20210626.P_LP.vcf

# need to add chr prefix in the clinvar_20201128.P_LP.vcf file
U> awk '{if($0 !~ /^#/) print "chr"$0; else print $0}' clinvar_20210626.P_LP.vcf > clinvar_20210626.P_LP.chr.vcf

# Exclude variants with “no assertion criteria provided”
U> grep '^#' clinvar_20210626.P_LP.chr.vcf > clinvar_20210626.P_LP.ACP.vcf && grep -v 'CLNREVSTAT=no_assertion_criteria_provided' clinvar_20210626.P_LP.chr.vcf >> clinvar_20210626.P_LP.ACP.vcf
U> cd /home/u035/u035/shared/resources/G2P
U> BEDTOOLS=/home/u035/u035/shared/software/bcbio/anaconda/bin/bedtools
U> $BEDTOOLS intersect -wa -c -a DDG2P.20210706.plus15bp.merged.bed -b ../clinvar/clinvar_20210626.P_LP.ACP.vcf > DDG2P.20210706.clinvar.20210626.plus15bp.txt

# proportion of all P/LP variants with assertion criteria provided in DD genes
U> grep -v '^#' ../clinvar/clinvar_20210626.P_LP.ACP.vcf | wc -l
Total of 102341 ClinVar vars

U> cat DDG2P.20210706.clinvar.20210626.plus15bp.txt | awk '{sum += $5} END {print sum}'
67623 of the ClinVar vars are in DD genes; 67623/102341=66% of all are in DD gene
U> cd /home/u035/u035/shared/resources/blacklist
U> nano blacklist.2019-11-27.txt
U> cp blacklist.2019-11-27.txt current_blacklist.txt
U> cd /home/u035/u035/shared/resources/trans_map
U> nano trans_map.2019-11-27.txt
U> cp trans_map.2019-11-27.txt current_trans_map.txt
VASE=/home/u035/u035/shared/software/bcbio/anaconda/bin/vase

IN_FILE=${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}.ready.vcf.gz
OUT_FILE=${VASE_DIR}/${PLATE_ID}_${FAMILY_ID}.strict.denovo.vcf
PED_FILE=${PED_DIR}/${BATCH_ID}_${PLATE_ID}_${FAMILY_ID}.ped

time ${VASE} \
    -i ${IN_FILE} \
    -o ${OUT_FILE} \
    --log_progress \
    --prog_interval 100000 \
    --freq 0.0001 \
    --gq 30 --dp 10 \
    --het_ab 0.3 \
    --max_alt_alleles 1 \
    --csq all \
    --biotypes all \
    --control_gq 15 --control_dp 5 \
    --control_het_ab 0.01 \
    --control_max_ref_ab 0.05 \
    --de_novo \
    --ped ${PED_FILE}