diff --git a/docs/Setup_variant_prioritization.md b/docs/Setup_variant_prioritization.md index 2c282b084014f5603fa6be34bc892704dcdca23b..5f5cd18ce2fac73c9dc6b3c8914c1864e1c2e98d 100644 --- a/docs/Setup_variant_prioritization.md +++ b/docs/Setup_variant_prioritization.md @@ -52,41 +52,54 @@ types => {map {$_ => 1} qw(splice_donor_variant ... (line 145) ### Setting G2P in completely offline mode -All external datasets listed for the `af_from_vcf_keys flag` (gnomADe_GRCh38|gnomADg_r3.0_GRCh38) in: +All external datasets listed for the `af_from_vcf_keys flag` (gnomADe_r2.1.1_GRCh38|gnomADg_r3.1.1_GRCh38) in: * `/home/u035/u035/shared/scripts/process_NHS_WES_trio.sh` * `/home/u035/u035/shared/scripts/process_NHS_WES_aff_probands.sh` must be available locally (see below for downloading gnomADe and gnomADg datasets) -* gnomADg dataset (r3.0, downloaded 27/08/2020): `/home/u035/u035/shared/software/bcbio/genomes/Hsapiens/hg38/variation/gnomad_genomes.vcf.gz` -* gnomADe dataset (r2.1, downloaded 13/09/2019): `/home/u035/u035/shared/resources/gnomad/r2.1/exomes/` +* gnomADg dataset (r3.1.1, downloaded 23/08/2021): `/home/u035/u035/shared/resources/gnomad/r3.1.1/genomes` +* gnomADe dataset (r2.1.1, downloaded 23/08/2021): `/home/u035/u035/shared/resources/gnomad/r2.1.1/exomes` + +To re-fetch the gnomADg dataset: +``` +U> cd /home/u035/u035/shared/resources/gnomad/r3.1.1/genomes +U> for i in {1..22} X Y +U> do +U> wget https://storage.googleapis.com/gcp-public-data--gnomad/release/3.1.1/vcf/genomes/gnomad.genomes.v3.1.1.sites.chr${i}.vcf.bgz +U> wget https://storage.googleapis.com/gcp-public-data--gnomad/release/3.1.1/vcf/genomes/gnomad.genomes.v3.1.1.sites.chr${i}.vcf.bgz.tbi +U> done +``` To re-fetch the gnomADe dataset: ``` -U> cd /home/u035/u035/shared/resources/gnomad/r2.1/exomes -U> wget ftp://ftp.ensembl.org/pub/data_files/homo_sapiens/GRCh38/variation_genotype/gnomad/r2.1/exomes/*.gz -U> wget ftp://ftp.ensembl.org/pub/data_files/homo_sapiens/GRCh38/variation_genotype/gnomad/r2.1/exomes/*.tbi +U> cd /home/u035/u035/shared/resources/gnomad/r2.1.1/exomes +U> for i in {1..22} X Y +U> do +U> wget https://storage.googleapis.com/gcp-public-data--gnomad/release/2.1.1/liftover_grch38/vcf/exomes/gnomad.exomes.r2.1.1.sites.${i}.liftover_grch38.vcf.bgz +U> wget https://storage.googleapis.com/gcp-public-data--gnomad/release/2.1.1/liftover_grch38/vcf/exomes/gnomad.exomes.r2.1.1.sites.${i}.liftover_grch38.vcf.bgz.tbi +U> done ``` Edit the `/home/u035/u035/shared/software/bcbio/anaconda/share/ensembl-vep-100.4-0/Bio/EnsEMBL/Variation/DBSQL/vcf_config.json` file to update type (remote -> local) and filename_template (local path to datasets) variables for these local datasets. gnomADg (lines 138-143) ``` -"id": "gnomADg_r3.0_GRCh38", -"description": "Genome Aggregation Database genomes r3.0", +"id": "gnomADg_r3.1.1_GRCh38", +"description": "Genome Aggregation Database genomes r3.1.1", "species": "homo_sapiens", "assembly": "GRCh38", "type": "local", -"filename_template": "/home/u035/u035/shared/resources/gnomad/r3.0/genomes/gnomad.genomes.r3.0.sites.chr###CHR###_trimmed_info.vcf.bgz" +"filename_template": "/home/u035/u035/shared/resources/gnomad/r3.1.1/genomes/gnomad.genomes.v3.1.1.sites.chr###CHR###.vcf.bgz", ``` gnomADe (lines 199-204) ``` -"id": "gnomADe_GRCh38", -"description": "Genome Aggregation Database exomes r2.1", +"id": "gnomADe_r2.1.1_GRCh38", +"description": "Genome Aggregation Database exomes r2.1.1 liftover to GRCh38", "species": "homo_sapiens", "assembly": "GRCh38", "type": "local", -"filename_template": "/home/u035/u035/shared/resources/gnomad/r2.1/exomes/gnomad.exomes.r2.1.sites.grch38.chr###CHR###_noVEP.vcf.gz" +"filename_template": "/home/u035/u035/shared/resources/gnomad/r2.1.1/exomes/gnomad.exomes.r2.1.1.sites.###CHR###.liftover_grch38.vcf.bgz", ``` ## Parameter Values for G2P call @@ -99,7 +112,7 @@ Files VEP="/home/u035/u035/shared/software/bcbio/anaconda/bin/perl /home/u035/u035/shared/software/bcbio/anaconda/bin/vep" # this points to ../share/ensembl-vep-100.4-0/vep -REFERENCE_GENOME=/home/u035/u035/shared/resources/hg38.fa +REFERENCE_GENOME=/home/u035/u035/shared/software/bcbio/genomes/Hsapiens/hg38/seq/hg38.fa IN_FILE=${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}.clean.vcf G2P_LOG_DIR=${G2P_DIR}/${PLATE_ID}_${FAMILY_ID}_LOG_DIR @@ -120,9 +133,9 @@ time ${VEP} \ --cache --cache_version 100 \ --dir_cache /home/u035/u035/shared/software/bcbio/genomes/Hsapiens/hg38/vep \ --individual all \ - --transcript_filter "gene_symbol in /home/u035/u035/shared/resources/genes_in_DDG2P.20201208.txt" \ + --transcript_filter "gene_symbol in /home/u035/u035/shared/resources/G2P/genes_in_DDG2P.20201208.txt" \ --dir_plugins /home/u035/u035/shared/software/bcbio/anaconda/share/ensembl-vep-100.4-0 \ - --plugin G2P,file='/home/u035/u035/shared/resources/DDG2P.20201208.csv',af_from_vcf=1,confidence_levels='confirmed&probable&both RD and IF',af_from_vcf_keys=${VCF_KEYS},\ + --plugin G2P,file='/home/u035/u035/shared/resources/G2P/DDG2P.20201208.csv',af_from_vcf=1,confidence_levels='confirmed&probable&both RD and IF',af_from_vcf_keys=${VCF_KEYS},\ log_dir=${G2P_LOG_DIR},txt_report=${TXT_OUT},html_report=${HTML_OUT} ``` @@ -154,7 +167,7 @@ E> gunzip -c DDG2P.orig.<date_downloaded>.csv.gz > DDG2P.orig.<date_downloaded>. * Remove entries with no allelic requirement listed (if any) * Split records (rows) with multiple (comma separated) allelic requirements; sort again * Save as `DDG2P.<date_downloaded>.csv` -* Copy `DDG2P.<date_downloaded>.csv` from `<datastore_work_folder>` to `<eddie_work_folder>` and to ultra at `/home/u035/u035/shared/resources` +* Copy `DDG2P.<date_downloaded>.csv` from `<datastore_work_folder>` to `<eddie_work_folder>` and to ultra at `/home/u035/u035/shared/resources/G2P` `DDG2P.20210706.csv` stats ``` @@ -194,12 +207,12 @@ After the updates of the resources for the coverage analysis are completed, upda to point to the updated files: ``` -TARGETS=/home/u035/u035/shared/resources/DDG2P.20210706.plus15bp.merged.bed -CLINVAR=/home/u035/u035/shared/resources/DDG2P.20210706.clinvar.20210626.plus15bp.txt +TARGETS=/home/u035/u035/shared/resources/G2P/DDG2P.20210706.plus15bp.merged.bed +CLINVAR=/home/u035/u035/shared/resources/G2P/DDG2P.20210706.clinvar.20210626.plus15bp.txt echo "Performing G2P analysis (DD genes)for FAMILY_ID = ${PLATE_ID}_${FAMILY_ID}..." echo "Using ${TARGETS}" ---transcript_filter "gene_symbol in /home/u035/u035/shared/resources/genes_in_DDG2P.20210706.txt" ---plugin G2P,file='/home/u035/u035/shared/resources/DDG2P.20210706.csv',af_from_vcf... +--transcript_filter "gene_symbol in /home/u035/u035/shared/resources/G2P/genes_in_DDG2P.20210706.txt" +--plugin G2P,file='/home/u035/u035/shared/resources/G2P/DDG2P.20210706.csv',af_from_vcf... ``` ## Resources for coverage analysis @@ -210,7 +223,7 @@ Source: https://www.ncbi.nlm.nih.gov/projects/CCDS Date obtained: 28/02/2019 -Location at EPCC: /home/u035/u035/shared/resources +Location at EPCC: /home/u035/u035/shared/resources/exome_targets File name: CCDS.20180614.plus15bp.merged.bed @@ -229,18 +242,18 @@ E> bedtools merge -i CCDS.20180614.plus15bp.sorted.bed -c 4 -o distinct > CCDS.2 ### DD genes Dataset -Source: /home/u035/u035/shared/resources/genes_in_DDG2P.20210706.txt +Source: /home/u035/u035/shared/resources/G2P/genes_in_DDG2P.20210706.txt Date obtained: 08/12/2020 -Location at EPCC: /home/u035/u035/shared/resources +Location at EPCC: /home/u035/u035/shared/resources/G2P File name: DDG2P.20210706.plus15bp.merged.bed From the CCDS BED file (above), extract a BED file for the DD genes ``` -U> cd /home/u035/u035/shared/resources +U> cd /home/u035/u035/shared/resources/G2P U> PYTHON=/home/u035/u035/shared/software/bcbio/anaconda/envs/python2/bin/python2.7 U> time $PYTHON /home/u035/u035/shared/scripts/extract_BED_CCDS_DDG2P.py CCDS.20180614.plus15bp.merged.bed genes_in_DDG2P.20210706.txt DDG2P.20210706.plus15bp.merged.bed Found 3553 unique gene names in genes_in_DDG2P.20210706.txt @@ -255,14 +268,14 @@ Source: ftp://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh38/ Date obtained: 06/07/2021 -Location at EPCC: /home/u035/u035/shared/resources +Location at EPCC: /home/u035/u035/shared/resources/clinvar File name: clinvar_20210626.P_LP.ACP.vcf Description of ClinVar VCF @ `https://www.ncbi.nlm.nih.gov/variation/docs/ClinVar_vcf_files/` ``` -U> cd /home/u035/u035/shared/resources +U> cd /home/u035/u035/shared/resources/clinvar U> wget ftp://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh38/clinvar_20210626.vcf.gz U> wget ftp://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh38/clinvar_20210626.vcf.gz.tbi @@ -280,7 +293,7 @@ U> grep '^#' clinvar_20210626.P_LP.chr.vcf > clinvar_20210626.P_LP.ACP.vcf && gr Source: DDG2P.20210706.plus15bp.merged.bed & clinvar_20210626.P_LP.ACP.vcf (see above) -Location at EPCC: /home/u035/u035/shared/resources +Location at EPCC: /home/u035/u035/shared/resources/G2P File name: DDG2P.20210706.clinvar.20210626.plus15bp.txt @@ -290,12 +303,12 @@ A BED file for all CCDS exons (15bp padded) found in the DD genes, annotated wit Use bedtools to count and record the number of P/LP variants per each interval ``` -U> cd /home/u035/u035/shared/resources +U> cd /home/u035/u035/shared/resources/G2P U> BEDTOOLS=/home/u035/u035/shared/software/bcbio/anaconda/bin/bedtools -U> $BEDTOOLS intersect -wa -c -a DDG2P.20210706.plus15bp.merged.bed -b clinvar_20210626.P_LP.ACP.vcf > DDG2P.20210706.clinvar.20210626.plus15bp.txt +U> $BEDTOOLS intersect -wa -c -a DDG2P.20210706.plus15bp.merged.bed -b ../clinvar/clinvar_20210626.P_LP.ACP.vcf > DDG2P.20210706.clinvar.20210626.plus15bp.txt # proportion of all P/LP variants with assertion criteria provided in DD genes -U> grep -v '^#' clinvar_20210626.P_LP.ACP.vcf | wc -l +U> grep -v '^#' ../clinvar/clinvar_20210626.P_LP.ACP.vcf | wc -l Total of 102341 ClinVar vars U> cat DDG2P.20210706.clinvar.20210626.plus15bp.txt | awk '{sum += $5} END {print sum}' @@ -325,7 +338,7 @@ Source: NHSS Date obtained: 25/09/2019 -Location at EPCC: /home/u035/u035/shared/resources +Location at EPCC: /home/u035/u035/shared/resources/blacklist File name: current_blacklist.txt @@ -335,7 +348,7 @@ This is a file which contains variant which were assessed by NHSS as safe to be Open the Excel file provided by NHSS and store the information in a tab-separated file named `blacklist.<date_received>.txt` with the format chr pos ref alt, adding the ‘chr’ prefix if necessary. Create a copy of the file named `current_blacklist.txt` which is looked for and used by `NHS_WES_filter_LQ_GT.py`. ``` -U> cd /home/u035/u035/shared/resources +U> cd /home/u035/u035/shared/resources/blacklist U> nano blacklist.2019-11-27.txt U> cp blacklist.2019-11-27.txt current_blacklist.txt ``` @@ -346,7 +359,7 @@ Source: NHSS Date obtained: 25/09/2019 -Location at EPCC: /home/u035/u035/shared/resources +Location at EPCC: /home/u035/u035/shared/resources/trans_map File name: current_trans_map.txt @@ -355,7 +368,7 @@ Some of the VEP (v97) GRCh38 transcripts are not currently recognized by DECIPHE Open the Excel file provided by NHSS and store the information in a tab-separated file named `trans_map.<date_received>.txt` with the format `Unrecognized_transcript Replacement_transcript`. Create a copy of the file named `current_trans_map.txt` which is looked for and used by `NHS_WES_filter_LQ_GT.py`. ``` -U> cd /home/u035/u035/shared/resources +U> cd /home/u035/u035/shared/resources/trans_map U> nano trans_map.2019-11-27.txt U> cp trans_map.2019-11-27.txt current_trans_map.txt ``` @@ -367,7 +380,7 @@ Location at EPCC: /home/u035/u035/shared/software/bcbio/anaconda/bin/vase Parameter Values for VASE STRICT ``` -VASE=/home/u035/u035/shared/software/bin/vase +VASE=/home/u035/u035/shared/software/bcbio/anaconda/bin/vase IN_FILE=${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}.ready.vcf.gz OUT_FILE=${VASE_DIR}/${PLATE_ID}_${FAMILY_ID}.strict.denovo.vcf @@ -391,18 +404,5 @@ time ${VASE} \ --ped ${PED_FILE} ``` -### Install IGV for snapshot generation - -Done 28/08/2020 - -``` -[mike_hala@ultra ~]$ cd /home/u035/u035/shared/software/ -[mike_hala@ultra software]$ wget https://data.broadinstitute.org/igv/projects/downloads/2.8/IGV_Linux_2.8.9.zip -[mike_hala@ultra software]$ unzip -l IGV_Linux_2.8.9.zip | less -[mike_hala@ultra software]$ unzip IGV_Linux_2.8.9.zip -``` - - -