diff --git a/.gitignore b/.gitignore
index 0fb699fd11be7d4e2a8592eca29810dcb58dce5e..a03d9f5d1a6bf3e14705dc9dce22145501574a14 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,2 +1,4 @@
 *_transfer_info_file.sh
 *.o[1-9]*
+*#
+*~
diff --git a/G2P.pm b/G2P.pm
new file mode 100644
index 0000000000000000000000000000000000000000..9fe3124efd90f263500d5189c87e3cca5312fe33
--- /dev/null
+++ b/G2P.pm
@@ -0,0 +1,1474 @@
+=head1 LICENSE
+
+Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute
+Copyright [2016-2020] EMBL-European Bioinformatics Institute
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
+=head1 CONTACT
+
+ Ensembl <http://www.ensembl.org/info/about/contact/index.html>
+    
+=cut
+
+=head1 NAME
+
+ G2P
+
+=head1 SYNOPSIS
+
+ mv G2P.pm ~/.vep/Plugins
+ ./vep -i variations.vcf --plugin G2P,file=/path/to/G2P.csv
+
+=head1 DESCRIPTION
+
+ A VEP plugin that uses G2P allelic requirements to assess variants in genes
+ for potential phenotype involvement.
+
+ The plugin has multiple configuration options, though minimally requires only
+ the CSV file of G2P data.
+
+ Options are passed to the plugin as key=value pairs, (defaults in parentheses):
+
+ file                  : Path to G2P data file. The file needs to be uncompressed.
+                         - Download from http://www.ebi.ac.uk/gene2phenotype/downloads
+                         - Download from PanelApp  
+
+ variant_include_list  : A list of variants to include even if variants do not pass allele
+                         frequency filtering. The include list needs to be a sorted, bgzipped and
+                         tabixed VCF file.
+
+ af_monoallelic        : maximum allele frequency for inclusion for monoallelic genes (0.0001)
+
+ af_biallelic          : maximum allele frequency for inclusion for biallelic genes (0.005)
+ confidence_levels     : Confidence levels to include: confirmed, probable, possible, both RD and IF.
+                         Separate multiple values with '&'.
+                         https://www.ebi.ac.uk/gene2phenotype/terminology
+                         Default levels are confirmed and probable.
+ all_confidence_levels : Set value to 1 to include all confidence levels: confirmed, probable and possible.
+                         Setting the value to 1 will overwrite any confidence levels provided with the
+                         confidence_levels option.
+ af_from_vcf           : set value to 1 to include allele frequencies from VCF file. 
+                         Specifiy the list of reference populations to include with --af_from_vcf_keys    
+ af_from_vcf_keys      : VCF collections used for annotating variant alleles with observed
+                         allele frequencies. Allele frequencies are retrieved from VCF files. If
+                         af_from_vcf is set to 1 but no VCF collections are specified with --af_from_vcf_keys
+                         all available VCF collections are included. 
+                         Available VCF collections: topmed, uk10k, gnomADe, gnomADg, gnomADg_r3.0
+                         Separate multiple values with '&'
+                         VCF collections contain the following populations: 
+                         topmed: TOPMed
+                         uk10k: ALSPAC, TWINSUK
+                         gnomADe: gnomADe:AFR, gnomADe:ALL, gnomADe:AMR, gnomADe:ASJ, gnomADe:EAS, gnomADe:FIN, gnomADe:NFE, gnomADe:OTH, gnomADe:SAS
+                         gnomADg: gnomADg:AFR, gnomADg:ALL, gnomADg:AMR, gnomADg:ASJ, gnomADg:EAS, gnomADg:FIN, gnomADg:NFE, gnomADg:OTH
+ default_af            : default frequency of the input variant if no frequency data is
+                         found (0). This determines whether such variants are included;
+                         the value of 0 forces variants with no frequency data to be
+                         included as this is considered equivalent to having a frequency
+                         of 0. Set to 1 (or any value higher than af) to exclude them.
+ types                 : SO consequence types to include. Separate multiple values with '&'
+                         (splice_donor_variant,splice_acceptor_variant,stop_gained,
+                         frameshift_variant,stop_lost,initiator_codon_variant,
+                         inframe_insertion,inframe_deletion,missense_variant,
+                         coding_sequence_variant,start_lost,transcript_ablation,
+                         transcript_amplification,protein_altering_variant,splice_region_variant)
+  
+  log_dir              : write stats to log files in log_dir 
+
+  txt_report           : write all G2P complete genes and attributes to txt file
+
+  html_report          : write all G2P complete genes and attributes to html file
+
+ Example:
+
+ --plugin G2P,file=G2P.csv,af_monoallelic=0.05,types=stop_gained&frameshift_variant
+ --plugin G2P,file=G2P.csv,af_monoallelic=0.05,af_from_vcf=1
+ --plugin G2P,file=G2P.csv,af_from_vcf=1,af_from_vcf_keys=topmed&gnomADg
+ --plugin G2P,file=G2P.csv,af_from_vcf=1,af_from_vcf_keys=topmed&gnomADg,confidence_levels='confirmed&probable&both RD and IF'
+ --plugin G2P,file=G2P.csv
+ 
+=cut
+
+package G2P;
+
+use strict;
+use warnings;
+use Cwd;
+use Scalar::Util qw(looks_like_number);
+use FileHandle;
+use Text::CSV;
+use Bio::EnsEMBL::Utils::Sequence qw(reverse_comp);
+use Bio::EnsEMBL::Variation::Utils::Sequence qw(get_matched_variant_alleles);
+use Bio::EnsEMBL::Variation::Utils::VEP qw(parse_line);
+use Bio::EnsEMBL::Variation::DBSQL::VCFCollectionAdaptor;
+use Bio::EnsEMBL::Variation::Utils::BaseVepPlugin;
+use base qw(Bio::EnsEMBL::Variation::Utils::BaseVepTabixPlugin);
+
+our $CAN_USE_HTS_PM;
+
+BEGIN {
+  if (eval { require Bio::DB::HTS::Tabix; 1 }) {
+    $CAN_USE_HTS_PM = 1;
+  }
+}
+
+
+my %DEFAULTS = (
+
+  # vars must have a frequency <= to this to pass
+  af_monoallelic => 0.0001,
+  af_biallelic => 0.005, 
+
+  af_keys => [qw(AA AFR AMR EA EAS EUR SAS gnomAD gnomAD_AFR gnomAD_AMR gnomAD_ASJ gnomAD_EAS gnomAD_FIN gnomAD_NFE gnomAD_OTH gnomAD_SAS)],
+
+  af_from_vcf_keys => [qw(uk10k topmed gnomADe gnomADg gnomADg_r3.0)],
+
+  # if no MAF data is found, default to 0
+  # this means absence of MAF data is considered equivalent to MAF=0
+  # set to 1 to do the "opposite", i.e. exclude variants with no MAF data
+  default_af => 0,
+
+  confidence_levels => [qw(confirmed probable)],
+
+  # only include variants with these consequence types
+  # currently not ontology-resolved, exact term matches only
+  types => {map {$_ => 1} qw(splice_donor_variant splice_acceptor_variant stop_gained frameshift_variant stop_lost initiator_codon_variant inframe_insertion inframe_deletion missense_variant coding_sequence_variant start_lost transcript_ablation transcript_amplification protein_altering_variant splice_region_variant)},
+
+);
+
+my $af_key_2_population_name = {
+  minor_allele_freq => 'global allele frequency (AF) from 1000 Genomes Phase 3 data',
+  AFR => '1000GENOMES:phase_3:AFR',
+  AMR => '1000GENOMES:phase_3:AMR',
+  EAS => '1000GENOMES:phase_3:EAS',
+  EUR => '1000GENOMES:phase_3:EUR',
+  SAS => '1000GENOMES:phase_3:SAS',
+  AA => 'Exome Sequencing Project 6500:African_American',
+  EA => 'Exome Sequencing Project 6500:European_American',
+  gnomAD => 'Genome Aggregation Database:Total',
+  gnomAD_AFR => 'Genome Aggregation Database exomes v2.1:African/African American',
+  gnomAD_AMR => 'Genome Aggregation Database exomes v2.1:Latino/Admixed American',
+  gnomAD_ASJ => 'Genome Aggregation Database exomes v2.1:Ashkenazi Jewish',
+  gnomAD_EAS => 'Genome Aggregation Database exomes v2.1:East Asian',
+  gnomAD_FIN => 'Genome Aggregation Database exomes v2.1:Finnish',
+  gnomAD_NFE => 'Genome Aggregation Database exomes v2.1:Non-Finnish European',
+  gnomAD_OTH => 'Genome Aggregation Database exomes v2.1:Other (population not assigned)',
+  gnomAD_SAS => 'Genome Aggregation Database exomes v2.1:South Asian',
+};
+
+my $allelic_requirements = {
+  'biallelic' => { af => 0.005, rules => {HET => 2, HOM => 1} },
+  'monoallelic' => { af => 0.0001, rules => {HET => 1, HOM => 1} },
+  'hemizygous' => { af => 0.0001, rules => {HET => 1, HOM => 1} },
+  'x-linked dominant' => { af => 0.0001, rules => {HET => 1, HOM => 1} },
+  'x-linked over-dominance' => { af => 0.0001, rules => {HET => 1, HOM => 1} },
+};
+
+my $supported_confidence_levels = {
+  'confirmed' => 1,
+  'probable' => 1,
+  'possible' => 1,
+  'both RD and IF' => 1,
+};
+
+my @allelic_requirement_terms = keys %$allelic_requirements;
+
+sub new {
+  my $class = shift;
+
+  my $self = $class->SUPER::new(@_);
+# suppress warnings that the FeatureAdpators spit if using no_slice_cache
+  Bio::EnsEMBL::Utils::Exception::verbose(1999);
+
+  my $params = $self->params_to_hash();
+  my $file = '';
+
+  # user only supplied file as first param?
+  if (!keys %$params) {
+    $file = $self->params->[0];
+    $params->{file} = $file;
+  }
+  else {
+    $file = $params->{file};
+
+    # process types
+    if ($params->{types}) {
+      $params->{types} = {map {$_ => 1} split(/[\;\&\|]/, $params->{types})};
+    }
+
+    # check af
+    foreach my $af (qw/af_monoallelic af_biallelic/) {
+      if($params->{$af}) {
+        die("ERROR: Invalid value for af: ".$params->{$af} . "\n") unless
+          looks_like_number($params->{$af}) && ($params->{$af} >= 0 && $params->{$af} <= 1)
+      }
+      my $ar = $af;
+      $ar =~ s/af_//;
+      $allelic_requirements->{$ar}->{af} = $params->{$af} if (defined $params->{$af});
+    }
+
+    $params->{af_keys} = \@{$DEFAULTS{af_keys}};
+  }
+
+  my ($sec, $min, $hour, $mday, $mon, $year, $wday, $yday, $isdst) = localtime(time);
+  $year += 1900;
+  $mon++;
+  my $stamp = join('_', ($year, $mon, $mday, $hour, $min, $sec));
+  my $cwd_dir = getcwd;
+  my $new_log_dir = "$cwd_dir/g2p_log_dir\_$stamp";
+  my $log_dir = $params->{log_dir} || $new_log_dir;
+  if (!-d $log_dir) {
+    my $return = mkdir $log_dir, 0755;
+    die("ERROR: Couldn't create log_dir $log_dir $!\n") if (!$return);
+    $params->{log_dir} = $log_dir;
+  } 
+
+  foreach my $report_type (qw/txt_report html_report/) {
+    if (!$params->{$report_type}) {
+      my $file_type = ($report_type eq 'txt_report') ? 'txt' : 'html';
+      $params->{$report_type} = $cwd_dir . "/$report_type\_$stamp.$file_type";
+    } 
+  }
+
+  if ($params->{all_confidence_levels}) {
+    if ($params->{confidence_levels}) {
+      warn("Option all_confidence_levels set to 1 overwrites confidence levels provided with confidence_levels option.");
+    }
+    $params->{confidence_levels} = ['possible', @{$DEFAULTS{confidence_levels}}];
+  }
+  elsif ($params->{confidence_levels}) {
+    my @confidence_levels = ();
+    foreach my $confidence_level (split(/[\;\&\|]/, $params->{confidence_levels})) {
+      if (!$supported_confidence_levels->{$confidence_level}) {
+        die "$confidence_level is not a supported value for supported confidence levels. Supported values are: ", join(', ', keys %$supported_confidence_levels);
+      } else {
+        push @confidence_levels, $confidence_level;
+        push @confidence_levels, 'both DD and IF' if ($confidence_level eq 'both RD and IF'); # legacy support for using both DD and IF
+
+      }
+    }
+    if (scalar @confidence_levels > 0) {
+      $params->{confidence_levels} = \@confidence_levels;
+    }
+  }
+  if ($params->{af_from_vcf}) {
+    if ($CAN_USE_HTS_PM) {
+      my @vcf_collection_ids = ();
+      my $assembly =  $self->{config}->{assembly};
+      if ($params->{af_from_vcf_keys}) {
+        foreach my $key (split(/[\;\&\|]/, $params->{af_from_vcf_keys})) {
+          push @vcf_collection_ids, $key;
+          push @vcf_collection_ids, "$key\_$assembly";
+        }
+      } else {
+        foreach my $key (@{$DEFAULTS{af_from_vcf_keys}}) {
+          push @vcf_collection_ids, "$key\_$assembly";
+        }
+      }
+
+      my $species =  $self->{config}->{species};
+      my $reg = $self->{config}->{reg};
+      my $vca;
+      if (defined $self->{config}->{offline}) {
+        $vca = Bio::EnsEMBL::Variation::DBSQL::VCFCollectionAdaptor->new();
+      } else {
+        my $vdba = $reg->get_DBAdaptor($species, 'variation');
+        $vdba->dbc->reconnect_when_lost(1);
+        $vca = $vdba->get_VCFCollectionAdaptor;
+        $vca->db->use_vcf(2);
+      }
+      my $vcf_collections = $vca->fetch_all;
+      my @collections = ();
+      foreach my $vcf_collection (@$vcf_collections) {
+        $vcf_collection->use_db(0) if (defined $self->{config}->{offline});
+        my $vcf_collection_id = $vcf_collection->id;
+        if ($vcf_collection->assembly eq $assembly && grep {$_ =~ /$vcf_collection_id/i} @vcf_collection_ids) {
+          delete $vcf_collection->adaptor->{collections};
+          delete $vcf_collection->adaptor->{config};
+          my $description = $vcf_collection->description || $vcf_collection_id;
+          foreach my $population (@{$vcf_collection->get_all_Populations})  {
+            my $population_name = $population->name;
+            my $population_description = $population->description;    
+            $af_key_2_population_name->{$population_name} = "$description $population_name $population_description";
+          }
+          push @collections, $vcf_collection;
+        }
+      }
+      warn "Couldn't find VCF collection ids for assembly " . $assembly if (!@collections);
+      $self->{config}->{vcf_collections} = \@collections;
+      $self->{config}->{use_vcf} = 1;
+    } else {
+      warn "Cannot get data from VCF without Bio::DB::HTS::Tabix";
+    } 
+  }
+
+  if ($params->{variant_include_list}) {
+    if (! -f $params->{variant_include_list}) {
+      die "Variant include list (" . $params->{variant_include_list} . ") does not exist.";
+    }
+    $self->{_files} = [$params->{variant_include_list}];
+  }
+
+  # copy in default params
+  $params->{$_} //= $DEFAULTS{$_} for keys %DEFAULTS;
+  $self->{user_params} = $params;
+
+  $self->{config}->{frequency_threshold} = _get_highest_frequency_threshold();
+  # read data from file
+  $self->{gene_data} = $self->read_gene_data_from_file($file);
+  $self->synonym_mappings();
+
+  # force some config params
+  $self->{config}->{individual} //= ['all'];
+  $self->{config}->{symbol} = 1;
+  $self->{config}->{check_existing} = 1;
+  $self->{config}->{failed} = 1;
+  $self->{config}->{af} = 1;
+  $self->{config}->{af_1kg} = 1;
+  $self->{config}->{af_esp} = 1;
+  $self->{config}->{af_gnomad} = 1;
+  $self->{config}->{sift} = 'b';
+  $self->{config}->{polyphen} = 'b';
+
+  # tell VEP we have a cache so stuff gets shared/merged between forks
+  $self->{has_cache} = 1;
+  $self->{cache}->{g2p_in_vcf} = {};
+
+
+  return $self;
+}
+
+sub _get_highest_frequency_threshold {
+  my $highest_frequency = 0.0;
+  foreach my $ar (keys %$allelic_requirements) {
+    if ($allelic_requirements->{$ar}->{af} > $highest_frequency) {
+      $highest_frequency = $allelic_requirements->{$ar}->{af};
+    }
+  }
+  return $highest_frequency;
+}
+
+sub feature_types {
+  return ['Transcript'];
+}
+
+sub get_header_info {
+  my $self = shift;
+
+  return {
+    G2P_flag => 'Flags zygosity of valid variants for a G2P gene',
+    G2P_complete => 'Indicates this variant completes the allelic requirements for a G2P gene',
+    G2P_gene_req => 'MONO or BI depending on the context in which this gene has been explored',
+  };
+}
+
+sub run {
+  my ($self, $tva, $line) = @_;
+
+  # only interested if we know the zygosity
+  my $zyg = defined($line->{Extra}) ? $line->{Extra}->{ZYG} : $line->{ZYG};
+  return {} unless $zyg;
+  # filter by G2P gene overlap
+  return {} if (!$self->gene_overlap_filtering($tva));
+  # filter by variant consequence
+  return {} unless grep {$self->{user_params}->{types}->{$_->SO_term}} @{$tva->get_all_OverlapConsequences};
+  $self->set_variant_include_list_flag($tva);
+  # filter by allele frequency
+  return {} if (!$self->frequency_filtering($tva));
+  # dump annotations for txt and html report files
+  $self->dump_vf_annotations($tva);      
+  $self->dump_individual_annotations($tva, $zyg);
+  # check if transcript contains enough variants to fulfill the allelic requirement of the gene
+  my $G2P_complete = $self->is_g2p_complete($tva, $zyg);
+  my $G2P_flag = $self->is_valid_g2p_variant($tva, $zyg);
+  my $results = {};
+  $results->{G2P_complete} = $G2P_complete if ($G2P_complete); 
+  $results->{G2P_flag} = $G2P_flag if ($G2P_flag);
+  return $results;
+}
+
+sub set_variant_include_list_flag {
+  my $self = shift;
+  my $tva = shift;
+  return if (!$self->{user_params}->{variant_include_list});
+  my $vf = $tva->variation_feature;
+
+  my $allele = $tva->variation_feature_seq;
+
+  foreach (@{$self->get_data($vf->{chr}, $vf->{start} - 1, $vf->{end})}) {
+    my @vcf_alleles = split /\//, $_->allele_string;
+    my $ref_allele  = shift @vcf_alleles;
+    my $matches = get_matched_variant_alleles(
+      {
+        ref    => $vf->ref_allele_string,
+        alts   => [$allele],
+        pos    => $vf->{start},
+        strand => $vf->strand
+      },
+      {
+        ref  => $ref_allele,
+        alts => \@vcf_alleles,
+        pos  => $_->{start},
+      }
+    );
+    if (scalar @$matches) {
+      my $vf_cache_name = $self->get_cache_name($vf);
+      $self->{g2p_vf_cache}->{$vf_cache_name}->{is_on_variant_include_list} = 1;
+      last;
+    }
+  }
+}
+
+sub is_valid_g2p_variant {
+  my $self = shift;
+  my $tva = shift;
+  my $zyg = shift;
+  my $transcript = $tva->transcript;
+  my $gene_stable_id = $transcript->{_gene}->stable_id;
+  my @allelic_requirements = keys %{$self->{ar}->{$gene_stable_id}};
+  my @results = ();
+  foreach my $ar (@allelic_requirements) {
+    my $ar_rules = $allelic_requirements->{$ar};
+    my $af_threshold = $ar_rules->{af};
+    if ($self->exceeds_threshold($af_threshold, [$self->{vf_cache_name}])) {
+      push @results, "$ar=$zyg";
+    }
+  }
+  return join(',', @results);
+}
+
+sub is_g2p_complete {
+  my $self = shift;
+  my $tva = shift;
+  my $zyg = shift;
+  my $vf = $tva->base_variation_feature;
+  my $individual = $vf->{individual};
+  my $transcript = $tva->transcript;
+  my $gene_stable_id = $transcript->{_gene}->stable_id;
+  my $transcript_stable_id = $transcript->stable_id; 
+  $self->{per_individual}->{$individual}->{$transcript_stable_id}->{$zyg}->{$self->{vf_cache_name}} = 1;
+  my @allelic_requirements = keys %{$self->{ar}->{$gene_stable_id}};
+  my $G2P_complete;
+  foreach my $ar (@allelic_requirements) {
+    my $zyg2var = $self->{per_individual}->{$individual}->{$transcript_stable_id};
+    my $fulfils_ar = $self->obeys_rule($ar, $zyg2var);
+    if (scalar keys %$fulfils_ar > 0) {
+      $G2P_complete .= "$ar=";
+      my @passed_variants = ();
+      foreach my $zyg (keys %$fulfils_ar) {
+        my @tmp = ();
+        foreach my $var (@{$fulfils_ar->{$zyg}}) {
+          push @tmp, "$zyg:$var";
+        }
+        push @passed_variants, join('&', @tmp);
+      }
+      $G2P_complete = "$ar=" . join(',', @passed_variants);
+    }
+  }
+  return $G2P_complete;
+} 
+
+sub obeys_rule {
+  my $self = shift;
+  my $ar = shift;
+  my $zyg2variants = shift;
+  my $ar_rules = $allelic_requirements->{$ar};
+  my $af_threshold = $ar_rules->{af};
+  my $zyg2counts = $ar_rules->{rules};
+  my $results = {};
+  foreach my $zyg (keys %$zyg2counts) {
+    my $count = $zyg2counts->{$zyg};
+    my @all_variants = keys %{$zyg2variants->{$zyg}};
+    my $variants = $self->exceeds_threshold($af_threshold, \@all_variants);
+    if (scalar @$variants >= $count) {
+      $results->{$zyg} = $variants;
+    }
+  }
+  return $results;
+}
+
+sub exceeds_threshold {
+  my $self = shift;
+  my $af_threshold = shift;
+  my $variants = shift;
+  my @pass_variants = ();
+  foreach my $variant (@$variants) {
+    if (!defined $self->{highest_frequencies}->{$variant} || $self->{highest_frequencies}->{$variant} <= $af_threshold || $self->{g2p_vf_cache}->{$variant}->{is_on_variant_include_list}) {
+      push @pass_variants, $variant;
+    }
+  }
+  return \@pass_variants;
+}
+
+sub gene_overlap_filtering {
+  my $self = shift;
+  my $tva = shift;
+  my $transcript = $tva->transcript;
+  my $gene = $transcript->{_gene};
+  my $gene_stable_id = $gene->stable_id;
+
+  my $pass_gene_overlap_filter = $self->{g2p_gene_cache}->{$gene_stable_id};
+  my @gene_xrefs = ();
+  if (! defined $pass_gene_overlap_filter) {
+    my $gene_symbol = $transcript->{_gene_symbol} || $transcript->{_gene_hgnc};
+    $pass_gene_overlap_filter = 0;
+    foreach my $gene_id ($gene_symbol, $gene_stable_id) {
+      my $gene_data = $self->gene_data($gene_id) if (defined $gene_id);
+      if (defined $gene_data) {
+        if (defined $gene_data->{'allelic requirement'} && scalar @{$gene_data->{'allelic requirement'}}) {
+          foreach my $ar (@{$gene_data->{'allelic requirement'}}) {
+            $self->{ar}->{$gene_stable_id}->{$ar} = 1;
+          } 
+          $self->write_report('G2P_gene_data', $gene_stable_id, $gene_data, $gene_data->{'gene_xrefs'});
+        } 
+        $self->write_report('G2P_in_vcf', $gene_stable_id);
+        $pass_gene_overlap_filter = 1;
+        last;
+      } 
+    }
+    $self->{g2p_gene_cache}->{$gene_stable_id} = $pass_gene_overlap_filter;
+  }
+  $self->_dump_transcript_annotations($transcript) if ($pass_gene_overlap_filter);
+  return $self->{g2p_gene_cache}->{$gene_stable_id};
+}
+
+sub _dump_transcript_annotations {
+  my $self = shift;
+  my $transcript = shift;
+  my $transcript_stable_id = $transcript->stable_id;
+  if (!defined $self->{g2p_transcript_cache}->{$transcript_stable_id}) {
+    my $gene = $transcript->{_gene};
+    my $gene_stable_id = $gene->stable_id;
+    if ($transcript->is_canonical) {
+      $self->write_report('G2P_transcript_data', "$gene_stable_id\t$transcript_stable_id\tis_canonical");
+    }
+    $self->{g2p_transcript_cache}->{$transcript_stable_id} = 1;
+  }
+}
+
+sub get_cache_name {
+  my $self = shift;
+  my $vf = shift;
+  my $cache_name = ($vf->{original_chr} || $vf->{chr}) . '_' . $vf->{start} . '_' . ($vf->{allele_string} || $vf->{class_SO_term});
+  return $cache_name;
+}
+
+sub frequency_filtering {
+  my $self = shift;
+  my $tva = shift;
+
+  my $vf = $tva->base_variation_feature;
+  # Set up caching to avoid looking up frequencies for each overlapping transcript
+  my $vf_cache_name = $self->get_cache_name($vf);
+  $self->{vf_cache_name} = $vf_cache_name;
+  $self->{g2p_vf_cache} = {} if (!defined $self->{g2p_vf_cache}->{$vf_cache_name});
+  # Retrieve cached result
+  my $pass_frequency_filter = $self->{g2p_vf_cache}->{$vf_cache_name}->{pass_frequency_filter};
+  return $pass_frequency_filter if (defined $pass_frequency_filter);
+  # Check frequencies from cache files first
+  $pass_frequency_filter = $self->_vep_cache_frequency_filtering($tva);
+  # Check frequencies from VCF files if user is providing use_vcf flag
+  if ($pass_frequency_filter && $self->{config}->{use_vcf}) {
+    $pass_frequency_filter = $self->_vcf_frequency_filtering($tva);
+  } 
+
+  $self->{g2p_vf_cache}->{$vf_cache_name}->{pass_frequency_filter} = $pass_frequency_filter;
+  return $self->{g2p_vf_cache}->{$vf_cache_name}->{pass_frequency_filter};
+}
+
+sub _vep_cache_frequency_filtering {
+  my $self = shift;
+  my $tva = shift;
+
+  my $allele = $tva->variation_feature_seq;
+  my $vf     = $tva->base_variation_feature;
+  my $frequency_threshold = $self->{config}->{frequency_threshold}; 
+  my $existing = $vf->{existing}; # Get existing variants from cache file which are stored on VF level
+  my @keys = @{$self->{user_params}->{af_keys}}; # Consider user defined list of af keys
+  my $dumped_annotations = 0;  # Indicates if existing annotations have already been dumped for txt and html report files
+  my $vf_cache_name =  $self->{vf_cache_name};
+  foreach my $existing_var (@$existing) {
+    my @frequencies = grep defined, @{$existing_var}{@keys};
+    if ($existing_var->{matched_alleles}) { # Get matched alleles from input variant and existing variant, in case input variant was normalized to match variant from cache file
+      $allele = $existing_var->{matched_alleles}[0]->{b_allele};
+    }
+    next if (!@frequencies);
+    if ($self->_exceeds_frequency_threshold(\@frequencies, $allele, $frequency_threshold) && !$self->{g2p_vf_cache}->{$vf_cache_name}->{is_on_variant_include_list}) { 
+      return 0; # Return 0 (failed filtering) if frequencies exceed threshold and variant is not on variant_include_list
+    } else {
+      # Dump annotations for txt and html report files
+      $self->_dump_existing_vf_frequencies($existing_var, $allele);
+      $self->_dump_existing_vf_annotations($existing_var);
+      $dumped_annotations = 1;
+    }
+  }
+  # If we get to this point it means that there were no frequencies for the input variant in the cache files
+  # and we pass the filtering step.
+  # We need to dump 'empty' annotations for such variants to indicate that there are no available frequencies
+  $self->_dump_existing_vf_annotations() if (!$dumped_annotations);
+  return 1;
+}
+
+sub _dump_existing_vf_frequencies {
+  my $self = shift;
+  my $existing_var = shift;
+  my $allele = shift;
+  my @keys = @{$self->{user_params}->{af_keys}};
+  my @frequencies = ();
+  my $higest_frequency = 0;
+  foreach my $population_name (@keys) {
+    my $af = $existing_var->{$population_name};
+    next if (!defined $af);
+    foreach my $pair (split(',', $af)) {
+      my ($a, $f) = split(':', $pair);
+      if(($a || '') eq $allele && defined($f)) {
+        push @frequencies, "$population_name=$f";
+        $higest_frequency = $f if ($f > $higest_frequency);
+      }
+    }
+  }
+  $self->store_highest_frequency($higest_frequency);
+  $self->write_report('G2P_frequencies', $self->{vf_cache_name}, \@frequencies);
+}
+
+sub _dump_existing_vf_annotations {
+  my $self = shift;
+  my $existing_var = shift;
+
+  my $data = {
+    'clin_sig' => 'NA',
+    'failed' => 'NA',
+    'existing_name' => 'NA',
+    'novel' => 'yes',
+  };
+  if ($existing_var) { 
+    $data = {
+      'clin_sig' => $existing_var->{clin_sig} || 'NA',
+      'failed' => ($existing_var->{failed}) ? 'yes' : 'no',
+      'existing_name' => $existing_var->{variation_name} || 'NA',
+      'novel' => 'no',
+    };
+  }
+  $self->write_report('G2P_existing_vf_annotations', $self->{vf_cache_name}, $data);
+}
+
+
+sub _exceeds_frequency_threshold {
+  my $self = shift;
+  my $vep_cache_frequencies = shift;
+  my $allele = shift;
+  my $threshold = shift;
+  foreach my $vep_cache_frequency (@$vep_cache_frequencies) {
+    foreach my $pair (split(',', $vep_cache_frequency)) {
+      my ($a, $f) = split(':', $pair);
+      if(($a || '') eq $allele && defined($f)) {
+        return 1 if ($f > $threshold);
+      }
+    }
+  }
+  return 0;
+}
+
+sub _vcf_frequency_filtering {
+  my $self = shift;
+  my $tva = shift;
+  my $allele = $tva->variation_feature_seq;
+  my $vf = $tva->base_variation_feature;
+  # get the lowest frequency threshold. Threshold can be different for monoallelic and biallelic genes.
+  my $frequency_threshold = $self->{config}->{frequency_threshold}; 
+  my $vf_cache_name =  $self->{vf_cache_name};
+  foreach my $vcf_collection (@{$self->{config}->{vcf_collections}}) {
+    my @alleles = grep {$_->allele eq $allele} @{$vcf_collection->get_all_Alleles_by_VariationFeature($vf)};
+    # As soon as we find a frequency which is higher than the frequency_threshold,
+    # and variant is not on variant_include_list we can stop.
+    my @frequencies = grep {$_->frequency > $frequency_threshold} @alleles;
+    if (scalar @frequencies > 0 && !$self->{g2p_vf_cache}->{$vf_cache_name}->{is_on_variant_include_list}) {
+      return 0;
+    } else {
+      $self->_dump_existing_vf_vcf(\@alleles) if (scalar @alleles); 
+    }
+  }
+  return 1;
+}
+
+sub _dump_existing_vf_vcf {
+  my $self = shift;
+  my $alleles = shift;
+  my @frequencies = map {$_->population->name . '=' . $_->frequency} @$alleles;
+  my @sorted_frequencies = sort { $a->frequency <=> $b->frequency } @$alleles;
+  $self->store_highest_frequency($sorted_frequencies[-1]->frequency);
+  $self->write_report('G2P_frequencies', $self->{vf_cache_name}, \@frequencies);
+}
+
+sub store_highest_frequency {
+  my $self = shift;
+  my $f = shift;
+  $self->{highest_frequency}->{$self->{vf_cache_name}} = $f;
+}
+
+sub dump_vf_annotations {
+  my $self = shift;
+  my $tva = shift;
+  my @consequence_types = map { $_->SO_term } @{$tva->get_all_OverlapConsequences};
+  my $vf = $tva->base_variation_feature;
+  my $allele = $tva->variation_feature_seq;
+  my $start = $vf->{start};
+  my $end = $vf->{end};
+
+  my $individual = $vf->{individual};
+  my $vf_name = $vf->variation_name;
+  my $vf_cache_name = $self->{vf_cache_name};
+  my $allele_string = $vf->{allele_string};
+  my @alleles = split('/', $allele_string);
+  my $ref = $alleles[0];
+  my $seq_region_name = $vf->{chr};
+
+  my $is_on_variant_include_list = $self->{g2p_vf_cache}->{$vf_cache_name}->{is_on_variant_include_list} || 0;
+
+  my $params = $self->{user_params};
+  my $tr = $tva->transcript;
+  my $refseq = $tr->{_refseq} || 'NA';
+  my $hgvs_t = $tva->hgvs_transcript || 'NA';
+  my $hgvs_p = $tva->hgvs_protein || 'NA';
+
+  my $pph_score   = (defined $tva->polyphen_score) ? $tva->polyphen_score : 'NA';
+  my $pph_pred    = (defined $tva->polyphen_prediction) ? $tva->polyphen_prediction : 'NA';
+  my $sift_score  = (defined $tva->sift_score) ? $tva->sift_score : 'NA';
+  my $sift_pred   = (defined $tva->sift_prediction) ? $tva->sift_prediction : 'NA';
+
+  my $g2p_data = {
+    'vf_name' => $vf_name,
+    'is_on_variant_include_list' => $is_on_variant_include_list,
+    'transcript_stable_id' => $tr->stable_id,
+    'consequence_types' => join(',', @consequence_types),
+    'refseq' => $refseq,
+    'hgvs_t' => $hgvs_t,
+    'hgvs_p' => $hgvs_p,
+    'vf_location' => "$seq_region_name:$start-$end $ref/$allele",
+    'sift_score' => "$sift_score",
+    'sift_prediction' => $sift_pred,
+    'polyphen_score' => "$pph_score",
+    'polyphen_prediction' => $pph_pred,
+  };
+  $self->write_report('G2P_tva_annotations', $vf_cache_name, $tr->stable_id, $g2p_data);
+  $self->write_report('is_on_variant_include_list', $vf_cache_name) if ($is_on_variant_include_list);
+}
+
+sub dump_individual_annotations {
+  my $self = shift;
+  my $tva = shift;
+  my $zyg = shift;
+  my $vf = $tva->base_variation_feature;
+  my $individual = $vf->{individual};
+  my $vf_cache_name = $self->{vf_cache_name};
+  my $transcript = $tva->transcript;
+  my $transcript_stable_id = $transcript->stable_id;
+  my $gene_stable_id = $transcript->{_gene_stable_id};
+  $self->write_report('G2P_individual_annotations', join("\t", $gene_stable_id, $transcript_stable_id, $vf_cache_name, $zyg, $individual));
+}
+
+# read G2P CSV dump
+# as from http://www.ebi.ac.uk/gene2phenotype/downloads
+sub read_gene_data_from_file {
+  my $self = shift;
+  my $file = shift;
+  my $delimiter = shift;
+  my (@headers, %gene_data);
+
+  my $assembly =  $self->{config}->{assembly};
+  die("ERROR: No file specified or could not read from file ".($file || '')."\n") unless $file && -e $file;
+
+  my @confidence_levels = @{$self->{user_params}->{confidence_levels}};
+
+  # determine file type
+  my $file_type;
+  my $fh = FileHandle->new($file, 'r');
+  while (<$fh>) {
+    chomp;
+      if (/Model_Of_Inheritance/) {
+        $file_type = 'panelapp';
+      } elsif (/allelic requirement/) {
+        $file_type = 'g2p';
+      } else {
+        $file_type = 'unknown';
+      }
+      last;
+  }
+  $fh->close();
+  if ($file_type eq 'unknown') {
+    if ($file =~ /gz$/) { 
+      die("ERROR: G2P plugin can only read uncompressed data\n");
+    } else {
+      die("ERROR: Could not recognize input file format. Format must be one of panelapp, g2p or custom. Check website for details: https://www.ebi.ac.uk/gene2phenotype/g2p_vep_plugin\n");
+    }
+  }
+
+  if ($file_type eq 'panelapp') {
+    my @headers = ();
+    my $csv = Text::CSV->new ({ sep_char => "\t" });
+    open my $fh, "<:encoding(utf8)", "$file" or die "$file: $!";
+    while ( my $row = $csv->getline( $fh ) ) {
+      unless (@headers) {
+        @headers = @$row;
+      } else {
+        my %tmp = map {$headers[$_] => $row->[$_]} (0..$#headers);
+        my $gene_symbol = $tmp{"Gene Entity Symbol"};
+        my $ensembl_gene_id = "";
+        if ($assembly eq 'GRCh37') { 
+          $ensembl_gene_id = $tmp{"EnsemblId(GRch37)"};
+        } else { # GRCh38
+          $ensembl_gene_id = $tmp{"EnsemblId(GRch38)"};
+        }
+        if ($ensembl_gene_id) {
+          my @ars = ();
+          my $allelic_requirement_panel_app = $tmp{"Model_Of_Inheritance"};
+          if ($allelic_requirement_panel_app =~ m/MONOALLELIC|BOTH/) {
+            push @ars, 'monoallelic';
+          } elsif ($allelic_requirement_panel_app =~ m/BIALLELIC|BOTH/) {
+            push @ars, 'biallelic';
+          } elsif ($allelic_requirement_panel_app eq 'X-LINKED: hemizygous mutation in males, biallelic mutations in females') {
+            push @ars, 'hemizygous';
+          } elsif ($allelic_requirement_panel_app eq 'X-LINKED: hemizygous mutation in males, monoallelic mutations in females may cause disease (may be less severe, later onset than males)') {
+            push @ars, 'x-linked dominant';
+          } else {
+            $self->write_report('log', "no allelelic_requirement for $ensembl_gene_id");
+          }
+          foreach my $ar (@ars) {
+            push @{$gene_data{$ensembl_gene_id}->{"allelic requirement"}}, $ar;
+          }
+        } else {
+          $self->write_report('log', "no ensembl gene id");
+        }
+      }
+    }
+    $csv->eof or $csv->error_diag();
+    close $fh;
+  }
+
+  if ($file_type eq 'g2p') {
+    # this regexp allows for nested ",", e.g.
+    # item,description
+    # cheese,"salty,delicious"
+    my $re = qr/(?: "\( ( [^()""]* ) \)" |  \( ( [^()]* ) \) |  " ( [^"]* ) " |  ( [^,]* ) ) , \s* /x;
+
+    my $fh = FileHandle->new($file, 'r');
+
+    while(<$fh>) {
+      chomp;
+      $_ =~ s/\R//g;
+      my @split = grep defined, "$_," =~ /$re/g;
+      unless(@headers) {
+        if ($file_type eq 'g2p') {
+          @headers = map {s/\"//g; $_} @split;
+        } else {
+          @headers = @split;
+        }
+      }
+      else {
+        my %tmp = map {$headers[$_] => $split[$_]} (0..$#split);
+        die("ERROR: Gene symbol column not found\n$_\n") unless $tmp{"gene symbol"};
+        $self->write_report('G2P_list', $tmp{"gene symbol"}, $tmp{"DDD category"});
+        my $confidence_value = $tmp{"DDD category"} || $tmp{"confidence category"}; # deprecate use of DDD category
+        next if (!grep{$_ eq $confidence_value} @confidence_levels);
+        my $gene_symbol = $tmp{"gene symbol"};
+        push @{$gene_data{$gene_symbol}->{"gene_xrefs"}}, split(';', $tmp{"prev symbols"});
+        push @{$gene_data{$gene_symbol}->{"gene_xrefs"}}, $tmp{"gene symbol"};
+        push @{$gene_data{$gene_symbol}->{"allelic requirement"}}, $tmp{"allelic requirement"} if ($tmp{"allelic requirement"});
+      }
+    }
+    $fh->close;
+  }
+  return \%gene_data;
+}
+
+# return either whole gene data hash or one gene's data
+# this should allow updates to this plugin to e.g. query a REST server, for example
+sub gene_data {
+  my ($self, $gene_symbol) = @_;
+  my $gene_data = $self->{gene_data}->{$gene_symbol};
+  if (!$gene_data) {
+    my $prev_gene_symbol = $self->{prev_symbol_mappings}->{$gene_symbol};
+    return $prev_gene_symbol ? $self->{gene_data}->{$prev_gene_symbol} : undef;
+  } 
+  return $gene_data;
+}
+
+sub synonym_mappings {
+  my $self = shift;
+  my $gene_data = $self->{gene_data};
+  my $synonym_mappings = {};
+  foreach my $gene_symbol (keys %$gene_data) {
+    foreach my $prev_symbol (@{$gene_data->{$gene_symbol}->{'gene_xrefs'}}) {
+      $synonym_mappings->{$prev_symbol} = $gene_symbol;
+    }
+  }
+  $self->{prev_symbol_mappings} = $synonym_mappings;
+}
+
+sub write_report {
+  my $self = shift;
+  my $flag = shift;
+  my $log_dir = $self->{user_params}->{log_dir};
+  my $log_file = "$log_dir/$$.txt";
+  open(my $fh, '>>', $log_file) or die "Could not open file '$flag $log_file' $!\n";
+  if ($flag eq 'G2P_list') {
+    my ($gene_symbol, $DDD_category) = @_;
+    $DDD_category ||= 'Not assigned';
+    print $fh "$flag\t$gene_symbol\t$DDD_category\n";
+  } elsif ($flag eq 'G2P_in_vcf') {
+    my $gene_symbol = shift;
+    print $fh "$flag\t$gene_symbol\n";
+  } elsif ($flag eq 'G2P_complete') {
+    print $fh join("\t", $flag, @_), "\n";
+  } elsif ($flag eq 'log') {
+    print $fh join("\t", $flag, @_), "\n";
+  } elsif ($flag eq 'is_on_variant_include_list') {
+    my ($vf_name) = @_;
+    print $fh "$flag\t$vf_name\n";
+  } elsif ($flag eq 'G2P_gene_data') {
+    my ($gene_id, $gene_data, $gene_xrefs) = @_;
+    my $ar = join(',', @{$gene_data->{'allelic requirement'}});
+    my %seen;
+    $seen{$_} = 1 foreach @{$gene_xrefs};
+    my @unique = keys %seen;
+    my $xrefs = join(',', grep {$_ !~ /^ENS/} sort @unique);
+    print $fh join("\t", $flag, $gene_id, $ar, $xrefs), "\n";
+  } elsif ($flag eq 'G2P_frequencies') {
+    my ($vf_name, $frequencies) = @_;
+    print $fh join("\t", $flag, $vf_name, join(',', @$frequencies)), "\n";
+  } elsif ($flag eq 'G2P_tva_annotations') {
+    my ($vf_name, $transcript_stable_id, $data) = @_;
+    $data = join(';', map {"$_=$data->{$_}"} sort keys %$data);
+    print $fh join("\t", $flag, $vf_name, $transcript_stable_id, $data), "\n";
+  } elsif ($flag eq 'G2P_existing_vf_annotations') {
+    my ($vf_name, $data) = @_;
+    $data = join(';', map {"$_=$data->{$_}"} sort keys %$data);
+    print $fh join("\t", $flag, $vf_name, $data), "\n";
+  } elsif ($flag eq 'G2P_transcript_data') {
+    print $fh join("\t", $flag, @_), "\n";
+  } elsif ($flag eq 'G2P_individual_annotations') { 
+    print $fh join("\t", $flag, @_), "\n";
+  } else {
+    print $fh "Did not recognize flag, @_\n";
+  }
+  close $fh;
+}
+
+sub finish {
+  my $self = shift;
+  $self->generate_report;
+}
+
+sub generate_report {
+  my $self = shift;
+  my $result_summary = $self->parse_log_files;
+  my $chart_txt_data = $self->chart_and_txt_data($result_summary);
+  my $chart_data = $chart_txt_data->{chart_data};
+  my $txt_data = $chart_txt_data->{txt_data};
+  $self->write_txt_output($txt_data, $result_summary->{gene_xrefs});
+  $self->write_charts($result_summary, $chart_data, $result_summary->{canonical_transcripts}, $result_summary->{gene_xrefs});
+}
+
+sub write_txt_output {
+  my $self = shift;
+  my $txt_output_data = shift; 
+  my $gene_xrefs = shift;
+  my $txt_output_file = $self->{user_params}->{txt_report};
+  my $fh_txt = FileHandle->new($txt_output_file, 'w');
+  foreach my $individual (sort keys %$txt_output_data) {
+    foreach my $gene_id (keys %{$txt_output_data->{$individual}}) {
+      my $gene_id_title = (defined $gene_xrefs->{$gene_id}) ? "$gene_id(" .  $gene_xrefs->{$gene_id} . ")" : $gene_id;
+      foreach my $ar (keys %{$txt_output_data->{$individual}->{$gene_id}}) {
+        foreach my $tr_stable_id (keys %{$txt_output_data->{$individual}->{$gene_id}->{$ar}}) {
+          my $is_canonical = $txt_output_data->{$individual}->{$gene_id}->{$ar}->{$tr_stable_id}->{is_canonical};
+          my $canonical_tag = ($is_canonical) ? 'is_canonical' : 'not_canonical';
+          my $req =  $txt_output_data->{$individual}->{$gene_id}->{$ar}->{$tr_stable_id}->{REQ};
+          my $variants = join(';', @{$txt_output_data->{$individual}->{$gene_id}->{$ar}->{$tr_stable_id}->{variants}});
+          print $fh_txt join("\t", $individual, $gene_id_title, $tr_stable_id, $canonical_tag, "OBS=$ar", "REQ=$req", $variants), "\n";
+        }
+      }
+    }
+  }
+  $fh_txt->close();
+}
+
+sub write_charts {
+  my $self = shift;
+  my $result_summary = shift;
+  my $chart_data = shift;
+  my $canonical_transcripts = shift;
+  my $gene_xrefs = shift;
+
+  my $count_g2p_genes = keys %{$result_summary->{g2p_list}};
+  my $count_in_vcf_file = keys %{$result_summary->{in_vcf_file}};
+  my $count_complete_genes = scalar keys %{$result_summary->{complete_genes}};
+
+  my @charts = ();
+  my @frequencies_header = (); 
+
+  foreach my $short_name (sort keys %{$self->{population_names}}) {
+    my $text = $af_key_2_population_name->{$short_name} || 'No description';
+    push @frequencies_header, "<a style=\"cursor: pointer\" data-placement=\"top\" data-toggle=\"tooltip\" data-container=\"body\" title=\"$text\">$short_name</a>";
+  }
+
+  my $count = 1;
+  my @new_header = (
+    'Variant location and alleles (REF/ALT)',
+    'Variant name (* indicates that variant is on variant include list)',
+    'Existing name', 
+    'Zygosity', 
+    'All allelic requirements from G2P DB',
+    'Consequence types', 
+    'ClinVar annotation', 
+    'SIFT', 
+    'PolyPhen', 
+    'Novel variant', 
+    'Has been failed by Ensembl', 
+    @frequencies_header,
+    'HGVS transcript', 
+    'HGVS protein', 
+    'RefSeq IDs', 
+  );
+
+  my $html_output_file = $self->{user_params}->{html_report};
+  my $fh_out = FileHandle->new($html_output_file, 'w');
+  print $fh_out stats_html_head(\@charts);
+  print $fh_out "<div class='main_content container'>";
+
+ 
+  print $fh_out "<h1>G2P report</h1>";
+  print $fh_out "<p>Input and output files:</p>";
+
+  print $fh_out "<dl class='dl-horizontal'>";
+  print $fh_out "<dt>G2P list</dt>";
+  print $fh_out "<dd>" . $self->{user_params}->{file} .  "</dd>";
+  print $fh_out "<dt>Log directory</dt>";
+  print $fh_out "<dd>" . $self->{user_params}->{log_dir} .  "</dd>";
+  print $fh_out "<dt>HTML report</dt>";
+  print $fh_out "<dd>" . $self->{user_params}->{html_report} .  "</dd>";
+  print $fh_out "<dt>TXT report</dt>";
+  print $fh_out "<dd>" . $self->{user_params}->{txt_report} .  "</dd>";
+  print $fh_out "</dl>";
+
+  print $fh_out "<p>Counts:</p>";
+  print $fh_out "<dl class='dl-horizontal text-overflow'>";
+  print $fh_out "<dt>$count_g2p_genes</dt>";
+  print $fh_out "<dd>G2P genes</dd>";
+  print $fh_out "<dt>$count_in_vcf_file</dt>";
+  print $fh_out "<dd>G2P genes in input VCF file</dd>";
+  print $fh_out "<dt>$count_complete_genes</dt>";
+  print $fh_out "<dd>G2P complete genes in input VCF file</dd>";
+  print $fh_out "</dl>";
+
+
+  print $fh_out "<h1>Summary of G2P complete genes per individual</h1>";
+  print $fh_out "<p>G2P complete gene: A sufficient number of variant hits for the observed allelic requirement in at least one of the gene's transcripts. Variants are filtered by frequency.</p>";
+  print $fh_out "<p>Frequency thresholds and number of required variant hits for each allelic requirement:</p>";
+
+  print $fh_out "<table class='table table-bordered'>";
+  print $fh_out "<thead>";
+  print $fh_out "<tr><th>Allelic requirement</th><th>Frequency threshold for filtering</th><th>Variant counts by zygosity</th></tr>";
+  print $fh_out "</thead>";
+  print $fh_out "<tbody>";
+  foreach my $ar (sort keys %$allelic_requirements) {
+    my $af = $allelic_requirements->{$ar}->{af};
+    my $rules =  $allelic_requirements->{$ar}->{rules};
+    my $rule = join(' OR ', map {"$_ >= $rules->{$_}"} keys %$rules);
+    print $fh_out "<tr><td>$ar</td><td>$af</td><td>$rule</td></tr>";
+  }
+  print $fh_out "</tbody>";
+  print $fh_out "</table>";
+
+my $switch =<<SHTML;
+<form>
+<div class="checkbox">
+  <label>
+    <input class="target" type="checkbox"> Show only canonical transcript
+  </label>
+</div>
+</form>
+SHTML
+
+  print $fh_out $switch;
+
+  foreach my $individual (sort keys %$chart_data) {
+    foreach my $gene_id (keys %{$chart_data->{$individual}}) {
+      my $gene_id_title = (defined $gene_xrefs->{$gene_id}) ? "$gene_id(" .  $gene_xrefs->{$gene_id} . ")" : $gene_id;
+      foreach my $ar (keys %{$chart_data->{$individual}->{$gene_id}}) {
+        print $fh_out "<ul>\n";
+        foreach my $transcript_stable_id (keys %{$chart_data->{$individual}->{$gene_id}->{$ar}}) {
+          my $class = ($canonical_transcripts->{$transcript_stable_id}) ? 'is_canonical' : 'not_canonical';
+          print $fh_out "<li><a class=\"$class\" href=\"#$individual\_$gene_id_title\_$ar\_$transcript_stable_id\">" . "$individual &gt; $gene_id_title &gt; $ar &gt; $transcript_stable_id" . "</a> </li>\n";
+        }
+        print $fh_out "</ul>\n";
+      }
+    }
+  }
+
+  foreach my $individual (sort keys %$chart_data) {
+    foreach my $gene_id (keys %{$chart_data->{$individual}}) {
+      my $gene_id_title = (defined $gene_xrefs->{$gene_id}) ? "$gene_id(" .  $gene_xrefs->{$gene_id} . ")" : $gene_id;
+      foreach my $ar (keys %{$chart_data->{$individual}->{$gene_id}}) {
+        foreach my $transcript_stable_id (keys %{$chart_data->{$individual}->{$gene_id}->{$ar}}) {
+          my $class = ($canonical_transcripts->{$transcript_stable_id}) ? 'is_canonical' : 'not_canonical';
+          print $fh_out "<div class=\"$class\">";
+          my $name = "$individual\_$gene_id_title\_$ar\_$transcript_stable_id";
+          my $title = "$individual &gt; $gene_id_title &gt; $ar &gt; $transcript_stable_id";
+          print $fh_out "<h3><a name=\"$name\"></a>$title <a title=\"Back to Top\" data-toggle=\"tooltip\" href='#top'><span class=\"glyphicon glyphicon-arrow-up\" aria-hidden=\"true\"></span></a></h3>\n";
+          print $fh_out "<div class=\"table-responsive\" style=\"width:100%\">\n";
+          print $fh_out "<TABLE  class=\"table table-bordered table-condensed\" style=\"margin-left: 2em\">";
+          print $fh_out "<thead>\n";
+          print $fh_out "<tr>" . join('', map {"<th>$_</th>"} @new_header) . "</tr>\n";
+          print $fh_out "</thead>\n";
+          print $fh_out "<tbody>\n";
+          foreach my $vf_data (@{$chart_data->{$individual}->{$gene_id}->{$ar}->{$transcript_stable_id}}) {
+            my $data_row = $vf_data->[0];
+            my @tds = ();
+            foreach my $cell (@$data_row) {
+              my $value = $cell->[0];
+              my $class = $cell->[1];
+              if ($class) {
+                push @tds, "<td class=\"$class\">$value</td>";
+              } else {
+                push @tds, "<td>$value</td>";
+              }
+            }
+            print $fh_out "<tr>", join('', @tds), "</tr>\n";
+          }
+          print $fh_out "</tbody>\n";
+          print $fh_out "</TABLE>\n";
+          print $fh_out "</div>\n";
+          print $fh_out "</div>\n";
+        }
+      }
+    }
+  }
+  print $fh_out stats_html_tail();
+}
+
+sub chart_and_txt_data {
+  my $self = shift;
+  my $result_summary = shift;
+  my $complete_genes = $result_summary->{complete_genes};
+  my $new_order = $result_summary->{new_order};
+
+  my $tva_annotation_data = $result_summary->{tva_annotation_data};
+  my $vf_annotation_data = $result_summary->{vf_annotation_data};
+  my $frequency_data = $result_summary->{frequency_data};
+  my $canonical_transcripts = $result_summary->{canonical_transcripts};
+  my $gene2ar = $result_summary->{gene2ar};
+
+  my @frequencies_header = sort keys %{$self->{population_names}};
+
+  my $assembly = $self->{config}->{assembly};
+  my $chart_data = {};
+  my $txt_output_data = {};
+
+  my $prediction2bgcolor = {
+    'probably damaging' => 'danger',
+    'deleterious' => 'danger',
+    'possibly damaging' => 'warning',
+    'unknown'  => 'warning',
+    'benign' => 'success',
+    'tolerated' => 'success',
+  };
+
+
+  foreach my $individual (sort keys %$new_order) {
+
+    foreach my $gene_id (keys %{$new_order->{$individual}}) {
+      my $observed_allelic_requirement = join(',', keys %{$gene2ar->{$gene_id}});
+      foreach my $ar (keys %{$new_order->{$individual}->{$gene_id}}) {
+        foreach my $transcript_stable_id (keys %{$new_order->{$individual}->{$gene_id}->{$ar}}) {
+          my $zyg2vf = $new_order->{$individual}->{$gene_id}->{$ar}->{$transcript_stable_id}; 
+          foreach my $zygosity (keys %$zyg2vf) {
+            foreach my $vf_name (@{$zyg2vf->{$zygosity}}) {
+              my $tva_data = $tva_annotation_data->{$vf_name}->{$transcript_stable_id};
+              my $vf_data = $vf_annotation_data->{$vf_name}; 
+              if (!$vf_data) {
+                print STDERR "No vf_data for: $vf_name\n"; 
+              } 
+              my $hash = {};
+              foreach my $pair (split/;/, "$tva_data;$vf_data") {
+                my ($key, $value) = split('=', $pair, 2);
+                $value ||= '';
+                $hash->{$key} = $value;
+              }
+              my $vf_location = $hash->{vf_location};
+              my $existing_name = $hash->{existing_name};
+              if ($existing_name ne 'NA') {
+                $existing_name = "<a href=\"http://$assembly.ensembl.org/Homo_sapiens/Variation/Explore?v=$existing_name\">$existing_name</a>";
+              }
+              my $is_on_variant_include_list = $hash->{is_on_variant_include_list};
+              my $refseq = $hash->{refseq};
+              my $failed = $hash->{failed};
+              my $clin_sign = $hash->{clin_sig};
+              my $novel = $hash->{novel};
+              my $hgvs_t = $hash->{hgvs_t};
+              my $hgvs_p = $hash->{hgvs_p};
+              my $consequence_types = $hash->{consequence_types};
+              my $sift_score = $hash->{sift_score} || '0.0';
+              my $sift_prediction = $hash->{sift_prediction};
+              my $sift = 'NA';
+              my $sift_class = '';
+              if ($sift_prediction ne 'NA') {
+                $sift = "$sift_prediction(" . "$sift_score)";
+                $sift_class = $prediction2bgcolor->{$sift_prediction};
+              }
+              my $polyphen_score = $hash->{polyphen_score} || '0.0';
+              my $polyphen_prediction = $hash->{polyphen_prediction};
+              my $polyphen = 'NA';
+              my $polyphen_class = '';
+              if ($polyphen_prediction ne 'NA') {
+                $polyphen = "$polyphen_prediction($polyphen_score)";
+                $polyphen_class =  $prediction2bgcolor->{$polyphen_prediction};
+              }
+              
+              $hash->{frequencies} = join(',', keys %{$frequency_data->{$vf_name}}) || 'NA';
+              my %frequencies_hash = ();
+              if ($hash->{frequencies} ne 'NA') {
+                %frequencies_hash = split /[,=]/, $hash->{frequencies};
+              }
+              my @frequencies = ();
+              my @txt_output_frequencies = ();
+              foreach my $population (@frequencies_header) {
+                my $frequency = $frequencies_hash{$population} || '';
+                push @frequencies, ["$frequency"];
+                if ($frequency) {
+                  push @txt_output_frequencies, "$population=$frequency";
+                }
+              }
+              my $is_canonical = ($canonical_transcripts->{$transcript_stable_id}) ? 1 : 0;
+              my ($location, $alleles) = split(' ', $vf_location);
+              $location =~ s/\-/:/;
+              $alleles =~ s/\//:/;
+              $vf_name .= "*" if ($is_on_variant_include_list);
+              push @{$chart_data->{$individual}->{$gene_id}->{$ar}->{$transcript_stable_id}}, [[
+                [$vf_location], 
+                [$vf_name], 
+                [$existing_name], 
+                [$zygosity], 
+                [$observed_allelic_requirement],
+                [$consequence_types], 
+                [$clin_sign], 
+                [$sift, $sift_class], 
+                [$polyphen, $polyphen_class], 
+                [$novel], 
+                [$failed], 
+                @frequencies,
+                [$hgvs_t], 
+                [$hgvs_p], 
+                [$refseq] 
+              ], $is_canonical];
+
+              my $txt_output_variant = "$location:$alleles:$zygosity:$consequence_types:SIFT=$sift:PolyPhen=$polyphen";
+              if (@txt_output_frequencies) {
+                $txt_output_variant .= ':' . join(',', @txt_output_frequencies);
+              }
+              $txt_output_data->{$individual}->{$gene_id}->{$ar}->{$transcript_stable_id}->{is_canonical} = $is_canonical;
+              $txt_output_data->{$individual}->{$gene_id}->{$ar}->{$transcript_stable_id}->{REQ} = $observed_allelic_requirement;
+              push @{$txt_output_data->{$individual}->{$gene_id}->{$ar}->{$transcript_stable_id}->{variants}}, $txt_output_variant;
+            }
+          }
+        }
+      }
+    }
+  }
+  return {txt_data => $txt_output_data, chart_data => $chart_data};
+}
+
+sub parse_log_files {
+  my $self = shift;
+
+  my $log_dir = $self->{user_params}->{log_dir}; 
+  my @files = <$log_dir/*>;
+  my $individual_data = {};
+  my $frequency_data = {};
+  my $vf_annotation_data = {};
+  my $tva_annotation_data = {};
+  my $canonical_transcripts = {};
+  my $all_g2p_genes = {};
+  my $vcf_g2p_genes = {};
+  my $complete_genes = {};
+  my $ar_data = {};
+  my $g2p_transcripts = {};
+  my $gene_xrefs = {};
+
+  foreach my $file (@files) {
+    my $fh = FileHandle->new($file, 'r');
+    while (<$fh>) {
+     chomp;
+      next if /^log/;
+      if (/^G2P_list/) {
+        my ($flag, $gene_id, $DDD_category) = split/\t/;
+        $all_g2p_genes->{$gene_id} = 1;
+      }
+      #G2P_individual_annotations  ENSG00000091140 ENST00000450038 7_107545113_T/C HOM P10
+      elsif (/^G2P_individual_annotations/) {
+        my ($flag, $gene_stable_id, $transcript_stable_id, $vf_cache_name, $zyg, $individual) = split/\t/;
+        $individual_data->{$individual}->{$gene_stable_id}->{$transcript_stable_id}->{$zyg}->{$vf_cache_name} = 1;
+      }
+
+      elsif (/^G2P_frequencies/) {
+        my ($flag, $vf_cache_name, $frequencies) = split/\t/;
+        $frequency_data->{$vf_cache_name}->{$frequencies} = 1;
+        $self->store_population_names($frequencies);
+        my $highest_frequency = get_highest_frequency($frequencies);
+        if (!defined  $self->{highest_frequencies}->{$vf_cache_name} || $self->{highest_frequencies}->{$vf_cache_name} <= $highest_frequency) {
+          $self->{highest_frequencies}->{$vf_cache_name} = $highest_frequency;
+        }
+      }
+
+      elsif (/^G2P_tva_annotations/) {
+        my ($flag, $vf_cache_name, $transcript_stable_id, $annotations) = split/\t/;
+        $tva_annotation_data->{$vf_cache_name}->{$transcript_stable_id} = $annotations;
+      }
+
+      elsif (/^G2P_existing_vf_annotations/) {
+        my ($flag, $vf_cache_name, $annotations) = split/\t/;
+        $vf_annotation_data->{$vf_cache_name} = $annotations;
+      }
+
+      elsif (/^G2P_gene_data/) {
+        my ($flag, $gene_id, $ars, $xrefs) = split/\t/;
+        foreach my $ar (split(',', $ars)) {
+          $ar_data->{$gene_id}->{$ar} = 1;
+        }
+        $gene_xrefs->{$gene_id} = $xrefs;
+      }
+
+      elsif (/^G2P_in_vcf/) {
+        my ($flag, $gene_id) = split/\t/;
+        $vcf_g2p_genes->{$gene_id} = 1;
+      }
+
+      elsif (/^G2P_transcript_data/) {
+        my ($flag, $gene_id, $transcript_id, $is_canonical) = split/\t/;
+        $canonical_transcripts->{$transcript_id} = 1;
+      }
+      elsif (/^is_on_variant_include_list/) {
+        my ($flag, $vf_cache_name) =  split/\t/;
+        $self->{g2p_vf_cache}->{$vf_cache_name}->{is_on_variant_include_list} = 1;
+      }
+    }
+    $fh->close;
+  }
+  my $new_order = {};
+  foreach my $individual (keys %$individual_data) {
+    foreach my $gene_id (keys %{$individual_data->{$individual}}) {
+      foreach my $transcript_id (keys %{$individual_data->{$individual}->{$gene_id}}) {
+        foreach my $ar (keys %{$ar_data->{$gene_id}}) {
+          my $zyg2var = $individual_data->{$individual}->{$gene_id}->{$transcript_id};
+          my $fulfils_ar = $self->obeys_rule($ar, $zyg2var);
+          if (scalar keys %$fulfils_ar > 0) {
+            $complete_genes->{$gene_id} = 1;
+            $new_order->{$individual}->{$gene_id}->{$ar}->{$transcript_id} = $fulfils_ar;
+          }
+        }
+      }
+    }
+  }
+
+  return {
+    frequency_data => $frequency_data,
+    vf_annotation_data => $vf_annotation_data,
+    tva_annotation_data => $tva_annotation_data,
+    canonical_transcripts => $canonical_transcripts,
+    new_order => $new_order,
+    gene2ar => $ar_data,
+    gene_xrefs => $gene_xrefs,
+    in_vcf_file => $vcf_g2p_genes,
+    complete_genes => $complete_genes,
+    g2p_list => $all_g2p_genes,
+  };
+}
+
+sub get_highest_frequency {
+  my $frequencies = shift;
+  my $highest_frequency = 0;
+  foreach my $frequency_annotation (split(',', $frequencies)) {
+    my $frequency = (split('=', $frequency_annotation))[-1];
+    if ($frequency > $highest_frequency) {
+      $highest_frequency = $frequency;
+    }
+  }
+  return $highest_frequency;
+}
+
+sub store_population_names {
+  my $self = shift;
+  my $frequencies = shift;
+  foreach my $frequency_annotation (split(',', $frequencies)) {
+    my $population_name = (split('=', $frequency_annotation))[0];
+    $self->{population_names}->{$population_name} = 1;
+  }
+}
+
+sub get_start {
+  return $_[1]->{start};
+}
+
+sub get_end {
+  return $_[1]->{end};
+}
+
+sub parse_data {
+  my ($self, $line) = @_;
+  my ($vf) = @{parse_line({format => 'vcf', minimal => 1}, $line)};
+  return $vf;
+}
+
+sub stats_html_head {
+    my $charts = shift;
+
+    my $html =<<SHTML;
+<html>
+<head>
+  <title>VEP summary</title>
+  <link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.7/css/bootstrap.min.css" integrity="sha384-BVYiiSIFeK1dGmJRAkycuHAHRg32OmUcww7on3RYdg4Va+PmSTsz/K68vbdEjh4u" crossorigin="anonymous">
+  <style>
+    a.inactive {
+      color: grey;
+      pointer-events:none;
+    }
+  </style>
+</head>
+<body>
+SHTML
+  return $html;
+}
+
+sub stats_html_tail {
+  my $script =<<SHTML;
+  <script src="https://ajax.googleapis.com/ajax/libs/jquery/1.12.4/jquery.min.js"></script>
+  <script src="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.7/js/bootstrap.min.js"></script>
+  <script type="text/javascript" src="http://www.google.com/jsapi"></script>
+  <script>
+    \$( "input[type=checkbox]" ).on( "click", function(){
+      if (\$('.target').is(':checked')) {
+        \$( "div.not_canonical" ).hide();
+        \$("a.not_canonical").addClass("inactive");
+      } else {
+        \$( "div.not_canonical" ).show();
+        \$("a.not_canonical").removeClass("inactive");
+      }
+    } );
+  \$(document).ready(function(){
+    \$('[data-toggle="tooltip"]').tooltip(); 
+  });
+  </script>
+SHTML
+  return "\n</div>\n$script\n</body>\n</html>\n";
+}
+
+1;
diff --git a/README.md b/README.md
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..f84f2a8a10c1161e2160bf1abac60148ef63d683 100644
--- a/README.md
+++ b/README.md
@@ -0,0 +1,118 @@
+# Trio whole exome service scripts and documentation
+
+## Resources and set up
+
+* [Software](docs/Software_installation.md)
+* [Resources](docs/Resources.md)
+* [Variant prioritization specific resources](docs/Setup_variant_prioritization.md)
+
+## SOPs
+
+* [Sample acquisition](docs/SOP_sample_transfer_from_EdGe_to_EPCC.md)
+* [Alignment and variant calling](docs/SOP_alignment_variant_annotation.md)
+* [Variant prioritization](docs/SOP_variant_prioritization.md)
+* [Archiving and cleanup](docs/SOP_archiving.md)
+
+## Current script list by category
+
+### Resource generation & acquisition
+
+```
+bcbio_gnomad_install.sh
+```
+
+### Sample acquisition
+
+```
+submit_trio_wes_aspera_download.sh
+submit_trio_wes_lftp_download.sh
+```
+
+### Alignment & variant calling
+
+#### Preparation & config file generation
+
+```
+trio_wes_prepare_bcbio_config_crf.sh
+trio_wes_prepare_bcbio_config.sh
+trio_whole_exome_create_parameter_files.pl
+```
+
+#### Alignment & variant calling
+
+```
+submit_trio_wes_bcbio.sh
+```
+
+#### Quality control
+
+```
+trio_whole_exome_parse_peddy_ped_csv.pl
+```
+
+### Prioritization
+
+```
+compare_indi_vars_by_version.py
+convert_DEC_to_v10.py
+decipher_NHS_WES_trio.sh
+downstream_setup.sh
+extract_BED_CCDS_DDG2P.py
+extract_trio_FAM_PRO_ID.py
+filter_LQ_GT.py
+full_process_NHS_WES_trio.sh
+gather_NHS_WES_aff_probands_results.sh
+gather_NHS_WES_quad_results.sh
+gather_NHS_WES_trio_results.sh
+generate_coverage_result_file.py
+generate_DEC_IGV.py
+generate_G2P_out_VCF.py
+NHS_WES_check_PED_aff_probands.py
+NHS_WES_check_PED_quad.py
+NHS_WES_extract_shared_vars.py
+NHS_WES_extract_trio_FAM_PRO_ID.py
+NHS_WES_filter_LQ_GT.py
+NHS_WES_generate_aff_sib_ped.py
+NHS_WES_generate_coverage_result_file.py
+NHS_WES_generate_DEC_IGV_aff_probands.py
+NHS_WES_generate_DEC_IGV.py
+NHS_WES_generate_DEC_IGV.py.v1
+NHS_WES_generate_DEC_IGV.py_wrong_gene_trans
+NHS_WES_generate_DEC_IGV_sib_from_quad.py
+NHS_WES_generate_DEC_IGV_trio_from_quad.py
+NHS_WES_generate_trio_ped.py
+NHS_WES_generate_trio_VCF.py
+NHS_WES_trio_cram_setup.sh
+NHS_WES_trio_delete_BAM.sh
+NHS_WES_trio_setup.sh
+old_downstream_setup.sh
+old_submit_downstream.sh
+old_submit_trio_wes_aspera_download.sh
+processing_setup.sh
+process_NHS_WES_aff_probands.sh
+process_NHS_WES_quad_full.sh
+process_NHS_WES_quad.sh
+process_NHS_WES_trio_before_BAMOUT.sh
+process_NHS_WES_trio.sh
+run_processing.sh
+submit_depth_of_coverage_MQ20_BQ20.sh
+submit_downstream.sh
+test_process_NHS_WES_trio.sh
+test_run_processing.sh
+```
+
+### Archiving & cleanup
+
+```
+submit_trio_wes_cram_compression.sh
+submit_trio_wes_family_checksums.sh
+submit_trio_wes_project_checksums.sh
+```
+
+### Configuration files
+
+```
+trio_whole_exome_bcbio_template.yaml
+trio_whole_exome_config.sh
+vcf_config.json.backup
+```
diff --git a/add_family_id_to_santosh_ped.pl b/add_family_id_to_santosh_ped.pl
deleted file mode 100644
index 8a4d7a24a3dc67b5d7ed68099212a750c8e88fe4..0000000000000000000000000000000000000000
--- a/add_family_id_to_santosh_ped.pl
+++ /dev/null
@@ -1,16 +0,0 @@
-#!/usr/bin/perl -w
-
-use strict;
-
-while (my $line = <>)
-{
-	chomp $line;
-	my ($family, $sample, $father, $mother, $sex, $affected) = split(/\t/, $line);
-	$sample = sprintf("%s_%s", $sample, $family);
-	if ($father ne "0") { $father = sprintf("%s_%s", $father, $family); }
-	if ($mother ne "0") { $mother = sprintf("%s_%s", $mother, $family); }
-	$family = "99999_" . $family;
-
-     	printf "$family\t$sample\t$father\t$mother\t$sex\t$affected\n";
-}
-
diff --git a/add_samples_from_previous_CRF_runs.sh b/add_samples_from_previous_CRF_runs.sh
new file mode 100755
index 0000000000000000000000000000000000000000..3f7ddc865a77aacaa2541178b5fd25eb5242d122
--- /dev/null
+++ b/add_samples_from_previous_CRF_runs.sh
@@ -0,0 +1,39 @@
+#!/bin/bash
+#
+# add_samples_from_previous_runs.sh <config.sh> <project_id> <version> <samples>
+# 
+#
+
+CONFIG_SH=$1
+PROJECT_ID=$2
+VERSION=$3
+SAMPLES=$4
+
+source $CONFIG_SH
+
+cd $PARAMS_DIR
+
+# create reads directory for project and symlink directory underneath
+mkdir -p $READS_DIR/$PROJECT_ID/symlinks
+
+SHORT_PROJECT_ID=`echo $PROJECT_ID | cut -f 1 -d '_'`
+
+N=`wc -l $SAMPLES | awk '{ print $1 }'`
+
+for ((i = 1; i <= $N; i = i + 1))
+do
+  FAMILY_ID=`head -n $i $SAMPLES | tail -n 1 | cut -f 1`
+  SAMPLE=`head -n $i $SAMPLES | tail -n 1 | cut -f 2`
+  SEX=`head -n $i $SAMPLES | tail -n 1 | cut -f 3`
+  PHENOTYPE=`head -n $i $SAMPLES | tail -n 1 | cut -f 4`
+  ORIGINAL_PROJECT_ID=`head -n $i $SAMPLES | tail -n 1 | cut -f 5`
+  ORIGINAL_SAMPLE_ID=`head -n $i $SAMPLES | tail -n 1 | cut -f 6`
+  
+  PREFIX=${SHORT_PROJECT_ID}_${VERSION}_${FAMILY_ID}
+  
+  for FILE in `ls $DOWNLOAD_DIR/$ORIGINAL_PROJECT_ID/*${ORIGINAL_SAMPLE_ID}*.gz`
+  do
+    echo "$FILE,$SAMPLE,$FAMILY_ID,$SEX,$PHENOTYPE,$TARGET" >> $PREFIX.csv
+  done
+
+done
diff --git a/add_samples_from_previous_EdGe_runs.sh b/add_samples_from_previous_EdGe_runs.sh
new file mode 100755
index 0000000000000000000000000000000000000000..745e20ba13cdb244e0b81ffcbc1fc572a49f6741
--- /dev/null
+++ b/add_samples_from_previous_EdGe_runs.sh
@@ -0,0 +1,62 @@
+#!/bin/bash
+#
+# add_samples_from_previous_runs.sh <config.sh> <project_id> <version> <samples>
+# 
+#
+
+CONFIG_SH=$1
+PROJECT_ID=$2
+VERSION=$3
+SAMPLES=$4
+
+source $CONFIG_SH
+
+cd $PARAMS_DIR
+
+# create reads directory for project and symlink directory underneath
+mkdir -p $READS_DIR/$PROJECT_ID/symlinks
+
+SHORT_PROJECT_ID=`echo $PROJECT_ID | cut -f 1 -d '_'`
+
+N=`wc -l $SAMPLES | awk '{ print $1 }'`
+
+for ((i = 1; i <= $N; i = i + 1))
+do
+  FAMILY_ID=`head -n $i $SAMPLES | tail -n 1 | cut -f 1`
+  SAMPLE=`head -n $i $SAMPLES | tail -n 1 | cut -f 2`
+  SEX=`head -n $i $SAMPLES | tail -n 1 | cut -f 3`
+  PHENOTYPE=`head -n $i $SAMPLES | tail -n 1 | cut -f 4`
+  ORIGINAL_PROJECT_ID=`head -n $i $SAMPLES | tail -n 1 | cut -f 5`
+  ORIGINAL_SAMPLE_ID=`head -n $i $SAMPLES | tail -n 1 | cut -f 6`
+  
+  PREFIX=${SHORT_PROJECT_ID}_${VERSION}_${FAMILY_ID}
+  
+  # create symlinks for problematic filenames
+  mkdir -p $READS_DIR/$PROJECT_ID/symlinks/$SAMPLE
+  for FILE in `ls $DOWNLOAD_DIR/$ORIGINAL_PROJECT_ID/*/*/*$ORIGINAL_SAMPLE_ID*/*_1_*_1.fastq.gz`
+  do
+    newname=`basename $FILE | sed -e 's/_1_/_one_/'`
+    ln -s $FILE $READS_DIR/$PROJECT_ID/symlinks/$SAMPLE/${newname%1.fastq.gz}R1.fastq.gz
+  done
+  for FILE in `ls $DOWNLOAD_DIR/$ORIGINAL_PROJECT_ID/*/*/*$ORIGINAL_SAMPLE_ID*/*_1_*_2.fastq.gz`
+  do
+    newname=`basename $FILE | sed -e 's/_1_/_one_/'`
+    ln -s $FILE $READS_DIR/$PROJECT_ID/symlinks/$SAMPLE/${newname%2.fastq.gz}R2.fastq.gz
+  done
+  for FILE in `ls $DOWNLOAD_DIR/$ORIGINAL_PROJECT_ID/*/*/*$ORIGINAL_SAMPLE_ID*/*_2_*_1.fastq.gz`
+  do
+    newname=`basename $FILE | sed -e 's/_2_/_two_/'`
+    ln -s $FILE $READS_DIR/$PROJECT_ID/symlinks/$SAMPLE/${newname%1.fastq.gz}R1.fastq.gz
+  done
+  for FILE in `ls $DOWNLOAD_DIR/$ORIGINAL_PROJECT_ID/*/*/*$ORIGINAL_SAMPLE_ID*/*_2_*_2.fastq.gz`
+  do
+    newname=`basename $FILE | sed -e 's/_2_/_two_/'`
+    ln -s $FILE $READS_DIR/$PROJECT_ID/symlinks/$SAMPLE/${newname%2.fastq.gz}R2.fastq.gz
+  done
+
+#  for FILE in `ls $READS_DIR/$PROJECT_ID/symlinks/$SAMPLE/*_R[1,2].fastq.gz`
+#  do
+#    echo "$FILE,$SAMPLE,$FAMILY_ID,$SEX,$PHENOTYPE,$TARGET" >> ${PREFIX}.csv
+#  done
+
+done
diff --git a/check_quad_PED.py b/check_quad_PED.py
new file mode 100755
index 0000000000000000000000000000000000000000..2f81656bfd4158954c11e5630e98dacfcdd0abbb
--- /dev/null
+++ b/check_quad_PED.py
@@ -0,0 +1,74 @@
+#	input:	a PED file for a quad family - 2 affected kids and 2 unaffected parrents
+#
+#	checks that the there are exactly two kids in the PED file and both are affected
+#	checks that the there are exactly two parents in the PED file and both are unafefcted
+#	if any problems SystemExit(1) - the value of $? to be checked by the bash script - if 0: all is well, if 1: the PED file failed the checks
+#
+#	output: for other family types, maybe write out a file wit a list of all affected probands and a list of all unaffected parents ?
+#
+#       Author: MH
+#       last modified: SEPT 15, 2020
+
+
+
+
+import sys
+import os
+
+
+def go(in_file):
+
+    AFF_PROBANDS = []
+    UNAFF_PARENTS = []
+
+    in_han = open(in_file,'r')
+    for line in in_han:
+        data = [x.strip() for x in line.strip().split('\t')]
+        pro_fam_id = data[1]
+        par_1 = data[2]
+        par_2 = data[3]
+        aff = int(data[5])
+
+
+        if (par_1 != "0") and (par_2 != "0"):	# a proband
+            if aff != 2:			# not affected proband
+                print "ERROR: Found unaffected proband"
+                print line
+                raise SystemExit(1)
+            else:
+                AFF_PROBANDS.append(pro_fam_id)
+
+        if (par_1 == "0") and (par_2 == "0"):	# a parent
+            if aff != 1:                        # affected parent
+                print "ERROR: Found affected parent"
+                print line
+                raise SystemExit(1)
+            else:
+                UNAFF_PARENTS.append(pro_fam_id)
+
+
+    if len(AFF_PROBANDS) != 2:
+        print "ERROR: Could not find exactly 2 affected probands"
+        raise SystemExit(1)
+
+    if len(UNAFF_PARENTS) != 2:
+        print "ERROR: Could not find exactly 2 unaffected parents"
+        raise SystemExit(1)
+
+    in_han.close()
+    print "PED file checks: success"
+    print "Found %s affected probands with %s unaffected parents in %s" % (len(AFF_PROBANDS),len(UNAFF_PARENTS),in_file)
+    sys.stdout.flush()
+
+
+
+
+
+
+if __name__ == '__main__':
+    if len(sys.argv) == 2:
+        go(sys.argv[1])
+    else:
+        print "Suggested use: time python /home/u035/u035/shared/scripts/NHS_WES_check_PED_quad.py a_ped_file"
+        raise SystemExit
+
diff --git a/check_shared_PED.py b/check_shared_PED.py
new file mode 100755
index 0000000000000000000000000000000000000000..faf36b91ea3b80e08dde888f95e5839d35b8d032
--- /dev/null
+++ b/check_shared_PED.py
@@ -0,0 +1,64 @@
+#	input:	a PED file for a family with affected and related probands only
+#
+#	checks that all individuals in the PED file are affected
+#	checks that the parents for all individuals are missing (i.e., ID = 0)
+#	if any problems SystemExit(1) - the value of $? to be checked by the bash script - if 0: all is well, if 1: the PED file failed the checks
+#
+#	output: for other family types, maybe write out a file wit a list of all affected probands and a list of all unaffected parents ?
+#
+#       Author: MH
+#       last modified: FEB 25, 2020
+
+
+
+
+import sys
+import os
+
+
+def go(in_file):
+
+    AFF_PROBANDS = []
+
+    in_han = open(in_file,'r')
+    for line in in_han:
+        data = [x.strip() for x in line.strip().split('\t')]
+        pro_fam_id = data[1]
+        par_1 = int(data[2])
+        par_2 = int(data[3])
+        aff = int(data[5])
+
+        if (par_1 != 0) or (par_2 != 0):
+            print "ERROR: Found a proband with a parent"
+            print line
+            raise SystemExit(1)
+
+        if aff != 2:
+            print "ERROR: Found unaffected proband"
+            print line
+            raise SystemExit(1)
+
+        if pro_fam_id not in AFF_PROBANDS:
+            AFF_PROBANDS.append(pro_fam_id)
+        else:
+            print "ERROR: Found duplicate proband"
+            print line
+            raise SystemExit(1)
+
+    in_han.close()
+    print "PED file checks: success"
+    print "Found %s affected probands with no parents in %s" % (len(AFF_PROBANDS),in_file)
+    sys.stdout.flush()
+
+
+
+
+
+
+if __name__ == '__main__':
+    if len(sys.argv) == 2:
+        go(sys.argv[1])
+    else:
+        print "Suggested use: time python /home/u035/u035/shared/scripts/NHS_WES_check_PED_aff_probands.py a_ped_file"
+        raise SystemExit
+
diff --git a/convert_DEC_to_v10.py b/convert_DEC_to_v10.py
index 061f8b90ef71fb75e7fd2da85903cf59cdb07b05..f01d316337f5a2748840aa6ca4b35fc539078b34 100644
--- a/convert_DEC_to_v10.py
+++ b/convert_DEC_to_v10.py
@@ -21,7 +21,7 @@ def go(inout_dir,id):		# the folder where the bulk upload files are strored; id
 
     # create the workbook
     workbook = xlsxwriter.Workbook(out_file)
-    
+
     # create the worksheet
     worksheet = workbook.add_worksheet('Sequence Variants')
 
@@ -40,7 +40,7 @@ def go(inout_dir,id):		# the folder where the bulk upload files are strored; id
             if row[0] == 'Internal reference number or ID':      # ignore the header line
                 continue
 
-            cntr += 1 
+            cntr += 1
             id = str(row[0])
             shared = 'NHS-SCE'
             assembly = 'GRCh38'
@@ -62,7 +62,7 @@ def go(inout_dir,id):		# the folder where the bulk upload files are strored; id
             evid = ''
             cont = ''
             gt_groups = ''
-            data = (id,shared,assembly,HGVS,chr,start,ref,alt,gene,trans,inter,genotype,inher,patho,evid,cont,gt_groups)   
+            data = (id,shared,assembly,HGVS,chr,start,ref,alt,gene,trans,inter,genotype,inher,patho,evid,cont,gt_groups)
 
             # write it
             worksheet.write_row(cntr,0,data)
diff --git a/delete_BAM.sh b/delete_BAM.sh
new file mode 100755
index 0000000000000000000000000000000000000000..c765ff4a338c9a576e5afca7397f4faf8b7d8800
--- /dev/null
+++ b/delete_BAM.sh
@@ -0,0 +1,95 @@
+#!/bin/bash
+#SBATCH --cpus-per-task=1
+#SBATCH --mem=2GB
+#SBATCH --time=01:00:00
+#SBATCH --job-name=delete_bam
+#SBATCH --output=delete_bam.%A_%a.out
+#SBATCH --error=delete_bam.%A_%a.err
+
+
+### Setup the folder structure for the downstream analysis###
+BASE=/home/u035/u035/shared/analysis/work
+WORK_DIR=$BASE/${PROJECT_ID}
+VCF_DIR=${WORK_DIR}/VCF
+PED_DIR=${WORK_DIR}/PED
+LOG_DIR=${WORK_DIR}/LOG
+G2P_DIR=${WORK_DIR}/G2P
+VASE_DIR=${WORK_DIR}/VASE
+COV_DIR=${WORK_DIR}/COV
+DEC_DIR=${WORK_DIR}/DECIPHER
+IGV_DIR=${DEC_DIR}/IGV
+CNV_DIR=${WORK_DIR}/CNV
+BAMOUT_DIR=${WORK_DIR}/BAMOUT
+SCRIPTS_DIR=/home/u035/u035/shared/scripts
+
+
+
+echo "SOURCE_DIR = ${SOURCE_DIR}"       # the general path to the source VCF, BAM and PED files                 i.e. /home/u035/u035/shared/results
+echo "BATCH_ID = ${BATCH_ID}"           # the ID of the batch being processed                                   e.g. 19650_Ansari_Morad
+echo "BATCH_NUM = ${BATCH_NUM}"         # the numerical part of the BATCH_ID                                    e.g. 19650
+echo "PLATE_ID = ${PLATE_ID}"           # the PCR plate ID of the batch being currently processed,              e.g. 19285
+echo "PROJECT_ID = ${PROJECT_ID}"       # this the the folder (${BASE}/${PROJECT_ID}) where the downstream analysis will be done
+echo "VERSION_N = ${VERSION_N}"         # the version of the alignment and genotyping analysis
+
+
+
+
+
+
+
+##############################################################
+###   Delete indivdual BAMs (and indexes) iff CRAM found   ###
+##############################################################
+
+# make sure we are reading the data from the exact version, batch & plate ID
+SOURCE_VCF_DIRS=${SOURCE_DIR}/${BATCH_NUM}_${VERSION_N}/families/????-??-??_${BATCH_NUM}_${VERSION_N}_${PLATE_ID}_*
+
+
+for S_VCF_DIR in ${SOURCE_VCF_DIRS}
+do
+  VCF_DIR_NAME="${S_VCF_DIR##*/}"
+  IFS=_ read -ra my_arr <<< "${VCF_DIR_NAME}"
+  FAM_ID=${my_arr[-1]}
+  echo "  FAM_ID = ${FAM_ID}"
+
+  # identify all folders (one for each individual) for this family containing cram/bam files (format: <INDI_ID>_<FAM_ID>)
+  cd ${SOURCE_DIR}/${BATCH_NUM}_${VERSION_N}/families/????-??-??_${BATCH_NUM}_${VERSION_N}_${PLATE_ID}_${FAM_ID}
+  for ITEM in `ls -l`
+  do
+    if test -d $ITEM && [[ "$ITEM" == *"_"* ]]
+    then
+      echo "    $ITEM is a CRAM/BAM folder..."
+      BAM=${ITEM}/${ITEM}-ready.bam
+      CRAM=${ITEM}/${ITEM}-ready.cram
+
+      #  check if the CRAM file exists, iff yes, delete the BAM file and its index
+      if [[ -f "$CRAM" ]]
+      then
+        echo "      Found ${CRAM}"
+        echo "      Removing ${BAM}"
+        rm ${BAM}
+        echo "      Removing ${BAM}.bai"
+        rm ${BAM}.bai  
+      else
+        echo "      ERROR: CRAM file ${CRAM} not found - have not deleted BAM ${BAM}!"
+      fi
+    fi
+  done
+done
+
+
+
+
+echo ""
+echo ""
+echo "OK: Deletion of BAM files and their indexes for PROJECT_ID = $PROJECT_ID successful"
+
+
+
+
+
+
+
+
+
+
diff --git a/docs/Resources_ultra2.md b/docs/Resources.md
similarity index 87%
rename from docs/Resources_ultra2.md
rename to docs/Resources.md
index ccac6d6c194970309f0c9e7d30479a59d9959ce9..10fa72c27290f36819983b08271541cc6f7d37ec 100644
--- a/docs/Resources_ultra2.md
+++ b/docs/Resources.md
@@ -20,4 +20,4 @@ Retained for reference in `/home/u035/u035/shared/resources/exome_targets/archiv
 
 ## Resources for variant prioritization
 
-See: https://git.ecdf.ed.ac.uk/igmmbioinformatics/trio-whole-exome/edit/master/docs/Setup_variant_prioritization.md
\ No newline at end of file
+See [Setup for variant prioritization](Setup_variant_prioritization.md).
diff --git a/docs/SOP_alignment_variant_annotation.md b/docs/SOP_alignment_variant_annotation.md
index fbfa22416be2c997d32a435813fbcc80c7ca24b4..c7046b312c9a1844c2d27df625df9e0b98407d5b 100644
--- a/docs/SOP_alignment_variant_annotation.md
+++ b/docs/SOP_alignment_variant_annotation.md
@@ -1,73 +1,62 @@
-# Standard operating procedure - Alignment, variant calling, and annotation of trio whole exome samples at the Edinburgh Parallel Computing Centre
+# Standard operating procedure - Alignment, variant calling, annotation, compression, and storage of trio whole exome samples at the Edinburgh Parallel Computing Centre
 
-This SOP applies to batches of family/trio samples where trio whole exome sequencing has been performed by Edinburgh Genomics (EdGE) or the Edinburgh Clinical Research Facility (ECRF). It assumes that data has been successfully transferred to the Edinburgh Parallel Computing Centre (EPCC) (see SOP: Transfer of whole exome sequencing samples from Edinburgh Genomics to Edinburgh Parallel Computing Centre). Scripts are version controlled on the University of Edinburgh gitlab server gitlab.ecdf.ed.ac.uk/igmmbioinformatics/trio-whole-exome. Request access by e-mail: alison.meynert@igmm.ed.ac.uk.
+This SOP applies to batches of family/trio samples where trio whole exome sequencing has been performed by Edinburgh Genomics (EdGE) or the Edinburgh Clinical Research Facility (ECRF). It assumes that data has been successfully transferred to the Edinburgh Parallel Computing Centre (EPCC) (see SOP: Transfer of whole exome sequencing samples from Edinburgh Genomics to Edinburgh Parallel Computing Centre). Scripts are version controlled on the University of Edinburgh gitlab server `gitlab.ecdf.ed.ac.uk/igmmbioinformatics/trio-whole-exome`. Request access by e-mail: alison.meynert@igmm.ed.ac.uk.
 
 ## Definitions
 
 In this document, N is the total number of samples in the project, and X is the number of families.
 
-Text in angle brackets, e.g. <project> indicates variable parameters. A variable parameter such as <family1-X> indicates that there are X instances of the parameter, each with their own unique value.
+Text in angle brackets, e.g. `<project>` indicates variable parameters. A variable parameter such as `<family1-X>` indicates that there are X instances of the parameter, each with their own unique value.
 
 ## Software and data requirements
 
-The analysis is run with the bcbio pipeline (version 1.2.3) located at /home/u035/project/software/bcbio. All genome reference and annotation data resources are contained within the genomes/Hsapiens/hg38 subfolder.
+The analysis is run with the bcbio pipeline (version 1.2.8) located at `/home/u035/u035/shared/software/bcbio`. All genome reference and annotation data resources are contained within the `genomes/Hsapiens/hg38` subfolder.
 
-The TWIST target BED file is at: /home/u035/project/resources/Twist_Exome_RefSeq_targets_hg38.plus15bp.bed
+The TWIST target BED file is at: `/home/u035/u035/shared/resources/Twist_Exome_RefSeq_targets_hg38.plus15bp.bed`. See [resources](Resources.md).
 
-To generate the target BED file, first copy the file Twist_Exome_RefSeq_targets_hg38.bed from NHS Clinical Genetics Services to /home/u035/project/resources on ultra, then pad it by 15bp each side.
-
-```
-cd /home/u035/project/resources
-source ../scripts/trio_whole_exome_config.sh
-
-bedtools slop -g $REFERENCE_GENOME.fai -i Twist_Exome_RefSeq_targets_hg38.bed -b 15 | \
-  bedtools merge > Twist_Exome_RefSeq_targets_hg38.plus15bp.bed
-```
+The tracking file is maintained on the IGC datastore at `/exports/igmm/datastore/IGMM-VariantAnalysis/trio_whole_exome/Batch_status.xlsx.`
 
 ## Input
 
 ### PED file
 
-A 6-column tab-delimited PED/FAM format file (https://www.cog-genomics.org/plink2/formats#fam) is required for each batch, describing the relationships between the sampled individuals, their sex, and their affected/unaffected status.
+A 6-column tab-delimited [PED/FAM format file](https://www.cog-genomics.org/plink2/formats#fam) is required for each batch, describing the relationships between the sampled individuals, their sex, and their affected/unaffected status.
 
 
 ### Sample id format
 
-The sequencing reads for the samples delivered from EdGE are identified by folder name and as the 8th column in the tab-delimited text file file_list.tsv inside the dated batch folder. The identifiers are in the format:
+The sequencing reads for the samples delivered from EdGE are identified by folder name and as the 8th column in the tab-delimited text file `file_list.tsv` inside the dated batch directory. The identifiers are in the format:
 
 ```
 <pcr_plate_id>_<indiv_id>_<family_id><suffix>
 ```
 
-The suffix identifies the exome kit, e.g. "_IDT-A". These identifiers are referenced below in the output file structure.
+The suffix identifies the exome kit, e.g. `_WESTwist_IDT-A`. These identifiers are referenced below in the output file structure.
 
 ### Reads - Edinburgh Genomics
 
-A set of paired end FASTQ files (designated by R1 or R2 suffixes), possibly more than one pair per sample. Each sample's files are in its own folder. The input files will be in the folder /scratch/u035/project/trio_whole_exome/data and follow the structure in *Figure 1*.
+A set of paired end FASTQ files (designated by R1 or R2 suffixes), possibly more than one pair per sample. Each sample's files are in its own folder. The input files will be in the folder `/home/u035/u035/shared/data` and follow the structure in *Figure 1*. Older deliveries contained the `<dated_batch>` folder within a `raw_data` folder.
 
 ```
 <EdGE_project_id>/
+  +---<dated_batch>/
+  |   +---<sample_id>/
+  |   |   +---*.fastq.count
+  |   |   +---*.fastq.gz
+  |   +---file_list.tsv
+  |   +---md5sums.txt
+  +---<dated_batch>_tree.txt
+  +---Information.txt
   +---md5_check.txt
-  +---raw_data/
-  |   +---<dated_batch>/
-  |   |   +---<EdGE_sample_id>/
-  |   |   |   +---<fastq_id>_R1.fastq.count
-  |   |   |   +---<fastq_id>_R1.fastq.gz
-  |   |   |   +---<fastq_id>_R2.fastq.count
-  |   |   |   +---<fastq_id>_R2.fastq.gz
-  |   |   +---file_list.tsv
-  |   |   +---md5sums.txt
-  |   +---<dated_batch>_tree.txt
-  |   +---Information.txt
-```
-*Figure 1.* File name and directory structure for a batch of sequencing from Edinburgh Genomics. The EdGE project id takes the format XXXXX_Lastname_Firstname, identifying the NHS staff member who submitted the samples for sequencing. The dated batch is in the format yyyymmdd â€“ in general we expect there to be only one of these per EdGE project id. The FASTQ file id relates to the sequencing run information and does not contain any information about the sample itself.
+```
+*Figure 1.* File name and directory structure for a batch of sequencing from Edinburgh Genomics. The EdGE project id takes the format `XXXXX_Lastname_Firstname`, identifying the NHS staff member who submitted the samples for sequencing. The dated batch is in the format `yyyymmdd` â€“ in general we expect there to be only one of these per EdGE project id. The FASTQ file names relate to the sequencing run information and do not contain any information about the sample itself.
 
 ### Reads - Edinburgh Clinical Research Facility
 
-A set of paired end FASTQ files (designated by R1 or R2 suffixes), generally one pair per sample. The input files will be in the folder /scratch/u035/project/trio_whole_exome/data and follow the structure in *Figure 2*.
+A set of paired end FASTQ files (designated by R1 or R2 suffixes), generally one pair per sample. The input files will be in the folder `/home/u035/u035/shared/data` and follow the structure in *Figure 2*.
 
 ```
-<EdGE_project_id>/
+<ECRF_project_id>/
   +---<internal_id_-md5.txt
   +---<pcr_plate_id>_<indiv_id>_<family_id><suffix>_S<i>_L001_R1_001.fastq.gz
   +---<pcr_plate_id>_<indiv_id>_<family_id><suffix>_S<i>_L001_R2_001.fastq.gz
@@ -78,81 +67,55 @@ A set of paired end FASTQ files (designated by R1 or R2 suffixes), generally one
 
 ## Working directories
 
-The project working directories will be in the folder /scratch/u035/project/trio_whole_exome/analysis and follow the structure in *Figure 3*.
+The project working directories will be in the folder `/home/u035/u035/shared/analysis` and follow the structure in *Figure 3*.
 
 ```
     config â€“ bcbio configuration files in YAML format
     logs â€“ PBS job submission log files
-    output â€“ output to be passed to variant prioritization and archiving
     params â€“ parameters for PBS job submission
-    reads â€“ symlinks to input FASTQ files
+    reads â€“ symlinks/merged versions of input FASTQ files
     work â€“ bcbio working folder
 ```
 *Figure 3.* Project working directories.
 
 ## Project configuration
 
-A configuration script sets environment variables common to scripts used in this SOP. This is stored at /home/u035/project/scripts/trio_whole_exome_config.sh.
-
-```
-#!/usr/bin/bash
-#
-# Basic configuration options for trio WES pipeline
-#
-
-SCRIPTS=/home/u035/project/scripts
-BCBIO_TEMPLATE=$SCRIPTS/trio_whole_exome_bcbio_template.yaml
-TARGET=/home/u035/project/resources/Twist_Exome_RefSeq_targets_hg38.plus15bp.bed
-DOWNLOAD_DIR=/scratch/u035/project/trio_whole_exome/data
-REFERENCE_GENOME=/home/u035/project/software/bcbio/genomes/Hsapiens/hg38/seq/hg38.fa
-
-BASE=/scratch/u035/project/trio_whole_exome/analysis
-PARAMS_DIR=$BASE/params
-READS_DIR=$BASE/reads
-CONFIG_DIR=$BASE/config
-WORK_DIR=$BASE/work
-OUTPUT_DIR=$BASE/output
-
-ARCHIVE_DIR=/archive/u035/trio_whole_exome
-
-export PATH=/home/u035/project/software/bcbio/tools/bin:$PATH
-````
+A [configuration script](../trio_whole_exome_config.sh) sets environment variables common to scripts used in this SOP.
 
 ## Template for bcbio configuration
 
-Bcbio requires a template file in YAML format to define the procedures run in the pipeline. The template for this project is stored at /home/u035/project/scripts/trio_whole_exome_bcbio_template.yaml.
-
-```
-details:
-- algorithm:
-    platform: illumina
-    quality_format: standard
-    aligner: bwa
-    mark_duplicates: true
-    realign: false
-    recalibrate: true
-    effects: vep
-    effects_transcripts: all
-    variantcaller: gatk-haplotype
-    indelcaller: false
-    remove_lcr: true
-    tools_on:
-    - vep_splicesite_annotations
-  analysis: variant2
-  genome_build: hg38
-upload:
-  dir: /scratch/u035/project/trio_whole_exome/analysis/output
-```
+Bcbio requires a [template file in YAML format](../trio_whole_exome_bcbio_template.yaml) to define the procedures run in the pipeline.
 
 ## Output
 
 Per sample: BAM file of aligned reads against the hg38 genome assembly
 Per family: Annotated VCF file and QC report
 
-Output will be in the folder /scratch/u035/project/trio_whole_exome/analysis/output and follow the structure in *Figure 4* (with multiple instances of the indiv_id sub directories, one per sequenced family member.). The qc sub-directories are not enumerated, and automatically generated index files are not listed for brevity. An additional directory at the root of the output folder called â€œqcâ€ will contain the MultiQC reports generated for an entire batch.
+Output will be in the folder `/home/u035/u035/shared/results/<short_project_id>_<version>` where `<short_project_id>` is the numeric prefix of `<project_id>` and follow the structure in *Figure 4* and *Figure 5* (with multiple instances of the indiv_id sub directories, one per sequenced family member.). The qc sub-directories are not enumerated, and automatically generated index files are not listed for brevity.
+
+```
+<short_project_id>_<version>/
++---config
+|   +---<short_project_id>_<version>_<pcr_plate_id>_<family_id>.yaml
++---families
+|   +---<analysis_date>_<short_project_id>_<pcr_plate_id>_<family_id>
++---params
+|   +---<ped_file>
+|   +---<project_id>_<pcr_plate_id>_<family_id>.ped
+|   +---<project_id>.ped
+|   +---<project_id>_<version>_<date>.log
+|   +---<short_project_id>_<pcr_plate_id>_<family_id>.csv
++---prioritization
+|   +---<priority_dirs>
++---qc
+|   +---<short_project_id>_<version>.ped_check.txt
+|   +---<short_project_id>_<version>_qc_report_data
+|   +---<short_project_id>_<version>_qc_report.html
+```
+*Figure 4.* File name and output directory structure for a batch of sequencing.
 
 ```
-<analysis_date>_<EdGE_project_id>_<pcr_plate_id>_<family_id>/
+<analysis_date>_<short_project_id>_<pcr_plate_id>_<family_id>/
   +---<indiv_id>_<family_id>/
   |   +---<indiv_id>_<family_id>-callable.bed
   |   +---<indiv_id>_<family_id>-ready.bam
@@ -171,113 +134,147 @@ Output will be in the folder /scratch/u035/project/trio_whole_exome/analysis/out
   +---programs.txt
   +---project-summary.yaml
 ```
-*Figure 4.* File name and output directory structure for each family in a batch of sequencing.
+*Figure 5.* File name and output directory structure for each family in a batch of sequencing.
 
 ## Procedure
 
-1. Set environment variable project_id and general configuration variables.
+1. Set environment variable project_id and general configuration variables. All steps below can assume that these have been set. Version should be "v1" by default for \
+the first analysis run of a batch, "v2" etc for subsequent runs.
 
 ```
-project_id=<EdGE_project_id>
-source /home/u035/project/scripts/trio_whole_exome_config.sh
+project_id=<project_id>
+short_project_id=`echo $project_id | cut -f 1 -d '_'`
+version=<version>.
+source /home/u035/u035/shared/scripts/trio_whole_exome_config.sh
 ```
 
-2. Copy the PED file for the batch to the params folder in the working area. It should be named <EdGE_project_id>.ped, relating it to the input directory for the FASTQ files. If the PED file given was not named in this way, donâ€™t rename it, create a symlink with the correct name.
+2. Copy the PED file for the batch to the params folder in the working area. It should be named `<project_id>.ped`, relating it to the input directory for the FASTQ files. If the PED file given was not named in this way, donâ€™t rename it, copy it instead.
 
 ```
 cd $PARAMS_DIR
 ped_file=<input_ped_file>
-ln -s $ped_file $project_id.ped
+cp $ped_file $project_id.ped
 ```
 
-3. In the params folder, create the symlinks to the reads and the bcbio configuration files. If specifying a common sample suffix, ensure it includes any joining characters, e.g. â€œ-â€œ or â€œ_â€, so that the family identifier can be cleanly separated from the suffix. Get the number of families from the batch. Version should be "v1" by default for the first analysis run of a batch, "v2" etc for subsequent runs.
-
+3. In the params folder, create the symlinks to the reads and the bcbio configuration files. If specifying a common sample suffix, ensure it includes any joining characters, e.g. â€œ-â€œ or â€œ_â€, so that the family identifier can be cleanly separated from the suffix. Get the number of families from the batch.
 
-### Edinburgh Genomics data
+*Edinburgh Genomics data*
 
 ```
 cd $PARAMS_DIR
-version=<version>
 sample_suffix=<sample_suffix>
-/home/u035/project/scripts/prepare_bcbio_config.sh \
-  /home/u035/project/scripts/trio_whole_exome_config.sh \
-  $project_id $version $sample_suffix &> ${version}_${project_id}.log
+$SCRIPTS/trio_wes_prepare_bcbio_config.sh \
+  $SCRIPTS/trio_whole_exome_config.sh \
+  $project_id $version $sample_suffix &> ${project_id}_${version}_`date +%Y%m%d%H%M`.log
 X=`wc -l $PARAMS_DIR/$project_id.family_ids.txt | awk '{print $1}'`
 ```
 
-### Edinburgh Clinical Research Facility data
+*Edinburgh Clinical Research Facility data*
 
 ```
 cd $PARAMS_DIR
-version=<version>
 sample_suffix=<sample_suffix>
-/home/u035/project/scripts/prepare_bcbio_config_crf.sh \
-  /home/u035/project/scripts/trio_whole_exome_crf_config.sh \
-  $project_id $version $sample_suffix &> ${version}_${project_id}.log
+$SCRIPTS/scripts/trio_wes_prepare_bcbio_config_crf.sh \
+  $SCRIPTS/trio_whole_exome_config.sh \
+  $project_id $version $sample_suffix &> ${project_id}_${version}_`date +%Y%m%d%H%M`.log
 X=`wc -l $PARAMS_DIR/$project_id.family_ids.txt | awk '{print $1}'`
 ```
 
 4. Submit the bcbio jobs from the logs folder. See above for version.
 
 ```
-cd /home/u035/project/trio_whole_exome/analysis/logs
-qsub -v PROJECT_ID=$project_id,VERSION=$version,CONFIG_SH=/home/u035/project/scripts/trio_whole_exome_config.sh \
-  -J 1-$X -N trio_whole_exome_bcbio.$project_id \
-  /home/u035/project/scripts/submit_bcbio_trio_wes.sh
+cd $LOGS_DIR
+sbatch --export=PROJECT_ID=$project_id,VERSION=$version,CONFIG_SH=$SCRIPTS/trio_whole_exome_config.sh \
+  --array=1-$X $SCRIPTS/submit_trio_wes_bcbio.sh
 ```
 
 If all log files end in â€˜Finishedâ€™ or â€˜Storing in local filesystemâ€™ for a metadata file (occasionally the job completes without quite outputting all of the â€˜Storingâ€™ messages), the batch is complete. If this is not the case, resubmit the incomplete jobs â€“ they will resume where they left off.
 
-5. Generate a MultiQC report for all files in the batch.
+5. Check the output directory to make sure all family output folders were moved into the `families` subdirectory. This should happen automatically at the end of the `submit_bcbio_trio_wes.sh` script but occasionally fails.
+
+```
+cd $OUTPUT_DIR/${short_project_id}_${version}
+mv *${short_project_id}* families/
+```
+
+6. Generate a MultiQC report for all files in the batch.
 
 ```
-source /home/u035/project/scripts/trio_whole_exome_config.sh
-cd /scratch/u035/project/trio_whole_exome/analysis/output
-/home/u035/project/software/bcbio/anaconda/bin/multiqc --title "Trio whole exome QC report: $project_id" \
-  --outdir qc \
-  --filename ${version}_${project_id}_qc_report.html \
-  *$version*$project_id*
+cd $OUTPUT_DIR/${short_project_id}_${version}/families
+mkdir -p ../qc
+multiqc --title "Trio whole exome QC report: $short_project_id $version" \
+  --outdir ../qc \
+  --filename ${short_project_id}_${version}_qc_report.html .
 ```
 
-6. Check the parent-child relationships predicted by peddy match the pedigree information. There should be no entries in the <EdGE_project_id>.ped_check.txt file that do not end in â€˜Trueâ€™. If there are, report these back to the NHS Clinical Scientist who generated the PED file for this batch. The batch id is the 5 digit number that prefixes all the family ids in the output.
+7. Check the parent-child relationships predicted by peddy match the pedigree information. There should be no entries in the `<EdGE_project_id>.ped_check.txt` file that do not end in â€˜Trueâ€™. If there are, report these back to the NHS Clinical Scientist who generated the PED file for this batch. The `<batch_id>` is the 5 digit number that prefixes all the family ids in the output. Move to [SOP prioritization](SOP_prioritization.md).
 
 ```
-cd /scratch/u035/project/trio_whole_exome/analysis/output
-perl /home/u035/project/scripts/trio_whole_exome_parse_peddy_ped_csv.pl \
-  --output /scratch/u035/project/trio_whole_exome/analysis/output \
-  --project $project_id \
+cd $OUTPUT_DIR/${short_project_id}_${version}/families
+batch_id=<batch_id>
+
+perl $SCRIPTS/trio_whole_exome_parse_peddy_ped_csv.pl \
+  --output ../qc/${short_project_id}_${version}.ped_check.txt \
+  --project $short_project_id \
   --batch $batch_id \
   --version $version \
-  --ped /scratch/u035/project/trio_whole_exome/analysis/params/$project_id.ped
-grep -v False$ qc/${version}_$project_id.ped_check.txt
+  --ped ../params/$project_id.ped \
+  --families .
+grep -v False$ ../qc/${short_project_id}_${version}.ped_check.txt
 ```
 
-7. Clean up the output directory.
+8. Compress BAM files to CRAM and compare the two files. The output log files should be empty and the files <sample>.cram, <sample>.cram.crai, and <sample>.cram.flagstat.txt should be present for each sample.
 
 ```
-cd /home/u035/project/trio_whole_exome/
-mkdir ${version}_${project_id}
-mv *${version}_${project_id}* ${version}_${project_id}/
+cd $LOGS_DIR
+sbatch --export=PROJECT_ID=$project_id,VERSION=$version,CONFIG_SH=$SCRIPTS/trio_whole_exome_config.sh \
+  --array=1-$X $SCRIPTS/submit_trio_wes_cram_compression.sh
 ```
 
-8. Clear the work directory and move the log files to the complete sub-directory.
+9. Calculate md5 checksums on the per-family files, excluding the BAM files. Creates the file `md5sum.txt` at the root of each familyâ€™s output directory. Check the files with the calculated md5sums. They should total 30 lines per sample plus 26 lines per family. The log files should be empty. When complete, move the family ids text file into the results folder for the project.
 
 ```
-cd /scratch/u035/project/trio_whole_exome/work
-rm -r *
-cd /home/u035/project/trio_whole_exome/logs
-mv trio_whole_exome_bcbio.$project_id* complete/
+cd $LOGS_DIR
+sbatch --export=PROJECT_ID=$project_id,VERSION=$version,CONFIG_SH=$SCRIPTS/trio_whole_exome_config.sh \
+  --array=1-$X $SCRIPTS/submit_trio_wes_family_checksums.sh
+cd $OUTPUT_DIR/${short_project_id}_${version}/families
+wc -l */md5sum.txt
+
+cd $PARAMS_DIR
+mv $project_id.family_ids.txt $OUTPUT_DIR/${short_project_id}_${version}/params/
+```
+
+10. Wait for prioritization to be completed. Calculate md5 checksums on the remaining project files, excluding the `families` sub-directory. Creates the file `md5sum.txt` at the root of the project output directory.
+
+```
+sbatch --export=PROJECT_ID=$project_id,VERSION=$version,CONFIG_SH=$SCRIPTS/trio_whole_exome_config.sh \
+  $SCRIPTS/submit_trio_wes_project_checksums.sh
+```
+
+11. Remove the BAM files from the results.
+
+```
+cd $OUTPUT_DIR/${short_project_id}_${version}
+rm families/*/*/*.bam*
 ```
 
-9. Copy the MultiQC report to the IGMM-VariantAnalysis area on the IGMM datastore.
+12. Clean up. Clear the work and logs directories. Move the bcbio YAML configuration files into the results folder for the project. Retain reads for samples in families where one sample has failed QC, using a list `retain\_for\_rerun.txt`. These will likely be required for later runs, and it is simpler to regenerate config YAML files if it is not necessary to re-do symlinks/read merging.
 
 ```
-ssh eddie3.ecdf.ed.ac.uk
-qlogin -q staging
-cd /exports/igmm/datastore/IGMM-VariantAnalysis/documentation/trio_whole_exome/qc
+cd $WORK_DIR
+rm -r *
+
+cd $LOGS_DIR
+rm -r *
 
-user=<ultra_user_id>
-project_id=<EdGE_project_id>
+cd $PARAMS_DIR
+rm -r *
 
-scp $user@ultra.epcc.ed.ac.uk:/scratch/u035/project/trio_whole_exome/analysis/output/qc/${version}_${project_id}_qc_report.html ./
+mkdir -p $OUTPUT_DIR/${short_project_id}_${version}/config/
+mv $CONFIG_DIR/${short_project_id}_${version}*.yaml $OUTPUT_DIR/${short_project_id}_${version}/config/
+
+cd /home/u035/u035/shared/analysis/reads/${project_id}
+rm `ls | grep -v -f retain_for_rerun.txt`
 ```
+
+13. Update the batch status spreadsheet. 
diff --git a/docs/SOP_alignment_variant_annotation_ultra2.md b/docs/SOP_alignment_variant_annotation_ultra2.md
deleted file mode 100644
index 48009421640d96350133131faced8d24674b87a8..0000000000000000000000000000000000000000
--- a/docs/SOP_alignment_variant_annotation_ultra2.md
+++ /dev/null
@@ -1,279 +0,0 @@
-# Standard operating procedure - Alignment, variant calling, and annotation of trio whole exome samples at the Edinburgh Parallel Computing Centre
-
-This SOP applies to batches of family/trio samples where trio whole exome sequencing has been performed by Edinburgh Genomics (EdGE) or the Edinburgh Clinical Research Facility (ECRF). It assumes that data has been successfully transferred to the Edinburgh Parallel Computing Centre (EPCC) (see SOP: Transfer of whole exome sequencing samples from Edinburgh Genomics to Edinburgh Parallel Computing Centre). Scripts are version controlled on the University of Edinburgh gitlab server `gitlab.ecdf.ed.ac.uk/igmmbioinformatics/trio-whole-exome`. Request access by e-mail: alison.meynert@igmm.ed.ac.uk.
-
-## Definitions
-
-In this document, N is the total number of samples in the project, and X is the number of families.
-
-Text in angle brackets, e.g. `<project>` indicates variable parameters. A variable parameter such as `<family1-X>` indicates that there are X instances of the parameter, each with their own unique value.
-
-## Software and data requirements
-
-The analysis is run with the bcbio pipeline (version 1.2.8) located at `/home/u035/u035/shared/software/bcbio`. All genome reference and annotation data resources are contained within the `genomes/Hsapiens/hg38` subfolder.
-
-The TWIST target BED file is at: `/home/u035/u035/shared/resources/Twist_Exome_RefSeq_targets_hg38.plus15bp.bed`. See [resources](https://git.ecdf.ed.ac.uk/igmmbioinformatics/trio-whole-exome/blob/master/docs/Resources_ultra2.md).
-
-## Input
-
-### PED file
-
-A 6-column tab-delimited [PED/FAM format file](https://www.cog-genomics.org/plink2/formats#fam) is required for each batch, describing the relationships between the sampled individuals, their sex, and their affected/unaffected status.
-
-
-### Sample id format
-
-The sequencing reads for the samples delivered from EdGE are identified by folder name and as the 8th column in the tab-delimited text file file_list.tsv inside the dated batch folder. The identifiers are in the format:
-
-```
-<pcr_plate_id>_<indiv_id>_<family_id><suffix>
-```
-
-The suffix identifies the exome kit, e.g. `_WESTwist_IDT-A`. These identifiers are referenced below in the output file structure.
-
-### Reads - Edinburgh Genomics
-
-A set of paired end FASTQ files (designated by R1 or R2 suffixes), possibly more than one pair per sample. Each sample's files are in its own folder. The input files will be in the folder `/home/u035/u035/shared/data` and follow the structure in *Figure 1*. Older deliveries contained the `<dated_batch>` folder within a `raw_data` folder.
-
-```
-<EdGE_project_id>/
-  +---<dated_batch>/
-  |   +---<sample_id>/
-  |   |   +---*.fastq.count
-  |   |   +---*.fastq.gz
-  |   +---file_list.tsv
-  |   +---md5sums.txt
-  +---<dated_batch>_tree.txt
-  +---Information.txt
-  +---md5_check.txt
-```
-*Figure 1.* File name and directory structure for a batch of sequencing from Edinburgh Genomics. The EdGE project id takes the format `XXXXX\_Lastname\_Firstname`, identifying the NHS staff member who submitted the samples for sequencing. The dated batch is in the format `yyyymmdd` â€“ in general we expect there to be only one of these per EdGE project id. The FASTQ file names relate to the sequencing run information and do not contain any information about the sample itself.
-
-### Reads - Edinburgh Clinical Research Facility
-
-A set of paired end FASTQ files (designated by R1 or R2 suffixes), generally one pair per sample. The input files will be in the folder `/home/u035/u035/shared/data` and follow the structure in *Figure 2*.
-
-```
-<ECRF_project_id>/
-  +---<internal_id_-md5.txt
-  +---<pcr_plate_id>_<indiv_id>_<family_id><suffix>_S<i>_L001_R1_001.fastq.gz
-  +---<pcr_plate_id>_<indiv_id>_<family_id><suffix>_S<i>_L001_R2_001.fastq.gz
-  +...
-```
-
-*Figure 2.* File name and directory structure for a batch of sequencing from the ECRF.
-
-## Working directories
-
-The project working directories will be in the folder `/home/u035/u035/shared/analysis` and follow the structure in *Figure 3*.
-
-```
-    config â€“ bcbio configuration files in YAML format
-    logs â€“ PBS job submission log files
-    params â€“ parameters for PBS job submission
-    reads â€“ symlinks/merged versions of input FASTQ files
-    work â€“ bcbio working folder
-```
-*Figure 3.* Project working directories.
-
-## Project configuration
-
-A configuration script sets environment variables common to scripts used in this SOP. This is stored at `/home/u035/u035/shared/scripts/trio_whole_exome_config.sh`.
-
-```
-#!/usr/bin/bash
-#
-# Basic configuration options for trio WES pipeline
-#
-
-BASE=/home/u035/u035/shared
-SCRIPTS=$BASE/scripts
-BCBIO_TEMPLATE=$SCRIPTS/trio_whole_exome_bcbio_template.yaml
-TARGET=$BASE/resources/Twist_Exome_RefSeq_targets_hg38.plus15bp.bed
-DOWNLOAD_DIR=$BASE/data
-REFERENCE_GENOME=$BASE/software/bcbio/genomes/Hsapiens/hg38/seq/hg38.fa
-
-PARAMS_DIR=$BASE/analysis/params
-READS_DIR=$BASE/analysis/reads
-CONFIG_DIR=$BASE/analysis/config
-WORK_DIR=$BASE/analysis/work
-OUTPUT_DIR=$BASE/analysis/results
-
-export PATH=$BASE/software/bcbio/tools/bin:$PATH
-````
-
-## Template for bcbio configuration
-
-Bcbio requires a template file in YAML format to define the procedures run in the pipeline. The template for this project is stored at `/home/u035/u035/shared/scripts/trio_whole_exome_bcbio_template.yaml`.
-
-```
-details:
-- algorithm:
-    platform: illumina
-    quality_format: standard
-    aligner: bwa
-    mark_duplicates: true
-    realign: false
-    recalibrate: true
-    effects: vep
-    effects_transcripts: all
-    variantcaller: gatk-haplotype
-    indelcaller: false
-    remove_lcr: true
-    tools_on:
-    - vep_splicesite_annotations
-  analysis: variant2
-  genome_build: hg38
-upload:
-  dir: /home/u035/u035/shared/results
-```
-
-## Output
-
-Per sample: BAM file of aligned reads against the hg38 genome assembly
-Per family: Annotated VCF file and QC report
-
-Output will be in the folder `/home/u035/u035/shared/results/<version>_<project_id>` and follow the structure in *Figure 4* (with multiple instances of the indiv_id sub directories, one per sequenced family member.). The qc sub-directories are not enumerated, and automatically generated index files are not listed for brevity. An additional directory at the root of the output folder called â€œqcâ€ will contain the MultiQC reports generated for an entire batch.
-
-```
-<analysis_date>_<project_id>_<pcr_plate_id>_<family_id>/
-  +---<indiv_id>_<family_id>/
-  |   +---<indiv_id>_<family_id>-callable.bed
-  |   +---<indiv_id>_<family_id>-ready.bam
-  |   +---qc/
-  +---<pcr_plate>_<family_id>-gatk-haplotype-annotated.vcf.gz
-  +---bcbio-nextgen-commands.log
-  +---bcbio-nextgen.log
-  +---data_versions.csv
-  +---metadata.csv
-  +---multiqc/
-  |   +---list_files_final.txt
-  |   +---multiqc_config.yaml
-  |   +---multiqc_data/
-  |   +---multiqc_report.html
-  |   +---report/
-  +---programs.txt
-  +---project-summary.yaml
-```
-*Figure 4.* File name and output directory structure for each family in a batch of sequencing.
-
-## Procedure
-
-1. Set environment variable project_id and general configuration variables.
-
-```
-project_id=<project_id>
-source /home/u035/u035/shared/scripts/trio_whole_exome_config.sh
-```
-
-2. Copy the PED file for the batch to the params folder in the working area. It should be named <project_id>.ped, relating it to the input directory for the FASTQ files. If the PED file given was not named in this way, donâ€™t rename it, create a symlink with the correct name.
-
-```
-cd $PARAMS_DIR
-ped_file=<input_ped_file>
-ln -s $ped_file $project_id.ped
-```
-
-3. In the params folder, create the symlinks to the reads and the bcbio configuration files. If specifying a common sample suffix, ensure it includes any joining characters, e.g. â€œ-â€œ or â€œ_â€, so that the family identifier can be cleanly separated from the suffix. Get the number of families from the batch. Version should be "v1" by default for the first analysis run of a batch, "v2" etc for subsequent runs.
-
-
-### Edinburgh Genomics data
-
-```
-cd $PARAMS_DIR
-version=<version>
-sample_suffix=<sample_suffix>
-/home/u035/u035/shared/scripts/prepare_bcbio_config.sh \
-  /home/u035/u035/shared/scripts/trio_whole_exome_config.sh \
-  $project_id $version $sample_suffix &> ${version}_${project_id}.log
-X=`wc -l $PARAMS_DIR/$project_id.family_ids.txt | awk '{print $1}'`
-```
-
-### Edinburgh Clinical Research Facility data
-
-```
-cd $PARAMS_DIR
-version=<version>
-sample_suffix=<sample_suffix>
-/home/u035/u035/shared/scripts/prepare_bcbio_config_crf.sh \
-  /home/u035/u035/shared/scripts/trio_whole_exome_crf_config.sh \
-  $project_id $version $sample_suffix &> ${version}_${project_id}.log
-X=`wc -l $PARAMS_DIR/$project_id.family_ids.txt | awk '{print $1}'`
-```
-
-4. Submit the bcbio jobs from the logs folder. See above for version.
-
-```
-cd /home/u035/u035/shared/trio_whole_exome/analysis/logs
-qsub -v PROJECT_ID=$project_id,VERSION=$version,CONFIG_SH=/home/u035/u035/shared/scripts/trio_whole_exome_config.sh \
-  -J 1-$X -N trio_whole_exome_bcbio.$project_id \
-  /home/u035/u035/shared/scripts/submit_bcbio_trio_wes.sh
-```
-
-If all log files end in â€˜Finishedâ€™ or â€˜Storing in local filesystemâ€™ for a metadata file (occasionally the job completes without quite outputting all of the â€˜Storingâ€™ messages), the batch is complete. If this is not the case, resubmit the incomplete jobs â€“ they will resume where they left off.
-
-5. Clean up the output directory.
-
-```
-cd /home/u035/u035/shared/results
-short_project_id=`echo $project_id | cut -f 1 -d '_'`
-mkdir ${version}_${short_project_id}
-mv *${version}_${project_id}* ${version}_${short_project_id}/
-```
-
-6. Generate a MultiQC report for all files in the batch.
-
-```
-source /home/u035/u035/shared/scripts/trio_whole_exome_config.sh
-short_project_id=`echo $project_id | cut -f 1 -d '_'`
-
-cd /home/u035/u035/shared/results
-/home/u035/u035/shared/software/bcbio/anaconda/bin/multiqc --title "Trio whole exome QC report: $project_id" \
-  --outdir ${short_version}_${project_id}/qc \
-  --filename ${version}_${project_id}_qc_report.html \
-  ${version}_${short_project_id}
-```
-
-7. Check the parent-child relationships predicted by peddy match the pedigree information. There should be no entries in the <EdGE_project_id>.ped_check.txt file that do not end in â€˜Trueâ€™. If there are, report these back to the NHS Clinical Scientist who generated the PED file for this batch. The batch id is the 5 digit number that prefixes all the family ids in the output.
-
-```
-cd /home/u035/u035/shared/results
-short_project_id=`echo $project_id | cut -f 1 -d '_'`
-
-perl /home/u035/u035/shared/scripts/trio_whole_exome_parse_peddy_ped_csv.pl \
-  --output /home/u035/u035/shared/results/${version}_${short_project_id}/qc \
-  --project $project_id \
-  --batch $batch_id \
-  --version $version \
-  --ped /home/u035/u035/shared/analysis/params/$project_id.ped
-grep -v False$ ${version}_${short_project_id}/qc/${version}_${project_id}.ped_check.txt
-```
-
-8. Clear the work directory and move the log files to the complete sub-directory.
-
-```
-cd /home/u035/u035/shared/analysis/work
-rm -r *
-cd /home/u035/u035/shared/analysis/logs
-mv trio_whole_exome_bcbio.$project_id* complete/
-```
-
-9. Clean up the reads directory. Retain reads for samples in families where one sample has failed QC, using a list `retain\_for\_rerun.txt`. These will likely be required for later runs, and it is simpler to regenerate config YAML files if it is not necessary to re-do symlinks/read merging.
-
-```
-cd /home/u035/u035/shared/analysis/reads/${project_id}
-rm `ls | grep -v -f retain_for_rerun.txt`
-```
-
-10. Copy the MultiQC report to the IGMM-VariantAnalysis area on the IGMM datastore.
-
-```
-ssh eddie3.ecdf.ed.ac.uk
-qlogin -q staging
-cd /exports/igmm/datastore/IGMM-VariantAnalysis/documentation/trio_whole_exome/qc
-
-user=<ultra_user_id>
-project_id=<EdGE_project_id>
-
-scp $user@sdf-cs1.epcc.ed.ac.uk:/home/u035/u035/shared/results/${version}_${project_id}/qc/${version}_${project_id}_qc_report.html ./
-```
diff --git a/docs/SOP_archiving.md b/docs/SOP_archiving.md
deleted file mode 100644
index 8ed7673731ebcc7e310454e13b6a150fc0b38fee..0000000000000000000000000000000000000000
--- a/docs/SOP_archiving.md
+++ /dev/null
@@ -1,110 +0,0 @@
-# Standard operating procedure - Archiving of trio whole exome samples at Edinburgh Parallel Computing Centre
-
-This SOP applies to the archiving of files generated by the alignment, variant calling, and variant prioritization analysis pipeline for trio whole exome samples at the Edinburgh Parallel Computing Centre (EPCC). Scripts are version controlled on the University of Edinburgh gitlab server gitlab.ecdf.ed.ac.uk/igmmbioinformatics/trio-whole-exome. Request access by e-mail: alison.meynert@igmm.ed.ac.uk.
-
-## User requirements
-
-The user will need an account on the EPCC Ultra system. Contact Donald Scobbie (d.scobbie@eppc.ed.ac.uk) for any issues.
-
-## Software requirements
-
-Htslib 1.10.2 and Samtools 1.10 are installed at /home/u035/project/software/bcbio/tools/bin.
-
-## Data requirements
-
-A copy of the human reference genome hg38 is at /home/u035/project/software/bcbio/genomes/Hsapiens/hg38/seq/hg38.fa.
-
-## Definitions
-
-In this document, N is the total number of samples in the batch, and X is the number of families.
-
-Text in angle brackets, e.g. <batch> indicates variable parameters. A variable parameter such as <family1-X> indicates that there are X instances of the parameter, each with their own unique value.
-
-## Notes
-
-1. In all steps below, when log files are completed and no longer used, move them into the sub-directory â€˜logs/completeâ€™. This â€œclean deskâ€ policy makes it easier to see what current jobs are running and which log files require examination.
-
-2. For all jobs with an array option, if there is only one task to be run, omit the â€˜-Jâ€™ parameter.
-
-3. The tracking file is maintained on the IGMM datastore at /exports/igmm/datastore/IGMM-VariantAnalysis/trio_whole_exome/Batch_status.xlsx.
-
-## Procedure
-
-1. Log in to the EPCC Ultra system. Set the project id environment variable and other general configuration environment variables, and calculate the number of families in the batch. Change to the logs directory â€“ all jobs are submitted from here.
-
-```
-ssh user@ultra.epcc.ed.ac.uk
-project_id=<EdGE_project_id>
-source /home/u035/project/scripts/trio_whole_exome_config.sh
-X=`wc -l $PARAMS_DIR/$project_id.family_ids.txt | awk '{print $1}'`
-cd /home/u035/project/trio_whole_exome/analysis/logs
-```
-
-2. Compress BAM files to CRAM and compare the two files. The output log files should be empty and the files <sample>.cram, <sample>.cram.crai, and <sample>.cram.flagstat.txt should be present for each sample.
-
-```
-qsub -v \
-  PROJECT_ID=$project_id,VERSION=$version,CONFIG_SH=/home/u035/project/scripts/trio_whole_exome_config.sh \
-  -N trio_whole_exome_cram_compression.$project_id -J 1-${X} \
-  /home/u035/project/scripts/submit_trio_wes_cram_compression.sh
-```
-
-3. Calculate md5 checksums on the project files, excluding the BAM files. Creates the file md5sum.txt at the root of each familyâ€™s output directory. Check the files with the calculated md5sums. They should total 30 lines per sample plus 26 lines per family. The log files should be empty.
-
-```
-qsub -v \
-  PROJECT_ID=$project_id,VERSION=$version,CONFIG_SH=/home/u035/project/scripts/trio_whole_exome_config.sh \
-  -N trio_whole_exome_checksums.$project_id -J 1-${X} \
-  /home/u035/project/scripts/submit_trio_wes_checksums.sh
-cd ../output
-wc -l *$project_id*/md5sum.txt
-```
-
-4. Calculate md5 checksums on the prioritization sub-directory(ies) for this project and on the qc files for this project. If there is more than one prioritization sub-directory, use a colon-delimited list.
-
-```
-priority_dirs=ddmmyyyy #:ddmmyyyy
-qsub -v \
-  PROJECT_ID=$project_id,VERSION=$version,PRIORITY_DIRS=$priority_dirs,CONFIG_SH=/home/u035/project/scripts/trio_whole_exome_config.sh \
-  -N trio_whole_exome_priority_and_qc_checksums.$project_id \
-  /home/u035/project/scripts/submit_trio_wes_priority_and_qc_checksums.sh
-```
-
-5. Archive the output to /archive/u035/trio_whole_exome, excluding the BAM files. Confirm md5 checksums on the archived files. There should be no lines ending in â€˜FAILâ€™. If there are any, investigate which file(s) and manually copy these, then re-check the md5 of the archived copy.
-
-```
-qsub -v \
-  PROJECT_ID=$project_id,VERSION=$version,PRIORITY_DIRS=$priority_dirs,CONFIG_SH=/home/u035/project/scripts/trio_whole_exome_config.sh \
-  -N trio_whole_exome_archive_project.$project_id \
-  /home/u035/project/scripts/submit_trio_wes_archive_project.sh
-grep -c FAIL trio_whole_exome_archive_project.$project_id*
-```
-
-6. Clean up the logs directory.
-
-```
-cd /home/u035/project/trio_whole_exome/logs 
-mv *.project_id.* complete/
-```
-
-7. Confirmation of copy to tape. EPCC runs a cron job on Monday mornings at 8am to check the DMF file state for all files in /archive/u035/trio_whole_exome. This produces a text report at /archive/u035/confirmation using the format yyyy-mm-dd. 
-
-Check for â€˜DULâ€™ entries for files in the confirmation report. The grep command below excludes directories, which are generally marked as â€˜REGâ€™, blank lines, and total file size lines. If not all entries are DUL, check the non-DUL ones for error states as below. The expectation is that some files may be in the process of migration; in which case, check again in the following weekâ€™s report.
-
-```
-cd /archive/u035/confirmation
-grep -v ^/archive <yyyy-mm-dd> \
-  | grep -v total | grep -v ^d | sed '/^$/d' | grep -v DUL
-```
-
-In the tracking file, enter the confirmation report date in the â€˜Archivedâ€™ field for the project.
-
-DMF has several possible states for files. The first three shown below are the most likely to appear:
-
-* REG Regular. The file exists only online, on active disk
-* OFL Offline. The file's directory entry remains on disk, but its data blocks are located offline only (on tape).
-* DUL Dual-state. Identical copies of the file exist online (on disk) and offline (on tape). The online copy will persist if there is no demand for free space in its filesystem. When free space is needed, the online copy of the file is removed, leaving just the offline copy; the file becomes "offline." If you make any change to a dual-state file, the offline copy becomes out of date and invalid, and the file is once again a "regular" file. 
-* MIG Migrating. The file is in process of migrating from disk to tape.
-* UNM Unmigrating. The file has been recalled and is in process of moving back from tape to disk.
-* NMG Nonmigratable. The file cannot be migrated.
-* INV Invalid. DMF cannot determine the file's state. The most likely reason is that it is in a filesystem that does not use DMF
\ No newline at end of file
diff --git a/docs/Software_installation.md b/docs/Software_installation.md
index 502a63f3bc73f478b87e2e2418e2b7338b9df2c5..1221db819a07d4be137d795db424cae1c9e2d08d 100644
--- a/docs/Software_installation.md
+++ b/docs/Software_installation.md
@@ -1,71 +1,62 @@
-# Installation of software for trio whole exome project
+# Ultra2 - Installation of software for trio whole exome project
 
 ## Aspera
 
-Downloaded Aspera Connect version 3.7.4.147727 from https://downloads.asperasoft.com and extracted to /home/u035/project/software.
+Downloaded Aspera Connect version 3.9.6.1467 installer script from https://downloads.asperasoft.com to /home/u035/project/software/install and run it. This installs the software in ~/.aspera, so it needs to be moved to the shared folder.
+
+```
+bash ibm-aspera-cli-3.9.6.1467.159c5b1-linux-64-release.sh
+mv ~/.aspera ../aspera
+```
 
 ## bcbio
 
-Version 1.2.3 with some bugfixes from the dev branch as of 26 August 2020.
+Version 1.2.8 (14 April 2021).
 
 Start with installing the base software, and add datatargets.
 
-This will take a long time, and may require multiple runs if it fails on a step. It will resume if needed. Run on a screen session and log each attempt. It's important to set the limit on the number of concurrently open files to as high as possible (4096 on ultra).
+This will take a long time, and may require multiple runs if it fails on a step. It will resume if needed. Run on a screen session and log each attempt. It's important to set the limit on the number of concurrently open files to as high as possible (4096).
 
 ```
-cd /home/u035/project/software/install
+cd /home/u035/u035/shared/software/install
+mkdir bcbio_install_logs
+
 wget https://raw.github.com/bcbio/bcbio-nextgen/master/scripts/bcbio_nextgen_install.py
 
 ulimit -n 4096
 
 DATE=`date +%Y%m%d%H%M`
-python bcbio_nextgen_install.py /home/u035/project/software/bcbio \
-  --tooldir /home/u035/project/software/bcbio/tools \
+python3 bcbio_nextgen_install.py /home/u035/u035/shared/software/bcbio \
+  --tooldir /home/u035/u035/shared/software/bcbio/tools \
   --genomes hg38 --aligners bwa \
-  --cores 32 &> bcbio_install_base_${DATE}.log
-```
-
-Fix an issue with bcbio & vt/samtools/htslib. See https://github.com/bcbio/bcbio-nextgen/issues/3327 and https://github.com/bcbio/bcbio-nextgen/issues/3328.
-
-```
-DATE=`date +%Y%m%d%H%M`
-/home/u035/project/software/bcbio/tools/bin/bcbio_nextgen.py upgrade -u development --tools &> bcbio_install_upgrade_tools_${DATE}.log
+  --cores 128 &> bcbio_install_logs/bcbio_install_base_${DATE}.log
 ```
 
 Install datatarget vep
 
 ```
 DATE=`date +%Y%m%d%H%M`
-/home/u035/project/software/bcbio/tools/bin/bcbio_nextgen.py upgrade -u skip --datatarget vep &> bcbio_install_datatarget_vep_${DATE}.log
-```
-
-We already had gnomAD 3.0 compiled and downloaded from another bcbio installation, so this gets copied to /home/u035/project/software/bcbio/genomes/Hsapiens/hg38/variation. However, if needed, re-generate it like this. It will take about 6 days.
-
-```
-DATE=`date +%Y%m%d%H%M`
-/home/u035/project/software/bcbio/tools/bin/bcbio_nextgen.py upgrade -u skip --datatarget gnomad &> bcbio_install_datatarget_gnomad_${DATE}.log
+/home/u035/u035/shared/software/bcbio/tools/bin/bcbio_nextgen.py upgrade -u skip --datatarget vep &> bcbio_install_logs/bcbio_install_datatarget_vep_${DATE}.log
 ```
 
 Increase JVM memory for GATK in galaxy/bcbio_system.yaml
 
 ```
-Â  gatk:
-Â  Â  jvm_opts: ["-Xms500m", "-Xmx5g"]
+  gatk:
+    jvm_opts: ["-Xms500m", "-Xmx5g"]
 ```
 
 ### Patch Ensembl VEP 100.4
 
 See https://github.com/Ensembl/ensembl-variation/pull/621/files
 
-Edit /home/u035/project/software/bcbio/anaconda/share/ensembl-vep-100.4-0/Bio/EnsEMBL/Variation/BaseAnnotation.pm accordingly.
+Edit /home/u035/u035/shared/software/bcbio/anaconda/share/ensembl-vep-100.4-0/Bio/EnsEMBL/Variation/BaseAnnotation.pm accordingly.
 
 ### Verifybamid custom panel for exomes
 
 ```
-source /home/u035/project/scripts/trio_whole_exome_config.sh
-
-mkdir /home/u035/project/software/install/1000G_phase3_hg38
-cd /home/u035/project/software/install/1000G_phase3_hg38
+mkdir /home/u035/u035/shared/software/install/1000G_phase3_hg38
+cd /home/u035/u035/shared/software/install/1000G_phase3_hg38
 
 # download the 1000 Genomes autosomes + X site VCFs
 for ((i = 1; i <= 22; i = i + 1))
@@ -75,83 +66,90 @@ do
 done
 wget ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000_genomes_project/release/20190312_biallelic_SNV_and_INDEL/ALL.chrX.shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased.vcf.gz
 wget ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000_genomes_project/release/20190312_biallelic_SNV_and_INDEL/ALL.chrX.shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased.vcf.gz.tbi
-cd ..
 
 # create bare to prefixed chromosome map
 for ((i = 1; i <= 22; i = i + 1))
 do
   echo $i "chr"$i >> chr_prefix_map.txt
 done
-echo chrX >> chr_prefix_map.txt
+echo X chrX >> chr_prefix_map.txt
+
+# add bcbio tools to path
+PATH=/home/u035/u035/shared/software/bcbio/tools/bin:/home/u035/u035/shared/software/bcbio/anaconda/share/verifybamid2-1.0.6-0:$PATH
 
 # use the TWIST kit to subset the variants and add the chr prefix at the same time
-for file in 1000G_phase3_hg38/*vcf.gz
+sed -e 's/chr//' ../../../resources/Twist_Exome_Target_hg38.bed > targets.bed
+for file in *phased.vcf.gz
 do
   bname=`basename $file`
-  bcftools view -R /home/u035/project/resources/Twist_Exome_Target_hg38.bed -m2 -M2 -v snps -i 'AF >= 0.01' $file | bcftools annotate --rename-chrs chr_prefix_map.txt | bgzip -c > ${bname%.vcf.gz}.biallelic.snps.m\
-inAF0.01.vcf.gz
+  bcftools view -R targets.bed -m2 -M2 -v snps -i 'AF >= 0.01' $file | bcftools annotate --rename-chrs chr_prefix_map.txt | bgzip -c > ${bname%.vcf.gz}.biallelic.snps.minAF0.01.vcf.gz
   tabix ${bname%.vcf.gz}.biallelic.snps.minAF0.01.vcf.gz
 done
 
 # concatenate all the files in the correct order
-bcftools concat -o ALL.shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased.chr.biallelic.snps.minAF0.01.vcf.gz -O z \
-  ALL.chr[1-9].shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased.chr.biallelic.snps.minAF0.01.vcf.gz \
-  ALL.chr[12][0-9].shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased.chr.biallelic.snps.minAF0.01.vcf.gz \
-  ALL.chrX.shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased.chr.biallelic.snps.minAF0.01.vcf.gz
-tabix ALL.shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased.chr.biallelic.snps.minAF0.01.vcf.gz
+bcftools concat -o ALL.shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased.biallelic.snps.minAF0.01.vcf.gz -O z \
+  ALL.chr[1-9].shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased.biallelic.snps.minAF0.01.vcf.gz \
+  ALL.chr[12][0-9].shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased.biallelic.snps.minAF0.01.vcf.gz \
+  ALL.chrX.shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased.biallelic.snps.minAF0.01.vcf.gz
+tabix ALL.shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased.biallelic.snps.minAF0.01.vcf.gz
 
 # use VerifyBamID to create the new panel
-/home/u035/project/software/bcbio/anaconda/share/verifybamid2-1.0.6-0/VerifyBamID \
-  --RefVCF ALL.shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased.chr.biallelic.snps.minAF0.01.vcf.gz
-  --Reference bcbio-1.1.5/genomes/Hsapiens/hg38/seq/hg38.fa
+VerifyBamID \
+  --RefVCF ALL.shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased.biallelic.snps.minAF0.01.vcf.gz \
+  --Reference ../../bcbio/genomes/Hsapiens/hg38/seq/hg38.fa
 
 # rename the files to the correct format
-mv ALL.shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased.chr.biallelic.snps.minAF0.01.vcf.gz.bed 1000g.phase3.100k.b38.vcf.gz.dat.bed
-mv ALL.shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased.chr.biallelic.snps.minAF0.01.vcf.gz.mu 1000g.phase3.100k.b38.vcf.gz.dat.mu
-mv ALL.shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased.chr.biallelic.snps.minAF0.01.vcf.gz.PC 1000g.phase3.100k.b38.vcf.gz.dat.V
-mv ALL.shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased.chr.biallelic.snps.minAF0.01.vcf.gz.UD 1000g.phase3.100k.b38.vcf.gz.dat.UD
+mv ALL.shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased.biallelic.snps.minAF0.01.vcf.gz.bed 1000g.phase3.100k.b38.vcf.gz.dat.bed
+mv ALL.shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased.biallelic.snps.minAF0.01.vcf.gz.mu 1000g.phase3.100k.b38.vcf.gz.dat.mu
+mv ALL.shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased.biallelic.snps.minAF0.01.vcf.gz.PC 1000g.phase3.100k.b38.vcf.gz.dat.V
+mv ALL.shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased.biallelic.snps.minAF0.01.vcf.gz.UD 1000g.phase3.100k.b38.vcf.gz.dat.UD
 
 # move them into the correct location, backing up the original resource folder
-cd /home/u035/project/software/bcbio/anaconda/share/verifybamid2-1.0.6-0
+cd /home/u035/u035/shared/software/bcbio/anaconda/share/verifybamid2-1.0.6-0
 mv resource resource.bak
 mkdir resource
-mv /home/u035/project/software/install/1000G_phase3_hg38/1000g.phase3.100k.b38* resource/
+mv /home/u035/u035/shared/software/install/1000G_phase3_hg38/1000g.phase3.100k.b38* resource/
+
+# clean up intermediate files
+cd /home/u035/u035/shared/software/install
+rm -r 1000G_phase3_hg38
 ```
 
 ## Python modules
 
 ### VASE
 
-VASE v0.4 was installed 28 August 2020.
+VASE v0.4.2 was installed 18 August 2021.
 
 ```
-cd /home/u035/project/software
-./bcbio/anaconda/bin/pip3 install git+git://github.com/david-a-parry/vase.git#egg=project[BGZIP,REPORTER,MYGENE]
+cd /home/u035/u035/shared/software
+./bcbio/anaconda/bin/pip3 install git+git://github.com/david-a-parry/vase.git#egg=vase[BGZIP,REPORTER,MYGENE]
 ```
 
 ### XlsxWriter
 
-XlsxWriter 1.3.3 was installed 28 August 2020.
+XlsxWriter 3.0.1 was installed 18 August 2021.
 
 ```
-cd /home/u035/project/software
+cd /home/u035/u035/shared/software
 ./bcbio/anaconda/bin/pip3 install XlsxWriter
 ```
 
 ## GATK 3.8
 
 ```
-cd /home/u035/project/software/install
+cd /home/u035/u035/shared/software/install
 wget https://storage.googleapis.com/gatk-software/package-archive/gatk/GenomeAnalysisTK-3.8-0-ge9d806836.tar.bz2
 bzip2 -d GenomeAnalysisTK-3.8-0-ge9d806836.tar.bz2 
 tar -xf GenomeAnalysisTK-3.8-0-ge9d806836.tar
 mv GenomeAnalysisTK-3.8-0-ge9d806836 ../GenomeAnalysisTK-3.8
+rm GenomeAnalysisTK-3.8-0-ge9d806836.tar
 ```
 
 ## RTG tools
 
 ```
-cd /home/u035/project/software
+cd /home/u035/u035/shared/software
 wget https://github.com/RealTimeGenomics/rtg-tools/releases/download/3.11/rtg-tools-3.11-linux-x64.zip
 unzip rtg-tools-3.11-linux-x64.zip
 rm rtg-tools-3.11-linux-x64.zip
@@ -160,9 +158,16 @@ rm rtg-tools-3.11-linux-x64.zip
 ## IGV
 
 ```
-cd /home/u035/project/software
+cd /home/u035/u035/shared/software
 wget https://data.broadinstitute.org/igv/projects/downloads/2.8/IGV_Linux_2.8.9.zip
 unzip IGV_Linux_2.8.9.zip
 rm IGV_Linux_2.8.9.zip
 ```
 
+## Emacs
+
+```
+cd /home/u035/u035/shared/software
+./bcbio/anaconda/bin/conda install emacs
+```
+
diff --git a/docs/Software_installation_ultra2.md b/docs/Software_installation_ultra2.md
deleted file mode 100644
index 1221db819a07d4be137d795db424cae1c9e2d08d..0000000000000000000000000000000000000000
--- a/docs/Software_installation_ultra2.md
+++ /dev/null
@@ -1,173 +0,0 @@
-# Ultra2 - Installation of software for trio whole exome project
-
-## Aspera
-
-Downloaded Aspera Connect version 3.9.6.1467 installer script from https://downloads.asperasoft.com to /home/u035/project/software/install and run it. This installs the software in ~/.aspera, so it needs to be moved to the shared folder.
-
-```
-bash ibm-aspera-cli-3.9.6.1467.159c5b1-linux-64-release.sh
-mv ~/.aspera ../aspera
-```
-
-## bcbio
-
-Version 1.2.8 (14 April 2021).
-
-Start with installing the base software, and add datatargets.
-
-This will take a long time, and may require multiple runs if it fails on a step. It will resume if needed. Run on a screen session and log each attempt. It's important to set the limit on the number of concurrently open files to as high as possible (4096).
-
-```
-cd /home/u035/u035/shared/software/install
-mkdir bcbio_install_logs
-
-wget https://raw.github.com/bcbio/bcbio-nextgen/master/scripts/bcbio_nextgen_install.py
-
-ulimit -n 4096
-
-DATE=`date +%Y%m%d%H%M`
-python3 bcbio_nextgen_install.py /home/u035/u035/shared/software/bcbio \
-  --tooldir /home/u035/u035/shared/software/bcbio/tools \
-  --genomes hg38 --aligners bwa \
-  --cores 128 &> bcbio_install_logs/bcbio_install_base_${DATE}.log
-```
-
-Install datatarget vep
-
-```
-DATE=`date +%Y%m%d%H%M`
-/home/u035/u035/shared/software/bcbio/tools/bin/bcbio_nextgen.py upgrade -u skip --datatarget vep &> bcbio_install_logs/bcbio_install_datatarget_vep_${DATE}.log
-```
-
-Increase JVM memory for GATK in galaxy/bcbio_system.yaml
-
-```
-  gatk:
-    jvm_opts: ["-Xms500m", "-Xmx5g"]
-```
-
-### Patch Ensembl VEP 100.4
-
-See https://github.com/Ensembl/ensembl-variation/pull/621/files
-
-Edit /home/u035/u035/shared/software/bcbio/anaconda/share/ensembl-vep-100.4-0/Bio/EnsEMBL/Variation/BaseAnnotation.pm accordingly.
-
-### Verifybamid custom panel for exomes
-
-```
-mkdir /home/u035/u035/shared/software/install/1000G_phase3_hg38
-cd /home/u035/u035/shared/software/install/1000G_phase3_hg38
-
-# download the 1000 Genomes autosomes + X site VCFs
-for ((i = 1; i <= 22; i = i + 1))
-do
-  wget ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000_genomes_project/release/20190312_biallelic_SNV_and_INDEL/ALL.chr${i}.shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased.vcf.gz;
-  wget ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000_genomes_project/release/20190312_biallelic_SNV_and_INDEL/ALL.chr${i}.shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased.vcf.gz.tbi
-done
-wget ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000_genomes_project/release/20190312_biallelic_SNV_and_INDEL/ALL.chrX.shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased.vcf.gz
-wget ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000_genomes_project/release/20190312_biallelic_SNV_and_INDEL/ALL.chrX.shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased.vcf.gz.tbi
-
-# create bare to prefixed chromosome map
-for ((i = 1; i <= 22; i = i + 1))
-do
-  echo $i "chr"$i >> chr_prefix_map.txt
-done
-echo X chrX >> chr_prefix_map.txt
-
-# add bcbio tools to path
-PATH=/home/u035/u035/shared/software/bcbio/tools/bin:/home/u035/u035/shared/software/bcbio/anaconda/share/verifybamid2-1.0.6-0:$PATH
-
-# use the TWIST kit to subset the variants and add the chr prefix at the same time
-sed -e 's/chr//' ../../../resources/Twist_Exome_Target_hg38.bed > targets.bed
-for file in *phased.vcf.gz
-do
-  bname=`basename $file`
-  bcftools view -R targets.bed -m2 -M2 -v snps -i 'AF >= 0.01' $file | bcftools annotate --rename-chrs chr_prefix_map.txt | bgzip -c > ${bname%.vcf.gz}.biallelic.snps.minAF0.01.vcf.gz
-  tabix ${bname%.vcf.gz}.biallelic.snps.minAF0.01.vcf.gz
-done
-
-# concatenate all the files in the correct order
-bcftools concat -o ALL.shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased.biallelic.snps.minAF0.01.vcf.gz -O z \
-  ALL.chr[1-9].shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased.biallelic.snps.minAF0.01.vcf.gz \
-  ALL.chr[12][0-9].shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased.biallelic.snps.minAF0.01.vcf.gz \
-  ALL.chrX.shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased.biallelic.snps.minAF0.01.vcf.gz
-tabix ALL.shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased.biallelic.snps.minAF0.01.vcf.gz
-
-# use VerifyBamID to create the new panel
-VerifyBamID \
-  --RefVCF ALL.shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased.biallelic.snps.minAF0.01.vcf.gz \
-  --Reference ../../bcbio/genomes/Hsapiens/hg38/seq/hg38.fa
-
-# rename the files to the correct format
-mv ALL.shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased.biallelic.snps.minAF0.01.vcf.gz.bed 1000g.phase3.100k.b38.vcf.gz.dat.bed
-mv ALL.shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased.biallelic.snps.minAF0.01.vcf.gz.mu 1000g.phase3.100k.b38.vcf.gz.dat.mu
-mv ALL.shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased.biallelic.snps.minAF0.01.vcf.gz.PC 1000g.phase3.100k.b38.vcf.gz.dat.V
-mv ALL.shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased.biallelic.snps.minAF0.01.vcf.gz.UD 1000g.phase3.100k.b38.vcf.gz.dat.UD
-
-# move them into the correct location, backing up the original resource folder
-cd /home/u035/u035/shared/software/bcbio/anaconda/share/verifybamid2-1.0.6-0
-mv resource resource.bak
-mkdir resource
-mv /home/u035/u035/shared/software/install/1000G_phase3_hg38/1000g.phase3.100k.b38* resource/
-
-# clean up intermediate files
-cd /home/u035/u035/shared/software/install
-rm -r 1000G_phase3_hg38
-```
-
-## Python modules
-
-### VASE
-
-VASE v0.4.2 was installed 18 August 2021.
-
-```
-cd /home/u035/u035/shared/software
-./bcbio/anaconda/bin/pip3 install git+git://github.com/david-a-parry/vase.git#egg=vase[BGZIP,REPORTER,MYGENE]
-```
-
-### XlsxWriter
-
-XlsxWriter 3.0.1 was installed 18 August 2021.
-
-```
-cd /home/u035/u035/shared/software
-./bcbio/anaconda/bin/pip3 install XlsxWriter
-```
-
-## GATK 3.8
-
-```
-cd /home/u035/u035/shared/software/install
-wget https://storage.googleapis.com/gatk-software/package-archive/gatk/GenomeAnalysisTK-3.8-0-ge9d806836.tar.bz2
-bzip2 -d GenomeAnalysisTK-3.8-0-ge9d806836.tar.bz2 
-tar -xf GenomeAnalysisTK-3.8-0-ge9d806836.tar
-mv GenomeAnalysisTK-3.8-0-ge9d806836 ../GenomeAnalysisTK-3.8
-rm GenomeAnalysisTK-3.8-0-ge9d806836.tar
-```
-
-## RTG tools
-
-```
-cd /home/u035/u035/shared/software
-wget https://github.com/RealTimeGenomics/rtg-tools/releases/download/3.11/rtg-tools-3.11-linux-x64.zip
-unzip rtg-tools-3.11-linux-x64.zip
-rm rtg-tools-3.11-linux-x64.zip
-```
-
-## IGV
-
-```
-cd /home/u035/u035/shared/software
-wget https://data.broadinstitute.org/igv/projects/downloads/2.8/IGV_Linux_2.8.9.zip
-unzip IGV_Linux_2.8.9.zip
-rm IGV_Linux_2.8.9.zip
-```
-
-## Emacs
-
-```
-cd /home/u035/u035/shared/software
-./bcbio/anaconda/bin/conda install emacs
-```
-
diff --git a/docs/run-notes/19258_run_notes.md b/docs/run-notes/19258_run_notes.md
index b54413b5460832b5be0e035e193f086d9a0e7cf9..fc4814e976a7e318d247e3835d6bd86a19608e37 100644
--- a/docs/run-notes/19258_run_notes.md
+++ b/docs/run-notes/19258_run_notes.md
@@ -19,3 +19,7 @@ Duos/singleton â€“ for Congenica only
 * Contamination check ok
 * Sample 129421 was not delivered - don't analyze family 436427, parents QC'd ok
 * Sequencing QC check ok
+
+# Additional run with remove_lcr: false
+
+Get an idea of how many additional candidate variants are returned if LCR filtering is turned off.
diff --git a/docs/run-notes/19502_run_notes.md b/docs/run-notes/19502_run_notes.md
new file mode 100644
index 0000000000000000000000000000000000000000..79d4d655085b533132f0f7afd7bdfbada37c09c6
--- /dev/null
+++ b/docs/run-notes/19502_run_notes.md
@@ -0,0 +1,7 @@
+* Families 451528, 452965 and 453088 are all straightforward trios.
+* Family 423278 is the complicated one â€“ we previously sequenced samples 123747 (proband), 123745 (dad), 123746 (mum) on run 18422. We have now received a similarly affected sibling, sample 131699, which is going on this run. Would it be possible to perform a shared analysis between 123747 and 131699 please? As Iâ€™m writing this, I realise that run 18422 was performed in December 2020, so Iâ€™m not sure how easy itâ€™s going to be to access this data?
+* Family 438810 is a duo for Congenica only (we just put this on to fill the run).
+
+Initial run - 3 trios + duo (for QC only)
+
+Quad retrieved from archive and run.
diff --git a/docs/run-notes/19573_run_notes.md b/docs/run-notes/19573_run_notes.md
new file mode 100644
index 0000000000000000000000000000000000000000..7ad40f3765db5dcc509c8af30547f3e80c14e9e2
--- /dev/null
+++ b/docs/run-notes/19573_run_notes.md
@@ -0,0 +1,19 @@
+# 19573 EdGe
+
+## Issues
+
+* 447226 - sample 131873 (father) is very low depth (6X), comes up as non-paternity for 114891 as a result but this can be ignored
+* 453955 - sample 132058 (proband) comes up as incorrect sex due to het ratio, annotated as female in PED file, no evidence of Y coverage - will run as female.
+
+## Notes
+
+* no contamination issues
+* low depth but still usable samples
+  * 438938 - sample 131858 (proband) 33X
+  * 433285 - sample 132221 (proband) 36X
+* XYY observable in X/Y coverage
+  * 452542 - sample 131311 (proband) 
+  * 438235 - sample 131376 (father)
+* Possible amplification of a region on chr4 - increased coverage observed in QC
+  * 438371 samples 131201 (proband), 131202 (mother)
+  * 454213 samples 132222 (proband), 132223 (father)
diff --git a/docs/run-notes/plots/runs_by_sequencer_fisher_strand_test_histogram.png b/docs/run-notes/plots/runs_by_sequencer_fisher_strand_test_histogram.png
new file mode 100644
index 0000000000000000000000000000000000000000..bec4f6e6cb3cf4f4b4a97a497039202abc01d255
Binary files /dev/null and b/docs/run-notes/plots/runs_by_sequencer_fisher_strand_test_histogram.png differ
diff --git a/docs/run-notes/plots/runs_by_sequencer_fisher_strand_test_vs_strand_odds_ratio_xyplot.png b/docs/run-notes/plots/runs_by_sequencer_fisher_strand_test_vs_strand_odds_ratio_xyplot.png
new file mode 100644
index 0000000000000000000000000000000000000000..1f7339f7a861a8c57829cffd26444827db3ebd49
Binary files /dev/null and b/docs/run-notes/plots/runs_by_sequencer_fisher_strand_test_vs_strand_odds_ratio_xyplot.png differ
diff --git a/docs/run-notes/plots/runs_by_sequencer_strand_odds_ratio_histogram.png b/docs/run-notes/plots/runs_by_sequencer_strand_odds_ratio_histogram.png
new file mode 100644
index 0000000000000000000000000000000000000000..0328a9d00f8eb6468796c267405902f7b47f87da
Binary files /dev/null and b/docs/run-notes/plots/runs_by_sequencer_strand_odds_ratio_histogram.png differ
diff --git a/docs/run-notes/strand_bias.md b/docs/run-notes/strand_bias.md
new file mode 100644
index 0000000000000000000000000000000000000000..1450860e960383475e39e43d3dde0772bc578b1e
--- /dev/null
+++ b/docs/run-notes/strand_bias.md
@@ -0,0 +1,126 @@
+# Strand bias metrics in samples sequenced to 2021-11-01
+
+## Parsing out data
+
+Working in `/home/u035/u035/ameynert`.
+
+`extract_strand_bias_metrics.sh`
+
+```
+#!/usr/bin/bash
+
+FOLDER=$1
+OUTPUT=$2
+
+cd $FOLDER/DECIPHER
+NAME=`basename $FOLDER`
+
+grep -v 'Internal' *.csv | cut -f 1-3 -d ',' | sed -e 's/_DEC_FLT.csv:/,/' | cut -f 1,3,4 -d ',' > $OUTPUT/$NAME.sites.csv
+cd $OUTPUT
+
+cut -f 1 -d ',' $NAME.sites.csv | sort -u > $NAME.indivs.txt
+
+cd $FOLDER/VCF
+
+for indiv in `cat $OUTPUT/$NAME.indivs.txt`
+do
+    if [ ! -e *.ready.$indiv.vcf.gz.tbi ]
+    then
+	tabix *.ready.$indiv.vcf.gz
+    fi
+done
+
+count=`wc -l $OUTPUT/$NAME.sites.csv | awk '{ print $1 }'`
+
+for ((i = 1; i <= $count; i = i + 1))
+do
+    indiv=`head -n $i $OUTPUT/$NAME.sites.csv | tail -n 1 | cut -d ',' -f 1`
+    chr=`head -n $i $OUTPUT/$NAME.sites.csv | tail -n 1 | cut -d ',' -f 2`
+    pos=`head -n $i $OUTPUT/$NAME.sites.csv | tail -n 1 | cut -d ',' -f 3`
+
+    res=`bcftools view *.ready.$indiv.vcf.gz chr$chr:$pos | bcftools query -f "%CHROM,%POS,%REF,%ALT,%INFO/FS,%INFO/SOR\n"`
+    echo $indiv,$res >> $OUTPUT/$NAME.strandbias.csv
+done
+```
+
+```
+ls /home/u035/u035/shared/results/*/prioritization/* | grep ':' | sed -e 's/\://' > folders.txt
+for folder in `cat folders.txt`
+do
+  ./extract_strand_bias_metrics.sh $folder /home/u035/u035/ameynert
+done
+
+cat *.strandbias.csv | sort -u | awk '{ print $1 }' > strandbias.csv
+```
+
+Clean up `strandbias.csv` manually with emacs.
+
+```
+cut -f 1 -d ',' strandbias.csv | sort -u > indivs.txt
+
+cd ../shared/results
+cat 1*/params/*family_ids.txt | sort -u | cut -d '_' -f 2 > ~/novaseq_families.txt
+cat 200*/params/*Ansari_Morad*family_ids.txt | sort -u | cut -d '_' -f 2 > ~/nextseq_families.txt
+cat 201[02]*/params/*Ansari_Morad*family_ids.txt | sort -u | cut -d '_' -f 2 >> ~/nextseq_families.txt 
+cat 21*/params/*Ansari_Morad*family_ids.txt | sort -u | cut -d '_' -f 2 >> ~/nextseq_families.txt 
+
+cd
+grep -f nextseq_families.txt strandbias.csv > strandbias_nextseq.csv
+grep -f novaseq_families.txt strandbias.csv > strandbias_novaseq.csv
+```
+
+## Plots 
+
+```
+library(lattice)
+
+x = read.table("strandbias_nextseq.csv", col.names=c("indiv", "chr", "pos", "ref", "alt", "fs", "sor"), stringsAsFactors=F, sep=",")
+x$sequencer = "NextSeq 2000"
+
+y = read.table("strandbias_novaseq.csv", col.names=c("indiv", "chr", "pos", "ref", "alt", "fs", "sor"), stringsAsFactors=F, sep=",")
+y$sequencer = "NovaSeq 6000"
+
+x = rbind(x, y)
+x$sequencer = as.factor(x$sequencer)
+
+png("plots/runs_by_sequencer_strand_odds_ratio_histogram.png", width=600, height=800)
+histogram(~log10(sor) | sequencer, x, breaks=seq(-2.5,1.1,0.02), xlab="log10(Strand odds ratio)",  main="Trio/family whole exome candidate variants", layout=c(1,2))
+dev.off()
+
+png("plots/runs_by_sequencer_fisher_strand_test_histogram.png", width=600, height=800)
+histogram(~log10(fs)  | sequencer, x, breaks=seq(-1,3,0.02),     xlab="log10(Fisher strand test)", main="Trio/family whole exome candidate variants", layout=c(1,2))
+dev.off()
+
+my.key=list(points=list(pch=c(1,2), col=c("blue", "orange")), text=list(labels=levels(x$sequencer)))
+my.panel = function(x, y, ...) { panel.xyplot(x, y, ...); panel.abline(v=log10(3)); panel.abline(h = log10(60)) }
+
+png("plots/runs_by_sequencer_fisher_strand_test_vs_strand_odds_ratio_xyplot.png", width=600, height=600)
+xyplot(log10(fs) ~ log10(sor), x, groups=c(sequencer), xlab="log10(Strand odds ratio)", ylab="log10(Fisher strand test)", main="Trio/family whole exome candidate variants", key=my.key, pch=c(1,2), col=c("blue", "orange"), panel=my.panel)
+dev.off()
+
+y = subset(x, x$sor >= 3 | x$fs >= 60)
+length(x$sor)
+[1] 2313
+length(y$sor)
+[1] 888
+
+length(y$sor) / length(x$sor)
+[1] 0.383917
+
+ x$count = 1
+aggregate(x$count, by = list(x$sequencer), sum)
+       Group.1    x
+1 NovaSeq 6000 1868
+2 NextSeq 2000  445
+
+aggregate(y$count, by = list(y$sequencer), sum)
+       Group.1   x
+1 NovaSeq 6000 524
+2 NextSeq 2000 364
+```
+
+![Strand odds ratio histogram by sequencer](plots/runs_by_sequencer_strand_odds_ratio_histogram.png)
+
+![Fisher strand test histogram by sequencer](plots/runs_by_sequencer_fisher_strand_test_histogram.png)
+
+![Strand odds ratio vs Fisher strand test by sequencer](plots/runs_by_sequencer_fisher_strand_test_vs_strand_odds_ratio_xyplot.png)
diff --git a/docs/tool_and_data_change_tracking.md b/docs/tool_and_data_change_tracking.md
new file mode 100644
index 0000000000000000000000000000000000000000..b3503d4505e4311ef15428363799b72f01738dc2
--- /dev/null
+++ b/docs/tool_and_data_change_tracking.md
@@ -0,0 +1,44 @@
+Software | bcbio-1.1.5 (ultra) | bcbio-1.2.3 (ultra) | bcbio-1.2.8 (ultra2) | bcbio-1.2.8 (ultra2 tools upgrade) | Notes
+---------|-------------|-------------|-------------|------|-------
+bcbio-nextgen | 1.1.5-b | **1.2.3** | **1.2.8** | 1.2.8 | Pipeline
+bcftools | 1.9 | **1.10.2** | **1.9** | 1.9 | VCF manipulation suite
+gatk4 | 4.1.2.0 | **4.1.8.1** | **4.2.1.0** | **4.2.5.0** | Alignment post-processing and variant calling
+picard | 2.20.5 | **2.23.3** | **2.25.7** | **2.26.10** | VCF/BAM manipulation suite - only one VCF tool used
+samtools | 1.9 | 1.9 | 1.9 | 1.9 | BAM manipulation suite 
+variant-effect-predictor | 97.3 | **100.4** | 100.4 | 100.4 | VCF annotations
+bamtools | 2.4.0 | 2.4.0 | 2.4.0 | 2.5.1 | BAM manipulation suite
+bcbio-variation | 0.2.6 | 0.2.6 | 0.2.6 | 0.2.6 
+bedtools | 2.27.1 | 2.27.1 | **2.30.0** | 2.30.0 | BED file manipulation suite
+biobambam | 2.0.87 | 2.0.87 | 2.0.87 | 2.0.87 | BAM manipulation suite
+bwa | 0.7.17 | 0.7.17 | 0.7.17 | 0.7.17 
+fastqc | 0.11.8 | 0.11.8 | 0.11.8 | 0.11.8 | QC checks on FASTQ files
+grabix | 0.1.8 | 0.1.8 | 0.1.8 | 0.1.8 |
+vt | 2015.11.10 | 2015.11.10 | 2015.11.10 | 2015.11.10 |
+vase | 0.2.4 | **0.4** | **0.4.2** | 0.4.2 | Identification of de novo variants
+rtg-tools | 3.10.1 | **3.11** | 3.11 | 3.11 | Used for GIAB concordance analysis
+
+Resource | bcbio-1.1.5 (ultra) | bcbio-1.2.3 (ultra) | bcbio-1.2.8 (ultra2) | Notes
+---------|-------------|-------------|-------------|------
+seq | 1000g-20150219_1 | 1000g-20150219_1 | 1000g-20150219_1 | 
+bwa | 1000g-20150219 | 1000g-20150219 | 1000g-20150219 | 
+ccds | r20 | r20 | r20 | 
+capture_regions | 20161202 | 20161202 | 20161202 | 
+coverage | 2018-10-16 | 2018-10-16 | 2018-10-16 | 
+prioritize | 20181227 | 20181227 | 20181227 | 
+dbsnp | 151-20180418 | **153-20180725** | **154-20210112** | VCF annotations
+hapmap_snps | 20160105 | 20160105 | 20160105 | 
+1000g_omni_snps | 20160105 | 20160105 | 20160105 | 
+ACMG56_genes | 20160726 | 20160726 | 20160726 | 
+1000g_snps | 20160105 | 20160105 | 20160105 | 
+mills_indels | 20160105 | 20160105 | 20160105 | 
+1000g_indels | 2.8_hg38_20150522 | 2.8_hg38_20150522 | 2.8_hg38_20150522 | 
+clinvar | 20190513 | 20190513 | **20210110** | VCF annotations, not used
+qsignature | 20160526 | 20160526 | 20160526 | 
+genesplicer | 2004.04.03 | 2004.04.03 | 2004.04.03 | 
+effects_transcript | 2017-03-16 | 2017-03-16 | 2017-03-16 | 
+varpon | 20181105 | 20181105 | 20181105 | 
+vcfanno | 20190119 | 20190119 | **20210204** | VCF annotations, not used
+viral | 2017.02.04 | 2017.02.04 | 2017.02.04 | 
+gnomad genomes | 2.1 | **3.0** | **3.1.1** | VCF annotations, used by G2P for MAF thresholding
+gnomad exomes | 2.1 | 2.1 | **2.1.1** | VCF annotations, used by G2P for MAF thresholding
+dbnsfp | 3.5a | not installed | not installed | VCF annotations, not used
diff --git a/extract_solo_FAM_PRO_ID.py b/extract_solo_FAM_PRO_ID.py
new file mode 100755
index 0000000000000000000000000000000000000000..903ab671b0166cb623ed023aeedb6b5410f5015b
--- /dev/null
+++ b/extract_solo_FAM_PRO_ID.py
@@ -0,0 +1,132 @@
+#	input:	the work folder which contains a PED subfolder where all family PED files were copied
+#	output:	only for singletons
+#		solo_FAM_IDs.txt, solo_PRO_IDs.txt and solo_FAM_PRO.txt
+#
+#       Author: MH
+#       last modified: MARCH 04, 2022
+
+
+
+import sys
+import os
+
+
+
+def go(work_dir):
+
+    out_fam_file = work_dir + '/solo_FAM_IDs.txt'
+    out_pro_file = work_dir + '/solo_PRO_IDs.txt'
+    out_f_p_file = work_dir + '/solo_FAM_PRO.txt'
+
+    out_fam_han = open(out_fam_file,'w')
+    out_pro_han = open(out_pro_file,'w')
+    out_f_p_han = open(out_f_p_file,'w')
+
+    cntr_fam = 0
+    cntr_pro = 0
+    cntr_f_p = 0
+
+
+    # go over the PED folder in the working dir and process each PED file
+    ped_dir = work_dir + '/PED'
+    print ""
+    print "Processing the PED files (in %s) to extract singleton FAM_ID, PRO_ID amd FAM_PRO files" % (ped_dir)
+
+    for file in os.listdir(ped_dir):					# per each PED file
+        if file.endswith(".ped"):
+
+            print "  %s" % (file)
+            in_file = os.path.join(ped_dir, file)
+
+            # check how many lines in the PED file - if more than one cannot be a singleton, ignore
+            num_lines = 0
+            in_han = open(in_file,'r')
+            for line in in_han:
+                num_lines += 1
+            in_han.close()
+
+            if num_lines > 1:
+                continue
+            if num_lines == 0:
+                print "ERROR: empty PED file %s" % (file)
+                raise SystemExit
+
+            # if here, the PED file contains exactly one line
+            # check if all fine: parents IDs = 0, kid with known sex and is affected
+            CHILD_ID = 0
+            FAM_ID = 0
+
+            in_han = open(in_file,'r')
+            for line in in_han:
+                data = [x.strip() for x in line.strip().split('\t')]
+
+                x_plate,x_fam = data[0].split('_')			# in the internal PED file, family_id is plateID_familyID, will keep only clean family_id, which corresponds to DECIPHER ID
+                y_indi,y_fam = data[1].split('_')			# in the internal PED file, indi_id is indiID_familyID, split
+
+                if x_fam != y_fam:
+                    print "ERROR: FAMILY_ID mismatch in %s" % (file)
+                    print line
+                    raise SystemExit
+
+                FAM_ID = x_fam
+                CHILD_ID = y_indi
+
+                # check both parent IDs == 0
+                if (data[2] != '0') or (data[3] != '0'):
+                    print "ERROR: found parent ID for a singleton child in %s" % (file)
+                    print line
+                    raise SystemExit
+
+                # make sure the sex of the child is known
+                CHILD_SEX = int(data[4])
+                if (CHILD_SEX == 1) or (CHILD_SEX == 2):
+                    pass
+                else:
+                    print "ERROR: proband sex unknown in %s" % (file)
+                    print line
+                    raise SystemExit
+
+                # check kid is affected
+                if int(data[5]) != 2:
+                    print "ERROR: singleton child not affected"
+                    print line
+                    raise SystemExit
+
+            if FAM_ID == 0:
+                print "ERROR: Cannot find the FAMILY_ID in %s" % (file)
+                raise SystemExit
+            if CHILD_ID == 0:
+                print "ERROR: Cannot find CHILD_ID in %s" % (file)
+                raise SystemExit
+            else:
+                out_fam_han.write('%s\n' % (FAM_ID))
+                cntr_fam += 1
+                out_pro_han.write('%s\n' % (CHILD_ID))
+                cntr_pro += 1
+                out_f_p_han.write('%s\t%s\n' % (FAM_ID,CHILD_ID))
+                cntr_f_p += 1
+
+    out_fam_han.close()
+    out_pro_han.close()
+    out_f_p_han.close()
+
+
+
+    print ""
+    print "Singleton Families:"
+    print "   %s FAM_IDs --> %s" % (cntr_fam, out_fam_file)
+    print "   %s PRO_IDs --> %s" % (cntr_pro, out_pro_file)
+    print "   %s FAM_PRO --> %s" % (cntr_f_p, out_f_p_file)
+    print ""
+
+
+
+
+
+if __name__ == '__main__':
+    if len(sys.argv) == 2:
+        go(sys.argv[1])
+    else:
+        print "Suggested use: time python /home/u035/u035/shared/scripts/extract_solo_FAM_PRO_ID.py /home/u035/u035/shared/analysis/work/<PROJECT_ID>"
+        raise SystemExit
+
diff --git a/extract_trio_FAM_PRO_ID.py b/extract_trio_FAM_PRO_ID.py
index e66d8f4cef2867cc6ffabc94881ea97abcc58d60..5e2ae5348704133dadd32d236a75fe6262f61d9e 100755
--- a/extract_trio_FAM_PRO_ID.py
+++ b/extract_trio_FAM_PRO_ID.py
@@ -3,7 +3,7 @@
 #		FAM_IDs.txt, PRO_IDs.txt and FAM_PRO.txt
 #
 #       Author: MH
-#       last modified: JUNE 06, 2019
+#       last modified: NOV 02, 2021
 
 
 
@@ -27,8 +27,6 @@ def go(work_dir):
     cntr_f_p = 0
 
 
-
-
     # go over the PED folder in the working dir and process each PED file
     ped_dir = work_dir + '/PED'
     print ""
@@ -37,30 +35,36 @@ def go(work_dir):
     for file in os.listdir(ped_dir):					# per each PED file
         if file.endswith(".ped"):
 #            print(os.path.join(ped_dir, file))
-                        
-            print "  %s" % (file) 
-            in_file = os.path.join(ped_dir, file)  
+
+            print "  %s" % (file)
+            in_file = os.path.join(ped_dir, file)
 
             CHILD_ID = 0
-            FAM_ID = 0            
+            FAM_ID = 0
 
             in_han = open(in_file,'r')
             for line in in_han:
                 data = [x.strip() for x in line.strip().split('\t')]
 
+                x_plate,x_fam = data[0].split('_')			# in the internal PED file, family_id is plateID_familyID, will keep only clean family_id, which corresponds to DECIPHER ID
+                y_indi,y_fam = data[1].split('_')			# in the internal PED file, indi_id is indiID_familyID, split
+
+
+#                print "data[0]=%s,data[1]=%s,x_plate=%s,x_fam=%s,y_indi=%s,y_fam=%s" % (data[0],data[1],x_plate,x_fam,y_indi,y_fam)
+
                 if FAM_ID == 0:
-                    FAM_ID = data[0]
-                elif FAM_ID != data[0]:
+                    FAM_ID = x_fam
+                elif FAM_ID != x_fam:
                     print "ERROR: more than one FAMILY_ID in %s" % (file)
-                    raise SystemExit 
-                     
+                    raise SystemExit
+
                 if data[2] != '0' and data[3] != '0':			# this is the child in the trio
                     if CHILD_ID == 0:
-                        CHILD_ID = data[1]
+                        CHILD_ID = y_indi
                     else:						# seen another child
                         print "WARNING: already have seen a child (possibly a quad) in %s" % (file)
                         CHILD_ID = 0
-                        break 
+                        break
 
                     CHILD_SEX = int(data[4])
                     if (CHILD_SEX == 1) or (CHILD_SEX == 2):
@@ -73,28 +77,26 @@ def go(work_dir):
                     if int(data[5]) != 2:
                         print "ERROR: child in a trio not affected"
                         print line
-                        raise SystemExit  
+                        raise SystemExit
 
             if FAM_ID == 0:
                 print "ERROR: Cannot find the FAMILY_ID in %s" % (file)
                 raise SystemExit
             if CHILD_ID == 0:
-                print "WARNING: Cannot find exactly one CHILD_ID in %s : not a trio --> will not be analyzed" % (file)  
+                print "WARNING: Cannot find exactly one CHILD_ID (with 2 available parents) in %s : not a trio --> will not be analyzed" % (file)
             else:
                 out_fam_han.write('%s\n' % (FAM_ID))
                 cntr_fam += 1
                 out_pro_han.write('%s\n' % (CHILD_ID))
                 cntr_pro += 1
                 out_f_p_han.write('%s\t%s\n' % (FAM_ID,CHILD_ID))
-                cntr_f_p += 1 
-                  
+                cntr_f_p += 1
+
     out_fam_han.close()
     out_pro_han.close()
     out_f_p_han.close()
 
-    out_fam_file = work_dir + '/FAM_IDs.txt'
-    out_pro_file = work_dir + '/PRO_IDs.txt'
-    out_f_p_file = work_dir + '/FAM_PRO.txt'
+
 
     print ""
     print "Trio Families:"
@@ -111,6 +113,6 @@ if __name__ == '__main__':
     if len(sys.argv) == 2:
         go(sys.argv[1])
     else:
-        print "Suggested use: time python /home/u035/u035/shared/scripts/extract_trio_FAM_PRO_ID.py /scratch/u035/u035/shared/analysis/wes_pilot/03062019"
+        print "Suggested use: time python /home/u035/u035/shared/scripts/extract_trio_FAM_PRO_ID.py /home/u035/u035/shared/analysis/work/<PROJECT_ID>"
         raise SystemExit
 
diff --git a/filter_LQ_GT.py b/filter_LQ_GT.py
index c94305de259de1056c6ba278d7fe13ab006b717f..16169e07e2f96287f7e75a633cb5f89dc73b378f 100644
--- a/filter_LQ_GT.py
+++ b/filter_LQ_GT.py
@@ -3,7 +3,7 @@
 #
 #
 #       Author: MH
-#       last modified: SEPT 27, 2019
+#       last modified: JUNE 06, 2019
 
 
 
@@ -15,7 +15,8 @@ import gzip
 
 num_ALT_THERSH = int(3)
 VAF_THRESH = float(0.2)
-BLACKLIST = {}                  # key: 'chr:pos:ref:alt'; value: irrelevant
+BLACKLIST = {}			# key: 'chr:pos:ref:alt'; value: irrelevant
+
 
 
 def go(black_file,in_file,out_file):
@@ -60,7 +61,7 @@ def go(black_file,in_file,out_file):
             if GT_IDX != 0:
                 print "ERROR: GT at weird place"
                 print line
-                raise SystemExit 
+                raise SystemExit
         except:
             print "ERROR: Cannot find the GT label in the FORMAT field"
             print line
@@ -84,11 +85,11 @@ def go(black_file,in_file,out_file):
 
         # check if GT needs reseting
         for y in xrange(9,len(data)):
-            cntr_vars += 1 
+            cntr_vars += 1
             needs_reset = False
             this_var = [z.strip() for z in data[y].split(':')]
             this_VAR_GT = this_var[GT_IDX]
-            
+
             if this_VAR_GT == './.':				# no need to reset, it is already a no-call
                 new_line = new_line + '%s\t' % (data[y])
                 continue
@@ -100,7 +101,7 @@ def go(black_file,in_file,out_file):
                 num_alt = int(0)
             else:
                 num_ref,num_alt = this_VAR_AD.split(',')
-                if num_ref == '.':				
+                if num_ref == '.':
                     num_ref = int(0)
                 else:
                     num_ref = int(num_ref)
@@ -112,7 +113,7 @@ def go(black_file,in_file,out_file):
             if (num_alt + num_ref) == 0:
                 needs_reset = True
             else:
-                VAF = float(num_alt)/float(num_alt+num_ref) 
+                VAF = float(num_alt)/float(num_alt+num_ref)
 
             if num_alt < num_ALT_THERSH:
                 needs_reset = True
@@ -133,7 +134,7 @@ def go(black_file,in_file,out_file):
 
         # write the new line
         new_line = new_line[:-1] + '\n'
-        out_han.write(new_line)    
+        out_han.write(new_line)
 
     in_han.close()
     out_han.close()
@@ -141,7 +142,7 @@ def go(black_file,in_file,out_file):
     perc_reset = (float(cntr_reset)/float(cntr_vars))*100.0
     print ""
     print ""
-    print "^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^" 
+    print "^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^"
     print "Read a total of %s sites and %s individual GT; %s of them (%.2f%%) were LQ non-ref and were reset to no-call (./.)" % (cntr_sites,cntr_vars,cntr_reset,perc_reset) 
     print "^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^"
     print ""
@@ -149,6 +150,9 @@ def go(black_file,in_file,out_file):
 
 
 
+
+
+
 def read_blacklist(in_file):
     in_han = open(in_file,'r')
     for line in in_han:
@@ -168,10 +172,14 @@ def read_blacklist(in_file):
 
 
 
+
+
+
+
 if __name__ == '__main__':
     if len(sys.argv) == 4:
         go(sys.argv[1],sys.argv[2],sys.argv[3])
     else:
-        print "Suggested use: time $PYTHON2 ${BLACKLIST} ${VCF_DIR}/${FAMILY_ID}.AC0.vcf ${VCF_DIR}/${FAMILY_ID}.clean.vcf"
+        print "Suggested use: time $PYTHON2 ${BLACKLIST} ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}.AC0.vcf ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}.clean.vcf"
         raise SystemExit
 
diff --git a/gather_quad_results.sh b/gather_quad_results.sh
new file mode 100755
index 0000000000000000000000000000000000000000..c20a8b6b58929b0a9210a5a317079c3e379c6055
--- /dev/null
+++ b/gather_quad_results.sh
@@ -0,0 +1,88 @@
+#!/bin/bash
+#SBATCH --cpus-per-task=1
+#SBATCH --mem=2GB
+#SBATCH --time=01:00:00
+#SBATCH --job-name=gather_quad_results
+#SBATCH --output=gather_quad_results.%A_%a.out
+#SBATCH --error=gather_quad_results.%A_%a.err
+
+
+
+### folder structure for the downstream analysis - created by trio_setup.sh ###
+BASE=/home/u035/u035/shared/analysis/work
+WORK_DIR=${BASE}/${PROJECT_ID}
+NHS_DIR=${WORK_DIR}/${BATCH_NUM}_${VERSION_N}_results
+
+
+echo "BATCH_NUM = ${BATCH_NUM}"         # the numerical part of the BATCH_ID
+echo "PLATE_ID = ${PLATE_ID}"           # the PCR plate ID of the batch being currently processed,                      e.g. 16862
+echo "PROJECT_ID = ${PROJECT_ID}"       # this the the folder (${BASE}/${PROJECT_ID}) where the downstream analysis will be done
+echo "VERSION_N = ${VERSION_N}"         # the version of the alignment and genotyping analysis
+echo "FAMILY_ID = ${FAMILY_ID}"         # the family ID of this family with affected probands
+echo "JOB_ID = ${JOB_ID}"               # the numerical part of the Job id generated by EPCC when running process_NHS_WES_aff_probands.sh for this family
+echo "KID_1_ID = ${KID_1_ID}"
+echo "KID_2_ID = ${KID_2_ID}"
+
+
+# check if ${NHS_DIR} already exists - if not, exit and ask to be created
+if [ ! -d "${NHS_DIR}" ]; then
+  echo "${NHS_DIR} does not exist - need to create it before running this script!!!!"
+  exit
+fi
+
+
+
+# create the family folder for the results
+FAM_DIR=${NHS_DIR}/${PLATE_ID}_${FAMILY_ID}
+if [ -d "${FAM_DIR}" ]; then
+  echo "${FAM_DIR} already exists - delete if you want to overwrite!!!!"
+  exit
+fi
+mkdir ${FAM_DIR}
+
+
+# copy the LOG files
+cp ${WORK_DIR}/LOG/process_quad.${JOB_ID}_*.err ${FAM_DIR}
+cp ${WORK_DIR}/LOG/process_quad.${JOB_ID}_*.out ${FAM_DIR}
+
+
+# copy the G2P html reports for the two trios and the affected sib-pair
+cp ${WORK_DIR}/G2P/${PLATE_ID}_${FAMILY_ID}_${KID_1_ID}_LOG_DIR/${PLATE_ID}_${FAMILY_ID}_${KID_1_ID}.report.html ${FAM_DIR}
+cp ${WORK_DIR}/G2P/${PLATE_ID}_${FAMILY_ID}_${KID_2_ID}_LOG_DIR/${PLATE_ID}_${FAMILY_ID}_${KID_2_ID}.report.html ${FAM_DIR}
+cp ${WORK_DIR}/G2P/${PLATE_ID}_${FAMILY_ID}_shared_LOG_DIR/${PLATE_ID}_${FAMILY_ID}_shared.report.html ${FAM_DIR}
+
+
+# copy (VASE) de novo variants in each proband VCF file
+cp ${WORK_DIR}/VASE/${PLATE_ID}_${FAMILY_ID}_${KID_1_ID}.ready.denovo.vcf ${FAM_DIR}
+cp ${WORK_DIR}/VASE/${PLATE_ID}_${FAMILY_ID}_${KID_2_ID}.ready.denovo.vcf ${FAM_DIR}
+
+
+# copy the DECIPHER files for bulk upload
+cp ${WORK_DIR}/DECIPHER/${KID_1_ID}_${FAMILY_ID}_DEC_FLT.csv ${FAM_DIR}
+cp ${WORK_DIR}/DECIPHER/${KID_1_ID}_${FAMILY_ID}_DECIPHER_v10.xlsx ${FAM_DIR}
+cp ${WORK_DIR}/DECIPHER/${KID_2_ID}_${FAMILY_ID}_DEC_FLT.csv ${FAM_DIR}
+cp ${WORK_DIR}/DECIPHER/${KID_2_ID}_${FAMILY_ID}_DECIPHER_v10.xlsx ${FAM_DIR}
+cp ${WORK_DIR}/DECIPHER/${KID_1_ID}_${FAMILY_ID}_shared_DEC_FLT.csv ${FAM_DIR}
+cp ${WORK_DIR}/DECIPHER/${KID_1_ID}_${FAMILY_ID}_shared_DECIPHER_v10.xlsx ${FAM_DIR}
+cp ${WORK_DIR}/DECIPHER/${KID_2_ID}_${FAMILY_ID}_shared_DEC_FLT.csv ${FAM_DIR}
+cp ${WORK_DIR}/DECIPHER/${KID_2_ID}_${FAMILY_ID}_shared_DECIPHER_v10.xlsx ${FAM_DIR}
+
+
+# copy the variant snapshots
+IGV_SNAP_DIR=${FAM_DIR}/IGV_snapshots
+mkdir ${IGV_SNAP_DIR}
+cp -r ${WORK_DIR}/DECIPHER/IGV/${PLATE_ID}_${FAMILY_ID}_${KID_1_ID} ${IGV_SNAP_DIR}
+cp -r ${WORK_DIR}/DECIPHER/IGV/${PLATE_ID}_${FAMILY_ID}_${KID_2_ID} ${IGV_SNAP_DIR}
+cp -r ${WORK_DIR}/DECIPHER/IGV/${PLATE_ID}_${FAMILY_ID}_shared ${IGV_SNAP_DIR}
+
+
+# copy proband coverage files
+cp ${WORK_DIR}/COV/${KID_1_ID}_${FAMILY_ID}.DD15.COV.txt ${FAM_DIR}
+cp ${WORK_DIR}/COV/${KID_2_ID}_${FAMILY_ID}.DD15.COV.txt ${FAM_DIR}
+cp ${WORK_DIR}/COV/${KID_1_ID}_${FAMILY_ID}.REC_SNP_COV.txt ${FAM_DIR}
+cp ${WORK_DIR}/COV/${KID_2_ID}_${FAMILY_ID}.REC_SNP_COV.txt ${FAM_DIR}
+
+
+
+echo "OK: Results for ${FAMILY_ID} are stored in ${FAM_DIR}"
+
diff --git a/gather_shared_results.sh b/gather_shared_results.sh
new file mode 100755
index 0000000000000000000000000000000000000000..084b66bfb9febd6a2a103bc57ffa1e1a0fba4a01
--- /dev/null
+++ b/gather_shared_results.sh
@@ -0,0 +1,68 @@
+#!/bin/bash
+#SBATCH --cpus-per-task=1
+#SBATCH --mem=2GB
+#SBATCH --time=01:00:00
+#SBATCH --job-name=gather_shared_results
+#SBATCH --output=gather_shared_results.%A_%a.out
+#SBATCH --error=gather_shared_results.%A_%a.err
+
+
+
+### folder structure for the downstream analysis - created by trio_setup.sh ###
+BASE=/home/u035/u035/shared/analysis/work
+WORK_DIR=${BASE}/${PROJECT_ID}
+NHS_DIR=${WORK_DIR}/${BATCH_NUM}_${VERSION_N}_results
+
+
+
+echo "BATCH_NUM = ${BATCH_NUM}"         # the numerical part of the BATCH_ID						e.g. 19650
+echo "PLATE_ID = ${PLATE_ID}"           # the PCR plate ID of the batch being currently processed,              	e.g. 19285
+echo "PROJECT_ID = ${PROJECT_ID}"       # this the the folder (${BASE}/${PROJECT_ID}) where the downstream analysis will be done
+echo "FAMILY_ID = ${FAMILY_ID}"         # the family ID of this family with affected probands
+echo "VERSION_N = ${VERSION_N}"         # the version of the alignment and genotyping analysis
+echo "JOB_ID = ${JOB_ID}"		# the numerical part of the Job id generated by EPCC when running process_NHS_WES_aff_probands.sh for this family
+
+
+# check if ${NHS_DIR} already exists - if not, exits and asks to run the standard trio-based analysis first
+if [ ! -d "${NHS_DIR}" ]; then
+  echo "${NHS_DIR} does not exist - need to run standard trio-based analysis first!!!!"
+  exit
+fi
+
+
+# create the family folder for the shared results for this family
+FAM_DIR=${NHS_DIR}/${PLATE_ID}_${FAMILY_ID}_shared
+if [ -d "${FAM_DIR}" ]; then
+  echo "${FAM_DIR} already exists - delete if you want to overwrite!!!!"
+  exit
+fi
+mkdir ${FAM_DIR}
+
+
+# copy the LOG files
+cp ${WORK_DIR}/LOG/process_shared.${JOB_ID}_*.err ${FAM_DIR}
+cp ${WORK_DIR}/LOG/process_shared.${JOB_ID}_*.out ${FAM_DIR}
+
+
+# copy the G2P family html report
+cp ${WORK_DIR}/G2P/${PLATE_ID}_${FAMILY_ID}_LOG_DIR/${PLATE_ID}_${FAMILY_ID}.report.html ${FAM_DIR}
+
+
+# copy all the DECIPHER files for bulk upload
+cp ${WORK_DIR}/DECIPHER/*_${FAMILY_ID}_DEC_FLT.csv ${FAM_DIR}
+cp ${WORK_DIR}/DECIPHER/*_${FAMILY_ID}_DECIPHER_v10.xlsx ${FAM_DIR}
+
+
+# copy the variant snapshots
+cp ${WORK_DIR}/DECIPHER/IGV/${PLATE_ID}_${FAMILY_ID}/*.png ${FAM_DIR}
+
+
+# copy the coverage files
+cp ${WORK_DIR}/COV/*_${FAMILY_ID}.DD15.COV.txt ${FAM_DIR}
+cp ${WORK_DIR}/COV/*_${FAMILY_ID}.REC_SNP_COV.txt ${FAM_DIR}
+
+
+echo ""
+echo ""
+echo "OK: Results for ${FAMILY_ID} are stored in ${FAM_DIR}"
+
diff --git a/gather_solo_results.sh b/gather_solo_results.sh
new file mode 100755
index 0000000000000000000000000000000000000000..ab8d6865c22f7230bac425a1ef78e1d644d66ef4
--- /dev/null
+++ b/gather_solo_results.sh
@@ -0,0 +1,85 @@
+#!/bin/bash
+#SBATCH --cpus-per-task=1
+#SBATCH --mem=2GB
+#SBATCH --time=01:00:00
+#SBATCH --job-name=gather_results
+#SBATCH --output=gather_solo_results.%A_%a.out
+#SBATCH --error=gather_solo_results.%A_%a.err
+
+
+
+### folder structure for the downstream analysis - created by trio_setup.sh ###
+
+BASE=/home/u035/u035/shared/analysis/work
+WORK_DIR=${BASE}/${PROJECT_ID}
+NHS_DIR=${WORK_DIR}/${BATCH_NUM}_${VERSION_N}_results
+
+
+# other files to be used
+FAMILY_IDS=${WORK_DIR}/solo_FAM_IDs.txt                                                      # created by trio_setup.sh
+CHILD_IDS=${WORK_DIR}/solo_PRO_IDs.txt                                                       # created by trio_setup.sh
+
+
+echo "BATCH_NUM = ${BATCH_NUM}"         # the numerical part of the BATCH_ID
+echo "PLATE_ID = ${PLATE_ID}"           # the PCR plate ID of the batch being currently processed,              	e.g. 16862
+echo "PROJECT_ID = ${PROJECT_ID}"       # this the the folder (${BASE}/${PROJECT_ID}) where the downstream analysis will be done
+echo "VERSION_N = ${VERSION_N}"         # the version of the alignment and genotyping analysis
+
+
+# check if ${NHS_DIR} already exists - if not, exit and ask to be created
+if [ ! -d "${NHS_DIR}" ]; then
+  echo "${NHS_DIR} does not exist - need to create it before running this script!!!!"
+  exit
+fi
+
+
+#~## enable running singletons
+#~#if [ -z $PBS_ARRAY_INDEX ]
+#~#then
+#~#  if [ -z $INDEX ]
+#~#  then
+#~#    export PBS_ARRAY_INDEX=1
+#~#  else
+#~#    export PBS_ARRAY_INDEX=$INDEX
+#~#  fi
+#~#fi
+
+
+
+FAMILY_ID=`head -n ${SLURM_ARRAY_TASK_ID} ${FAMILY_IDS} | tail -n 1`				# contains only the family IDs (e.g.385295)
+PROBAND_ID=`head -n ${SLURM_ARRAY_TASK_ID} ${CHILD_IDS} | tail -n 1`				# contains only the proband IDs (e.g. 107060)
+
+
+# create the family folder for the results
+FAM_DIR=${NHS_DIR}/${PLATE_ID}_${FAMILY_ID}
+mkdir ${FAM_DIR}
+
+
+# copy the DECIPHER-to-INTERNAL ID mapping
+cp ${WORK_DIR}/solo_DECIPHER_INTERNAL_IDs.txt ${FAM_DIR}
+
+
+# copy the LOG files
+cp ${WORK_DIR}/LOG/process_solo.*_${SLURM_ARRAY_TASK_ID}.err ${FAM_DIR}
+cp ${WORK_DIR}/LOG/process_solo.*_${SLURM_ARRAY_TASK_ID}.out ${FAM_DIR}
+
+
+# copy the G2P family html report
+cp ${WORK_DIR}/G2P/${PLATE_ID}_${FAMILY_ID}_LOG_DIR/${PLATE_ID}_${FAMILY_ID}.report.html ${FAM_DIR}
+
+
+# copy the DECIPHER file for bulk upload
+cp ${WORK_DIR}/DECIPHER/${PROBAND_ID}_${FAMILY_ID}_DEC_FLT.csv ${FAM_DIR}
+cp ${WORK_DIR}/DECIPHER/${PROBAND_ID}_${FAMILY_ID}_DECIPHER_v10.xlsx ${FAM_DIR}
+
+
+# copy the variant snapshots
+cp ${WORK_DIR}/DECIPHER/IGV/${PLATE_ID}_${FAMILY_ID}/*.png ${FAM_DIR}
+
+
+# copy proband coverage files
+cp ${WORK_DIR}/COV/${PROBAND_ID}_${FAMILY_ID}.DD15.COV.txt ${FAM_DIR}
+cp ${WORK_DIR}/COV/${PROBAND_ID}_${FAMILY_ID}.REC_SNP_COV.txt ${FAM_DIR}
+
+echo "OK: Results for ${FAMILY_ID} are stored in ${FAM_DIR}"
+
diff --git a/gather_trio_results.sh b/gather_trio_results.sh
new file mode 100755
index 0000000000000000000000000000000000000000..7325bffbcb4d76771d147504145f2a6fdf30adbe
--- /dev/null
+++ b/gather_trio_results.sh
@@ -0,0 +1,88 @@
+#!/bin/bash
+#SBATCH --cpus-per-task=1
+#SBATCH --mem=2GB
+#SBATCH --time=01:00:00
+#SBATCH --job-name=gather_results
+#SBATCH --output=gather_results.%A_%a.out
+#SBATCH --error=gather_results.%A_%a.err
+
+
+
+### folder structure for the downstream analysis - created by trio_setup.sh ###
+
+BASE=/home/u035/u035/shared/analysis/work
+WORK_DIR=${BASE}/${PROJECT_ID}
+NHS_DIR=${WORK_DIR}/${BATCH_NUM}_${VERSION_N}_results
+
+
+# other files to be used
+FAMILY_IDS=${WORK_DIR}/FAM_IDs.txt                                                      # created by trio_setup.sh
+CHILD_IDS=${WORK_DIR}/PRO_IDs.txt                                                       # created by trio_setup.sh
+
+echo "BATCH_NUM = ${BATCH_NUM}"         # the numerical part of the BATCH_ID
+echo "PLATE_ID = ${PLATE_ID}"           # the PCR plate ID of the batch being currently processed,              	e.g. 16862
+echo "PROJECT_ID = ${PROJECT_ID}"       # this the the folder (${BASE}/${PROJECT_ID}) where the downstream analysis will be done
+echo "VERSION_N = ${VERSION_N}"         # the version of the alignment and genotyping analysis
+
+
+# check if ${NHS_DIR} already exists - if not, exit and ask to be created
+if [ ! -d "${NHS_DIR}" ]; then
+  echo "${NHS_DIR} does not exist - need to create it before running this script!!!!"
+  exit
+fi
+
+
+#~## enable running singletons
+#~#if [ -z $PBS_ARRAY_INDEX ]
+#~#then
+#~#  if [ -z $INDEX ]
+#~#  then
+#~#    export PBS_ARRAY_INDEX=1
+#~#  else
+#~#    export PBS_ARRAY_INDEX=$INDEX
+#~#  fi
+#~#fi
+
+
+
+FAMILY_ID=`head -n ${SLURM_ARRAY_TASK_ID} ${FAMILY_IDS} | tail -n 1`				# contains only the family IDs (e.g.385295)
+PROBAND_ID=`head -n ${SLURM_ARRAY_TASK_ID} ${CHILD_IDS} | tail -n 1`				# contains only the proband IDs (e.g. 107060)
+
+
+# create the family folder for the results
+FAM_DIR=${NHS_DIR}/${PLATE_ID}_${FAMILY_ID}
+mkdir ${FAM_DIR}
+
+
+# copy the VASE de novo variants in the proband VCF file
+cp ${WORK_DIR}/VASE/${PLATE_ID}_${FAMILY_ID}.ready.denovo.vcf ${FAM_DIR}
+
+
+# copy the DECIPHER-to-INTERNAL ID mapping
+cp ${WORK_DIR}/DECIPHER_INTERNAL_IDs.txt ${FAM_DIR}
+
+
+# copy the LOG files
+cp ${WORK_DIR}/LOG/process_trio.*_${SLURM_ARRAY_TASK_ID}.err ${FAM_DIR}
+cp ${WORK_DIR}/LOG/process_trio.*_${SLURM_ARRAY_TASK_ID}.out ${FAM_DIR}
+
+
+# copy the G2P family html report
+cp ${WORK_DIR}/G2P/${PLATE_ID}_${FAMILY_ID}_LOG_DIR/${PLATE_ID}_${FAMILY_ID}.report.html ${FAM_DIR}
+
+
+# copy the DECIPHER file for bulk upload
+cp ${WORK_DIR}/DECIPHER/${PROBAND_ID}_${FAMILY_ID}_DEC_FLT.csv ${FAM_DIR}
+cp ${WORK_DIR}/DECIPHER/${PROBAND_ID}_${FAMILY_ID}_DECIPHER_v10.xlsx ${FAM_DIR}
+
+
+# copy the variant snapshots
+cp ${WORK_DIR}/DECIPHER/IGV/${PLATE_ID}_${FAMILY_ID}/*.png ${FAM_DIR}
+
+
+# copy proband coverage files
+cp ${WORK_DIR}/COV/${PROBAND_ID}_${FAMILY_ID}.DD15.COV.txt ${FAM_DIR}
+cp ${WORK_DIR}/COV/${PROBAND_ID}_${FAMILY_ID}.REC_SNP_COV.txt ${FAM_DIR}
+
+echo "OK: Results for ${FAMILY_ID} are stored in ${FAM_DIR}"
+
diff --git a/generate_DEC_IGV_aff_sib_scripts_from_quad.py b/generate_DEC_IGV_aff_sib_scripts_from_quad.py
new file mode 100755
index 0000000000000000000000000000000000000000..29bfaa22a5ddaf9e6a049a9b6793d62945c0894d
--- /dev/null
+++ b/generate_DEC_IGV_aff_sib_scripts_from_quad.py
@@ -0,0 +1,573 @@
+#	input:
+#		the family PED file
+#		G2P text output for the trio		[${FAMILY_ID}.report.txt]
+#		the joint and individual VCFs
+#
+#
+#	output (per affected proband_:
+#		DECIPHER formated file (all shared G2P variants)
+#		IGV snapshot script file
+#
+#	checks:
+#		all G2P variants found in the individual VCF
+#
+#       Author: MH
+#       last modified: JAN 21, 2022
+
+
+
+
+import sys
+import os
+import csv
+import gzip
+from collections import defaultdict
+
+
+ASSEMBLY = 'GRCh38'
+INTERGENIC = 'No'
+ACCESS = 'No'
+
+
+
+TRANS_DICT = {}				# key: transcriptID not found in DECIPHER; value: the chosen replacement transcriptID from those available in DECIPHER
+KIDS_SEX_DICT = {}			# key: <indi_fam_id>; value: sex (in the format 46XX/46XY)
+KIDS_G2P_DICT = defaultdict(dict)	# 1st level key: <indi_fam_id>; 2nd level key: chr:start:end:ref:alt; value: (ZYG,gene,trans)
+KIDS_VCF_DICT = defaultdict(dict)	# 1st level key: <indi_fam_id>; 2nd level key: chr:pos:ref:alt; value: (FS,SOR)
+SHARED_DICT = {}			# key: chr:start:ref:alt; value: (ZYG,gene,trans)
+NUM_SHARED_G2P_VARS = 0
+SNAP_FLANK = 25
+
+FS_THRESH = float(60)
+SOR_THRESH = float(3)
+
+
+
+
+## create the names of the needed files
+#PED_FILE=${PED_DIR}/${BATCH_ID}_${PLATE_ID}_${FAMILY_ID}_shared.ped
+#IN_G2P_FILE=${G2P_DIR}/${PLATE_ID}_${FAMILY_ID}_shared_LOG_DIR/${PLATE_ID}_${FAMILY_ID}_shared.report.txt
+#FAM_IGV_DIR=${IGV_DIR}/${PLATE_ID}_${FAMILY_ID}_shared
+#FAM_BAM_DIR=${SOURCE_DIR}/????-??-??_${VERSION_N}_${BATCH_ID}_${PLATE_ID}_${FAMILY_ID}
+
+### call the python scrpit
+#time ${PYTHON2} ${SCRIPTS_DIR}/NHS_WES_generate_DEC_IGV_sib_from_quad.py \
+#${DECIPHER_ID} \
+#${TRANS_MAP} \
+#${PED_FILE} \
+#${IN_G2P_FILE} \
+#${FAM_IGV_DIR} \
+#${VCF_DIR} \
+#${PLATE_ID} \
+#${FAMILY_ID} \
+#${DEC_DIR} \
+#${FAM_BAM_DIR}
+
+
+
+
+
+def go(dec_id,trans_map_file,ped_file,in_g2p_file,fam_igv_dir,vcf_dir,plate_id,fam_id,dec_dir,fam_bam_dir):
+
+    # read the transcript mapping file
+    read_trans_map(trans_map_file)
+
+    # read the ped file and establish KID_ID + KID_SEX
+    read_ped(ped_file)
+
+    # read the G2P output for this family
+    read_G2P(in_g2p_file)
+
+    # now read the individual VCFs and record all the variants
+    # list of all ids
+    proband_ids = KIDS_G2P_DICT.keys()
+    for pro_id in proband_ids:
+        vcf_file = '%s/%s_%s_shared.ready.%s.vcf.gz' % (vcf_dir,plate_id,fam_id,pro_id)
+        read_all_VCF_vars(vcf_file,KIDS_VCF_DICT,pro_id)
+    print ""
+    for k,v in KIDS_VCF_DICT.iteritems():
+        print "Found %s unique VCF variants for affected proband (%s)" % (len(v),k)
+    print ""
+    sys.stdout.flush()
+
+
+    print "Going over the varaints in each affected proband and checking each if it is shared G2P variant (to keep)"
+    # setup the DECIPHER and IGV snapshot output files - per each affected proband
+    proband_ids = KIDS_G2P_DICT.keys()
+    for pro_id in proband_ids:
+
+        num_out_vars = 0	# must be == NUM_SHARED_G2P_VARS
+
+        out_dec_file = '%s/%s_shared_DEC_FLT.csv' % (dec_dir,pro_id)
+        out_han = open(out_dec_file,'w')
+        out_han.write('Internal reference number or ID,Chromosome,Start,Genome assembly,Reference allele,Alternate allele,Transcript,Gene name,Intergenic,Chromosomal sex,Other rearrangements/aneuploidy,Open-access consent,Age at last clinical assessment,Prenatal age in weeks,Note,Inheritance,Pathogenicity,Phenotypes,HGVS code,Genotype,Responsible contact\n')
+
+        # setup the IGV snapshot file
+        out_igv_file = '%s/IGV/%s.shared.snapshot.FLT.txt' % (dec_dir,pro_id)
+        out_igv_han = open(out_igv_file,'w')
+        out_igv_han.write('new\n')
+        out_igv_han.write('genome hg38\n')
+        out_igv_han.write('mkdir -p "%s"\n' % (fam_igv_dir))
+        out_igv_han.write('new\n')
+
+        child_bam = '%s/%s/%s-ready.bam' % (fam_bam_dir,pro_id,pro_id)
+        out_igv_han.write('load %s\n' % (child_bam))
+        out_igv_han.write('snapshotDirectory "%s"\n' % (fam_igv_dir))
+        out_igv_han.write('\n')
+
+        # go over the individual's VCF variants, check if found in the shared G2P variants, if yes - output it (with VCF's coordinates)
+        # KIDS_VCF_DICT = defaultdict(dict)       # 1st level key: <indi_fam_id>; 2nd level key: chr:pos:ref:alt; value: irrelevant
+        # SHARED_DICT = {}                        # key: chr:start:ref:alt; value: (ZYG,gene,trans)
+
+        pro_vcf_vars = KIDS_VCF_DICT[pro_id]
+        for pro_vcf_var,fs_sor in pro_vcf_vars.iteritems():
+            chr,pos,ref,alt = pro_vcf_var.split(':')
+            pos = int(pos)
+            FS = fs_sor[0]
+            SOR = fs_sor[1]
+
+            # adjust pro_vcf_var for indels to match G2P style of recording
+            if len(ref) == len(alt):							# SNP
+                if len(ref) != 1:
+                    print "ERROR: MNPs are not supported!"
+                    print line
+                    raise SystemExit
+                G2P_key_to_match = '%s:%s:%s:%s' % (chr,pos,ref,alt)
+            elif len(ref) > len(alt):							# DEL
+                if len(alt) != 1:
+                    print "ERROR with a deletion"
+                    print line
+                    raise SystemExit
+                G2P_key_to_match = '%s:%s:%s:-' % (chr,pos+1,ref[1:])
+            elif len(ref) < len(alt):							# INS
+                if len(ref) != 1:
+                    print "ERROR with an insertion"
+                    print line
+                    raise SystemExit
+                G2P_key_to_match = '%s:%s:-:%s' % (chr,pos+1,alt[1:])
+            else:
+                print "Cannot establish the type of this VCF variant"
+                print line
+                raise SystemExit
+
+            if G2P_key_to_match not in SHARED_DICT:					# an individual variant which is not in the shared G2P output
+                continue
+
+            # if here, this variant is in the shared G2P output, write it out
+            print "\t%s:\tfound %s (VCF) -> %s (shared G2P)" % (pro_id,pro_vcf_var,G2P_key_to_match)
+
+            GT = SHARED_DICT[G2P_key_to_match][0]
+            gene = SHARED_DICT[G2P_key_to_match][1]
+            trans = SHARED_DICT[G2P_key_to_match][2]
+
+            inher_stat = 'Unknown'
+
+            if (chr != 'chrX') and (chr != 'chrY'):
+                if GT == 'HET':
+                    genotype = 'Heterozygous'
+                elif GT == 'HOM':
+                    genotype = 'Homozygous'
+                else:
+                    print "ERROR: Cannot understand GT = %s" % (GT)
+                    raise SystemExit
+
+            elif (chr == 'chrX') or (chr == 'chrY'):
+                if KIDS_SEX_DICT[pro_id] == '46XX':                 # a girl
+                    if GT == 'HET':
+                        genotype = 'Heterozygous'
+                    elif GT == 'HOM':
+                        genotype = 'Homozygous'
+                    else:
+                        print "ERROR: Cannot understand GT = %s" % (GT)
+                        raise SystemExit
+                elif KIDS_SEX_DICT[pro_id] == '46XY':               # a boy
+                    if GT == 'HET':
+                        genotype = 'Heterozygous'
+                        print "   WARNING: HET variant on chrX/Y for a boy (%s): %s\t%s\t%s\t%s\t%s" % (pro_id,chr,pos,ref,alt,pro_vcf_var)
+                    elif GT == 'HOM':
+                        genotype = 'Hemizygous'
+                    else:
+                        print "ERROR: Cannot understand GT = %s" % (GT)
+                        raise SystemExit
+                else:
+                    print "ERROR: unknown sex for this proband = %s" % (DEC_CHILD_SEX)
+                    raise SystemExit
+            else:
+                print "ERROR: unknown chr"
+                print line
+                raise SystemExit
+
+            # write to the DECIPHER file
+            gene_id_idx = gene.find('(')
+            if gene_id_idx == -1:
+                gene_id_idx = len(gene)
+            gene_id = gene[0:gene_id_idx]
+
+            if trans in TRANS_DICT:                         # if the transcriptID is to be replaced
+                safe_trans = TRANS_DICT[trans]
+            else:
+                safe_trans = trans
+
+            to_write = '%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,,%s,,,,"%s",,,,%s,\n' % (dec_id,chr[3:],pos,ASSEMBLY,ref,alt,safe_trans,gene_id,INTERGENIC,KIDS_SEX_DICT[pro_id],ACCESS,inher_stat,genotype)
+            out_han.write(to_write)
+
+            # write to the IGV file
+            i_s = pos - SNAP_FLANK
+            i_e = pos + SNAP_FLANK
+
+            # check if above FS/SOR_THRESH to include in the snapshot name
+            if (FS == '') or (SOR == ''):
+                flag = 'NA'
+            elif (FS >= FS_THRESH) and (SOR >= SOR_THRESH):
+                flag = 'FS_%.1f_SOR_%.1f' % (FS,SOR)
+            else:
+                flag = 'OK'
+            i_name = '%s_%s_%s_%s_%s_%s.png' % (pro_id,chr,pos,ref,alt,flag)
+
+            out_igv_han.write('goto %s:%s-%s\n' % (chr,i_s,i_e))
+            out_igv_han.write('sort strand\n')
+            out_igv_han.write('squish\n')
+            out_igv_han.write('snapshot %s\n' % (i_name))
+            out_igv_han.write('\n')
+
+            num_out_vars += 1
+
+        out_han.close()
+        out_igv_han.close()
+        if num_out_vars == NUM_SHARED_G2P_VARS:
+            print "\t%s:\tNumber of output variants matches the number of shared variants: OK" % (pro_id)
+        else:
+            print "\t%s:\tERROR: number of output variants does NOT match the number of shared variants" % (pro_id)
+        print "\t%s:\tdecipher file = %s" % (pro_id,out_dec_file)
+        print "\t%s:\tigv snapshot file for %s" % (pro_id,out_igv_file)
+        print "\t--------------------------------"
+
+
+
+
+
+
+def read_all_VCF_vars(in_vcf_file,THIS_DICT,pro_id):
+
+    cntr = 0
+    in_han = gzip.open(in_vcf_file,'r')
+    for line in in_han:
+        if line.startswith('#'):
+            continue
+
+        cntr += 1
+        data = [x.strip() for x in line.strip().split('\t')]
+        chr = data[0]
+        pos = int(data[1])
+        ref = data[3]
+        alt = data[4]
+
+        # extract FS and SOR
+        FS = ''
+        SOR = ''
+        infos = [y.strip() for y in data[7].strip().split(';')]
+        for info in infos:
+            if info.startswith('FS='):
+                tag,FS = info.split('=')
+                FS = float(FS)
+            elif info.startswith('SOR='):
+                tag,SOR = info.split('=')
+                SOR = float(SOR)
+
+        # did the splitting and normalizing - should not have multiallelic variants
+        if alt.find(',') != -1:
+            print "ERROR: found multiallelic variant"
+            print line
+            raiseSystemExit
+
+        key = '%s:%s:%s:%s' % (chr,pos,ref,alt)
+
+        if pro_id not in THIS_DICT:
+            THIS_DICT[pro_id][key] = (FS,SOR)
+        elif key not in THIS_DICT[pro_id]:
+            THIS_DICT[pro_id][key] = (FS,SOR)
+        else:
+            print "ERROR: duplicate key = %s in %s" % (key,in_vcf_file)
+            raise SystemExit
+
+    in_han.close()
+
+
+
+
+
+
+
+
+def read_G2P(in_file):
+
+    global NUM_SHARED_G2P_VARS
+
+#.#    known_OBS_states = ['monoallelic','biallelic','hemizygous','x-linked dominant','x-linked over-dominance']
+    known_OBS_states = ['monoallelic_autosomal','biallelic_autosomal','monoallelic_X_hem','monoallelic_X_het']
+
+    # to make sure no duplicate vars per indi
+    CHECK_DICT =  defaultdict(dict)       # 1st level key: indi_fam_id:chr:start:end:ref:alt; 2nd level key: OBS_state; value: irrelevant
+
+    # first, read the G2P variants on canonical transcripts for each of the affected probands
+    in_han = open(in_file,'r')
+    for line in in_han:
+        data = [x.strip() for x in line.strip().split('\t')]
+
+        # get the individual_id
+        sam_id = data[0]
+
+        # if, in addition to the  affected siblings there is an unaffected parent, they would be in the family VCF
+        # they would be G2P-ed, must be excluded by now!!!
+        if sam_id not in KIDS_SEX_DICT:
+            print "ERROR: In the G2P file found a sample which is not an affected kid = %s !!!" % (sam_id)
+            raise SystemExit
+
+        # ignore variants not on canonical transcripts
+        is_canon = data[3]
+        if is_canon != 'is_canonical':
+            continue
+
+        # split the variants based on the gene's OBS model of inheritance
+        inher_model = data[4]
+        aaa,OBS_state = inher_model.split('=')
+
+        if OBS_state not in known_OBS_states:
+            print "ERROR: unknown OBS state = %s in %s" % (OBS_state,in_file)
+            raise SystemExit
+
+        # get the gene name in format ENSG00000165899(C12orf64,OTOGL)
+        gene_name = data[1]
+
+        # get the transcript name in format ENST00000238647
+        transcript = data[2]
+
+        # this is a list of variants (n>=1) on a canonical transcript in a gene being considered under any OBS state
+        var_list = [y.strip() for y in data[6].split(';')]
+        for v in var_list:
+            v_details = [z.strip() for z in v.split(':')]
+            chr = v_details[0]
+            start = int(v_details[1])
+            end = int(v_details[2])
+            ref = v_details[3]
+            alt = v_details[4]
+            GT = v_details[5]
+            second_key = '%s:%s:%s:%s:%s' % (chr,start,end,ref,alt)
+
+##########################################################################
+#            check_key = '%s:%s' % (sam_id,second_key)
+#            if check_key not in CHECK_DICT:
+#                CHECK_DICT[check_key][OBS_state] = 1
+#            elif OBS_state not in CHECK_DICT[check_key].keys():
+#                CHECK_DICT[check_key][OBS_state] = 1
+#            else:
+#                print "ERROR: a duplicate variant %s in %s gene for CHILD = %s, OBS_state = %s" % (check_key,gene_name,sam_id,OBS_state)
+#                raise SystemExit
+#
+#            if sam_id not in KIDS_G2P_DICT:
+#                KIDS_G2P_DICT[sam_id][second_key] = (GT,gene_name,transcript)
+#            elif second_key not in KIDS_G2P_DICT[sam_id]:
+#                KIDS_G2P_DICT[sam_id][second_key] = (GT,gene_name,transcript)
+#            else:
+##                print "ERROR: a duplicate variant %s in %s gene for CHILD = %s" % (second_key,gene_name,sam_id)
+##                raise SystemExit
+#                pass	# the same variant in diff OBS_state   - see above !
+############################################################################
+
+
+            ##########################################
+            ### to deal with the new output of G2P ###
+            ##########################################
+
+            check_key = '%s:%s' % (sam_id,second_key)
+            if check_key not in CHECK_DICT:						# first time we see this var in this sample, any OBS_state
+                CHECK_DICT[check_key][OBS_state] = 1
+                if sam_id not in KIDS_G2P_DICT:
+                    KIDS_G2P_DICT[sam_id][second_key] = (GT,gene_name,transcript)
+                elif second_key not in KIDS_G2P_DICT[sam_id]:
+                    KIDS_G2P_DICT[sam_id][second_key] = (GT,gene_name,transcript)
+                else:									# sanity check
+                    print "ERROR: first time var already seen?: variant %s in %s gene for CHILD = %s" % (second_key,gene_name,sam_id)
+                    raise SystemExit
+
+            elif OBS_state not in CHECK_DICT[check_key].keys():				# first time we see this var in this sample with this OBS_state
+                CHECK_DICT[check_key][OBS_state] = 1
+                if sam_id not in KIDS_G2P_DICT:
+                    KIDS_G2P_DICT[sam_id][second_key] = (GT,gene_name,transcript)
+                elif second_key not in KIDS_G2P_DICT[sam_id]:
+                    KIDS_G2P_DICT[sam_id][second_key] = (GT,gene_name,transcript)
+                elif KIDS_G2P_DICT[sam_id][second_key] == (GT,gene_name,transcript):	# diff OBS_state, but must have same (GT,gene_name,transcript)
+                    pass
+                else:
+                    print "ERROR: diff (GT,gene_name,transcript) for variant %s in %s gene for CHILD = %s" % (second_key,gene_name,sam_id)
+                    raise SystemExit
+
+            else:       # same individual, same variant, known OBS_state
+                        # due to the new output of G2P we may have the same variant but with different gene names - ensembl/refseq
+                        # check the gene name in KIDS_G2P_DICT[sam_id][second_key]
+                if not KIDS_G2P_DICT[sam_id][second_key][1].startswith('ENSG'):             # recorded is refseq
+                    if gene_name.startswith('ENSG'):                                        # this is ensembl
+                        KIDS_G2P_DICT[sam_id][second_key] = (GT,gene_name,transcript)       # replace
+                    else:                                                                   # this is refseq again, ignore
+                        pass
+                else:                                                                       # recorded is ensembl, ignore
+                    pass
+
+
+    in_han.close()
+    print ""
+    print ""
+    print "Found the following variants on canonical transcripts in the G2P output for these affected probands"
+
+    for id,val in KIDS_G2P_DICT.iteritems():
+        print "--------------------------"
+        for k,v in val.iteritems():
+            print "    %s\t%s\t%s" % (id,k,v)
+    print ""
+    print ""
+
+
+
+
+    ###################################################################################
+    ####    SHARED variant filtering                                               ####
+    ####    select only variants seen in all affected probands with the same GT    ####
+    ###################################################################################
+
+    print ""
+    print "===   SHARED variant filtering   ==="
+
+    # list of all ids
+    proband_ids = KIDS_G2P_DICT.keys()
+    print "All affected probands = %s" % (proband_ids)
+
+    # for each proband, go thru all of their variants, check if seen in all probands excl this one, iff yes, record in SHARED_DICT
+    for pro_id in proband_ids:
+        other_pro_ids = []
+        for aaa in proband_ids:
+            other_pro_ids.append(aaa)
+        other_pro_ids.remove(pro_id)
+        print "Analyzing variants in %s, to be compared against the variants in all other affected probands %s" % (pro_id,other_pro_ids)
+
+        # go thru all of their variants
+        pro_vars = KIDS_G2P_DICT[pro_id]            # a dict with keys: chr,start,end,ref,alt and values: (GT,gene_name,transcript)
+        for var_loc,var_info in pro_vars.iteritems():
+            found_in_all = True
+
+            # check if seen in all probands excl this one
+            for o_id in other_pro_ids:
+                if var_loc not in KIDS_G2P_DICT[o_id]:
+                    print "  Excluding variant %s in %s, since not seen in %s" % (var_loc,pro_id,o_id)
+                    found_in_all = False
+                    break
+
+                # if variant found, check if GT matches
+                else:
+                    o_info = KIDS_G2P_DICT[o_id][var_loc]
+                    if var_info[0] != o_info[0]:
+                        print "  Excluding variant %s in %s (GT = %s); it is seen in %s but GT does not match (ST = %s)" % (var_loc,pro_id,var_info[0],o_id,o_info[0])
+                        found_in_all = False
+                        break
+
+            if found_in_all:	# this variant has been found in all affected probands with matching GT, keep it
+                if var_loc not in SHARED_DICT:		# it has not been recorded previously when considering another proband
+
+                    # for consistency with the standard trio-based processing
+                    # if a non-normalized INDEL in child G2P - must adjust (should not happen really, we split, normalized and left-aligned the family VCF before sending it to VEP+G2P)
+                    chr,start,end,ref,alt = var_loc.split(":")
+                    if len(ref) > 1 and len(alt) > 1:                           # an INDEL - not normalized
+                        if len(ref) < len(alt):                                 # an INS
+                            orig_start = start
+                            orig_ref = ref
+                            orig_alt = alt
+                            start = orig_start
+                            ref = '-'
+                            alt = orig_alt[len(orig_ref):]
+                            print "    WARNING: original INS = %s:%s:%s:%s:%s --> replaced with INS = %s:%s:%s:%s" % (chr,orig_start,end,orig_ref,orig_alt,chr,start,ref,alt)
+                        else:                                                   # a DEL
+                            print "ERROR: At the momemnt, cannot deal with this non-normalized deletion"
+                            print line
+                            raise SystemExit
+
+                    new_key = '%s:%s:%s:%s' % (chr,start,ref,alt)
+                    SHARED_DICT[new_key] = var_info
+                    print "  Keeping %s found in all affected probands, same GT" % (new_key)
+
+
+        print "---------------------"
+
+    NUM_SHARED_G2P_VARS = len(SHARED_DICT)
+    print "Found %s unique and canonical G2P variants SHARED between all %s affected probands in this family" % (NUM_SHARED_G2P_VARS,len(proband_ids))
+    sys.stdout.flush()
+    print ""
+
+
+
+
+
+
+
+
+
+
+
+def read_ped(in_file):
+    in_han = open(in_file,'r')
+    for line in in_han:
+        data = [x.strip() for x in line.strip().split('\t')]
+        kid_id = data[1]
+        kid_se = int(data[4])
+        if kid_se == 1:		# boy
+            kid_sex =  '46XY'
+        elif kid_se == 2:	# girl
+            kid_sex =  '46XX'
+        else:
+            print "ERROR: proband sex unknown"
+            print line
+            raise SystemExit
+        if kid_id not in KIDS_SEX_DICT:
+            KIDS_SEX_DICT[kid_id] = kid_sex
+        else:
+            print "ERROR: proband sex unknown"
+            print line
+            raise SystemExit
+    in_han.close()
+    print "Found the following affected probands"
+    for k,v in KIDS_SEX_DICT.iteritems():
+        print "    %s: %s" % (k,v)
+    sys.stdout.flush()
+
+
+
+
+
+def read_trans_map(in_file):
+    in_han = open(in_file,'r')
+    for line in in_han:
+        data = [x.strip() for x in line.strip().split('\t')]
+        old_trans_id = data[0]
+        new_trans_id = data[1]
+        if old_trans_id not in TRANS_DICT:
+            TRANS_DICT[old_trans_id] = new_trans_id
+        else:
+            print "ERROR: duplicate old transcript ID = %s" % (old_trans_id)
+            raise SystemExit
+    in_han.close()
+
+
+
+
+
+
+
+
+
+
+if __name__ == '__main__':
+    if len(sys.argv) == 11:
+        go(sys.argv[1],sys.argv[2],sys.argv[3],sys.argv[4],sys.argv[5],sys.argv[6],sys.argv[7],sys.argv[8],sys.argv[9],sys.argv[10])
+    else:
+        print "Suggested use: time python /home/u035/u035/shared/scripts/NHS_WES_generate_DEC_IGV_aff_probands.py \
+        dec_id,trans_map_file,ped_file,in_g2p_file,fam_igv_dir,vcf_dir,plate_id,fam_id,dec_dir,fam_bam_dir"
+        raise SystemExit
+
diff --git a/generate_DEC_IGV_scripts.py b/generate_DEC_IGV_scripts.py
new file mode 100755
index 0000000000000000000000000000000000000000..81892871a58e221e5c1928e1d8d73028bff0393b
--- /dev/null
+++ b/generate_DEC_IGV_scripts.py
@@ -0,0 +1,1404 @@
+#	input:
+#		the family PED file				[${BATCH_ID}_${PLATE_ID}_${FAMILY_ID}.ped]
+#		individual VCF file for the trio proband	[${FAMILY_ID}-gatk-haplotype-annotated.${SAMPLE_ID}.vcf.gz]
+#		G2P text output for the trio			[${FAMILY_ID}.report.txt]
+#		VASE output					[${SAMPLE_ID}.clean.strict.denovo.vcf]
+#
+#
+#	output:
+#		DECIPHER formated file for the proband
+#		- all G2P variants
+#		- denovo variants marked as such
+#
+#	checks:
+#		all G2P variants found in the individual VCF
+#		all VASE denovo variants found in the individual VCF
+#
+#       Author: MH
+#       last modified: JAN 18, 2022
+
+
+
+import sys
+import os
+import csv
+import gzip
+from collections import defaultdict
+
+
+ASSEMBLY = 'GRCh38'
+INTERGENIC = 'No'
+ACCESS = 'No'
+
+
+G2P_DICT = {}		# key: chr:pos:ref:alt; value: 0 (if found only in G2P); 1 (if found in VCF) - for variants found in G2P output for this CHILD_ID
+G2P_DATA = {}		# key: chr:pos:ref:alt; value: (transcript,gene,GT)
+VASE_DICT = {}		# key: chr:pos:ref:alt; value: 0 (if found only in VASE); 1 (if found in VCF) - for variants found in VASE output for this CHILD_ID
+
+
+NUM_UNIQ_G2P_VARS = 0
+NUM_UNIQ_VASE_VARS = 0
+
+
+CHILD_ID = 0
+CHILD_SEX = 0
+DEC_CHILD_SEX = 'unknown'
+
+MOM_ID = 0
+MOM_STAT = 0	# 1 = UNAFF, 2 = AFF
+
+DAD_ID = 0
+DAD_STAT = 0	# 1 = UNAFF, 2 = AFF
+
+
+ALL_CHILD_DICT = {}		# key: chr:pos:ref:alt; value: (num_ALT_reads,VAF)
+ALL_MOM_DICT = {}		# key: chr:pos:ref:alt; value: irrelevant
+ALL_DAD_DICT = {}		# key: chr:pos:ref:alt; value: irrelevant
+
+
+CHILD_INHER_DICT = {}		# key: chr:pos:ref:alt; value: 'Paternally inherited, constitutive in father' | 'Maternally inherited, constitutive in mother' | 'Biparental' | 'De novo constitutive' | 'Unknown'
+
+SNAP_FLANK = 25
+
+
+MAP_DICT = {}			# key: family_id (aka decipher_id); value: internal (decipher) ID
+TRANS_DICT = {}			# key: transcriptID not found in DECIPHER; value: the chosen replacement transcriptID from those available in DECIPHER
+
+
+FS_THRESH = float(60)
+SOR_THRESH = float(3)
+
+
+
+def go(dec_map_file,trans_map_file,ped_file,in_g2p_file,in_vase_file,fam_igv_dir,vcf_dir,plate_id,fam_id,dec_dir,fam_bam_dir):
+
+    # read the decipher to internal ID mapping file
+    read_map_file(dec_map_file)
+
+
+    # read the transcript mapping file
+    read_trans_map(trans_map_file)
+
+
+    # read the ped file and establish CHILD_ID,CHILD_SEX,MOM_ID,DAD_ID
+    read_ped(ped_file)
+
+    if (CHILD_ID != 0) and (CHILD_SEX != 0) and (DEC_CHILD_SEX != 'unknown') and (MOM_ID != 0) and (MOM_STAT != 0) and (DAD_ID != 0) and (MOM_STAT != 0):
+        print "======================================"
+        print "Analyzing:"
+        print "CHILD_ID = %s, CHILD_SEX = %s, DEC_CHILD_SEX = %s" % (CHILD_ID,CHILD_SEX,DEC_CHILD_SEX)
+        print "MOM_ID = %s, MOM_STATUS = %s" % (MOM_ID,MOM_STAT)
+        print "DAD_ID = %s, DAD_STATUS = %s" % (DAD_ID,DAD_STAT)
+        print "======================================"
+        sys.stdout.flush()
+    else:
+        print "ERROR: problems reading the PED file = %s" % (ped_file)
+        raise SystemExit
+
+
+    # read the G2P output for this family
+    read_G2P(in_g2p_file)
+
+
+    # read the VASE output for this family
+    read_VASE(in_vase_file)
+
+
+    # now read the individual VCFs and record all the variants
+    child_vcf_file = '%s/%s_%s.ready.%s.vcf.gz' % (vcf_dir,plate_id,fam_id,CHILD_ID)
+    mom_vcf_file = '%s/%s_%s.ready.%s.vcf.gz' % (vcf_dir,plate_id,fam_id,MOM_ID)
+    dad_vcf_file = '%s/%s_%s.ready.%s.vcf.gz' % (vcf_dir,plate_id,fam_id,DAD_ID)
+
+
+    read_all_VCF_vars(child_vcf_file,ALL_CHILD_DICT)
+    print "Found %s unique VCF variants for CHILD (%s)" % (len(ALL_CHILD_DICT),CHILD_ID)
+    sys.stdout.flush()
+
+    read_all_VCF_vars(mom_vcf_file,ALL_MOM_DICT)
+    print "Found %s unique VCF variants for MOM (%s)" % (len(ALL_MOM_DICT),MOM_ID)
+    sys.stdout.flush()
+
+    read_all_VCF_vars(dad_vcf_file,ALL_DAD_DICT)
+    print "Found %s unique VCF variants for DAD (%s)" % (len(ALL_DAD_DICT),DAD_ID)
+    sys.stdout.flush()
+
+
+    # now go over all child variants and set the inheritance
+    num_child_vars_assigned = 0
+    for key,v in ALL_CHILD_DICT.iteritems():
+        if (key in ALL_MOM_DICT) and (key in ALL_DAD_DICT):
+            CHILD_INHER_DICT[key] = 'Biparental'
+            num_child_vars_assigned += 1
+        elif key in ALL_MOM_DICT:
+            CHILD_INHER_DICT[key] = 'Maternally inherited, constitutive in mother'
+            num_child_vars_assigned += 1
+        elif key in ALL_DAD_DICT:
+            CHILD_INHER_DICT[key] = 'Paternally inherited, constitutive in father'
+            num_child_vars_assigned += 1
+        else:
+            CHILD_INHER_DICT[key] = 'Unknown'
+
+    assigned_ratio = (float(num_child_vars_assigned)/float(len(ALL_CHILD_DICT)))*100.0
+
+    print "%s of the %s unique VCF variants (%.2f%%) for CHILD (%s) has been assigned to parents" % (num_child_vars_assigned,len(ALL_CHILD_DICT),assigned_ratio,CHILD_ID)
+    sys.stdout.flush()
+
+
+
+
+
+
+    # setup the DECIPHER output file
+    out_dec_file = '%s/%s_DEC_FLT.csv' % (dec_dir,CHILD_ID)		################################
+    out_han = open(out_dec_file,'w')
+    out_han.write('Internal reference number or ID,Chromosome,Start,Genome assembly,Reference allele,Alternate allele,Transcript,Gene name,Intergenic,Chromosomal sex,Other rearrangements/aneuploidy,Open-access consent,Age at last clinical assessment,Prenatal age in weeks,Note,Inheritance,Pathogenicity,Phenotypes,HGVS code,Genotype,Responsible contact\n')
+
+
+    # setup the IGV snapshot file
+    out_igv_file = '%s/IGV/%s.snapshot.FLT.txt' % (dec_dir,CHILD_ID)	#################################
+    out_igv_han = open(out_igv_file,'w')
+    out_igv_han.write('new\n')
+    out_igv_han.write('genome hg38\n')
+    out_igv_han.write('mkdir -p "%s"\n' % (fam_igv_dir))
+    out_igv_han.write('new\n')
+
+    child_bam = '%s/%s/%s-ready.bam' % (fam_bam_dir,CHILD_ID,CHILD_ID)
+    mom_bam = '%s/%s/%s-ready.bam' % (fam_bam_dir,MOM_ID,MOM_ID)
+    dad_bam = '%s/%s/%s-ready.bam' % (fam_bam_dir,DAD_ID,DAD_ID)
+    out_igv_han.write('load %s\n' % (child_bam))
+    out_igv_han.write('load %s\n' % (mom_bam))
+    out_igv_han.write('load %s\n' % (dad_bam))
+
+    out_igv_han.write('snapshotDirectory "%s"\n' % (fam_igv_dir))
+    out_igv_han.write('\n')
+
+
+    # now read the child VCF, check if the variant in the G2P/VASE output, if yes:
+    # set the value in the dict to 1
+    # print out to to output file
+
+    in_cntr = 0
+    out_cntr = 0
+
+    child_vcf_file = '%s/%s_%s.ready.%s.vcf.gz' % (vcf_dir,plate_id,fam_id,CHILD_ID)
+    in_han = gzip.open(child_vcf_file,'r')
+
+
+    for line in in_han:
+        if line.startswith('#'):
+            continue
+
+        in_cntr += 1
+
+        data = [x.strip() for x in line.strip().split('\t')]
+        chr = data[0]
+        pos = int(data[1])
+        ref = data[3]
+        alt = data[4]
+
+        # extract FS and SOR
+        FS = ''
+        SOR = ''
+        infos = [y.strip() for y in data[7].strip().split(';')]
+        for info in infos:
+            if info.startswith('FS='):
+                tag,FS = info.split('=')
+                FS = float(FS)
+            elif info.startswith('SOR='):
+                tag,SOR = info.split('=')
+                SOR = float(SOR)
+
+        VCF_VAR = data[9]
+
+        key = '%s:%s:%s:%s' % (chr,pos,ref,alt)
+        inher_stat = CHILD_INHER_DICT[key]
+
+
+
+        ##############################################################
+        # different processing depending on being a SNP, INS, or DEL #
+        ##############################################################
+
+        if len(ref) == len(alt):			# SNP
+            if len(ref) != 1:
+                print "ERROR: MNPs are not supported!"
+                print line
+                raise SystemExit
+
+            key_to_match = '%s:%s:%s:%s' % (chr,pos,ref,alt)
+            is_denovo = False
+            if key_to_match in VASE_DICT:
+                VASE_DICT[key_to_match] = 1
+                is_denovo = True
+            if key_to_match in G2P_DICT:
+                G2P_DICT[key_to_match] = 1
+                trans = G2P_DATA[key_to_match][0]
+                gene = G2P_DATA[key_to_match][1]
+                GT = G2P_DATA[key_to_match][2]
+
+                if is_denovo:
+                    if inher_stat == 'Unknown':
+                        inher_stat = 'De novo constitutive'
+                    else:
+                        print "ERROR: %s is both VASE denovo and %s from VCF" % (key,inher_stat)
+                        raise SystemExit
+
+                if (chr != 'chrX') and (chr != 'chrY'):
+                    if GT == 'HET':
+                        genotype = 'Heterozygous'
+                    elif GT == 'HOM':
+                        genotype = 'Homozygous'
+                    else:
+                        print "ERROR: Cannot understand GT = %s" % (GT)
+                        raise SystemExit
+
+                elif (chr == 'chrX') or (chr == 'chrY'):
+                    if DEC_CHILD_SEX == '46XX':			# a girl
+                        if GT == 'HET':
+                            genotype = 'Heterozygous'
+                        elif GT == 'HOM':
+                            genotype = 'Homozygous'
+                        else:
+                            print "ERROR: Cannot understand GT = %s" % (GT)
+                            raise SystemExit
+                    elif DEC_CHILD_SEX == '46XY':		# a boy
+                        if GT == 'HET':
+                            genotype = 'Heterozygous'
+                            print "   WARNING: HET variant on chrX/Y for a boy (%s): %s\t%s\t%s\t%s\t%s" % (CHILD_ID,chr,pos,ref,alt,VCF_VAR)
+                        elif GT == 'HOM':
+                            genotype = 'Hemizygous'
+                        else:
+                            print "ERROR: Cannot understand GT = %s" % (GT)
+                            raise SystemExit
+                    else:
+                        print "ERROR: unknown sex for this proband = %s" % (DEC_CHILD_SEX)
+                        raise SystemExit
+                else:
+                    print "ERROR: unknown chr"
+                    print line
+                    raise SystemExit
+
+                # write to the DECIPHER file
+                gene_id_idx = gene.find('(')
+                if gene_id_idx == -1:
+                    gene_id_idx = len(gene)
+                gene_id = gene[0:gene_id_idx]
+                int_ID = MAP_DICT[fam_id]
+
+                if trans in TRANS_DICT:				# if the transcriptID is to be replaced
+                    safe_trans = TRANS_DICT[trans]
+                else:
+                    safe_trans = trans
+
+                to_write = '%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,,%s,,,,"%s",,,,%s,\n' % (int_ID,chr[3:],pos,ASSEMBLY,ref,alt,safe_trans,gene_id,INTERGENIC,DEC_CHILD_SEX,ACCESS,inher_stat,genotype)
+                out_cntr += 1
+                out_han.write(to_write)
+
+                # write to the IGV file
+                i_s = pos - SNAP_FLANK
+                i_e = pos + SNAP_FLANK
+
+                # check if above FS/SOR_THRESH to include in the snapshot name
+                if (FS == '') or (SOR == ''):
+                    flag = 'NA'
+                elif (FS >= FS_THRESH) and (SOR >= SOR_THRESH):
+                    flag = 'FS_%.1f_SOR_%.1f' % (FS,SOR)
+                else:
+                    flag = 'OK'
+                i_name = '%s_%s_%s_%s_%s_%s.png' % (CHILD_ID,chr,pos,ref,alt,flag)
+
+                out_igv_han.write('goto %s:%s-%s\n' % (chr,i_s,i_e))
+                out_igv_han.write('sort strand\n')
+                out_igv_han.write('squish\n')
+                out_igv_han.write('snapshot %s\n' % (i_name))
+                out_igv_han.write('\n')
+
+
+
+        elif len(ref) > len(alt):			# DEL
+            if len(alt) != 1:
+                print "ERROR with a deletion"
+                print line
+                raise SystemExit
+
+            G2P_key_to_match = '%s:%s:%s:-' % (chr,pos+1,ref[1:])
+            VASE_key_to_match = '%s:%s:%s:%s' % (chr,pos,ref,alt)
+            is_denovo = False
+            if VASE_key_to_match in VASE_DICT:
+                VASE_DICT[VASE_key_to_match] = 1
+                is_denovo = True
+            if G2P_key_to_match in G2P_DICT:
+                G2P_DICT[G2P_key_to_match] = 1
+                trans = G2P_DATA[G2P_key_to_match][0]
+                gene = G2P_DATA[G2P_key_to_match][1]
+                GT = G2P_DATA[G2P_key_to_match][2]
+
+                if is_denovo:
+                    if inher_stat == 'Unknown':
+                        inher_stat = 'De novo constitutive'
+                    else:
+                        print "ERROR: %s is both VASE denovo and %s from VCF" % (key,inher_stat)
+                        raise SystemExit
+
+                if (chr != 'chrX') and (chr != 'chrY'):
+                    if GT == 'HET':
+                        genotype = 'Heterozygous'
+                    elif GT == 'HOM':
+                        genotype = 'Homozygous'
+                    else:
+                        print "ERROR: Cannot understand GT = %s" % (GT)
+                        raise SystemExit
+                elif (chr == 'chrX') or (chr == 'chrY'):
+                    if DEC_CHILD_SEX == '46XX':                 # a girl
+                        if GT == 'HET':
+                            genotype = 'Heterozygous'
+                        elif GT == 'HOM':
+                            genotype = 'Homozygous'
+                        else:
+                            print "ERROR: Cannot understand GT = %s" % (GT)
+                            raise SystemExit
+                    elif DEC_CHILD_SEX == '46XY':               # a boy
+                        if GT == 'HET':
+                            genotype = 'Heterozygous'
+                            print "   WARNING: HET variant on chrX/Y for a boy (%s): %s\t%s\t%s\t%s\t%s" % (CHILD_ID,chr,pos,ref,alt,VCF_VAR)
+                        elif GT == 'HOM':
+                            genotype = 'Hemizygous'
+                        else:
+                            print "ERROR: Cannot understand GT = %s" % (GT)
+                            raise SystemExit
+                    else:
+                        print "ERROR: unknown sex for this proband = %s" % (DEC_CHILD_SEX)
+                        raise SystemExit
+                else:
+                    print "ERROR: unknown chr"
+                    print line
+                    raise SystemExit
+
+                # write to the DECIPHER file
+                gene_id_idx = gene.find('(')
+                if gene_id_idx == -1:
+                    gene_id_idx = len(gene)
+                gene_id = gene[0:gene_id_idx]
+                int_ID = MAP_DICT[fam_id]
+
+                if trans in TRANS_DICT:                         # if the transcriptID is to be replaced
+                    safe_trans = TRANS_DICT[trans]
+                else:
+                    safe_trans = trans
+
+                to_write = '%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,,%s,,,,"%s",,,,%s,\n' % (int_ID,chr[3:],pos,ASSEMBLY,ref,alt,safe_trans,gene_id,INTERGENIC,DEC_CHILD_SEX,ACCESS,inher_stat,genotype)
+                out_cntr += 1
+                out_han.write(to_write)
+
+                # write to the IGV file
+                i_s = pos - SNAP_FLANK
+                i_e = pos + SNAP_FLANK
+
+                # check if above FS/SOR_THRESH to include in the snapshot name
+                if (FS == '') or (SOR == ''):
+                    flag = 'NA'
+                elif (FS >= FS_THRESH) and (SOR >= SOR_THRESH):
+                    flag = 'FS_%.1f_SOR_%.1f' % (FS,SOR)
+                else:
+                    flag = 'OK'
+                i_name = '%s_%s_%s_%s_%s_%s.png' % (CHILD_ID,chr,pos,ref,alt,flag)
+
+                out_igv_han.write('goto %s:%s-%s\n' % (chr,i_s,i_e))
+                out_igv_han.write('sort strand\n')
+                out_igv_han.write('squish\n')
+                out_igv_han.write('snapshot %s\n' % (i_name))
+                out_igv_han.write('\n')
+
+
+
+        elif len(ref) < len(alt):                       # INS
+            if len(ref) != 1:
+                print "ERROR with an insertion"
+                print line
+                raise SystemExit
+
+            G2P_key_to_match = '%s:%s:-:%s' % (chr,pos+1,alt[1:])
+            VASE_key_to_match = '%s:%s:%s:%s' % (chr,pos,ref,alt)
+            is_denovo = False
+            if VASE_key_to_match in VASE_DICT:
+                VASE_DICT[VASE_key_to_match] = 1
+                is_denovo = True
+            if G2P_key_to_match in G2P_DICT:
+                G2P_DICT[G2P_key_to_match] = 1
+                trans = G2P_DATA[G2P_key_to_match][0]
+                gene = G2P_DATA[G2P_key_to_match][1]
+                GT = G2P_DATA[G2P_key_to_match][2]
+
+                if is_denovo:
+                    if inher_stat == 'Unknown':
+                        inher_stat = 'De novo constitutive'
+                    else:
+                        print "ERROR: %s is both VASE denovo and %s from VCF" % (key,inher_stat)
+                        raise SystemExit
+
+                if (chr != 'chrX') and (chr != 'chrY'):
+                    if GT == 'HET':
+                        genotype = 'Heterozygous'
+                    elif GT == 'HOM':
+                        genotype = 'Homozygous'
+                    else:
+                        print "ERROR: Cannot understand GT = %s" % (GT)
+                        raise SystemExit
+                elif (chr == 'chrX') or (chr == 'chrY'):
+                    if DEC_CHILD_SEX == '46XX':                 # a girl
+                        if GT == 'HET':
+                            genotype = 'Heterozygous'
+                        elif GT == 'HOM':
+                            genotype = 'Homozygous'
+                        else:
+                            print "ERROR: Cannot understand GT = %s" % (GT)
+                            raise SystemExit
+                    elif DEC_CHILD_SEX == '46XY':               # a boy
+                        if GT == 'HET':
+                            genotype = 'Heterozygous'
+                            print "   WARNING: HET variant on chrX/Y for a boy (%s): %s\t%s\t%s\t%s\t%s" % (CHILD_ID,chr,pos,ref,alt,VCF_VAR)
+                        elif GT == 'HOM':
+                            genotype = 'Hemizygous'
+                        else:
+                            print "ERROR: Cannot understand GT = %s" % (GT)
+                            raise SystemExit
+                    else:
+                        print "ERROR: unknown sex for this proband = %s" % (DEC_CHILD_SEX)
+                        raise SystemExit
+                else:
+                    print "ERROR: unknown chr"
+                    print line
+                    raise SystemExit
+
+
+                # write to the DECIPHER file
+                gene_id_idx = gene.find('(')
+                if gene_id_idx == -1:
+                    gene_id_idx = len(gene)
+                gene_id = gene[0:gene_id_idx]
+                int_ID = MAP_DICT[fam_id]
+
+                if trans in TRANS_DICT:                         # if the transcriptID is to be replaced
+                    safe_trans = TRANS_DICT[trans]
+                else:
+                    safe_trans = trans
+
+                to_write = '%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,,%s,,,,"%s",,,,%s,\n' % (int_ID,chr[3:],pos,ASSEMBLY,ref,alt,safe_trans,gene_id,INTERGENIC,DEC_CHILD_SEX,ACCESS,inher_stat,genotype)
+                out_cntr += 1
+                out_han.write(to_write)
+
+                # write to the IGV file
+                i_s = pos - SNAP_FLANK
+                i_e = pos + SNAP_FLANK
+
+                # check if above FS/SOR_THRESH to include in the snapshot name
+                if (FS == '') or (SOR == ''):
+                    flag = 'NA'
+                elif (FS >= FS_THRESH) and (SOR >= SOR_THRESH):
+                    flag = 'FS_%.1f_SOR_%.1f' % (FS,SOR)
+                else:
+                    flag = 'OK'
+                i_name = '%s_%s_%s_%s_%s_%s.png' % (CHILD_ID,chr,pos,ref,alt,flag)
+
+                out_igv_han.write('goto %s:%s-%s\n' % (chr,i_s,i_e))
+                out_igv_han.write('sort strand\n')
+                out_igv_han.write('squish\n')
+                out_igv_han.write('snapshot %s\n' % (i_name))
+                out_igv_han.write('\n')
+
+
+        else:
+            print "Cannot establish the type of this VCF variant"
+            print line
+            raise SystemExit
+
+    in_han.close()
+    out_han.close()
+    out_igv_han.close()
+
+
+
+
+
+
+
+    ### check if all G2P and VASE variants were found/matched in the proband's VCF
+    found_all_G2P = True
+    found_all_VASE = True
+
+    for k,v in G2P_DICT.iteritems():
+        if int(v) == 0:
+            print k
+            found_all_G2P = False
+            break
+
+    for k,v in VASE_DICT.iteritems():
+        if int(v) == 0:
+            print k
+            found_all_VASE = False
+            break
+
+    if found_all_G2P:
+        print "OK: Found all %s G2P variants in the proband's VCF file" % (len(G2P_DICT))
+    else:
+        print "ERROR: Could not find all G2P variants in the probands VCF file"
+        raise SystemExit
+
+    if found_all_VASE:
+        print "OK: Found all %s VASE variants in the proband's VCF file" % (len(VASE_DICT))
+    else:
+        print "ERROR: Could not find all VASE variants in the probands VCF file"
+        raise SystemExit
+
+    ### check if all G2P variants are written out
+    if out_cntr == NUM_UNIQ_G2P_VARS:
+        print "OK: All G2P vars are recorded in the output DECIPHER file"
+    else:
+        print "ERROR: *NOT* all G2P vars are recorded in the G2P VCF file"
+
+
+    print "Wrote %s variants in outfile = %s" % (out_cntr,out_dec_file)
+    print "The batch snapshot file = %s" % (out_igv_file)
+    sys.stdout.flush()
+
+
+
+
+
+
+
+
+
+
+def read_all_VCF_vars(in_vcf_file,THIS_DICT):
+
+    in_han = gzip.open(in_vcf_file,'r')
+    for line in in_han:
+        if line.startswith('#'):
+            continue
+
+        data = [x.strip() for x in line.strip().split('\t')]
+        chr = data[0]
+        pos = int(data[1])
+        ref = data[3]
+        alt = data[4]
+
+
+        # did the splitting and normalizing - should not have multiallelic variants
+        if alt.find(',') != -1:
+            print "ERROR: found multiallelic variant"
+            print line
+            raiseSystemExit
+
+        key = '%s:%s:%s:%s' % (chr,pos,ref,alt)
+        if key not in THIS_DICT:
+            THIS_DICT[key] = 1
+        else:
+            print "ERROR: duplicate key = %s in %s" % (key,in_vcf_file)
+            raise SystemExit
+
+    in_han.close()
+
+
+
+
+
+
+
+def read_VASE(in_file):
+
+    global NUM_UNIQ_VASE_VARS
+
+    in_han = open(in_file,'r')
+    for line in in_han:
+        # ignore header lines
+        if line.startswith('#'):
+            continue
+
+        data = [x.strip() for x in line.strip().split('\t')]
+        chr = data[0]
+        pos = data[1]
+        ref = data[3]
+        alt = data[4]
+
+        key = '%s:%s:%s:%s' % (chr,pos,ref,alt)
+
+        if key not in VASE_DICT:
+            VASE_DICT[key] = 0
+        else:
+            print "ERROR: duplicate VASE variant key = %s" % (key)
+            raise SystemExit
+
+    in_han.close()
+    NUM_UNIQ_VASE_VARS = len(VASE_DICT)
+    print "Found %s unique VASE denovo variants for CHILD (%s)" % (NUM_UNIQ_VASE_VARS,CHILD_ID)
+    sys.stdout.flush()
+
+
+
+
+
+
+
+
+
+
+
+
+
+def read_G2P(in_file):
+
+    global NUM_UNIQ_G2P_VARS
+
+#.#    known_OBS_states = ['monoallelic','biallelic','hemizygous','x-linked dominant','x-linked over-dominance']
+    known_OBS_states = ['monoallelic_autosomal','biallelic_autosomal','monoallelic_X_hem','monoallelic_X_het']
+
+    # first, read the G2P variants on canonical transcripts for each of the family members
+    CHILD_DICT = defaultdict(dict)	# 1st level key: OBS state; 2nd level key: chr:start:end:ref:alt; value: (ZYG,gene,trans)
+    MOM_DICT = defaultdict(dict)	# 1st level key: OBS state; 2nd level key: chr:start:end:ref:alt; value: (ZYG,gene,trans)
+    DAD_DICT = defaultdict(dict)	# 1st level key: OBS state; 2nd level key: chr:start:end:ref:alt; value: (ZYG,gene,trans)
+
+    in_han = open(in_file,'r')
+    for line in in_han:
+        data = [x.strip() for x in line.strip().split('\t')]
+
+        # get the individual_id
+        sam_id = data[0]
+
+        # ignore variants not on canonical transcripts
+        is_canon = data[3]
+        if is_canon != 'is_canonical':
+            continue
+
+        # split the variants based on the gene's OBS model of inheritance
+        inher_model = data[4]
+        aaa,OBS_state = inher_model.split('=')
+
+        if OBS_state not in known_OBS_states:
+            print "ERROR: unknown OBS state = %s in %s" % (OBS_state,in_file)
+            raise SystemExit
+
+        # get the gene name in format ENSG00000165899(C12orf64,OTOGL) or gene-MYT1L(MYT1L)
+        gene_name = data[1]
+
+        # get the transcript name in format ENST00000238647 or gene-MYT1L(MYT1L)
+        transcript = data[2]
+
+
+        # this is a list of variants (n>=1) on a canonical transcript in a gene being considered under any OBS state
+        var_list = [y.strip() for y in data[6].split(';')]
+        for v in var_list:
+            v_details = [z.strip() for z in v.split(':')]
+            chr = v_details[0]
+            start = int(v_details[1])
+            end = int(v_details[2])
+            ref = v_details[3]
+            alt = v_details[4]
+            GT = v_details[5]
+            second_key = '%s:%s:%s:%s:%s' % (chr,start,end,ref,alt)
+
+
+            if sam_id == CHILD_ID:
+                # check for duplication
+                if OBS_state not in CHILD_DICT:
+                    CHILD_DICT[OBS_state][second_key] = (GT,gene_name,transcript)
+                elif second_key not in CHILD_DICT[OBS_state]:
+                    CHILD_DICT[OBS_state][second_key] = (GT,gene_name,transcript)
+                else:		# already recorded this variant
+                     		# if we have refseq recorded and this is ensembl --> replace
+                    if not CHILD_DICT[OBS_state][second_key][1].startswith('ENSG'):		# recorded is refseq
+                        if gene_name.startswith('ENSG'):					# this is ensembl
+                            CHILD_DICT[OBS_state][second_key] = (GT,gene_name,transcript) 	# replace
+                        else:									# this is refseq again, ignore
+                            pass
+                    else:									# recorded is ensembl, ignore
+                        pass
+
+            elif sam_id == MOM_ID:
+                # check for duplication
+                if OBS_state not in MOM_DICT:
+                    MOM_DICT[OBS_state][second_key] = (GT,gene_name,transcript)
+                elif second_key not in MOM_DICT[OBS_state]:
+                    MOM_DICT[OBS_state][second_key] = (GT,gene_name,transcript)
+                else:           # already recorded this variant
+                                # if we have refseq recorded and this is ensembl --> replace
+                    if not MOM_DICT[OBS_state][second_key][1].startswith('ENSG'):		# recorded is refseq
+                        if gene_name.startswith('ENSG'):                                        # this is ensembl
+                            MOM_DICT[OBS_state][second_key] = (GT,gene_name,transcript)		# replace
+                        else:                                                                   # this is refseq again, ignore
+                            pass
+                    else:                                                                       # recorded is ensembl, ignore
+                        pass
+
+            elif sam_id == DAD_ID:
+                # check for duplication
+                if OBS_state not in DAD_DICT:
+                    DAD_DICT[OBS_state][second_key] = (GT,gene_name,transcript)
+                elif second_key not in DAD_DICT[OBS_state]:
+                    DAD_DICT[OBS_state][second_key] = (GT,gene_name,transcript)
+                else:           # already recorded this variant
+                                # if we have refseq recorded and this is ensembl --> replace
+                    if not DAD_DICT[OBS_state][second_key][1].startswith('ENSG'):               # recorded is refseq
+                        if gene_name.startswith('ENSG'):                                        # this is ensembl
+                            DAD_DICT[OBS_state][second_key] = (GT,gene_name,transcript)         # replace
+                        else:                                                                   # this is refseq again, ignore
+                            pass
+                    else:                                                                       # recorded is ensembl, ignore
+                        pass
+
+            else:
+                print "ERROR: cannot identify the person for this variant"
+                print line
+                raise SystemExit
+
+    in_han.close()
+
+
+    ### print out the number of unique G2P variants in CHILD ###
+    child_mono = 0
+    child_bi = 0
+    child_hem = 0
+    child_het = 0
+
+    if 'monoallelic_autosomal' in CHILD_DICT:
+        child_mono = len(CHILD_DICT['monoallelic_autosomal'])
+    if 'biallelic_autosomal' in CHILD_DICT:
+        child_bi = len(CHILD_DICT['biallelic_autosomal'])
+    if 'monoallelic_X_hem' in CHILD_DICT:
+        child_hem = len(CHILD_DICT['monoallelic_X_hem'])
+    if 'monoallelic_X_het' in CHILD_DICT:
+        child_het = len(CHILD_DICT['monoallelic_X_het'])
+
+    print "CHILD (%s): number of unique G2P variants on canon transcript in the following OBS states" % (CHILD_ID)
+    print "    monoallelic_autosomal: %s" % (child_mono)
+    print "    biallelic_autosomal: %s" % (child_bi)
+    print "    monoallelic_X_hem: %s" % (child_hem)
+    print "    monoallelic_X_het: %s" % (child_het)
+
+
+
+
+    ### print out the number of unique G2P variants in MOM ###
+    mom_mono = 0
+    mom_bi = 0
+    mom_hem = 0
+    mom_het = 0
+
+    if 'monoallelic_autosomal' in MOM_DICT:
+        mom_mono = len(MOM_DICT['monoallelic_autosomal'])
+    if 'biallelic_autosomal' in MOM_DICT:
+        mom_bi = len(MOM_DICT['biallelic_autosomal'])
+    if 'monoallelic_X_hem' in MOM_DICT:
+        mom_hem = len(MOM_DICT['monoallelic_X_hem'])
+    if 'monoallelic_X_het' in MOM_DICT:
+        mom_het = len(MOM_DICT['monoallelic_X_het'])
+
+    print "MOM (%s): number of unique G2P variants on canon transcript in the following OBS states" % (MOM_ID)
+    print "    monoallelic_autosomal: %s" % (mom_mono)
+    print "    biallelic_autosomal: %s" % (mom_bi)
+    print "    monoallelic_X_hem: %s" % (mom_hem)
+    print "    monoallelic_X_het: %s" % (mom_het)
+
+
+
+
+    ### print out the number of unique G2P variants in DAD ###
+    dad_mono = 0
+    dad_bi = 0
+    dad_hem = 0
+    dad_het = 0
+
+    if 'monoallelic_autosomal' in DAD_DICT:
+        dad_mono = len(DAD_DICT['monoallelic_autosomal'])
+    if 'biallelic_autosomal' in DAD_DICT:
+        dad_bi = len(DAD_DICT['biallelic_autosomal'])
+    if 'monoallelic_X_hem' in DAD_DICT:
+        dad_hem = len(DAD_DICT['monoallelic_X_hem'])
+    if 'monoallelic_X_het' in DAD_DICT:
+        dad_het = len(DAD_DICT['monoallelic_X_het'])
+
+    print "DAD (%s): number of unique G2P variants on canon transcript in the following OBS states" % (DAD_ID)
+    print "    monoallelic_autosomal: %s" % (dad_mono)
+    print "    biallelic_autosomal: %s" % (dad_bi)
+    print "    monoallelic_X_hem: %s" % (dad_hem)
+    print "    monoallelic_X_hem: %s" % (dad_het)
+    sys.stdout.flush()
+
+
+
+
+
+    ######################################################################################################
+    ####    Dominant filtering                                                                        ####
+    ####    if the gene has been considered under the dominant model (OBS == monoallelic_autosomal)   ####
+    ####    exclude child variants seen in UNAFFECTED mother/father, regardless of GT                 ####
+    ######################################################################################################
+
+
+    print ""
+    print "===   monoallelic autosomal (DOMINANT) filtering   ==="
+
+
+    for key in CHILD_DICT['monoallelic_autosomal']:	# this the second key: chr:start:end:ref:alt; value: (ZYG,gene,trans)
+
+        CHILD_GT = CHILD_DICT['monoallelic_autosomal'][key][0]
+        CHILD_GENE = CHILD_DICT['monoallelic_autosomal'][key][1]
+        CHILD_TRANS = CHILD_DICT['monoallelic_autosomal'][key][2]
+
+        if (key in MOM_DICT['monoallelic_autosomal']) and (MOM_STAT == "UNAFFECTED"):
+            MOM_GT = MOM_DICT['monoallelic_autosomal'][key][0]
+            print "***[DOMINANT model]*** Excluded CHILD var %s in gene = %s, CHILD_GT = %s, MOM_GT = %s, MOM_STAT = %s" % (key,CHILD_GENE,CHILD_GT,MOM_GT,MOM_STAT)
+            continue
+
+        if (key in DAD_DICT['monoallelic_autosomal']) and (DAD_STAT == "UNAFFECTED"):
+            DAD_GT = DAD_DICT['monoallelic_autosomal'][key][0]
+            print "***[DOMINANT model]*** Excluded CHILD var %s in gene = %s, CHILD_GT = %s, DAD_GT = %s, DAD_STAT = %s" % (key,CHILD_GENE,CHILD_GT,DAD_GT,DAD_STAT)
+            continue
+
+
+        # if a non-normalized INDEL in child G2P - must adjust (should not happen really, we split, normalized and left-aligned the family VCF before sending it to VEP+G2P)
+        chr,start,end,ref,alt = key.split(":")
+        if len(ref) > 1 and len(alt) > 1:                           # an INDEL - not normalized
+            if len(ref) < len(alt):                                 # an INS
+                orig_start = start
+                orig_ref = ref
+                orig_alt = alt
+                start = orig_start
+                ref = '-'
+                alt = orig_alt[len(orig_ref):]
+                print "    WARNING: original INS = %s:%s:%s:%s:%s --> replaced with INS = %s:%s:%s:%s" % (chr,orig_start,end,orig_ref,orig_alt,chr,start,ref,alt)
+            else:                                                   # a DEL
+                print "ERROR: At the momemnt, cannot deal with this non-normalized deletion"
+                print line
+                raise SystemExit
+
+        new_key = '%s:%s:%s:%s' % (chr,start,ref,alt)
+
+        # record the data for CHILD G2P variants (for OBS=monoallelic)
+
+        if new_key not in G2P_DICT:
+            G2P_DICT[new_key] = 0
+        else:
+            # print "ERROR: duplicate G2P variant new_key = %s" % (new_key)
+            # raise SystemExit
+            # this will happen if a gene is e.g. hemizygous,x-linked dominant - there will be two separate lines in the output for each req
+            pass
+
+        # and record the required data (CHILD_TRANS,CHILD_GENE,CHILD_GT) in G2P_DATA
+        if new_key not in G2P_DATA:
+            G2P_DATA[new_key] = (CHILD_TRANS,CHILD_GENE,CHILD_GT)
+        else:
+            # print "ERROR: duplicate G2P variant new_key = %s" % (new_key)
+            # raise SystemExit
+            # this will happen if a gene is e.g. hemizygous,x-linked dominant - there will be two separate lines in the output for each req
+            pass
+
+
+    NUM_UNIQ_G2P_VARS = len(G2P_DICT)
+    print "Found %s unique G2P variants in CHILD (%s) after considering MONOALLELIC genes" % (NUM_UNIQ_G2P_VARS,CHILD_ID)
+    sys.stdout.flush()
+
+    print ""
+
+
+
+
+
+    ##############################################################################################################
+    ####    Recessive filtering                                                                               ####
+    ####    under the recessive model (OBS == biallelic_autosomal) - consider ALL variants per gene           ####
+    ####    must all be HET in CHILD, GT in parent does not matter                                            ####
+    ####    all of them must *clearly* come from only one of the parents (maternally/paternally + biparental) ####
+    ####    and this parent must be unaffected                                                                ####
+    ####    if all these: then exclude all child variants in this gene                                        ####
+    ##############################################################################################################
+
+
+    print ""
+    print "===   biallelic autosomal (RECESSIVE) filtering   ==="
+
+
+    GENE_KEY_GT = defaultdict(dict)		# for child - 1st level key: gene_name; 2nd level key: chr:start:end:ref:alt; value: (GT,trans)
+
+    # process all variants in biallelic genes in child
+    for key in CHILD_DICT['biallelic_autosomal']:		# this the second key: chr:start:end:ref:alt; value: (ZYG,gene,trans)
+        b_GT = CHILD_DICT['biallelic_autosomal'][key][0]
+        b_gene = CHILD_DICT['biallelic_autosomal'][key][1]
+        b_trans = CHILD_DICT['biallelic_autosomal'][key][2]
+        GENE_KEY_GT[b_gene][key] = (b_GT,b_trans)
+
+    # iterate over genes in GENE_KEY_GT
+    for g in GENE_KEY_GT: 			# this is the biallelic gene name
+        all_HET = True
+
+        # iterate over variants in this gene
+        for kx in GENE_KEY_GT[g]:		# this the second key: chr:start:end:ref:alt
+            if GENE_KEY_GT[g][kx][0] == 'HOM':     # there is a HOM variant in the child - NO filtering
+                all_HET = False
+                break
+
+        if all_HET:				# for this gene
+        # all variants in this gene in the CHILD are HET, check if all come from a single unaffected parent
+        # if yes, filter out and write a message to the log file
+        # if not, to be added to G2P_DICT and G2P_DATA for further processing
+
+            all_from_one_parent = True
+
+            # iterate again over the variants in this gene
+            VAR_SOURCE_LIST = {}		# key: chr:start:end:ref:alt in child; value: (NONE) or (MOM or DAD or BOTH and the parent is UNAFFECTED)
+
+            for ky in GENE_KEY_GT[g]:		# this the second key: chr:start:end:ref:alt
+
+                this_var_status = 'NONE'
+
+                if ((ky in MOM_DICT['biallelic_autosomal']) or (ky in MOM_DICT['monoallelic_autosomal'])) and (MOM_STAT == "UNAFFECTED"):
+                    this_var_status = 'MOM'
+                if ((ky in DAD_DICT['biallelic_autosomal']) or (ky in DAD_DICT['monoallelic_autosomal'])) and (DAD_STAT == "UNAFFECTED"):
+                    if this_var_status == 'NONE':
+                        this_var_status = 'DAD'
+                    elif this_var_status == 'MOM':
+                        this_var_status = 'BOTH'
+
+                VAR_SOURCE_LIST[ky] = this_var_status
+
+            # have collected the parent source for all variants in this gene
+            tot_num_vars = len(VAR_SOURCE_LIST)
+            num_mom = 0
+            num_dad = 0
+            num_none = 0
+            for kt,v in VAR_SOURCE_LIST.iteritems():
+                if v == 'NONE':
+                    num_none += 1
+                elif v == 'MOM':
+                    num_mom += 1
+                elif v == 'DAD':
+                    num_dad += 1
+                elif v == 'BOTH':
+                    num_mom += 1
+                    num_dad += 1
+                else:
+                    print "ERROR: cannot understand the source parent = %s" % (v)
+                    raise SystemExit
+
+            if num_none > 0:
+                all_from_one_parent = False
+            elif num_mom < tot_num_vars and num_dad < tot_num_vars:
+                all_from_one_parent = False
+
+            # if all variants in the child in this gene are found in single unaffected parent - filter out
+            if all_from_one_parent:
+                for kz in GENE_KEY_GT[g]:
+                    print "***[RECESSIVE model]*** Excluded CHILD HET var %s in gene = %s, found in = %s, PARENT_STAT = UNAFFECTED" % (kz,g,VAR_SOURCE_LIST[kz])
+                continue
+
+        # end processing all HET variants in the proband - if all from single unaffected parent they have been excluded, message to the log written
+        # and gone to evaluating the next biallelic gene in the child
+
+        # if here
+        # - either not all CHILD variants in this gene are not HET, or
+        # - not all of them can be traced to a single unaffected parent
+        # --> add to be processed
+
+        # here we are at gene level, must iterate over all variants in this gene
+        # iterate over variants in this gene
+        for kkk in GENE_KEY_GT[g]:                # this the second key: chr:start:end:ref:alt
+
+            CHILD_GT = CHILD_DICT['biallelic_autosomal'][kkk][0]
+            CHILD_GENE = CHILD_DICT['biallelic_autosomal'][kkk][1]
+            CHILD_TRANS = CHILD_DICT['biallelic_autosomal'][kkk][2]
+
+            # if a non-normalized INDEL in child G2P - must adjust (should not happen really, we split, normalized and left-aligned the family VCF before sending it to VEP+G2P)
+            chr,start,end,ref,alt = kkk.split(":")
+            if len(ref) > 1 and len(alt) > 1:                           # an INDEL - not normalized
+                if len(ref) < len(alt):                                 # an INS
+                    orig_start = start
+                    orig_ref = ref
+                    orig_alt = alt
+                    start = orig_start
+                    ref = '-'
+                    alt = orig_alt[len(orig_ref):]
+                    print "    WARNING: original INS = %s:%s:%s:%s:%s --> replaced with INS = %s:%s:%s:%s" % (chr,orig_start,end,orig_ref,orig_alt,chr,start,ref,alt)
+                else:                                                   # a DEL
+                    print "ERROR: At the momemnt, cannot deal with this non-normalized deletion"
+                    print line
+                    raise SystemExit
+
+            new_key = '%s:%s:%s:%s' % (chr,start,ref,alt)
+
+            # record the data for CHILD G2P variants (for OBS=biallelic)
+            if new_key not in G2P_DICT:
+                G2P_DICT[new_key] = 0
+            else:
+                # print "ERROR: duplicate G2P variant new_key = %s" % (new_key)
+                # raise SystemExit
+                # this will happen if a gene is e.g. hemizygous,x-linked dominant - there will be two separate lines in the output for each req
+                pass
+
+            # and record the required data (CHILD_TRANS,CHILD_GENE,CHILD_GT) in G2P_DATA
+            if new_key not in G2P_DATA:
+                G2P_DATA[new_key] = (CHILD_TRANS,CHILD_GENE,CHILD_GT)
+            else:
+                # print "ERROR: duplicate G2P variant new_key = %s" % (new_key)
+                # raise SystemExit
+                # this will happen if a gene is e.g. hemizygous,x-linked dominant - there will be two separate lines in the output for each req
+                pass
+
+    NUM_UNIQ_G2P_VARS = len(G2P_DICT)
+    print "Found %s unique G2P variants in CHILD (%s) after considering MONOALLELIC and BIALLELIC genes" % (NUM_UNIQ_G2P_VARS,CHILD_ID)
+    sys.stdout.flush()
+    print ""
+
+
+
+
+
+
+
+
+
+    ####################################################################################################################
+    ####    X-linked filtering                                                                                      ####
+#.#    ####    under the x-linked model (OBS == hemizygous or x-linked dominant, but NOT x-linked over-dominance)      ####
+    ####    under the chrX model (OBS == monoallelic_X_hem or monoallelic_X_het)                                    ####
+    ####    exclude child HET variants if seen as HOM in UNAFFECTED father                                          ####
+    ####													    ####
+    ####    Note 18/01/2022    									    		    ####
+    ####    This is a temporary solution, since x-linked dominant and x-linked over-dominance -> monoallelic_X_het  ####
+    ####    and we should filter x-linked dominant and monoallelic_X_hem, but not x-linked over-dominance           ####
+    ####    the code below treats x-linked over-dominance as the others (i.e. filters, while it should not)         ####
+    ####    Issue flagged to G2P plug-in team, awaiting their fix						    ####
+    ####    for now manually scan the output of G2P for the proband (both for boys and girls)                       ####
+    ####        to check if any variant has been called in PCDH19 and EFNB1                                         ####
+    ####    also for all the variants filtered out from monoallelic_X_het we will print in the log the gene name    ####
+    ####################################################################################################################
+
+
+    print ""
+    print "===   X-linked filtering   ==="
+
+    #######################################
+    ### process monoallelic_X_hem genes ###
+    #######################################
+
+    for key in CHILD_DICT['monoallelic_X_hem']:       # this the second key: chr:start:end:ref:alt; value: (ZYG,gene,trans)
+
+        CHILD_GT = CHILD_DICT['monoallelic_X_hem'][key][0]
+        CHILD_GENE = CHILD_DICT['monoallelic_X_hem'][key][1]
+        CHILD_TRANS = CHILD_DICT['monoallelic_X_hem'][key][2]
+
+        if CHILD_GT == 'HOM':							# do NOT filter HOM variants in proband (i.e., hemizygous in boy or HOM in girl)
+            pass
+        else:
+            if (key in DAD_DICT['monoallelic_X_hem']) and (DAD_STAT == "UNAFFECTED"):
+                DAD_GT = DAD_DICT['monoallelic_X_hem'][key][0]
+                if DAD_GT == 'HOM':						# i.e., hemizygous variant in unaffected father
+                    print "***[monoallelic_X_hem]*** Excluded CHILD var %s in gene = %s, CHILD_GT = %s, DAD_GT = %s, DAD_STAT = %s" % (key,CHILD_GENE,CHILD_GT,DAD_GT,DAD_STAT)
+                    continue
+
+        # if a non-normalized INDEL in child G2P - must adjust (should not happen really, we split, normalized and left-aligned the family VCF before sending it to VEP+G2P)
+        chr,start,end,ref,alt = key.split(":")
+        if len(ref) > 1 and len(alt) > 1:                           # an INDEL - not normalized
+            if len(ref) < len(alt):                                 # an INS
+                orig_start = start
+                orig_ref = ref
+                orig_alt = alt
+                start = orig_start
+                ref = '-'
+                alt = orig_alt[len(orig_ref):]
+                print "    WARNING: original INS = %s:%s:%s:%s:%s --> replaced with INS = %s:%s:%s:%s" % (chr,orig_start,end,orig_ref,orig_alt,chr,start,ref,alt)
+            else:                                                   # a DEL
+                print "ERROR: At the momemnt, cannot deal with this non-normalized deletion"
+                print line
+                raise SystemExit
+
+        new_key = '%s:%s:%s:%s' % (chr,start,ref,alt)
+
+        # record the data for CHILD G2P variants (for OBS=monoallelic_X_hem)
+        if new_key not in G2P_DICT:
+            G2P_DICT[new_key] = 0
+        else:
+            # print "ERROR: duplicate G2P variant new_key = %s" % (new_key)
+            # raise SystemExit
+            # this will happen if a gene is e.g. hemizygous,x-linked dominant - there will be two separate lines in the output for each req
+            pass
+
+        # and record the required data (CHILD_TRANS,CHILD_GENE,CHILD_GT) in G2P_DATA
+        if new_key not in G2P_DATA:
+            G2P_DATA[new_key] = (CHILD_TRANS,CHILD_GENE,CHILD_GT)
+        else:
+            # print "ERROR: duplicate G2P variant new_key = %s" % (new_key)
+            # raise SystemExit
+            # this will happen if a gene is e.g. hemizygous,x-linked dominant - there will be two separate lines in the output for each req
+            pass
+
+
+
+    #######################################
+    ### process monoallelic_X_het genes ###
+    #######################################
+
+    for key in CHILD_DICT['monoallelic_X_het']:       # this the second key: chr:start:end:ref:alt; value: (ZYG,gene,trans)
+
+        CHILD_GT = CHILD_DICT['monoallelic_X_het'][key][0]
+        CHILD_GENE = CHILD_DICT['monoallelic_X_het'][key][1]
+        CHILD_TRANS = CHILD_DICT['monoallelic_X_het'][key][2]
+
+        if CHILD_GT == 'HOM':                                                   # do NOT filter HOM variants (i.e., hemizygous in boy or HOM in girl)
+            pass
+        else:
+            if (key in DAD_DICT['monoallelic_X_het']) and (DAD_STAT == "UNAFFECTED"):
+                DAD_GT = DAD_DICT['monoallelic_X_het'][key][0]
+                if DAD_GT == 'HOM':                                             # i.e., x-linked dominant variant in unnafected father
+                    print "***[monoallelic_X_het]*** Excluded CHILD var %s in gene = %s, CHILD_GT = %s, DAD_GT = %s, DAD_STAT = %s" % (key,CHILD_GENE,CHILD_GT,DAD_GT,DAD_STAT)
+                    continue
+
+        # if a non-normalized INDEL in child G2P - must adjust (should not happen really, we split, normalized and left-aligned the family VCF before sending it to VEP+G2P)
+        chr,start,end,ref,alt = key.split(":")
+        if len(ref) > 1 and len(alt) > 1:                           # an INDEL - not normalized
+            if len(ref) < len(alt):                                 # an INS
+                orig_start = start
+                orig_ref = ref
+                orig_alt = alt
+                start = orig_start
+                ref = '-'
+                alt = orig_alt[len(orig_ref):]
+                print "    WARNING: original INS = %s:%s:%s:%s:%s --> replaced with INS = %s:%s:%s:%s" % (chr,orig_start,end,orig_ref,orig_alt,chr,start,ref,alt)
+            else:                                                   # a DEL
+                print "ERROR: At the momemnt, cannot deal with this non-normalized deletion"
+                print line
+                raise SystemExit
+
+        new_key = '%s:%s:%s:%s' % (chr,start,ref,alt)
+
+        # record the data for CHILD G2P variants (for OBS=monoallelic_X_het)
+        if new_key not in G2P_DICT:
+            G2P_DICT[new_key] = 0
+        else:
+            # print "ERROR: duplicate G2P variant new_key = %s" % (new_key)
+            # raise SystemExit
+            # this will happen if a gene is e.g. hemizygous,x-linked dominant - there will be two separate lines in the output for each req
+            pass
+
+        # and record the required data (CHILD_TRANS,CHILD_GENE,CHILD_GT) in G2P_DATA
+        if new_key not in G2P_DATA:
+            G2P_DATA[new_key] = (CHILD_TRANS,CHILD_GENE,CHILD_GT)
+        else:
+            # print "ERROR: duplicate G2P variant new_key = %s" % (new_key)
+            # raise SystemExit
+            # this will happen if a gene is e.g. hemizygous,x-linked dominant - there will be two separate lines in the output for each req
+            pass
+
+
+
+#.#    ########################################################################
+#.#    ### process x-linked over-dominance  genes - no filtering to be done ###
+#.#    ########################################################################
+
+#.#    for key in CHILD_DICT['x-linked over-dominance']:       # this the second key: chr:start:end:ref:alt; value: (ZYG,gene,trans)
+
+#.#        CHILD_GT = CHILD_DICT['x-linked over-dominance'][key][0]
+#.#        CHILD_GENE = CHILD_DICT['x-linked over-dominance'][key][1]
+#.#        CHILD_TRANS = CHILD_DICT['x-linked over-dominance'][key][2]
+
+#.#        # if a non-normalized INDEL in child G2P - must adjust (should not happen really, we split, normalized and left-aligned the family VCF before sending it to VEP+G2P)
+#.#        chr,start,end,ref,alt = key.split(":")
+#.#        if len(ref) > 1 and len(alt) > 1:                           # an INDEL - not normalized
+#.#            if len(ref) < len(alt):                                 # an INS
+#.#                orig_start = start
+#.#                orig_ref = ref
+#.#                orig_alt = alt
+#.#                start = orig_start
+#.#                ref = '-'
+#.#                alt = orig_alt[len(orig_ref):]
+#.#                print "    WARNING: original INS = %s:%s:%s:%s:%s --> replaced with INS = %s:%s:%s:%s" % (chr,orig_start,end,orig_ref,orig_alt,chr,start,ref,alt)
+#.#            else:                                                   # a DEL
+#.#                print "ERROR: At the momemnt, cannot deal with this non-normalized deletion"
+#.#                print line
+#.#                raise SystemExit
+
+#.#        new_key = '%s:%s:%s:%s' % (chr,start,ref,alt)
+
+#.#        # record the data for CHILD G2P variants (for OBS=x-linked over-dominance)
+#.#        if new_key not in G2P_DICT:
+#.#            G2P_DICT[new_key] = 0
+#.#        else:
+#.#            # print "ERROR: duplicate G2P variant new_key = %s" % (new_key)
+#.#            # raise SystemExit
+#.#            # this will happen if a gene is e.g. hemizygous,x-linked dominant - there will be two separate lines in the output for each req
+#.#            pass
+
+#.#        # and record the required data (CHILD_TRANS,CHILD_GENE,CHILD_GT) in G2P_DATA
+#.#        if new_key not in G2P_DATA:
+#.#            G2P_DATA[new_key] = (CHILD_TRANS,CHILD_GENE,CHILD_GT)
+#.#        else:
+#.#            # print "ERROR: duplicate G2P variant new_key = %s" % (new_key)
+#.#            # raise SystemExit
+#.#            # this will happen if a gene is e.g. hemizygous,x-linked dominant - there will be two separate lines in the output for each req
+#.#            pass
+
+
+    NUM_UNIQ_G2P_VARS = len(G2P_DICT)
+    print "Found %s unique G2P variants in CHILD (%s) after considering MONOALLELIC, BIALLELIC and X-LINKED genes" % (NUM_UNIQ_G2P_VARS,CHILD_ID)
+    sys.stdout.flush()
+
+    print ""
+    print ""
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+def read_ped(in_file):
+
+    global CHILD_ID
+    global CHILD_SEX
+    global DEC_CHILD_SEX
+    global MOM_ID
+    global MOM_STAT
+    global DAD_ID
+    global DAD_STAT
+
+    CHILD_ID = 0
+    CHILD_SEX = 0
+    MOM_ID = 0
+    MOM_STAT = 0
+    DAD_ID = 0
+    DAD_STAT = 0
+
+    in_han = open(in_file,'r')
+    for line in in_han:
+        data = [x.strip() for x in line.strip().split('\t')]
+        if data[2] != '0' and data[3] != '0':			# this is the child in the trio
+            if CHILD_ID == 0:
+                CHILD_ID = data[1]
+            else:						# seen another child
+                print "ERROR: already have seen a child (possibly a quad) - cannot handle at the moment"
+                raise SystemExit
+
+            if DAD_ID == 0:
+                DAD_ID = data[2]
+            else:
+                if data[2] != DAD_ID:
+                    print "ERROR: DAD_ID mismatch - from child line dad_id = %s, from dad line dad_id = %s" % (data[2],DAD_ID)
+                    raise SystemExit
+            if MOM_ID == 0:
+                MOM_ID = data[3]
+            else:
+                if data[3] != MOM_ID:
+                    print "ERROR: MOM_ID mismatch - from child line mom_id = %s, from mom line mom_id = %s" % (data[3],MOM_ID)
+                    raise SystemExit
+
+            CHILD_SEX = int(data[4])
+            if CHILD_SEX == 1:		# boy
+                DEC_CHILD_SEX = '46XY'
+            elif CHILD_SEX == 2:	# girl
+                DEC_CHILD_SEX = '46XX'
+            else:
+                print "ERROR: proband sex unknown"
+                print line
+                raise SystemExit
+
+            if int(data[5]) != 2:
+                print "ERROR: child not affected"
+                print line
+                raise SystemExit
+
+
+        elif int(data[2]) == 0 and int(data[3]) == 0:		# this is a parent record
+            if int(data[4]) == 1:				# this is the dad
+                if int(data[5]) == 1:
+                    DAD_STAT = "UNAFFECTED"
+                elif int(data[5]) == 2:
+                    DAD_STAT = "AFFECTED"
+                else:
+                    print "ERROR: cannot establish the dad's status"
+                    print line
+                    raise SystemExit
+
+                if DAD_ID == 0:
+                    DAD_ID = data[1]
+                else:
+                    if data[1] != DAD_ID:
+                        print "ERROR: DAD_ID mismatch - from dad line dad_id = %s, from child line dad_id = %s" % (data[1],DAD_ID)
+                        raise SystemExit
+
+            if int(data[4]) == 2:                               # this is the mom
+                if int(data[5]) == 1:
+                    MOM_STAT = "UNAFFECTED"
+                elif int(data[5]) == 2:
+                    MOM_STAT = "AFFECTED"
+                else:
+                    print "ERROR: cannot establish mom's status"
+                    print line
+                    raise SystemExit
+
+                if MOM_ID == 0:
+                    MOM_ID = data[1]
+                else:
+                    if data[1] != MOM_ID:
+                        print "ERROR: MOM_ID mismatch - from mom line mom_id = %s, from child line mom_id = %s" % (data[1],MOM_ID)
+                        raise SystemExit
+        else:
+            print "ERROR: problematic PED line"
+            print line
+            raise SystemExit
+
+
+
+
+
+
+def read_map_file(in_file):
+    in_han = open(in_file,'r')
+    for line in in_han:
+        data = [x.strip() for x in line.strip().split('\t')]
+        dec_id = data[0]
+        int_id = data[1]
+        if dec_id not in MAP_DICT:
+            MAP_DICT[dec_id] = int_id
+        else:
+            print "ERROR: duplicate DECIPHER/family ID = %s" % (dec_id)
+            raise SystemExit
+    in_han.close()
+
+
+
+
+def read_trans_map(in_file):
+    in_han = open(in_file,'r')
+    for line in in_han:
+        data = [x.strip() for x in line.strip().split('\t')]
+        old_trans_id = data[0]
+        new_trans_id = data[1]
+        if old_trans_id not in TRANS_DICT:
+            TRANS_DICT[old_trans_id] = new_trans_id
+        else:
+            print "ERROR: duplicate old transcript ID = %s" % (old_trans_id)
+            raise SystemExit
+    in_han.close()
+
+
+
+
+
+
+if __name__ == '__main__':
+    if len(sys.argv) == 12:
+        go(sys.argv[1],sys.argv[2],sys.argv[3],sys.argv[4],sys.argv[5],sys.argv[6],sys.argv[7],sys.argv[8],sys.argv[9],sys.argv[10],sys.argv[11])
+    else:
+        print "Suggested use: time python /home/u035/u035/shared/scripts/NHS_WES_generate_DEC_IGV.py \
+        dec_map_file,trans_map_file,ped_file,in_g2p_file,in_vase_file,fam_igv_dir,vcf_dir,plate_id,fam_id,dec_dir,fam_bam_dir"
+        raise SystemExit
+
diff --git a/generate_DEC_IGV_shared_scripts.py b/generate_DEC_IGV_shared_scripts.py
new file mode 100755
index 0000000000000000000000000000000000000000..687a3138f1685d9706e56a0c8729a80bf11310d3
--- /dev/null
+++ b/generate_DEC_IGV_shared_scripts.py
@@ -0,0 +1,568 @@
+#	input:
+#		the family PED file
+#		G2P text output for the trio		[${FAMILY_ID}.report.txt]
+#		the joint and individual VCFs
+#
+#
+#	output (per affected proband_:
+#		DECIPHER formated file (all shared G2P variants)
+#		IGV snapshot script file
+#
+#	checks:
+#		all G2P variants found in the individual VCF
+#
+#       Author: MH
+#       last modified: JAN 20, 2022
+
+
+
+
+import sys
+import os
+import csv
+import gzip
+from collections import defaultdict
+
+
+ASSEMBLY = 'GRCh38'
+INTERGENIC = 'No'
+ACCESS = 'No'
+
+
+
+TRANS_DICT = {}				# key: transcriptID not found in DECIPHER; value: the chosen replacement transcriptID from those available in DECIPHER
+KIDS_SEX_DICT = {}			# key: <indi_fam_id>; value: sex (in the format 46XX/46XY)
+KIDS_G2P_DICT = defaultdict(dict)	# 1st level key: <indi_fam_id>; 2nd level key: chr:start:end:ref:alt; value: (ZYG,gene,trans)
+KIDS_VCF_DICT = defaultdict(dict)	# 1st level key: <indi_fam_id>; 2nd level key: chr:pos:ref:alt; value: (FS,SOR)
+SHARED_DICT = {}			# key: chr:start:ref:alt; value: (ZYG,gene,trans)
+NUM_SHARED_G2P_VARS = 0
+SNAP_FLANK = 25
+
+
+FS_THRESH = float(60)
+SOR_THRESH = float(3)
+
+
+
+### call the python scrpit
+#time ${PYTHON2} ${SCRIPTS_DIR}/NHS_WES_generate_DEC_IGV_aff_probands.py \
+#${DECIPHER_ID} \
+#${TRANS_MAP} \
+#${PED_FILE} \
+#${IN_G2P_FILE} \
+#${FAM_IGV_DIR} \
+#${VCF_DIR} \
+#${PLATE_ID} \
+#${FAMILY_ID} \
+#${DEC_DIR} \
+#${FAM_BAM_DIR}
+
+
+
+
+
+
+
+def go(dec_id,trans_map_file,ped_file,in_g2p_file,fam_igv_dir,vcf_dir,plate_id,fam_id,dec_dir,fam_bam_dir):
+
+    # read the transcript mapping file
+    read_trans_map(trans_map_file)
+
+    # read the ped file and establish KID_ID + KID_SEX
+    read_ped(ped_file)
+
+    # read the G2P output for this family
+    read_G2P(in_g2p_file)
+
+    # now read the individual VCFs and record all the variants
+    # list of all ids
+    proband_ids = KIDS_G2P_DICT.keys()
+    for pro_id in proband_ids:
+        vcf_file = '%s/%s_%s.ready.%s.vcf.gz' % (vcf_dir,plate_id,fam_id,pro_id)
+        read_all_VCF_vars(vcf_file,KIDS_VCF_DICT,pro_id)
+    print ""
+    for k,v in KIDS_VCF_DICT.iteritems():
+        print "Found %s unique VCF variants for affected proband (%s)" % (len(v),k)
+    print ""
+    sys.stdout.flush()
+
+
+    print "Going over the varaints in each affected proband and checking each if it is shared G2P variant (to keep)"
+    # setup the DECIPHER and IGV snapshot output files - per each affected proband
+    proband_ids = KIDS_G2P_DICT.keys()
+    for pro_id in proband_ids:
+
+        num_out_vars = 0	# must be == NUM_SHARED_G2P_VARS
+
+        out_dec_file = '%s/%s_DEC_FLT.csv' % (dec_dir,pro_id)
+        out_han = open(out_dec_file,'w')
+        out_han.write('Internal reference number or ID,Chromosome,Start,Genome assembly,Reference allele,Alternate allele,Transcript,Gene name,Intergenic,Chromosomal sex,Other rearrangements/aneuploidy,Open-access consent,Age at last clinical assessment,Prenatal age in weeks,Note,Inheritance,Pathogenicity,Phenotypes,HGVS code,Genotype,Responsible contact\n')
+
+        # setup the IGV snapshot file
+        out_igv_file = '%s/IGV/%s.snapshot.FLT.txt' % (dec_dir,pro_id)
+        out_igv_han = open(out_igv_file,'w')
+        out_igv_han.write('new\n')
+        out_igv_han.write('genome hg38\n')
+        out_igv_han.write('mkdir -p "%s"\n' % (fam_igv_dir))
+        out_igv_han.write('new\n')
+
+        child_bam = '%s/%s/%s-ready.bam' % (fam_bam_dir,pro_id,pro_id)
+        out_igv_han.write('load %s\n' % (child_bam))
+        out_igv_han.write('snapshotDirectory "%s"\n' % (fam_igv_dir))
+        out_igv_han.write('\n')
+
+        # go over the individual's VCF variants, check if found in the shared G2P variants, if yes - output it (with VCF's coordinates)
+        # KIDS_VCF_DICT = defaultdict(dict)       # 1st level key: <indi_fam_id>; 2nd level key: chr:pos:ref:alt; value: irrelevant
+        # SHARED_DICT = {}                        # key: chr:start:ref:alt; value: (ZYG,gene,trans)
+
+        pro_vcf_vars = KIDS_VCF_DICT[pro_id]
+        for pro_vcf_var,fs_sor in pro_vcf_vars.iteritems():
+            chr,pos,ref,alt = pro_vcf_var.split(':')
+            pos = int(pos)
+            FS = fs_sor[0]
+            SOR = fs_sor[1]
+
+            # adjust pro_vcf_var for indels to match G2P style of recording
+            if len(ref) == len(alt):							# SNP
+                if len(ref) != 1:
+                    print "ERROR: MNPs are not supported!"
+                    print line
+                    raise SystemExit
+                G2P_key_to_match = '%s:%s:%s:%s' % (chr,pos,ref,alt)
+            elif len(ref) > len(alt):							# DEL
+                if len(alt) != 1:
+                    print "ERROR with a deletion"
+                    print line
+                    raise SystemExit
+                G2P_key_to_match = '%s:%s:%s:-' % (chr,pos+1,ref[1:])
+            elif len(ref) < len(alt):							# INS
+                if len(ref) != 1:
+                    print "ERROR with an insertion"
+                    print line
+                    raise SystemExit
+                G2P_key_to_match = '%s:%s:-:%s' % (chr,pos+1,alt[1:])
+            else:
+                print "Cannot establish the type of this VCF variant"
+                print line
+                raise SystemExit
+
+            if G2P_key_to_match not in SHARED_DICT:					# an individual variant which is not in the shared G2P output
+                continue
+
+            # if here, this variant is in the shared G2P output, write it out
+            print "\t%s:\tfound %s (VCF) -> %s (shared G2P)" % (pro_id,pro_vcf_var,G2P_key_to_match)
+
+            GT = SHARED_DICT[G2P_key_to_match][0]
+            gene = SHARED_DICT[G2P_key_to_match][1]
+            trans = SHARED_DICT[G2P_key_to_match][2]
+
+            inher_stat = 'Unknown'
+
+            if (chr != 'chrX') and (chr != 'chrY'):
+                if GT == 'HET':
+                    genotype = 'Heterozygous'
+                elif GT == 'HOM':
+                    genotype = 'Homozygous'
+                else:
+                    print "ERROR: Cannot understand GT = %s" % (GT)
+                    raise SystemExit
+
+            elif (chr == 'chrX') or (chr == 'chrY'):
+                if KIDS_SEX_DICT[pro_id] == '46XX':                 # a girl
+                    if GT == 'HET':
+                        genotype = 'Heterozygous'
+                    elif GT == 'HOM':
+                        genotype = 'Homozygous'
+                    else:
+                        print "ERROR: Cannot understand GT = %s" % (GT)
+                        raise SystemExit
+                elif KIDS_SEX_DICT[pro_id] == '46XY':               # a boy
+                    if GT == 'HET':
+                        genotype = 'Heterozygous'
+                        print "   WARNING: HET variant on chrX/Y for a boy (%s): %s\t%s\t%s\t%s\t%s" % (pro_id,chr,pos,ref,alt,pro_vcf_var)
+                    elif GT == 'HOM':
+                        genotype = 'Hemizygous'
+                    else:
+                        print "ERROR: Cannot understand GT = %s" % (GT)
+                        raise SystemExit
+                else:
+                    print "ERROR: unknown sex for this proband = %s" % (DEC_CHILD_SEX)
+                    raise SystemExit
+            else:
+                print "ERROR: unknown chr"
+                print line
+                raise SystemExit
+
+            # write to the DECIPHER file
+            gene_id_idx = gene.find('(')
+            if gene_id_idx == -1:
+                gene_id_idx = len(gene)
+            gene_id = gene[0:gene_id_idx]
+
+            if trans in TRANS_DICT:                         # if the transcriptID is to be replaced
+                safe_trans = TRANS_DICT[trans]
+            else:
+                safe_trans = trans
+
+            to_write = '%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,,%s,,,,"%s",,,,%s,\n' % (dec_id,chr[3:],pos,ASSEMBLY,ref,alt,safe_trans,gene_id,INTERGENIC,KIDS_SEX_DICT[pro_id],ACCESS,inher_stat,genotype)
+            out_han.write(to_write)
+
+            # write to the IGV file
+            i_s = pos - SNAP_FLANK
+            i_e = pos + SNAP_FLANK
+
+            # check if above FS/SOR_THRESH to include in the snapshot name
+            if (FS == '') or (SOR == ''):
+                flag = 'NA'
+            elif (FS >= FS_THRESH) and (SOR >= SOR_THRESH):
+                flag = 'FS_%.1f_SOR_%.1f' % (FS,SOR)
+            else:
+                flag = 'OK'
+            i_name = '%s_%s_%s_%s_%s_%s.png' % (pro_id,chr,pos,ref,alt,flag)
+
+            out_igv_han.write('goto %s:%s-%s\n' % (chr,i_s,i_e))
+            out_igv_han.write('sort strand\n')
+            out_igv_han.write('squish\n')
+            out_igv_han.write('snapshot %s\n' % (i_name))
+            out_igv_han.write('\n')
+
+            num_out_vars += 1
+
+        out_han.close()
+        out_igv_han.close()
+        if num_out_vars == NUM_SHARED_G2P_VARS:
+            print "\t%s:\tNumber of output variants matches the number of shared variants: OK" % (pro_id)
+        else:
+            print "\t%s:\tERROR: number of output variants does NOT match the number of shared variants" % (pro_id)
+        print "\t%s:\tdecipher file = %s" % (pro_id,out_dec_file)
+        print "\t%s:\tigv snapshot file for %s" % (pro_id,out_igv_file)
+        print "\t--------------------------------"
+
+
+
+
+
+
+def read_all_VCF_vars(in_vcf_file,THIS_DICT,pro_id):
+
+    cntr = 0
+    in_han = gzip.open(in_vcf_file,'r')
+    for line in in_han:
+        if line.startswith('#'):
+            continue
+
+        cntr += 1
+        data = [x.strip() for x in line.strip().split('\t')]
+        chr = data[0]
+        pos = int(data[1])
+        ref = data[3]
+        alt = data[4]
+
+        # extract FS and SOR
+        FS = ''
+        SOR = ''
+        infos = [y.strip() for y in data[7].strip().split(';')]
+        for info in infos:
+            if info.startswith('FS='):
+                tag,FS = info.split('=')
+                FS = float(FS)
+            elif info.startswith('SOR='):
+                tag,SOR = info.split('=')
+                SOR = float(SOR)
+
+        # did the splitting and normalizing - should not have multiallelic variants
+        if alt.find(',') != -1:
+            print "ERROR: found multiallelic variant"
+            print line
+            raiseSystemExit
+
+        key = '%s:%s:%s:%s' % (chr,pos,ref,alt)
+
+        if pro_id not in THIS_DICT:
+            THIS_DICT[pro_id][key] = (FS,SOR)
+        elif key not in THIS_DICT[pro_id]:
+            THIS_DICT[pro_id][key] = (FS,SOR)
+        else:
+            print "ERROR: duplicate key = %s in %s" % (key,in_vcf_file)
+            raise SystemExit
+
+    in_han.close()
+
+
+
+
+
+
+
+
+def read_G2P(in_file):
+
+    global NUM_SHARED_G2P_VARS
+
+#.#    known_OBS_states = ['monoallelic','biallelic','hemizygous','x-linked dominant','x-linked over-dominance']
+    known_OBS_states = ['monoallelic_autosomal','biallelic_autosomal','monoallelic_X_hem','monoallelic_X_het']
+
+    # to make sure no duplicate vars per indi
+    CHECK_DICT =  defaultdict(dict)       # 1st level key: indi_fam_id:chr:start:end:ref:alt; 2nd level key: OBS_state; value: irrelevant
+
+    # first, read the G2P variants on canonical transcripts for each of the affected probands
+    in_han = open(in_file,'r')
+    for line in in_han:
+        data = [x.strip() for x in line.strip().split('\t')]
+
+        # get the individual_id
+        sam_id = data[0]
+
+        # if, in addition to the  affected siblings there is an unaffected parent, they would be in the family VCF
+        # they would be G2P-ed, must be excluded by now!!!
+        if sam_id not in KIDS_SEX_DICT:
+            print "ERROR: In the G2P file found a sample which is not an affected kid = %s !!!" % (sam_id)
+            raise SystemExit
+
+        # ignore variants not on canonical transcripts
+        is_canon = data[3]
+        if is_canon != 'is_canonical':
+            continue
+
+        # split the variants based on the gene's OBS model of inheritance
+        inher_model = data[4]
+        aaa,OBS_state = inher_model.split('=')
+
+        if OBS_state not in known_OBS_states:
+            print "ERROR: unknown OBS state = %s in %s" % (OBS_state,in_file)
+            raise SystemExit
+
+        # get the gene name in format ENSG00000165899(C12orf64,OTOGL)
+        gene_name = data[1]
+
+        # get the transcript name in format ENST00000238647
+        transcript = data[2]
+
+        # this is a list of variants (n>=1) on a canonical transcript in a gene being considered under any OBS state
+        var_list = [y.strip() for y in data[6].split(';')]
+        for v in var_list:
+            v_details = [z.strip() for z in v.split(':')]
+            chr = v_details[0]
+            start = int(v_details[1])
+            end = int(v_details[2])
+            ref = v_details[3]
+            alt = v_details[4]
+            GT = v_details[5]
+            second_key = '%s:%s:%s:%s:%s' % (chr,start,end,ref,alt)
+
+###################################################################################
+#            check_key = '%s:%s' % (sam_id,second_key)
+#            if check_key not in CHECK_DICT:
+#                CHECK_DICT[check_key][OBS_state] = 1
+#            elif OBS_state not in CHECK_DICT[check_key].keys():
+#                CHECK_DICT[check_key][OBS_state] = 1
+#            else:
+#                print "ERROR: a duplicate variant %s in %s gene for CHILD = %s, OBS_state = %s" % (check_key,gene_name,sam_id,OBS_state)
+#                raise SystemExit
+#
+#            if sam_id not in KIDS_G2P_DICT:
+#                KIDS_G2P_DICT[sam_id][second_key] = (GT,gene_name,transcript)
+#            elif second_key not in KIDS_G2P_DICT[sam_id]:
+#                KIDS_G2P_DICT[sam_id][second_key] = (GT,gene_name,transcript)
+#            else:
+##                print "ERROR: a duplicate variant %s in %s gene for CHILD = %s" % (second_key,gene_name,sam_id)
+##                raise SystemExit
+#                pass	# the same variant in diff OBS_state   - see above !
+###################################################################################
+
+
+            ##########################################
+            ### to deal with the new output of G2P ###
+            ##########################################
+
+            check_key = '%s:%s' % (sam_id,second_key)
+            if check_key not in CHECK_DICT:						# first time we see this var in this sample, any OBS_state
+                CHECK_DICT[check_key][OBS_state] = 1
+                if sam_id not in KIDS_G2P_DICT:
+                    KIDS_G2P_DICT[sam_id][second_key] = (GT,gene_name,transcript)
+                elif second_key not in KIDS_G2P_DICT[sam_id]:
+                    KIDS_G2P_DICT[sam_id][second_key] = (GT,gene_name,transcript)
+                else:									# sanity check
+                    print "ERROR: first time var already seen?: variant %s in %s gene for CHILD = %s" % (second_key,gene_name,sam_id)
+                    raise SystemExit
+
+            elif OBS_state not in CHECK_DICT[check_key].keys():				# first time we see this var in this sample with this OBS_state
+                CHECK_DICT[check_key][OBS_state] = 1
+                if sam_id not in KIDS_G2P_DICT:
+                    KIDS_G2P_DICT[sam_id][second_key] = (GT,gene_name,transcript)
+                elif second_key not in KIDS_G2P_DICT[sam_id]:
+                    KIDS_G2P_DICT[sam_id][second_key] = (GT,gene_name,transcript)
+                elif KIDS_G2P_DICT[sam_id][second_key] == (GT,gene_name,transcript):    # diff OBS_state, but must have same (GT,gene_name,transcript)
+                    pass
+                else:
+                    print "ERROR: diff (GT,gene_name,transcript) for variant %s in %s gene for CHILD = %s" % (second_key,gene_name,sam_id)
+                    raise SystemExit
+
+            else: 	# same individual, same variant, known OBS_state
+                        # due to the new output of G2P we may have the same variant but with different gene names - ensembl/refseq
+                        # check the gene name in KIDS_G2P_DICT[sam_id][second_key]
+                if not KIDS_G2P_DICT[sam_id][second_key][1].startswith('ENSG'):             # recorded is refseq
+                    if gene_name.startswith('ENSG'):                                        # this is ensembl
+                        KIDS_G2P_DICT[sam_id][second_key] = (GT,gene_name,transcript)       # replace
+                    else:                                                                   # this is refseq again, ignore
+                        pass
+                else:                                                                       # recorded is ensembl, ignore
+                    pass
+
+    in_han.close()
+    print ""
+    print ""
+    print "Found the following variants on canonical transcripts in the G2P output for these affected probands"
+
+    for id,val in KIDS_G2P_DICT.iteritems():
+        print "--------------------------"
+        for k,v in val.iteritems():
+            print "    %s\t%s\t%s" % (id,k,v)
+    print ""
+    print ""
+
+
+
+
+    ###################################################################################
+    ####    SHARED variant filtering                                               ####
+    ####    select only variants seen in all affected probands with the same GT    ####
+    ###################################################################################
+
+    print ""
+    print "===   SHARED variant filtering   ==="
+
+    # list of all ids
+    proband_ids = KIDS_G2P_DICT.keys()
+    print "All affected probands = %s" % (proband_ids)
+
+    # for each proband, go thru all of their variants, check if seen in all probands excl this one, iff yes, record in SHARED_DICT
+    for pro_id in proband_ids:
+        other_pro_ids = []
+        for aaa in proband_ids:
+            other_pro_ids.append(aaa)
+        other_pro_ids.remove(pro_id)
+        print "Analyzing variants in %s, to be compared against the variants in all other affected probands %s" % (pro_id,other_pro_ids)
+
+        # go thru all of their variants
+        pro_vars = KIDS_G2P_DICT[pro_id]            # a dict with keys: chr,start,end,ref,alt and values: (GT,gene_name,transcript)
+        for var_loc,var_info in pro_vars.iteritems():
+            found_in_all = True
+
+            # check if seen in all probands excl this one
+            for o_id in other_pro_ids:
+                if var_loc not in KIDS_G2P_DICT[o_id]:
+                    print "  Excluding variant %s in %s, since not seen in %s" % (var_loc,pro_id,o_id)
+                    found_in_all = False
+                    break
+
+                # if variant found, check if GT matches
+                else:
+                    o_info = KIDS_G2P_DICT[o_id][var_loc]
+                    if var_info[0] != o_info[0]:
+                        print "  Excluding variant %s in %s (GT = %s); it is seen in %s but GT does not match (ST = %s)" % (var_loc,pro_id,var_info[0],o_id,o_info[0])
+                        found_in_all = False
+                        break
+
+            if found_in_all:	# this variant has been found in all affected probands with matching GT, keep it
+                if var_loc not in SHARED_DICT:		# it has not been recorded previously when considering another proband
+
+                    # for consistency with the standard trio-based processing
+                    # if a non-normalized INDEL in child G2P - must adjust (should not happen really, we split, normalized and left-aligned the family VCF before sending it to VEP+G2P)
+                    chr,start,end,ref,alt = var_loc.split(":")
+                    if len(ref) > 1 and len(alt) > 1:                           # an INDEL - not normalized
+                        if len(ref) < len(alt):                                 # an INS
+                            orig_start = start
+                            orig_ref = ref
+                            orig_alt = alt
+                            start = orig_start
+                            ref = '-'
+                            alt = orig_alt[len(orig_ref):]
+                            print "    WARNING: original INS = %s:%s:%s:%s:%s --> replaced with INS = %s:%s:%s:%s" % (chr,orig_start,end,orig_ref,orig_alt,chr,start,ref,alt)
+                        else:                                                   # a DEL
+                            print "ERROR: At the momemnt, cannot deal with this non-normalized deletion"
+                            print line
+                            raise SystemExit
+
+                    new_key = '%s:%s:%s:%s' % (chr,start,ref,alt)
+                    SHARED_DICT[new_key] = var_info
+                    print "  Keeping %s found in all affected probands, same GT" % (new_key)
+
+
+        print "---------------------"
+
+    NUM_SHARED_G2P_VARS = len(SHARED_DICT)
+    print "Found %s unique and canonical G2P variants SHARED between all %s affected probands in this family" % (NUM_SHARED_G2P_VARS,len(proband_ids))
+    sys.stdout.flush()
+    print ""
+
+
+
+
+
+
+
+
+
+
+
+def read_ped(in_file):
+    in_han = open(in_file,'r')
+    for line in in_han:
+        data = [x.strip() for x in line.strip().split('\t')]
+        kid_id = data[1]
+        kid_se = int(data[4])
+        if kid_se == 1:		# boy
+            kid_sex =  '46XY'
+        elif kid_se == 2:	# girl
+            kid_sex =  '46XX'
+        else:
+            print "ERROR: proband sex unknown"
+            print line
+            raise SystemExit
+        if kid_id not in KIDS_SEX_DICT:
+            KIDS_SEX_DICT[kid_id] = kid_sex
+        else:
+            print "ERROR: proband sex unknown"
+            print line
+            raise SystemExit
+    in_han.close()
+    print "Found the following affected probands"
+    for k,v in KIDS_SEX_DICT.iteritems():
+        print "    %s: %s" % (k,v)
+    sys.stdout.flush()
+
+
+
+
+
+def read_trans_map(in_file):
+    in_han = open(in_file,'r')
+    for line in in_han:
+        data = [x.strip() for x in line.strip().split('\t')]
+        old_trans_id = data[0]
+        new_trans_id = data[1]
+        if old_trans_id not in TRANS_DICT:
+            TRANS_DICT[old_trans_id] = new_trans_id
+        else:
+            print "ERROR: duplicate old transcript ID = %s" % (old_trans_id)
+            raise SystemExit
+    in_han.close()
+
+
+
+
+
+
+
+
+
+
+if __name__ == '__main__':
+    if len(sys.argv) == 11:
+        go(sys.argv[1],sys.argv[2],sys.argv[3],sys.argv[4],sys.argv[5],sys.argv[6],sys.argv[7],sys.argv[8],sys.argv[9],sys.argv[10])
+    else:
+        print "Suggested use: time python /home/u035/u035/shared/scripts/NHS_WES_generate_DEC_IGV_aff_probands.py \
+        dec_id,trans_map_file,ped_file,in_g2p_file,fam_igv_dir,vcf_dir,plate_id,fam_id,dec_dir,fam_bam_dir"
+        raise SystemExit
+
diff --git a/generate_DEC_IGV_solo_scripts.py b/generate_DEC_IGV_solo_scripts.py
new file mode 100755
index 0000000000000000000000000000000000000000..f9896d9ba0224e8263c154966ece075e007e2c88
--- /dev/null
+++ b/generate_DEC_IGV_solo_scripts.py
@@ -0,0 +1,1110 @@
+#	input:
+#		the family PED file				[${BATCH_ID}_${PLATE_ID}_${FAMILY_ID}.ped]
+#		the VCF folder containing the proband VCF	[${PROJECT_ID}/VCF/  ${PLATE_ID}_${FAMILY_ID}.ready.${CHILD_ID}_${FAMILY_ID}.vcf.gz]
+#		G2P text output for the trio			[${FAMILY_ID}.report.txt]
+#
+#
+#	output:
+#		DECIPHER formated file for the proband
+#		- all (filtered) G2P variants
+#
+#	checks:
+#		all G2P variants found in the individual VCF
+#
+#       Author: MH
+#       last modified: MARCH 04, 2022
+
+
+
+import sys
+import os
+import csv
+import gzip
+from collections import defaultdict
+
+
+ASSEMBLY = 'GRCh38'
+INTERGENIC = 'No'
+ACCESS = 'No'
+
+
+G2P_DICT = {}		# key: chr:pos:ref:alt; value: 0 (if found only in G2P); 1 (if found in VCF) - for variants found in G2P output for this CHILD_ID
+G2P_DATA = {}		# key: chr:pos:ref:alt; value: (transcript,gene,GT)
+
+
+NUM_UNIQ_G2P_VARS = 0
+
+
+CHILD_ID = 0
+CHILD_SEX = 0
+DEC_CHILD_SEX = 'unknown'
+
+
+ALL_CHILD_DICT = {}		# key: chr:pos:ref:alt; value: (num_ALT_reads,VAF)
+CHILD_INHER_DICT = {}           # key: chr:pos:ref:alt; value: "Unknown" - all variants in a singleton are of unknown inheritance
+
+
+SNAP_FLANK = 25
+
+
+MAP_DICT = {}			# key: family_id (aka decipher_id); value: internal (decipher) ID
+TRANS_DICT = {}			# key: transcriptID not found in DECIPHER; value: the chosen replacement transcriptID from those available in DECIPHER
+
+
+FS_THRESH = float(60)
+SOR_THRESH = float(3)
+
+
+
+def go(dec_map_file,trans_map_file,ped_file,in_g2p_file,fam_igv_dir,vcf_dir,plate_id,fam_id,dec_dir,fam_bam_dir):
+
+    # read the decipher to internal ID mapping file
+    read_map_file(dec_map_file)
+
+
+    # read the transcript mapping file
+    read_trans_map(trans_map_file)
+
+    # read the ped file and establish CHILD_ID,CHILD_SEX,MOM_ID,DAD_ID
+    read_ped(ped_file)
+
+    if (CHILD_ID != 0) and (CHILD_SEX != 0) and (DEC_CHILD_SEX != 'unknown'):
+        print "======================================"
+        print "Analyzing singleton CHILD_ID = %s, CHILD_SEX = %s, DEC_CHILD_SEX = %s" % (CHILD_ID,CHILD_SEX,DEC_CHILD_SEX)
+        print "======================================"
+        sys.stdout.flush()
+    else:
+        print "ERROR: problems reading the PED file = %s" % (ped_file)
+        raise SystemExit
+
+
+    # read the G2P output for this family
+    read_G2P(in_g2p_file)
+
+
+    # now read the proband VCFs and record all the variants
+    child_vcf_file = '%s/%s_%s.ready.%s.vcf.gz' % (vcf_dir,plate_id,fam_id,CHILD_ID)
+    read_all_VCF_vars(child_vcf_file,ALL_CHILD_DICT)
+    print "Found %s unique VCF variants for CHILD (%s)" % (len(ALL_CHILD_DICT),CHILD_ID)
+    sys.stdout.flush()
+
+    # now go over all child variants and set the inheritance to "Unknown"
+    num_child_vars_assigned = 0
+    for key,v in ALL_CHILD_DICT.iteritems():
+        CHILD_INHER_DICT[key] = 'Unknown'
+    assigned_ratio = (float(num_child_vars_assigned)/float(len(ALL_CHILD_DICT)))*100.0
+    print "%s of the %s unique VCF variants (%.2f%%) for CHILD (%s) has been assigned to parents" % (num_child_vars_assigned,len(ALL_CHILD_DICT),assigned_ratio,CHILD_ID)
+    sys.stdout.flush()
+
+
+
+    # setup the DECIPHER output file
+    out_dec_file = '%s/%s_DEC_FLT.csv' % (dec_dir,CHILD_ID)		################################
+    out_han = open(out_dec_file,'w')
+    out_han.write('Internal reference number or ID,Chromosome,Start,Genome assembly,Reference allele,Alternate allele,Transcript,Gene name,Intergenic,Chromosomal sex,Other rearrangements/aneuploidy,Open-access consent,Age at last clinical assessment,Prenatal age in weeks,Note,Inheritance,Pathogenicity,Phenotypes,HGVS code,Genotype,Responsible contact\n')
+
+
+    # setup the IGV snapshot file
+    out_igv_file = '%s/IGV/%s.solo.snapshot.FLT.txt' % (dec_dir,CHILD_ID)	#################################
+    out_igv_han = open(out_igv_file,'w')
+    out_igv_han.write('new\n')
+    out_igv_han.write('genome hg38\n')
+    out_igv_han.write('mkdir -p "%s"\n' % (fam_igv_dir))
+    out_igv_han.write('new\n')
+
+    child_bam = '%s/%s/%s-ready.bam' % (fam_bam_dir,CHILD_ID,CHILD_ID)
+    out_igv_han.write('load %s\n' % (child_bam))
+
+    out_igv_han.write('snapshotDirectory "%s"\n' % (fam_igv_dir))
+    out_igv_han.write('\n')
+
+
+    # now read the child VCF, check if the variant in the G2P output, if yes:
+    # set the value in the dict to 1
+    # print out to to output file
+
+    in_cntr = 0
+    out_cntr = 0
+
+    child_vcf_file = '%s/%s_%s.ready.%s.vcf.gz' % (vcf_dir,plate_id,fam_id,CHILD_ID)
+    in_han = gzip.open(child_vcf_file,'r')
+
+    for line in in_han:
+        if line.startswith('#'):
+            continue
+
+        in_cntr += 1
+
+        data = [x.strip() for x in line.strip().split('\t')]
+        chr = data[0]
+        pos = int(data[1])
+        ref = data[3]
+        alt = data[4]
+
+        # extract FS and SOR
+        FS = ''
+        SOR = ''
+        infos = [y.strip() for y in data[7].strip().split(';')]
+        for info in infos:
+            if info.startswith('FS='):
+                tag,FS = info.split('=')
+                FS = float(FS)
+            elif info.startswith('SOR='):
+                tag,SOR = info.split('=')
+                SOR = float(SOR)
+
+        VCF_VAR = data[9]
+
+        key = '%s:%s:%s:%s' % (chr,pos,ref,alt)
+        inher_stat = CHILD_INHER_DICT[key]
+
+
+
+
+        ##############################################################
+        # different processing depending on being a SNP, INS, or DEL #
+        ##############################################################
+
+        if len(ref) == len(alt):			# SNP
+            if len(ref) != 1:
+                print "ERROR: MNPs are not supported!"
+                print line
+                raise SystemExit
+
+            key_to_match = '%s:%s:%s:%s' % (chr,pos,ref,alt)
+            if key_to_match in G2P_DICT:
+                G2P_DICT[key_to_match] = 1
+                trans = G2P_DATA[key_to_match][0]
+                gene = G2P_DATA[key_to_match][1]
+                GT = G2P_DATA[key_to_match][2]
+
+                if (chr != 'chrX') and (chr != 'chrY'):
+                    if GT == 'HET':
+                        genotype = 'Heterozygous'
+                    elif GT == 'HOM':
+                        genotype = 'Homozygous'
+                    else:
+                        print "ERROR: Cannot understand GT = %s" % (GT)
+                        raise SystemExit
+
+                elif (chr == 'chrX') or (chr == 'chrY'):
+                    if DEC_CHILD_SEX == '46XX':			# a girl
+                        if GT == 'HET':
+                            genotype = 'Heterozygous'
+                        elif GT == 'HOM':
+                            genotype = 'Homozygous'
+                        else:
+                            print "ERROR: Cannot understand GT = %s" % (GT)
+                            raise SystemExit
+                    elif DEC_CHILD_SEX == '46XY':		# a boy
+                        if GT == 'HET':
+                            genotype = 'Heterozygous'
+                            print "   WARNING: HET variant on chrX/Y for a boy (%s): %s\t%s\t%s\t%s\t%s" % (CHILD_ID,chr,pos,ref,alt,VCF_VAR)
+                        elif GT == 'HOM':
+                            genotype = 'Hemizygous'
+                        else:
+                            print "ERROR: Cannot understand GT = %s" % (GT)
+                            raise SystemExit
+                    else:
+                        print "ERROR: unknown sex for this proband = %s" % (DEC_CHILD_SEX)
+                        raise SystemExit
+                else:
+                    print "ERROR: unknown chr"
+                    print line
+                    raise SystemExit
+
+                # write to the DECIPHER file
+                gene_id_idx = gene.find('(')
+                if gene_id_idx == -1:
+                    gene_id_idx = len(gene)
+                gene_id = gene[0:gene_id_idx]
+                int_ID = MAP_DICT[fam_id]
+
+                if trans in TRANS_DICT:				# if the transcriptID is to be replaced
+                    safe_trans = TRANS_DICT[trans]
+                else:
+                    safe_trans = trans
+
+                to_write = '%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,,%s,,,,"%s",,,,%s,\n' % (int_ID,chr[3:],pos,ASSEMBLY,ref,alt,safe_trans,gene_id,INTERGENIC,DEC_CHILD_SEX,ACCESS,inher_stat,genotype)
+                out_cntr += 1
+                out_han.write(to_write)
+
+                # write to the IGV file
+                i_s = pos - SNAP_FLANK
+                i_e = pos + SNAP_FLANK
+
+                # check if above FS/SOR_THRESH to include in the snapshot name
+                if (FS == '') or (SOR == ''):
+                    flag = 'NA'
+                elif (FS >= FS_THRESH) and (SOR >= SOR_THRESH):
+                    flag = 'FS_%.1f_SOR_%.1f' % (FS,SOR)
+                else:
+                    flag = 'OK'
+                i_name = '%s_%s_%s_%s_%s_%s.png' % (CHILD_ID,chr,pos,ref,alt,flag)
+
+                out_igv_han.write('goto %s:%s-%s\n' % (chr,i_s,i_e))
+                out_igv_han.write('sort strand\n')
+                out_igv_han.write('squish\n')
+                out_igv_han.write('snapshot %s\n' % (i_name))
+                out_igv_han.write('\n')
+
+
+
+        elif len(ref) > len(alt):			# DEL
+            if len(alt) != 1:
+                print "ERROR with a deletion"
+                print line
+                raise SystemExit
+
+            G2P_key_to_match = '%s:%s:%s:-' % (chr,pos+1,ref[1:])
+            if G2P_key_to_match in G2P_DICT:
+                G2P_DICT[G2P_key_to_match] = 1
+                trans = G2P_DATA[G2P_key_to_match][0]
+                gene = G2P_DATA[G2P_key_to_match][1]
+                GT = G2P_DATA[G2P_key_to_match][2]
+
+                if (chr != 'chrX') and (chr != 'chrY'):
+                    if GT == 'HET':
+                        genotype = 'Heterozygous'
+                    elif GT == 'HOM':
+                        genotype = 'Homozygous'
+                    else:
+                        print "ERROR: Cannot understand GT = %s" % (GT)
+                        raise SystemExit
+                elif (chr == 'chrX') or (chr == 'chrY'):
+                    if DEC_CHILD_SEX == '46XX':                 # a girl
+                        if GT == 'HET':
+                            genotype = 'Heterozygous'
+                        elif GT == 'HOM':
+                            genotype = 'Homozygous'
+                        else:
+                            print "ERROR: Cannot understand GT = %s" % (GT)
+                            raise SystemExit
+                    elif DEC_CHILD_SEX == '46XY':               # a boy
+                        if GT == 'HET':
+                            genotype = 'Heterozygous'
+                            print "   WARNING: HET variant on chrX/Y for a boy (%s): %s\t%s\t%s\t%s\t%s" % (CHILD_ID,chr,pos,ref,alt,VCF_VAR)
+                        elif GT == 'HOM':
+                            genotype = 'Hemizygous'
+                        else:
+                            print "ERROR: Cannot understand GT = %s" % (GT)
+                            raise SystemExit
+                    else:
+                        print "ERROR: unknown sex for this proband = %s" % (DEC_CHILD_SEX)
+                        raise SystemExit
+                else:
+                    print "ERROR: unknown chr"
+                    print line
+                    raise SystemExit
+
+                # write to the DECIPHER file
+                gene_id_idx = gene.find('(')
+                if gene_id_idx == -1:
+                    gene_id_idx = len(gene)
+                gene_id = gene[0:gene_id_idx]
+                int_ID = MAP_DICT[fam_id]
+
+                if trans in TRANS_DICT:                         # if the transcriptID is to be replaced
+                    safe_trans = TRANS_DICT[trans]
+                else:
+                    safe_trans = trans
+
+                to_write = '%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,,%s,,,,"%s",,,,%s,\n' % (int_ID,chr[3:],pos,ASSEMBLY,ref,alt,safe_trans,gene_id,INTERGENIC,DEC_CHILD_SEX,ACCESS,inher_stat,genotype)
+                out_cntr += 1
+                out_han.write(to_write)
+
+                # write to the IGV file
+                i_s = pos - SNAP_FLANK
+                i_e = pos + SNAP_FLANK
+
+                # check if above FS/SOR_THRESH to include in the snapshot name
+                if (FS == '') or (SOR == ''):
+                    flag = 'NA'
+                elif (FS >= FS_THRESH) and (SOR >= SOR_THRESH):
+                    flag = 'FS_%.1f_SOR_%.1f' % (FS,SOR)
+                else:
+                    flag = 'OK'
+                i_name = '%s_%s_%s_%s_%s_%s.png' % (CHILD_ID,chr,pos,ref,alt,flag)
+
+                out_igv_han.write('goto %s:%s-%s\n' % (chr,i_s,i_e))
+                out_igv_han.write('sort strand\n')
+                out_igv_han.write('squish\n')
+                out_igv_han.write('snapshot %s\n' % (i_name))
+                out_igv_han.write('\n')
+
+
+
+        elif len(ref) < len(alt):                       # INS
+            if len(ref) != 1:
+                print "ERROR with an insertion"
+                print line
+                raise SystemExit
+
+            G2P_key_to_match = '%s:%s:-:%s' % (chr,pos+1,alt[1:])
+            if G2P_key_to_match in G2P_DICT:
+                G2P_DICT[G2P_key_to_match] = 1
+                trans = G2P_DATA[G2P_key_to_match][0]
+                gene = G2P_DATA[G2P_key_to_match][1]
+                GT = G2P_DATA[G2P_key_to_match][2]
+
+                if (chr != 'chrX') and (chr != 'chrY'):
+                    if GT == 'HET':
+                        genotype = 'Heterozygous'
+                    elif GT == 'HOM':
+                        genotype = 'Homozygous'
+                    else:
+                        print "ERROR: Cannot understand GT = %s" % (GT)
+                        raise SystemExit
+                elif (chr == 'chrX') or (chr == 'chrY'):
+                    if DEC_CHILD_SEX == '46XX':                 # a girl
+                        if GT == 'HET':
+                            genotype = 'Heterozygous'
+                        elif GT == 'HOM':
+                            genotype = 'Homozygous'
+                        else:
+                            print "ERROR: Cannot understand GT = %s" % (GT)
+                            raise SystemExit
+                    elif DEC_CHILD_SEX == '46XY':               # a boy
+                        if GT == 'HET':
+                            genotype = 'Heterozygous'
+                            print "   WARNING: HET variant on chrX/Y for a boy (%s): %s\t%s\t%s\t%s\t%s" % (CHILD_ID,chr,pos,ref,alt,VCF_VAR)
+                        elif GT == 'HOM':
+                            genotype = 'Hemizygous'
+                        else:
+                            print "ERROR: Cannot understand GT = %s" % (GT)
+                            raise SystemExit
+                    else:
+                        print "ERROR: unknown sex for this proband = %s" % (DEC_CHILD_SEX)
+                        raise SystemExit
+                else:
+                    print "ERROR: unknown chr"
+                    print line
+                    raise SystemExit
+
+
+                # write to the DECIPHER file
+                gene_id_idx = gene.find('(')
+                if gene_id_idx == -1:
+                    gene_id_idx = len(gene)
+                gene_id = gene[0:gene_id_idx]
+                int_ID = MAP_DICT[fam_id]
+
+                if trans in TRANS_DICT:                         # if the transcriptID is to be replaced
+                    safe_trans = TRANS_DICT[trans]
+                else:
+                    safe_trans = trans
+
+                to_write = '%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,,%s,,,,"%s",,,,%s,\n' % (int_ID,chr[3:],pos,ASSEMBLY,ref,alt,safe_trans,gene_id,INTERGENIC,DEC_CHILD_SEX,ACCESS,inher_stat,genotype)
+                out_cntr += 1
+                out_han.write(to_write)
+
+                # write to the IGV file
+                i_s = pos - SNAP_FLANK
+                i_e = pos + SNAP_FLANK
+
+                # check if above FS/SOR_THRESH to include in the snapshot name
+                if (FS == '') or (SOR == ''):
+                    flag = 'NA'
+                elif (FS >= FS_THRESH) and (SOR >= SOR_THRESH):
+                    flag = 'FS_%.1f_SOR_%.1f' % (FS,SOR)
+                else:
+                    flag = 'OK'
+                i_name = '%s_%s_%s_%s_%s_%s.png' % (CHILD_ID,chr,pos,ref,alt,flag)
+
+                out_igv_han.write('goto %s:%s-%s\n' % (chr,i_s,i_e))
+                out_igv_han.write('sort strand\n')
+                out_igv_han.write('squish\n')
+                out_igv_han.write('snapshot %s\n' % (i_name))
+                out_igv_han.write('\n')
+
+
+        else:
+            print "Cannot establish the type of this VCF variant"
+            print line
+            raise SystemExit
+
+    in_han.close()
+    out_han.close()
+    out_igv_han.close()
+
+
+
+
+
+
+    ### check if all G2P and VASE variants were found/matched in the proband's VCF
+    found_all_G2P = True
+    for k,v in G2P_DICT.iteritems():
+        if int(v) == 0:
+            print k
+            found_all_G2P = False
+            break
+
+    if found_all_G2P:
+        print "OK: Found all %s G2P variants in the proband's VCF file" % (len(G2P_DICT))
+    else:
+        print "ERROR: Could not find all G2P variants in the probands VCF file"
+        raise SystemExit
+
+
+    ### check if all G2P variants are written out
+    if out_cntr == NUM_UNIQ_G2P_VARS:
+        print "OK: All G2P vars are recorded in the output DECIPHER file"
+    else:
+        print "ERROR: *NOT* all G2P vars are recorded in the G2P VCF file"
+
+    print "Wrote %s variants in outfile = %s" % (out_cntr,out_dec_file)
+    print "The batch snapshot file = %s" % (out_igv_file)
+    sys.stdout.flush()
+
+
+
+
+
+
+
+
+
+
+def read_all_VCF_vars(in_vcf_file,THIS_DICT):
+
+    in_han = gzip.open(in_vcf_file,'r')
+    for line in in_han:
+        if line.startswith('#'):
+            continue
+
+        data = [x.strip() for x in line.strip().split('\t')]
+        chr = data[0]
+        pos = int(data[1])
+        ref = data[3]
+        alt = data[4]
+
+
+        # did the splitting and normalizing - should not have multiallelic variants
+        if alt.find(',') != -1:
+            print "ERROR: found multiallelic variant"
+            print line
+            raiseSystemExit
+
+        key = '%s:%s:%s:%s' % (chr,pos,ref,alt)
+        if key not in THIS_DICT:
+            THIS_DICT[key] = 1
+        else:
+            print "ERROR: duplicate key = %s in %s" % (key,in_vcf_file)
+            raise SystemExit
+
+    in_han.close()
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+def read_G2P(in_file):
+
+    global NUM_UNIQ_G2P_VARS
+
+    known_OBS_states = ['monoallelic_autosomal','biallelic_autosomal','monoallelic_X_hem','monoallelic_X_het']
+
+    # first, read the G2P variants on canonical transcripts for the singleton
+    CHILD_DICT = defaultdict(dict)	# 1st level key: OBS state; 2nd level key: chr:start:end:ref:alt; value: (ZYG,gene,trans)
+
+    in_han = open(in_file,'r')
+    for line in in_han:
+        data = [x.strip() for x in line.strip().split('\t')]
+
+        # get the individual_id
+        sam_id = data[0]
+
+        # ignore variants not on canonical transcripts
+        is_canon = data[3]
+        if is_canon != 'is_canonical':
+            continue
+
+        # split the variants based on the gene's OBS model of inheritance
+        inher_model = data[4]
+        aaa,OBS_state = inher_model.split('=')
+
+        if OBS_state not in known_OBS_states:
+            print "ERROR: unknown OBS state = %s in %s" % (OBS_state,in_file)
+            raise SystemExit
+
+        # get the gene name in format ENSG00000165899(C12orf64,OTOGL) or gene-MYT1L(MYT1L)
+        gene_name = data[1]
+
+        # get the transcript name in format ENST00000238647 or gene-MYT1L(MYT1L)
+        transcript = data[2]
+
+
+        # this is a list of variants (n>=1) on a canonical transcript in a gene being considered under any OBS state
+        var_list = [y.strip() for y in data[6].split(';')]
+        for v in var_list:
+            v_details = [z.strip() for z in v.split(':')]
+            chr = v_details[0]
+            start = int(v_details[1])
+            end = int(v_details[2])
+            ref = v_details[3]
+            alt = v_details[4]
+            GT = v_details[5]
+            second_key = '%s:%s:%s:%s:%s' % (chr,start,end,ref,alt)
+
+
+            if sam_id == CHILD_ID:
+                # check for duplication
+                if OBS_state not in CHILD_DICT:
+                    CHILD_DICT[OBS_state][second_key] = (GT,gene_name,transcript)
+                elif second_key not in CHILD_DICT[OBS_state]:
+                    CHILD_DICT[OBS_state][second_key] = (GT,gene_name,transcript)
+                else:		# already recorded this variant
+                     		# if we have refseq recorded and this is ensembl --> replace
+                    if not CHILD_DICT[OBS_state][second_key][1].startswith('ENSG'):		# recorded is refseq
+                        if gene_name.startswith('ENSG'):					# this is ensembl
+                            CHILD_DICT[OBS_state][second_key] = (GT,gene_name,transcript) 	# replace
+                        else:									# this is refseq again, ignore
+                            pass
+                    else:									# recorded is ensembl, ignore
+                        pass
+
+            else:
+                print "ERROR: cannot identify the person for this variant"
+                print line
+                raise SystemExit
+
+    in_han.close()
+
+
+    ### print out the number of unique G2P variants in CHILD ###
+    child_mono = 0
+    child_bi = 0
+    child_hem = 0
+    child_het = 0
+
+    if 'monoallelic_autosomal' in CHILD_DICT:
+        child_mono = len(CHILD_DICT['monoallelic_autosomal'])
+    if 'biallelic_autosomal' in CHILD_DICT:
+        child_bi = len(CHILD_DICT['biallelic_autosomal'])
+    if 'monoallelic_X_hem' in CHILD_DICT:
+        child_hem = len(CHILD_DICT['monoallelic_X_hem'])
+    if 'monoallelic_X_het' in CHILD_DICT:
+        child_het = len(CHILD_DICT['monoallelic_X_het'])
+
+    print "CHILD (%s): number of unique G2P variants on canon transcript in the following OBS states" % (CHILD_ID)
+    print "    monoallelic_autosomal: %s" % (child_mono)
+    print "    biallelic_autosomal: %s" % (child_bi)
+    print "    monoallelic_X_hem: %s" % (child_hem)
+    print "    monoallelic_X_het: %s" % (child_het)
+
+
+
+
+    ######################################################################################################
+    ####    Dominant filtering                                                                        ####
+    ####    if the gene has been considered under the dominant model (OBS == monoallelic_autosomal)   ####
+    ####    exclude child variants seen in UNAFFECTED mother/father, regardless of GT                 ####
+    ######################################################################################################
+
+
+    print ""
+    print "===   monoallelic autosomal (DOMINANT) filtering   ==="
+
+
+    for key in CHILD_DICT['monoallelic_autosomal']:	# this the second key: chr:start:end:ref:alt; value: (ZYG,gene,trans)
+
+        CHILD_GT = CHILD_DICT['monoallelic_autosomal'][key][0]
+        CHILD_GENE = CHILD_DICT['monoallelic_autosomal'][key][1]
+        CHILD_TRANS = CHILD_DICT['monoallelic_autosomal'][key][2]
+
+#        if (key in MOM_DICT['monoallelic_autosomal']) and (MOM_STAT == "UNAFFECTED"):
+#            MOM_GT = MOM_DICT['monoallelic_autosomal'][key][0]
+#            print "***[DOMINANT model]*** Excluded CHILD var %s in gene = %s, CHILD_GT = %s, MOM_GT = %s, MOM_STAT = %s" % (key,CHILD_GENE,CHILD_GT,MOM_GT,MOM_STAT)
+#            continue
+#
+#        if (key in DAD_DICT['monoallelic_autosomal']) and (DAD_STAT == "UNAFFECTED"):
+#            DAD_GT = DAD_DICT['monoallelic_autosomal'][key][0]
+#            print "***[DOMINANT model]*** Excluded CHILD var %s in gene = %s, CHILD_GT = %s, DAD_GT = %s, DAD_STAT = %s" % (key,CHILD_GENE,CHILD_GT,DAD_GT,DAD_STAT)
+#            continue
+
+
+        # if a non-normalized INDEL in child G2P - must adjust (should not happen really, we split, normalized and left-aligned the family VCF before sending it to VEP+G2P)
+        chr,start,end,ref,alt = key.split(":")
+        if len(ref) > 1 and len(alt) > 1:                           # an INDEL - not normalized
+            if len(ref) < len(alt):                                 # an INS
+                orig_start = start
+                orig_ref = ref
+                orig_alt = alt
+                start = orig_start
+                ref = '-'
+                alt = orig_alt[len(orig_ref):]
+                print "    WARNING: original INS = %s:%s:%s:%s:%s --> replaced with INS = %s:%s:%s:%s" % (chr,orig_start,end,orig_ref,orig_alt,chr,start,ref,alt)
+            else:                                                   # a DEL
+                print "ERROR: At the momemnt, cannot deal with this non-normalized deletion"
+                print line
+                raise SystemExit
+
+        new_key = '%s:%s:%s:%s' % (chr,start,ref,alt)
+
+        # record the data for CHILD G2P variants (for OBS=monoallelic)
+
+        if new_key not in G2P_DICT:
+            G2P_DICT[new_key] = 0
+        else:
+            # print "ERROR: duplicate G2P variant new_key = %s" % (new_key)
+            # raise SystemExit
+            # this will happen if a gene is e.g. hemizygous,x-linked dominant - there will be two separate lines in the output for each req
+            pass
+
+        # and record the required data (CHILD_TRANS,CHILD_GENE,CHILD_GT) in G2P_DATA
+        if new_key not in G2P_DATA:
+            G2P_DATA[new_key] = (CHILD_TRANS,CHILD_GENE,CHILD_GT)
+        else:
+            # print "ERROR: duplicate G2P variant new_key = %s" % (new_key)
+            # raise SystemExit
+            # this will happen if a gene is e.g. hemizygous,x-linked dominant - there will be two separate lines in the output for each req
+            pass
+
+
+    NUM_UNIQ_G2P_VARS = len(G2P_DICT)
+    print "Found %s unique G2P variants in CHILD (%s) after considering MONOALLELIC genes" % (NUM_UNIQ_G2P_VARS,CHILD_ID)
+    sys.stdout.flush()
+
+    print ""
+
+
+
+
+
+    ##############################################################################################################
+    ####    Recessive filtering                                                                               ####
+    ####    under the recessive model (OBS == biallelic_autosomal) - consider ALL variants per gene           ####
+    ####    must all be HET in CHILD, GT in parent does not matter                                            ####
+    ####    all of them must *clearly* come from only one of the parents (maternally/paternally + biparental) ####
+    ####    and this parent must be unaffected                                                                ####
+    ####    if all these: then exclude all child variants in this gene                                        ####
+    ##############################################################################################################
+
+
+    print ""
+    print "===   biallelic autosomal (RECESSIVE) filtering   ==="
+
+
+    GENE_KEY_GT = defaultdict(dict)		# for child - 1st level key: gene_name; 2nd level key: chr:start:end:ref:alt; value: (GT,trans)
+
+    # process all variants in biallelic genes in child
+    for key in CHILD_DICT['biallelic_autosomal']:		# this the second key: chr:start:end:ref:alt; value: (ZYG,gene,trans)
+        b_GT = CHILD_DICT['biallelic_autosomal'][key][0]
+        b_gene = CHILD_DICT['biallelic_autosomal'][key][1]
+        b_trans = CHILD_DICT['biallelic_autosomal'][key][2]
+        GENE_KEY_GT[b_gene][key] = (b_GT,b_trans)
+
+    # iterate over genes in GENE_KEY_GT
+    for g in GENE_KEY_GT: 			# this is the biallelic gene name
+#        all_HET = True
+#
+#        # iterate over variants in this gene
+#        for kx in GENE_KEY_GT[g]:		# this the second key: chr:start:end:ref:alt
+#            if GENE_KEY_GT[g][kx][0] == 'HOM':     # there is a HOM variant in the child - NO filtering
+#                all_HET = False
+#                break
+#
+#        if all_HET:				# for this gene
+#        # all variants in this gene in the CHILD are HET, check if all come from a single unaffected parent
+#        # if yes, filter out and write a message to the log file
+#        # if not, to be added to G2P_DICT and G2P_DATA for further processing
+#
+#            all_from_one_parent = True
+#
+#            # iterate again over the variants in this gene
+#            VAR_SOURCE_LIST = {}		# key: chr:start:end:ref:alt in child; value: (NONE) or (MOM or DAD or BOTH and the parent is UNAFFECTED)
+#
+#            for ky in GENE_KEY_GT[g]:		# this the second key: chr:start:end:ref:alt
+#
+#                this_var_status = 'NONE'
+#
+#                if ((ky in MOM_DICT['biallelic_autosomal']) or (ky in MOM_DICT['monoallelic_autosomal'])) and (MOM_STAT == "UNAFFECTED"):
+#                    this_var_status = 'MOM'
+#                if ((ky in DAD_DICT['biallelic_autosomal']) or (ky in DAD_DICT['monoallelic_autosomal'])) and (DAD_STAT == "UNAFFECTED"):
+#                    if this_var_status == 'NONE':
+#                        this_var_status = 'DAD'
+#                    elif this_var_status == 'MOM':
+#                        this_var_status = 'BOTH'
+#
+#                VAR_SOURCE_LIST[ky] = this_var_status
+#
+#            # have collected the parent source for all variants in this gene
+#            tot_num_vars = len(VAR_SOURCE_LIST)
+#            num_mom = 0
+#            num_dad = 0
+#            num_none = 0
+#            for kt,v in VAR_SOURCE_LIST.iteritems():
+#                if v == 'NONE':
+#                    num_none += 1
+#                elif v == 'MOM':
+#                    num_mom += 1
+#                elif v == 'DAD':
+#                    num_dad += 1
+#                elif v == 'BOTH':
+#                    num_mom += 1
+#                    num_dad += 1
+#                else:
+#                    print "ERROR: cannot understand the source parent = %s" % (v)
+#                    raise SystemExit
+#
+#            if num_none > 0:
+#                all_from_one_parent = False
+#            elif num_mom < tot_num_vars and num_dad < tot_num_vars:
+#                all_from_one_parent = False
+#
+#            # if all variants in the child in this gene are found in single unaffected parent - filter out
+#            if all_from_one_parent:
+#                for kz in GENE_KEY_GT[g]:
+#                    print "***[RECESSIVE model]*** Excluded CHILD HET var %s in gene = %s, found in = %s, PARENT_STAT = UNAFFECTED" % (kz,g,VAR_SOURCE_LIST[kz])
+#                continue
+#
+#        # end processing all HET variants in the proband - if all from single unaffected parent they have been excluded, message to the log written
+#        # and gone to evaluating the next biallelic gene in the child
+#
+
+#        # if here
+#        # - either not all CHILD variants in this gene are not HET, or
+#        # - not all of them can be traced to a single unaffected parent
+#        # --> add to be processed
+#
+        # here we are at gene level, must iterate over all variants in this gene
+        # iterate over variants in this gene
+        for kkk in GENE_KEY_GT[g]:                # this the second key: chr:start:end:ref:alt
+
+            CHILD_GT = CHILD_DICT['biallelic_autosomal'][kkk][0]
+            CHILD_GENE = CHILD_DICT['biallelic_autosomal'][kkk][1]
+            CHILD_TRANS = CHILD_DICT['biallelic_autosomal'][kkk][2]
+
+            # if a non-normalized INDEL in child G2P - must adjust (should not happen really, we split, normalized and left-aligned the family VCF before sending it to VEP+G2P)
+            chr,start,end,ref,alt = kkk.split(":")
+            if len(ref) > 1 and len(alt) > 1:                           # an INDEL - not normalized
+                if len(ref) < len(alt):                                 # an INS
+                    orig_start = start
+                    orig_ref = ref
+                    orig_alt = alt
+                    start = orig_start
+                    ref = '-'
+                    alt = orig_alt[len(orig_ref):]
+                    print "    WARNING: original INS = %s:%s:%s:%s:%s --> replaced with INS = %s:%s:%s:%s" % (chr,orig_start,end,orig_ref,orig_alt,chr,start,ref,alt)
+                else:                                                   # a DEL
+                    print "ERROR: At the momemnt, cannot deal with this non-normalized deletion"
+                    print line
+                    raise SystemExit
+
+            new_key = '%s:%s:%s:%s' % (chr,start,ref,alt)
+
+            # record the data for CHILD G2P variants (for OBS=biallelic)
+            if new_key not in G2P_DICT:
+                G2P_DICT[new_key] = 0
+            else:
+                # print "ERROR: duplicate G2P variant new_key = %s" % (new_key)
+                # raise SystemExit
+                # this will happen if a gene is e.g. hemizygous,x-linked dominant - there will be two separate lines in the output for each req
+                pass
+
+            # and record the required data (CHILD_TRANS,CHILD_GENE,CHILD_GT) in G2P_DATA
+            if new_key not in G2P_DATA:
+                G2P_DATA[new_key] = (CHILD_TRANS,CHILD_GENE,CHILD_GT)
+            else:
+                # print "ERROR: duplicate G2P variant new_key = %s" % (new_key)
+                # raise SystemExit
+                # this will happen if a gene is e.g. hemizygous,x-linked dominant - there will be two separate lines in the output for each req
+                pass
+
+    NUM_UNIQ_G2P_VARS = len(G2P_DICT)
+    print "Found %s unique G2P variants in CHILD (%s) after considering MONOALLELIC and BIALLELIC genes" % (NUM_UNIQ_G2P_VARS,CHILD_ID)
+    sys.stdout.flush()
+    print ""
+
+
+
+
+
+
+
+
+
+    ####################################################################################################################
+    ####    X-linked filtering                                                                                      ####
+#.#    ####    under the x-linked model (OBS == hemizygous or x-linked dominant, but NOT x-linked over-dominance)      ####
+    ####    under the chrX model (OBS == monoallelic_X_hem or monoallelic_X_het)                                    ####
+    ####    exclude child HET variants if seen as HOM in UNAFFECTED father                                          ####
+    ####													    ####
+    ####    Note 18/01/2022    									    		    ####
+    ####    This is a temporary solution, since x-linked dominant and x-linked over-dominance -> monoallelic_X_het  ####
+    ####    and we should filter x-linked dominant and monoallelic_X_hem, but not x-linked over-dominance           ####
+    ####    the code below treats x-linked over-dominance as the others (i.e. filters, while it should not)         ####
+    ####    Issue flagged to G2P plug-in team, awaiting their fix						    ####
+    ####    for now manually scan the output of G2P for the proband (both for boys and girls)                       ####
+    ####        to check if any variant has been called in PCDH19 and EFNB1                                         ####
+    ####    also for all the variants filtered out from monoallelic_X_het we will print in the log the gene name    ####
+    ####################################################################################################################
+
+
+    print ""
+    print "===   X-linked filtering   ==="
+
+    #######################################
+    ### process monoallelic_X_hem genes ###
+    #######################################
+
+    for key in CHILD_DICT['monoallelic_X_hem']:       # this the second key: chr:start:end:ref:alt; value: (ZYG,gene,trans)
+
+        CHILD_GT = CHILD_DICT['monoallelic_X_hem'][key][0]
+        CHILD_GENE = CHILD_DICT['monoallelic_X_hem'][key][1]
+        CHILD_TRANS = CHILD_DICT['monoallelic_X_hem'][key][2]
+
+#        if CHILD_GT == 'HOM':							# do NOT filter HOM variants in proband (i.e., hemizygous in boy or HOM in girl)
+#            pass
+#        else:
+#            if (key in DAD_DICT['monoallelic_X_hem']) and (DAD_STAT == "UNAFFECTED"):
+#                DAD_GT = DAD_DICT['monoallelic_X_hem'][key][0]
+#                if DAD_GT == 'HOM':						# i.e., hemizygous variant in unaffected father
+#                    print "***[monoallelic_X_hem]*** Excluded CHILD var %s in gene = %s, CHILD_GT = %s, DAD_GT = %s, DAD_STAT = %s" % (key,CHILD_GENE,CHILD_GT,DAD_GT,DAD_STAT)
+#                    continue
+#
+        # if a non-normalized INDEL in child G2P - must adjust (should not happen really, we split, normalized and left-aligned the family VCF before sending it to VEP+G2P)
+        chr,start,end,ref,alt = key.split(":")
+        if len(ref) > 1 and len(alt) > 1:                           # an INDEL - not normalized
+            if len(ref) < len(alt):                                 # an INS
+                orig_start = start
+                orig_ref = ref
+                orig_alt = alt
+                start = orig_start
+                ref = '-'
+                alt = orig_alt[len(orig_ref):]
+                print "    WARNING: original INS = %s:%s:%s:%s:%s --> replaced with INS = %s:%s:%s:%s" % (chr,orig_start,end,orig_ref,orig_alt,chr,start,ref,alt)
+            else:                                                   # a DEL
+                print "ERROR: At the momemnt, cannot deal with this non-normalized deletion"
+                print line
+                raise SystemExit
+
+        new_key = '%s:%s:%s:%s' % (chr,start,ref,alt)
+
+        # record the data for CHILD G2P variants (for OBS=monoallelic_X_hem)
+        if new_key not in G2P_DICT:
+            G2P_DICT[new_key] = 0
+        else:
+            # print "ERROR: duplicate G2P variant new_key = %s" % (new_key)
+            # raise SystemExit
+            # this will happen if a gene is e.g. hemizygous,x-linked dominant - there will be two separate lines in the output for each req
+            pass
+
+        # and record the required data (CHILD_TRANS,CHILD_GENE,CHILD_GT) in G2P_DATA
+        if new_key not in G2P_DATA:
+            G2P_DATA[new_key] = (CHILD_TRANS,CHILD_GENE,CHILD_GT)
+        else:
+            # print "ERROR: duplicate G2P variant new_key = %s" % (new_key)
+            # raise SystemExit
+            # this will happen if a gene is e.g. hemizygous,x-linked dominant - there will be two separate lines in the output for each req
+            pass
+
+
+
+    #######################################
+    ### process monoallelic_X_het genes ###
+    #######################################
+
+    for key in CHILD_DICT['monoallelic_X_het']:       # this the second key: chr:start:end:ref:alt; value: (ZYG,gene,trans)
+
+        CHILD_GT = CHILD_DICT['monoallelic_X_het'][key][0]
+        CHILD_GENE = CHILD_DICT['monoallelic_X_het'][key][1]
+        CHILD_TRANS = CHILD_DICT['monoallelic_X_het'][key][2]
+
+#        if CHILD_GT == 'HOM':                                                   # do NOT filter HOM variants (i.e., hemizygous in boy or HOM in girl)
+#            pass
+#        else:
+#            if (key in DAD_DICT['monoallelic_X_het']) and (DAD_STAT == "UNAFFECTED"):
+#                DAD_GT = DAD_DICT['monoallelic_X_het'][key][0]
+#                if DAD_GT == 'HOM':                                             # i.e., x-linked dominant variant in unnafected father
+#                    print "***[monoallelic_X_het]*** Excluded CHILD var %s in gene = %s, CHILD_GT = %s, DAD_GT = %s, DAD_STAT = %s" % (key,CHILD_GENE,CHILD_GT,DAD_GT,DAD_STAT)
+#                    continue
+#
+        # if a non-normalized INDEL in child G2P - must adjust (should not happen really, we split, normalized and left-aligned the family VCF before sending it to VEP+G2P)
+        chr,start,end,ref,alt = key.split(":")
+        if len(ref) > 1 and len(alt) > 1:                           # an INDEL - not normalized
+            if len(ref) < len(alt):                                 # an INS
+                orig_start = start
+                orig_ref = ref
+                orig_alt = alt
+                start = orig_start
+                ref = '-'
+                alt = orig_alt[len(orig_ref):]
+                print "    WARNING: original INS = %s:%s:%s:%s:%s --> replaced with INS = %s:%s:%s:%s" % (chr,orig_start,end,orig_ref,orig_alt,chr,start,ref,alt)
+            else:                                                   # a DEL
+                print "ERROR: At the momemnt, cannot deal with this non-normalized deletion"
+                print line
+                raise SystemExit
+
+        new_key = '%s:%s:%s:%s' % (chr,start,ref,alt)
+
+        # record the data for CHILD G2P variants (for OBS=monoallelic_X_het)
+        if new_key not in G2P_DICT:
+            G2P_DICT[new_key] = 0
+        else:
+            # print "ERROR: duplicate G2P variant new_key = %s" % (new_key)
+            # raise SystemExit
+            # this will happen if a gene is e.g. hemizygous,x-linked dominant - there will be two separate lines in the output for each req
+            pass
+
+        # and record the required data (CHILD_TRANS,CHILD_GENE,CHILD_GT) in G2P_DATA
+        if new_key not in G2P_DATA:
+            G2P_DATA[new_key] = (CHILD_TRANS,CHILD_GENE,CHILD_GT)
+        else:
+            # print "ERROR: duplicate G2P variant new_key = %s" % (new_key)
+            # raise SystemExit
+            # this will happen if a gene is e.g. hemizygous,x-linked dominant - there will be two separate lines in the output for each req
+            pass
+
+
+
+##.#    ########################################################################
+##.#    ### process x-linked over-dominance  genes - no filtering to be done ###
+##.#    ########################################################################
+#
+##.#    for key in CHILD_DICT['x-linked over-dominance']:       # this the second key: chr:start:end:ref:alt; value: (ZYG,gene,trans)
+#
+##.#        CHILD_GT = CHILD_DICT['x-linked over-dominance'][key][0]
+##.#        CHILD_GENE = CHILD_DICT['x-linked over-dominance'][key][1]
+##.#        CHILD_TRANS = CHILD_DICT['x-linked over-dominance'][key][2]
+#
+##.#        # if a non-normalized INDEL in child G2P - must adjust (should not happen really, we split, normalized and left-aligned the family VCF before sending it to VEP+G2P)
+##.#        chr,start,end,ref,alt = key.split(":")
+##.#        if len(ref) > 1 and len(alt) > 1:                           # an INDEL - not normalized
+##.#            if len(ref) < len(alt):                                 # an INS
+##.#                orig_start = start
+##.#                orig_ref = ref
+##.#                orig_alt = alt
+##.#                start = orig_start
+##.#                ref = '-'
+##.#                alt = orig_alt[len(orig_ref):]
+##.#                print "    WARNING: original INS = %s:%s:%s:%s:%s --> replaced with INS = %s:%s:%s:%s" % (chr,orig_start,end,orig_ref,orig_alt,chr,start,ref,alt)
+##.#            else:                                                   # a DEL
+##.#                print "ERROR: At the momemnt, cannot deal with this non-normalized deletion"
+##.#                print line
+##.#                raise SystemExit
+#
+##.#        new_key = '%s:%s:%s:%s' % (chr,start,ref,alt)
+#
+##.#        # record the data for CHILD G2P variants (for OBS=x-linked over-dominance)
+##.#        if new_key not in G2P_DICT:
+##.#            G2P_DICT[new_key] = 0
+##.#        else:
+##.#            # print "ERROR: duplicate G2P variant new_key = %s" % (new_key)
+##.#            # raise SystemExit
+##.#            # this will happen if a gene is e.g. hemizygous,x-linked dominant - there will be two separate lines in the output for each req
+##.#            pass
+#
+##.#        # and record the required data (CHILD_TRANS,CHILD_GENE,CHILD_GT) in G2P_DATA
+##.#        if new_key not in G2P_DATA:
+##.#            G2P_DATA[new_key] = (CHILD_TRANS,CHILD_GENE,CHILD_GT)
+##.#        else:
+##.#            # print "ERROR: duplicate G2P variant new_key = %s" % (new_key)
+##.#            # raise SystemExit
+##.#            # this will happen if a gene is e.g. hemizygous,x-linked dominant - there will be two separate lines in the output for each req
+##.#            pass
+
+
+    NUM_UNIQ_G2P_VARS = len(G2P_DICT)
+    print "Found %s unique G2P variants in CHILD (%s) after considering MONOALLELIC, BIALLELIC and X-LINKED genes" % (NUM_UNIQ_G2P_VARS,CHILD_ID)
+    sys.stdout.flush()
+
+    print ""
+    print ""
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+def read_ped(in_file):
+
+    global CHILD_ID
+    global CHILD_SEX
+    global DEC_CHILD_SEX
+
+    CHILD_ID = 0
+    CHILD_SEX = 0
+
+    # no need to do PED checks, did them for singletons at trio_setup.sh
+    in_han = open(in_file,'r')
+    for line in in_han:
+        data = [x.strip() for x in line.strip().split('\t')]
+        CHILD_ID = data[1]
+        CHILD_SEX = int(data[4])
+        if CHILD_SEX == 1:          # boy
+            DEC_CHILD_SEX = '46XY'
+        elif CHILD_SEX == 2:        # girl
+            DEC_CHILD_SEX = '46XX'
+        else:
+            print "ERROR: proband sex unknown"
+            print line
+            raise SystemExit
+
+
+
+
+def read_map_file(in_file):
+    in_han = open(in_file,'r')
+    for line in in_han:
+        data = [x.strip() for x in line.strip().split('\t')]
+        dec_id = data[0]
+        int_id = data[1]
+        if dec_id not in MAP_DICT:
+            MAP_DICT[dec_id] = int_id
+        else:
+            print "ERROR: duplicate DECIPHER/family ID = %s" % (dec_id)
+            raise SystemExit
+    in_han.close()
+
+
+
+
+def read_trans_map(in_file):
+    in_han = open(in_file,'r')
+    for line in in_han:
+        data = [x.strip() for x in line.strip().split('\t')]
+        old_trans_id = data[0]
+        new_trans_id = data[1]
+        if old_trans_id not in TRANS_DICT:
+            TRANS_DICT[old_trans_id] = new_trans_id
+        else:
+            print "ERROR: duplicate old transcript ID = %s" % (old_trans_id)
+            raise SystemExit
+    in_han.close()
+
+
+
+
+
+
+if __name__ == '__main__':
+    if len(sys.argv) == 11:
+        go(sys.argv[1],sys.argv[2],sys.argv[3],sys.argv[4],sys.argv[5],sys.argv[6],sys.argv[7],sys.argv[8],sys.argv[9],sys.argv[10])
+    else:
+        print "Suggested use: time python /home/u035/u035/shared/scripts/NHS_WES_generate_DEC_IGV.py \
+        dec_map_file,trans_map_file,ped_file,in_g2p_file,fam_igv_dir,vcf_dir,plate_id,fam_id,dec_dir,fam_bam_dir"
+        raise SystemExit
+
diff --git a/generate_DEC_IGV_trio_scripts_from_quad.py b/generate_DEC_IGV_trio_scripts_from_quad.py
new file mode 100755
index 0000000000000000000000000000000000000000..408a1240d6af518a960abef44f9e80227445f128
--- /dev/null
+++ b/generate_DEC_IGV_trio_scripts_from_quad.py
@@ -0,0 +1,1438 @@
+#	input:
+#	  this trio PED file				[${PED_DIR}/${BATCH_ID}_${PLATE_ID}_${FAMILY_ID}_${KID_ID}.ped]
+#	  individual VCF file for the trio proband	[${FAMILY_ID}-gatk-haplotype-annotated.${SAMPLE_ID}.vcf.gz]	????
+#	  G2P text output for the trio			[${G2P_DIR}/${PLATE_ID}_${FAMILY_ID}_${KID_ID}_LOG_DIR/${PLATE_ID}_${FAMILY_ID}_${KID_ID}.report.txt]
+#	  VASE output					[${VASE_DIR}/${PLATE_ID}_${FAMILY_ID}_${KID_ID}.ready.denovo.vcf]
+#
+#
+#	output:
+#		DECIPHER formated file for the proband
+#		- all G2P variants
+#		- denovo variants marked as such
+#
+#	checks:
+#		all G2P variants found in the individual VCF
+#		all VASE denovo variants found in the individual VCF
+#
+#       Author: MH
+#       last modified: JAN 21, 2022
+
+
+
+import sys
+import os
+import csv
+import gzip
+from collections import defaultdict
+
+
+ASSEMBLY = 'GRCh38'
+INTERGENIC = 'No'
+ACCESS = 'No'
+
+
+G2P_DICT = {}		# key: chr:pos:ref:alt; value: 0 (if found only in G2P); 1 (if found in VCF) - for variants found in G2P output for this CHILD_ID
+G2P_DATA = {}		# key: chr:pos:ref:alt; value: (transcript,gene,GT)
+VASE_DICT = {}		# key: chr:pos:ref:alt; value: 0 (if found only in VASE); 1 (if found in VCF) - for variants found in VASE output for this CHILD_ID
+
+
+NUM_UNIQ_G2P_VARS = 0
+NUM_UNIQ_VASE_VARS = 0
+
+
+CHILD_ID = 0
+CHILD_SEX = 0
+DEC_CHILD_SEX = 'unknown'
+
+MOM_ID = 0
+MOM_STAT = 0	# 1 = UNAFF, 2 = AFF
+
+DAD_ID = 0
+DAD_STAT = 0	# 1 = UNAFF, 2 = AFF
+
+
+ALL_CHILD_DICT = {}		# key: chr:pos:ref:alt; value: (num_ALT_reads,VAF)
+ALL_MOM_DICT = {}		# key: chr:pos:ref:alt; value: irrelevant
+ALL_DAD_DICT = {}		# key: chr:pos:ref:alt; value: irrelevant
+
+
+CHILD_INHER_DICT = {}		# key: chr:pos:ref:alt; value: 'Paternally inherited, constitutive in father' | 'Maternally inherited, constitutive in mother' | 'Biparental' | 'De novo constitutive' | 'Unknown'
+
+SNAP_FLANK = 25
+
+
+MAP_DICT = {}			# key: family_id (aka decipher_id); value: internal (decipher) ID
+TRANS_DICT = {}			# key: transcriptID not found in DECIPHER; value: the chosen replacement transcriptID from those available in DECIPHER
+
+
+FS_THRESH = float(60)
+SOR_THRESH = float(3)
+
+
+
+### call the python scrpit
+#time ${PYTHON2} ${SCRIPTS_DIR}/NHS_WES_generate_DEC_IGV_trio_from_quad.py \
+#${DECIPHER_ID} \
+#${TRANS_MAP} \
+#${PED_FILE} \
+#${IN_G2P_FILE} \
+#${IN_VASE_FILE} \
+#${FAM_IGV_DIR} \
+#${VCF_DIR} \
+#${PLATE_ID} \
+#${FAMILY_ID} \
+#${DEC_DIR} \
+#${FAM_BAM_DIR} \
+#${KID_ID}
+
+
+
+
+
+
+def go(dec_id,trans_map_file,ped_file,in_g2p_file,in_vase_file,fam_igv_dir,vcf_dir,plate_id,fam_id,dec_dir,fam_bam_dir,this_kid):
+
+###    # read the decipher to internal ID mapping file
+###    read_map_file(dec_map_file)
+    # set the Internal ID (called dec_id) for this Decipher ID (called fam_id)
+    MAP_DICT[fam_id] = dec_id
+
+    # read the transcript mapping file
+    read_trans_map(trans_map_file)
+
+
+    # read the ped file and establish CHILD_ID,CHILD_SEX,MOM_ID,DAD_ID
+    read_ped(ped_file)
+
+    if (CHILD_ID != 0) and (CHILD_SEX != 0) and (DEC_CHILD_SEX != 'unknown') and (MOM_ID != 0) and (MOM_STAT != 0) and (DAD_ID != 0) and (MOM_STAT != 0):
+        print "======================================"
+        print "Analyzing:"
+        print "CHILD_ID = %s, CHILD_SEX = %s, DEC_CHILD_SEX = %s" % (CHILD_ID,CHILD_SEX,DEC_CHILD_SEX)
+        print "MOM_ID = %s, MOM_STATUS = %s" % (MOM_ID,MOM_STAT)
+        print "DAD_ID = %s, DAD_STATUS = %s" % (DAD_ID,DAD_STAT)
+        print "======================================"
+        sys.stdout.flush()
+    else:
+        print "ERROR: problems reading the PED file = %s" % (ped_file)
+        raise SystemExit
+
+
+    # read the G2P output for this family
+    read_G2P(in_g2p_file)
+
+
+    # read the VASE output for this family
+    read_VASE(in_vase_file)
+
+
+    # now read the individual VCFs and record all the variants
+    child_vcf_file = '%s/%s_%s_%s.ready.%s.vcf.gz' % (vcf_dir,plate_id,fam_id,this_kid,CHILD_ID)
+    mom_vcf_file = '%s/%s_%s_%s.ready.%s.vcf.gz' % (vcf_dir,plate_id,fam_id,this_kid,MOM_ID)
+    dad_vcf_file = '%s/%s_%s_%s.ready.%s.vcf.gz' % (vcf_dir,plate_id,fam_id,this_kid,DAD_ID)
+
+
+    read_all_VCF_vars(child_vcf_file,ALL_CHILD_DICT)
+    print "Found %s unique VCF variants for CHILD (%s)" % (len(ALL_CHILD_DICT),CHILD_ID)
+    sys.stdout.flush()
+
+    read_all_VCF_vars(mom_vcf_file,ALL_MOM_DICT)
+    print "Found %s unique VCF variants for MOM (%s)" % (len(ALL_MOM_DICT),MOM_ID)
+    sys.stdout.flush()
+
+    read_all_VCF_vars(dad_vcf_file,ALL_DAD_DICT)
+    print "Found %s unique VCF variants for DAD (%s)" % (len(ALL_DAD_DICT),DAD_ID)
+    sys.stdout.flush()
+
+
+    # now go over all child variants and set the inheritance
+    num_child_vars_assigned = 0
+    for key,v in ALL_CHILD_DICT.iteritems():
+        if (key in ALL_MOM_DICT) and (key in ALL_DAD_DICT):
+            CHILD_INHER_DICT[key] = 'Biparental'
+            num_child_vars_assigned += 1
+        elif key in ALL_MOM_DICT:
+            CHILD_INHER_DICT[key] = 'Maternally inherited, constitutive in mother'
+            num_child_vars_assigned += 1
+        elif key in ALL_DAD_DICT:
+            CHILD_INHER_DICT[key] = 'Paternally inherited, constitutive in father'
+            num_child_vars_assigned += 1
+        else:
+            CHILD_INHER_DICT[key] = 'Unknown'
+
+    assigned_ratio = (float(num_child_vars_assigned)/float(len(ALL_CHILD_DICT)))*100.0
+
+    print "%s of the %s unique VCF variants (%.2f%%) for CHILD (%s) has been assigned to parents" % (num_child_vars_assigned,len(ALL_CHILD_DICT),assigned_ratio,CHILD_ID)
+    sys.stdout.flush()
+
+
+
+
+
+
+    # setup the DECIPHER output file
+    out_dec_file = '%s/%s_DEC_FLT.csv' % (dec_dir,CHILD_ID)		################################
+    out_han = open(out_dec_file,'w')
+    out_han.write('Internal reference number or ID,Chromosome,Start,Genome assembly,Reference allele,Alternate allele,Transcript,Gene name,Intergenic,Chromosomal sex,Other rearrangements/aneuploidy,Open-access consent,Age at last clinical assessment,Prenatal age in weeks,Note,Inheritance,Pathogenicity,Phenotypes,HGVS code,Genotype,Responsible contact\n')
+
+
+    # setup the IGV snapshot file
+    out_igv_file = '%s/IGV/%s.snapshot.FLT.txt' % (dec_dir,CHILD_ID)	#################################
+    out_igv_han = open(out_igv_file,'w')
+    out_igv_han.write('new\n')
+    out_igv_han.write('genome hg38\n')
+    out_igv_han.write('mkdir -p "%s"\n' % (fam_igv_dir))
+    out_igv_han.write('new\n')
+
+    child_bam = '%s/%s/%s-ready.bam' % (fam_bam_dir,CHILD_ID,CHILD_ID)
+    mom_bam = '%s/%s/%s-ready.bam' % (fam_bam_dir,MOM_ID,MOM_ID)
+    dad_bam = '%s/%s/%s-ready.bam' % (fam_bam_dir,DAD_ID,DAD_ID)
+    out_igv_han.write('load %s\n' % (child_bam))
+    out_igv_han.write('load %s\n' % (mom_bam))
+    out_igv_han.write('load %s\n' % (dad_bam))
+
+    out_igv_han.write('snapshotDirectory "%s"\n' % (fam_igv_dir))
+    out_igv_han.write('\n')
+
+
+
+    # now read the child VCF, check if the variant in the G2P/VASE output, if yes:
+    # set the value in the dict to 1
+    # print out to to output file
+
+
+    in_cntr = 0
+    out_cntr = 0
+
+    child_vcf_file = '%s/%s_%s_%s.ready.%s.vcf.gz' % (vcf_dir,plate_id,fam_id,this_kid,CHILD_ID)
+    in_han = gzip.open(child_vcf_file,'r')
+
+
+    for line in in_han:
+        if line.startswith('#'):
+            continue
+
+        in_cntr += 1
+
+        data = [x.strip() for x in line.strip().split('\t')]
+        chr = data[0]
+        pos = int(data[1])
+        ref = data[3]
+        alt = data[4]
+
+        # extract FS and SOR
+        FS = ''
+        SOR = ''
+        infos = [y.strip() for y in data[7].strip().split(';')]
+        for info in infos:
+            if info.startswith('FS='):
+                tag,FS = info.split('=')
+                FS = float(FS)
+            elif info.startswith('SOR='):
+                tag,SOR = info.split('=')
+                SOR = float(SOR)
+
+        VCF_VAR = data[9]
+
+        key = '%s:%s:%s:%s' % (chr,pos,ref,alt)
+        inher_stat = CHILD_INHER_DICT[key]
+
+
+
+        ##############################################################
+        # different processing depending on being a SNP, INS, or DEL #
+        ##############################################################
+
+        if len(ref) == len(alt):			# SNP
+            if len(ref) != 1:
+                print "ERROR: MNPs are not supported!"
+                print line
+                raise SystemExit
+
+            key_to_match = '%s:%s:%s:%s' % (chr,pos,ref,alt)
+            is_denovo = False
+            if key_to_match in VASE_DICT:
+                VASE_DICT[key_to_match] = 1
+                is_denovo = True
+            if key_to_match in G2P_DICT:
+                G2P_DICT[key_to_match] = 1
+                trans = G2P_DATA[key_to_match][0]
+                gene = G2P_DATA[key_to_match][1]
+                GT = G2P_DATA[key_to_match][2]
+
+                if is_denovo:
+                    if inher_stat == 'Unknown':
+                        inher_stat = 'De novo constitutive'
+                    else:
+                        print "ERROR: %s is both VASE denovo and %s from VCF" % (key,inher_stat)
+                        raise SystemExit
+
+                if (chr != 'chrX') and (chr != 'chrY'):
+                    if GT == 'HET':
+                        genotype = 'Heterozygous'
+                    elif GT == 'HOM':
+                        genotype = 'Homozygous'
+                    else:
+                        print "ERROR: Cannot understand GT = %s" % (GT)
+                        raise SystemExit
+
+                elif (chr == 'chrX') or (chr == 'chrY'):
+                    if DEC_CHILD_SEX == '46XX':			# a girl
+                        if GT == 'HET':
+                            genotype = 'Heterozygous'
+                        elif GT == 'HOM':
+                            genotype = 'Homozygous'
+                        else:
+                            print "ERROR: Cannot understand GT = %s" % (GT)
+                            raise SystemExit
+                    elif DEC_CHILD_SEX == '46XY':		# a boy
+                        if GT == 'HET':
+                            genotype = 'Heterozygous'
+                            print "   WARNING: HET variant on chrX/Y for a boy (%s): %s\t%s\t%s\t%s\t%s" % (CHILD_ID,chr,pos,ref,alt,VCF_VAR)
+                        elif GT == 'HOM':
+                            genotype = 'Hemizygous'
+                        else:
+                            print "ERROR: Cannot understand GT = %s" % (GT)
+                            raise SystemExit
+                    else:
+                        print "ERROR: unknown sex for this proband = %s" % (DEC_CHILD_SEX)
+                        raise SystemExit
+                else:
+                    print "ERROR: unknown chr"
+                    print line
+                    raise SystemExit
+
+                # write to the DECIPHER file
+                gene_id_idx = gene.find('(')
+                if gene_id_idx == -1:
+                    gene_id_idx = len(gene)
+                gene_id = gene[0:gene_id_idx]
+                int_ID = MAP_DICT[fam_id]
+
+                if trans in TRANS_DICT:				# if the transcriptID is to be replaced
+                    safe_trans = TRANS_DICT[trans]
+                else:
+                    safe_trans = trans
+
+                to_write = '%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,,%s,,,,"%s",,,,%s,\n' % (int_ID,chr[3:],pos,ASSEMBLY,ref,alt,safe_trans,gene_id,INTERGENIC,DEC_CHILD_SEX,ACCESS,inher_stat,genotype)
+                out_cntr += 1
+                out_han.write(to_write)
+
+                # write to the IGV file
+                i_s = pos - SNAP_FLANK
+                i_e = pos + SNAP_FLANK
+
+                # check if above FS/SOR_THRESH to include in the snapshot name
+                if (FS == '') or (SOR == ''):
+                    flag = 'NA'
+                elif (FS >= FS_THRESH) and (SOR >= SOR_THRESH):
+                    flag = 'FS_%.1f_SOR_%.1f' % (FS,SOR)
+                else:
+                    flag = 'OK'
+                i_name = '%s_%s_%s_%s_%s_%s.png' % (CHILD_ID,chr,pos,ref,alt,flag)
+
+                out_igv_han.write('goto %s:%s-%s\n' % (chr,i_s,i_e))
+                out_igv_han.write('sort strand\n')
+                out_igv_han.write('squish\n')
+                out_igv_han.write('snapshot %s\n' % (i_name))
+                out_igv_han.write('\n')
+
+
+
+        elif len(ref) > len(alt):			# DEL
+            if len(alt) != 1:
+                print "ERROR with a deletion"
+                print line
+                raise SystemExit
+
+            G2P_key_to_match = '%s:%s:%s:-' % (chr,pos+1,ref[1:])
+            VASE_key_to_match = '%s:%s:%s:%s' % (chr,pos,ref,alt)
+            is_denovo = False
+            if VASE_key_to_match in VASE_DICT:
+                VASE_DICT[VASE_key_to_match] = 1
+                is_denovo = True
+            if G2P_key_to_match in G2P_DICT:
+                G2P_DICT[G2P_key_to_match] = 1
+                trans = G2P_DATA[G2P_key_to_match][0]
+                gene = G2P_DATA[G2P_key_to_match][1]
+                GT = G2P_DATA[G2P_key_to_match][2]
+
+                if is_denovo:
+                    if inher_stat == 'Unknown':
+                        inher_stat = 'De novo constitutive'
+                    else:
+                        print "ERROR: %s is both VASE denovo and %s from VCF" % (key,inher_stat)
+                        raise SystemExit
+
+                if (chr != 'chrX') and (chr != 'chrY'):
+                    if GT == 'HET':
+                        genotype = 'Heterozygous'
+                    elif GT == 'HOM':
+                        genotype = 'Homozygous'
+                    else:
+                        print "ERROR: Cannot understand GT = %s" % (GT)
+                        raise SystemExit
+                elif (chr == 'chrX') or (chr == 'chrY'):
+                    if DEC_CHILD_SEX == '46XX':                 # a girl
+                        if GT == 'HET':
+                            genotype = 'Heterozygous'
+                        elif GT == 'HOM':
+                            genotype = 'Homozygous'
+                        else:
+                            print "ERROR: Cannot understand GT = %s" % (GT)
+                            raise SystemExit
+                    elif DEC_CHILD_SEX == '46XY':               # a boy
+                        if GT == 'HET':
+                            genotype = 'Heterozygous'
+                            print "   WARNING: HET variant on chrX/Y for a boy (%s): %s\t%s\t%s\t%s\t%s" % (CHILD_ID,chr,pos,ref,alt,VCF_VAR)
+                        elif GT == 'HOM':
+                            genotype = 'Hemizygous'
+                        else:
+                            print "ERROR: Cannot understand GT = %s" % (GT)
+                            raise SystemExit
+                    else:
+                        print "ERROR: unknown sex for this proband = %s" % (DEC_CHILD_SEX)
+                        raise SystemExit
+                else:
+                    print "ERROR: unknown chr"
+                    print line
+                    raise SystemExit
+
+                # write to the DECIPHER file
+                gene_id_idx = gene.find('(')
+                if gene_id_idx == -1:
+                    gene_id_idx = len(gene)
+                gene_id = gene[0:gene_id_idx]
+                int_ID = MAP_DICT[fam_id]
+
+                if trans in TRANS_DICT:                         # if the transcriptID is to be replaced
+                    safe_trans = TRANS_DICT[trans]
+                else:
+                    safe_trans = trans
+
+                to_write = '%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,,%s,,,,"%s",,,,%s,\n' % (int_ID,chr[3:],pos,ASSEMBLY,ref,alt,safe_trans,gene_id,INTERGENIC,DEC_CHILD_SEX,ACCESS,inher_stat,genotype)
+                out_cntr += 1
+                out_han.write(to_write)
+
+                # write to the IGV file
+                i_s = pos - SNAP_FLANK
+                i_e = pos + SNAP_FLANK
+
+                # check if above FS/SOR_THRESH to include in the snapshot name
+                if (FS == '') or (SOR == ''):
+                    flag = 'NA'
+                elif (FS >= FS_THRESH) and (SOR >= SOR_THRESH):
+                    flag = 'FS_%.1f_SOR_%.1f' % (FS,SOR)
+                else:
+                    flag = 'OK'
+                i_name = '%s_%s_%s_%s_%s_%s.png' % (CHILD_ID,chr,pos,ref,alt,flag)
+
+                out_igv_han.write('goto %s:%s-%s\n' % (chr,i_s,i_e))
+                out_igv_han.write('sort strand\n')
+                out_igv_han.write('squish\n')
+                out_igv_han.write('snapshot %s\n' % (i_name))
+                out_igv_han.write('\n')
+
+
+
+        elif len(ref) < len(alt):                       # INS
+            if len(ref) != 1:
+                print "ERROR with an insertion"
+                print line
+                raise SystemExit
+
+            G2P_key_to_match = '%s:%s:-:%s' % (chr,pos+1,alt[1:])
+            VASE_key_to_match = '%s:%s:%s:%s' % (chr,pos,ref,alt)
+            is_denovo = False
+            if VASE_key_to_match in VASE_DICT:
+                VASE_DICT[VASE_key_to_match] = 1
+                is_denovo = True
+            if G2P_key_to_match in G2P_DICT:
+                G2P_DICT[G2P_key_to_match] = 1
+                trans = G2P_DATA[G2P_key_to_match][0]
+                gene = G2P_DATA[G2P_key_to_match][1]
+                GT = G2P_DATA[G2P_key_to_match][2]
+
+                if is_denovo:
+                    if inher_stat == 'Unknown':
+                        inher_stat = 'De novo constitutive'
+                    else:
+                        print "ERROR: %s is both VASE denovo and %s from VCF" % (key,inher_stat)
+                        raise SystemExit
+
+                if (chr != 'chrX') and (chr != 'chrY'):
+                    if GT == 'HET':
+                        genotype = 'Heterozygous'
+                    elif GT == 'HOM':
+                        genotype = 'Homozygous'
+                    else:
+                        print "ERROR: Cannot understand GT = %s" % (GT)
+                        raise SystemExit
+                elif (chr == 'chrX') or (chr == 'chrY'):
+                    if DEC_CHILD_SEX == '46XX':                 # a girl
+                        if GT == 'HET':
+                            genotype = 'Heterozygous'
+                        elif GT == 'HOM':
+                            genotype = 'Homozygous'
+                        else:
+                            print "ERROR: Cannot understand GT = %s" % (GT)
+                            raise SystemExit
+                    elif DEC_CHILD_SEX == '46XY':               # a boy
+                        if GT == 'HET':
+                            genotype = 'Heterozygous'
+                            print "   WARNING: HET variant on chrX/Y for a boy (%s): %s\t%s\t%s\t%s\t%s" % (CHILD_ID,chr,pos,ref,alt,VCF_VAR)
+                        elif GT == 'HOM':
+                            genotype = 'Hemizygous'
+                        else:
+                            print "ERROR: Cannot understand GT = %s" % (GT)
+                            raise SystemExit
+                    else:
+                        print "ERROR: unknown sex for this proband = %s" % (DEC_CHILD_SEX)
+                        raise SystemExit
+                else:
+                    print "ERROR: unknown chr"
+                    print line
+                    raise SystemExit
+
+
+                # write to the DECIPHER file
+                gene_id_idx = gene.find('(')
+                if gene_id_idx == -1:
+                    gene_id_idx = len(gene)
+                gene_id = gene[0:gene_id_idx]
+                int_ID = MAP_DICT[fam_id]
+
+                if trans in TRANS_DICT:                         # if the transcriptID is to be replaced
+                    safe_trans = TRANS_DICT[trans]
+                else:
+                    safe_trans = trans
+
+                to_write = '%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,,%s,,,,"%s",,,,%s,\n' % (int_ID,chr[3:],pos,ASSEMBLY,ref,alt,safe_trans,gene_id,INTERGENIC,DEC_CHILD_SEX,ACCESS,inher_stat,genotype)
+                out_cntr += 1
+                out_han.write(to_write)
+
+                # write to the IGV file
+                i_s = pos - SNAP_FLANK
+                i_e = pos + SNAP_FLANK
+
+                # check if above FS/SOR_THRESH to include in the snapshot name
+                if (FS == '') or (SOR == ''):
+                    flag = 'NA'
+                elif (FS >= FS_THRESH) and (SOR >= SOR_THRESH):
+                    flag = 'FS_%.1f_SOR_%.1f' % (FS,SOR)
+                else:
+                    flag = 'OK'
+                i_name = '%s_%s_%s_%s_%s_%s.png' % (CHILD_ID,chr,pos,ref,alt,flag)
+
+                out_igv_han.write('goto %s:%s-%s\n' % (chr,i_s,i_e))
+                out_igv_han.write('sort strand\n')
+                out_igv_han.write('squish\n')
+                out_igv_han.write('snapshot %s\n' % (i_name))
+                out_igv_han.write('\n')
+
+
+        else:
+            print "Cannot establish the type of this VCF variant"
+            print line
+            raise SystemExit
+
+    in_han.close()
+    out_han.close()
+    out_igv_han.close()
+
+
+
+
+
+
+
+    ### check if all G2P and VASE variants were found/matched in the proband's VCF
+    found_all_G2P = True
+    found_all_VASE = True
+
+    for k,v in G2P_DICT.iteritems():
+        if int(v) == 0:
+            print k
+            found_all_G2P = False
+            break
+
+    for k,v in VASE_DICT.iteritems():
+        if int(v) == 0:
+            print k
+            found_all_VASE = False
+            break
+
+    if found_all_G2P:
+        print "OK: Found all %s G2P variants in the proband's VCF file" % (len(G2P_DICT))
+    else:
+        print "ERROR: Could not find all G2P variants in the probands VCF file"
+        raise SystemExit
+
+    if found_all_VASE:
+        print "OK: Found all %s VASE variants in the proband's VCF file" % (len(VASE_DICT))
+    else:
+        print "ERROR: Could not find all VASE variants in the probands VCF file"
+        raise SystemExit
+
+    ### check if all G2P variants are written out
+    if out_cntr == NUM_UNIQ_G2P_VARS:
+        print "OK: All G2P vars are recorded in the output DECIPHER file"
+    else:
+        print "ERROR: *NOT* all G2P vars are recorded in the G2P VCF file"
+
+
+    print "Wrote %s variants in outfile = %s" % (out_cntr,out_dec_file)
+    print "The batch snapshot file = %s" % (out_igv_file)
+    sys.stdout.flush()
+
+
+
+
+
+
+
+
+
+
+def read_all_VCF_vars(in_vcf_file,THIS_DICT):
+
+    in_han = gzip.open(in_vcf_file,'r')
+    for line in in_han:
+        if line.startswith('#'):
+            continue
+
+        data = [x.strip() for x in line.strip().split('\t')]
+        chr = data[0]
+        pos = int(data[1])
+        ref = data[3]
+        alt = data[4]
+
+
+        # did the splitting and normalizing - should not have multiallelic variants
+        if alt.find(',') != -1:
+            print "ERROR: found multiallelic variant"
+            print line
+            raiseSystemExit
+
+        key = '%s:%s:%s:%s' % (chr,pos,ref,alt)
+        if key not in THIS_DICT:
+            THIS_DICT[key] = 1
+        else:
+            print "ERROR: duplicate key = %s in %s" % (key,in_vcf_file)
+            raise SystemExit
+
+    in_han.close()
+
+
+
+
+
+
+
+def read_VASE(in_file):
+
+    global NUM_UNIQ_VASE_VARS
+
+    in_han = open(in_file,'r')
+    for line in in_han:
+        # ignore header lines
+        if line.startswith('#'):
+            continue
+
+        data = [x.strip() for x in line.strip().split('\t')]
+        chr = data[0]
+        pos = data[1]
+        ref = data[3]
+        alt = data[4]
+
+        key = '%s:%s:%s:%s' % (chr,pos,ref,alt)
+
+        if key not in VASE_DICT:
+            VASE_DICT[key] = 0
+        else:
+            print "ERROR: duplicate VASE variant key = %s" % (key)
+            raise SystemExit
+
+    in_han.close()
+    NUM_UNIQ_VASE_VARS = len(VASE_DICT)
+    print "Found %s unique VASE denovo variants for CHILD (%s)" % (NUM_UNIQ_VASE_VARS,CHILD_ID)
+    sys.stdout.flush()
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+def read_G2P(in_file):
+
+    global NUM_UNIQ_G2P_VARS
+
+#.#    known_OBS_states = ['monoallelic','biallelic','hemizygous','x-linked dominant','x-linked over-dominance']
+    known_OBS_states = ['monoallelic_autosomal','biallelic_autosomal','monoallelic_X_hem','monoallelic_X_het']
+
+
+    # first, read the G2P variants on canonical transcripts for each of the family members
+    CHILD_DICT = defaultdict(dict)	# 1st level key: OBS state; 2nd level key: chr:start:end:ref:alt; value: (ZYG,gene,trans)
+    MOM_DICT = defaultdict(dict)	# 1st level key: OBS state; 2nd level key: chr:start:end:ref:alt; value: (ZYG,gene,trans)
+    DAD_DICT = defaultdict(dict)	# 1st level key: OBS state; 2nd level key: chr:start:end:ref:alt; value: (ZYG,gene,trans)
+
+    in_han = open(in_file,'r')
+    for line in in_han:
+        data = [x.strip() for x in line.strip().split('\t')]
+
+        # get the individual_id
+        sam_id = data[0]
+
+        # ignore variants not on canonical transcripts
+        is_canon = data[3]
+        if is_canon != 'is_canonical':
+            continue
+
+        # split the variants based on the gene's OBS model of inheritance
+        inher_model = data[4]
+        aaa,OBS_state = inher_model.split('=')
+
+        if OBS_state not in known_OBS_states:
+            print "ERROR: unknown OBS state = %s in %s" % (OBS_state,in_file)
+            raise SystemExit
+
+        # get the gene name in format ENSG00000165899(C12orf64,OTOGL)
+        gene_name = data[1]
+
+        # get the transcript name in format ENST00000238647
+        transcript = data[2]
+
+
+        # this is a list of variants (n>=1) on a canonical transcript in a gene being considered under any OBS state
+        var_list = [y.strip() for y in data[6].split(';')]
+        for v in var_list:
+            v_details = [z.strip() for z in v.split(':')]
+            chr = v_details[0]
+            start = int(v_details[1])
+            end = int(v_details[2])
+            ref = v_details[3]
+            alt = v_details[4]
+            GT = v_details[5]
+            second_key = '%s:%s:%s:%s:%s' % (chr,start,end,ref,alt)
+
+
+            if sam_id == CHILD_ID:
+                # check for duplication
+                if OBS_state not in CHILD_DICT:
+                    CHILD_DICT[OBS_state][second_key] = (GT,gene_name,transcript)
+                elif second_key not in CHILD_DICT[OBS_state]:
+                    CHILD_DICT[OBS_state][second_key] = (GT,gene_name,transcript)
+                else:           # already recorded this variant
+                                # if we have refseq recorded and this is ensembl --> replace
+                    if not CHILD_DICT[OBS_state][second_key][1].startswith('ENSG'):             # recorded is refseq
+                        if gene_name.startswith('ENSG'):                                        # this is ensembl
+                            CHILD_DICT[OBS_state][second_key] = (GT,gene_name,transcript)       # replace
+                        else:                                                                   # this is refseq again, ignore
+                            pass
+                    else:                                                                       # recorded is ensembl, ignore
+                        pass
+
+            elif sam_id == MOM_ID:
+                # check for duplication
+                if OBS_state not in MOM_DICT:
+                    MOM_DICT[OBS_state][second_key] = (GT,gene_name,transcript)
+                elif second_key not in MOM_DICT[OBS_state]:
+                    MOM_DICT[OBS_state][second_key] = (GT,gene_name,transcript)
+                else:           # already recorded this variant
+                                # if we have refseq recorded and this is ensembl --> replace
+                    if not MOM_DICT[OBS_state][second_key][1].startswith('ENSG'):               # recorded is refseq
+                        if gene_name.startswith('ENSG'):                                        # this is ensembl
+                            MOM_DICT[OBS_state][second_key] = (GT,gene_name,transcript)         # replace
+                        else:                                                                   # this is refseq again, ignore
+                            pass
+                    else:                                                                       # recorded is ensembl, ignore
+                        pass
+
+            elif sam_id == DAD_ID:
+                # check for duplication
+                if OBS_state not in DAD_DICT:
+                    DAD_DICT[OBS_state][second_key] = (GT,gene_name,transcript)
+                elif second_key not in DAD_DICT[OBS_state]:
+                    DAD_DICT[OBS_state][second_key] = (GT,gene_name,transcript)
+                else:           # already recorded this variant
+                                # if we have refseq recorded and this is ensembl --> replace
+                    if not DAD_DICT[OBS_state][second_key][1].startswith('ENSG'):               # recorded is refseq
+                        if gene_name.startswith('ENSG'):                                        # this is ensembl
+                            DAD_DICT[OBS_state][second_key] = (GT,gene_name,transcript)         # replace
+                        else:                                                                   # this is refseq again, ignore
+                            pass
+                    else:                                                                       # recorded is ensembl, ignore
+                        pass
+
+            else:
+                print "ERROR: cannot identify the person for this variant"
+                print line
+                raise SystemExit
+
+
+
+    in_han.close()
+
+
+    ### print out the number of unique G2P variants in CHILD ###
+    child_mono = 0
+    child_bi = 0
+    child_hem = 0
+    child_het = 0
+
+    if 'monoallelic_autosomal' in CHILD_DICT:
+        child_mono = len(CHILD_DICT['monoallelic_autosomal'])
+    if 'biallelic_autosomal' in CHILD_DICT:
+        child_bi = len(CHILD_DICT['biallelic_autosomal'])
+    if 'monoallelic_X_hem' in CHILD_DICT:
+        child_hem = len(CHILD_DICT['monoallelic_X_hem'])
+    if 'monoallelic_X_het' in CHILD_DICT:
+        child_het = len(CHILD_DICT['monoallelic_X_het'])
+
+    print "CHILD (%s): number of unique G2P variants on canon transcript in the following OBS states" % (CHILD_ID)
+    print "    monoallelic_autosomal: %s" % (child_mono)
+    print "    biallelic_autosomal: %s" % (child_bi)
+    print "    monoallelic_X_hem: %s" % (child_hem)
+    print "    monoallelic_X_het: %s" % (child_het)
+
+
+    ### print out the number of unique G2P variants in MOM ###
+    mom_mono = 0
+    mom_bi = 0
+    mom_hem = 0
+    mom_het = 0
+
+    if 'monoallelic_autosomal' in MOM_DICT:
+        mom_mono = len(MOM_DICT['monoallelic_autosomal'])
+    if 'biallelic_autosomal' in MOM_DICT:
+        mom_bi = len(MOM_DICT['biallelic_autosomal'])
+    if 'monoallelic_X_hem' in MOM_DICT:
+        mom_hem = len(MOM_DICT['monoallelic_X_hem'])
+    if 'monoallelic_X_het' in MOM_DICT:
+        mom_het = len(MOM_DICT['monoallelic_X_het'])
+
+    print "MOM (%s): number of unique G2P variants on canon transcript in the following OBS states" % (MOM_ID)
+    print "    monoallelic_autosomal: %s" % (mom_mono)
+    print "    biallelic_autosomal: %s" % (mom_bi)
+    print "    monoallelic_X_hem: %s" % (mom_hem)
+    print "    monoallelic_X_het: %s" % (mom_het)
+
+
+
+    ### print out the number of unique G2P variants in DAD ###
+    dad_mono = 0
+    dad_bi = 0
+    dad_hem = 0
+    dad_het = 0
+
+    if 'monoallelic_autosomal' in DAD_DICT:
+        dad_mono = len(DAD_DICT['monoallelic_autosomal'])
+    if 'biallelic_autosomal' in DAD_DICT:
+        dad_bi = len(DAD_DICT['biallelic_autosomal'])
+    if 'monoallelic_X_hem' in DAD_DICT:
+        dad_hem = len(DAD_DICT['monoallelic_X_hem'])
+    if 'monoallelic_X_het' in DAD_DICT:
+        dad_het = len(DAD_DICT['monoallelic_X_het'])
+
+    print "DAD (%s): number of unique G2P variants on canon transcript in the following OBS states" % (DAD_ID)
+    print "    monoallelic_autosomal: %s" % (dad_mono)
+    print "    biallelic_autosomal: %s" % (dad_bi)
+    print "    monoallelic_X_hem: %s" % (dad_hem)
+    print "    monoallelic_X_het: %s" % (dad_het)
+    sys.stdout.flush()
+
+
+
+
+
+
+    ######################################################################################################
+    ####    Dominant filtering                                                                        ####
+    ####    if the gene has been considered under the dominant model (OBS == monoallelic_autosomal)   ####
+    ####    exclude child variants seen in UNAFFECTED mother/father, regardless of GT                 ####
+    ######################################################################################################
+
+
+    print ""
+    print "===   monoallelic autosomal (DOMINANT) filtering   ==="
+
+
+    for key in CHILD_DICT['monoallelic_autosomal']:	# this the second key: chr:start:end:ref:alt; value: (ZYG,gene,trans)
+
+        CHILD_GT = CHILD_DICT['monoallelic_autosomal'][key][0]
+        CHILD_GENE = CHILD_DICT['monoallelic_autosomal'][key][1]
+        CHILD_TRANS = CHILD_DICT['monoallelic_autosomal'][key][2]
+
+        if (key in MOM_DICT['monoallelic_autosomal']) and (MOM_STAT == "UNAFFECTED"):
+            MOM_GT = MOM_DICT['monoallelic_autosomal'][key][0]
+            print "***[DOMINANT model]*** Excluded CHILD var %s in gene = %s, CHILD_GT = %s, MOM_GT = %s, MOM_STAT = %s" % (key,CHILD_GENE,CHILD_GT,MOM_GT,MOM_STAT)
+            continue
+
+        if (key in DAD_DICT['monoallelic_autosomal']) and (DAD_STAT == "UNAFFECTED"):
+            DAD_GT = DAD_DICT['monoallelic_autosomal'][key][0]
+            print "***[DOMINANT model]*** Excluded CHILD var %s in gene = %s, CHILD_GT = %s, DAD_GT = %s, DAD_STAT = %s" % (key,CHILD_GENE,CHILD_GT,DAD_GT,DAD_STAT)
+            continue
+
+
+        # if a non-normalized INDEL in child G2P - must adjust (should not happen really, we split, normalized and left-aligned the family VCF before sending it to VEP+G2P)
+        chr,start,end,ref,alt = key.split(":")
+        if len(ref) > 1 and len(alt) > 1:                           # an INDEL - not normalized
+            if len(ref) < len(alt):                                 # an INS
+                orig_start = start
+                orig_ref = ref
+                orig_alt = alt
+                start = orig_start
+                ref = '-'
+                alt = orig_alt[len(orig_ref):]
+                print "    WARNING: original INS = %s:%s:%s:%s:%s --> replaced with INS = %s:%s:%s:%s" % (chr,orig_start,end,orig_ref,orig_alt,chr,start,ref,alt)
+            else:                                                   # a DEL
+                print "ERROR: At the momemnt, cannot deal with this non-normalized deletion"
+                print line
+                raise SystemExit
+
+        new_key = '%s:%s:%s:%s' % (chr,start,ref,alt)
+
+        # record the data for CHILD G2P variants (for OBS=monoallelic)
+
+        if new_key not in G2P_DICT:
+            G2P_DICT[new_key] = 0
+        else:
+            # print "ERROR: duplicate G2P variant new_key = %s" % (new_key)
+            # raise SystemExit
+            # this will happen if a gene is e.g. hemizygous,x-linked dominant - there will be two separate lines in the output for each req
+            pass
+
+        # and record the required data (CHILD_TRANS,CHILD_GENE,CHILD_GT) in G2P_DATA
+        if new_key not in G2P_DATA:
+            G2P_DATA[new_key] = (CHILD_TRANS,CHILD_GENE,CHILD_GT)
+        else:
+            # print "ERROR: duplicate G2P variant new_key = %s" % (new_key)
+            # raise SystemExit
+            # this will happen if a gene is e.g. hemizygous,x-linked dominant - there will be two separate lines in the output for each req
+            pass
+
+
+    NUM_UNIQ_G2P_VARS = len(G2P_DICT)
+    print "Found %s unique G2P variants in CHILD (%s) after considering MONOALLELIC genes" % (NUM_UNIQ_G2P_VARS,CHILD_ID)
+    sys.stdout.flush()
+
+    print ""
+
+
+
+
+    ##############################################################################################################
+    ####    Recessive filtering                                                                               ####
+    ####    under the recessive model (OBS == biallelic_autosomal) - consider ALL variants per gene           ####
+    ####    must all be HET in CHILD, GT in parent does not matter                                            ####
+    ####    all of them must *clearly* come from only one of the parents (maternally/paternally + biparental) ####
+    ####    and this parent must be unaffected                                                                ####
+    ####    if all these: then exclude all child variants in this gene                                        ####
+    ##############################################################################################################
+
+    print ""
+    print "===   biallelic autosomal (RECESSIVE) filtering   ==="
+
+
+    GENE_KEY_GT = defaultdict(dict)             # for child - 1st level key: gene_name; 2nd level key: chr:start:end:ref:alt; value: (GT,trans)
+
+    # process all variants in biallelic genes in child
+    for key in CHILD_DICT['biallelic_autosomal']:               # this the second key: chr:start:end:ref:alt; value: (ZYG,gene,trans)
+        b_GT = CHILD_DICT['biallelic_autosomal'][key][0]
+        b_gene = CHILD_DICT['biallelic_autosomal'][key][1]
+        b_trans = CHILD_DICT['biallelic_autosomal'][key][2]
+        GENE_KEY_GT[b_gene][key] = (b_GT,b_trans)
+
+    # iterate over genes in GENE_KEY_GT
+    for g in GENE_KEY_GT:                       # this is the biallelic gene name
+        all_HET = True
+
+        # iterate over variants in this gene
+        for kx in GENE_KEY_GT[g]:               # this the second key: chr:start:end:ref:alt
+            if GENE_KEY_GT[g][kx][0] == 'HOM':     # there is a HOM variant in the child - NO filtering
+                all_HET = False
+                break
+
+        if all_HET:                             # for this gene
+        # all variants in this gene in the CHILD are HET, check if all come from a single unaffected parent
+        # if yes, filter out and write a message to the log file
+        # if not, to be added to G2P_DICT and G2P_DATA for further processing
+
+            all_from_one_parent = True
+
+            # iterate again over the variants in this gene
+            VAR_SOURCE_LIST = {}                # key: chr:start:end:ref:alt in child; value: (NONE) or (MOM or DAD or BOTH and the parent is UNAFFECTED)
+
+            for ky in GENE_KEY_GT[g]:           # this the second key: chr:start:end:ref:alt
+
+                this_var_status = 'NONE'
+
+                if ((ky in MOM_DICT['biallelic_autosomal']) or (ky in MOM_DICT['monoallelic_autosomal'])) and (MOM_STAT == "UNAFFECTED"):
+                    this_var_status = 'MOM'
+                if ((ky in DAD_DICT['biallelic_autosomal']) or (ky in DAD_DICT['monoallelic_autosomal'])) and (DAD_STAT == "UNAFFECTED"):
+                    if this_var_status == 'NONE':
+                        this_var_status = 'DAD'
+                    elif this_var_status == 'MOM':
+                        this_var_status = 'BOTH'
+
+                VAR_SOURCE_LIST[ky] = this_var_status
+
+            # have collected the parent source for all variants in this gene
+            tot_num_vars = len(VAR_SOURCE_LIST)
+            num_mom = 0
+            num_dad = 0
+            num_none = 0
+            for kt,v in VAR_SOURCE_LIST.iteritems():
+                if v == 'NONE':
+                    num_none += 1
+                elif v == 'MOM':
+                    num_mom += 1
+                elif v == 'DAD':
+                    num_dad += 1
+                elif v == 'BOTH':
+                    num_mom += 1
+                    num_dad += 1
+                else:
+                    print "ERROR: cannot understand the source parent = %s" % (v)
+                    raise SystemExit
+
+            if num_none > 0:
+                all_from_one_parent = False
+            elif num_mom < tot_num_vars and num_dad < tot_num_vars:
+                all_from_one_parent = False
+
+
+            # if all variants in the child in this gene are found in single unaffected parent - filter out
+            if all_from_one_parent:
+                for kz in GENE_KEY_GT[g]:
+                    print "***[RECESSIVE model]*** Excluded CHILD HET var %s in gene = %s, found in = %s, PARENT_STAT = UNAFFECTED" % (kz,g,VAR_SOURCE_LIST[kz])
+                continue
+
+        # end processing all HET variants in the proband - if all from single unaffected parent they have been excluded, message to the log written
+        # and gone to evaluating the next biallelic gene in the child
+
+        # if here
+        # - either not all CHILD variants in this gene are not HET, or
+        # - not all of them can be traced to a single unaffected parent
+        # --> add to be processed
+
+        # here we are at gene level, must iterate over all variants in this gene
+        # iterate over variants in this gene
+        for kkk in GENE_KEY_GT[g]:                # this the second key: chr:start:end:ref:alt
+
+            CHILD_GT = CHILD_DICT['biallelic_autosomal'][kkk][0]
+            CHILD_GENE = CHILD_DICT['biallelic_autosomal'][kkk][1]
+            CHILD_TRANS = CHILD_DICT['biallelic_autosomal'][kkk][2]
+
+            # if a non-normalized INDEL in child G2P - must adjust (should not happen really, we split, normalized and left-aligned the family VCF before sending it to VEP+G2P)
+            chr,start,end,ref,alt = kkk.split(":")
+            if len(ref) > 1 and len(alt) > 1:                           # an INDEL - not normalized
+                if len(ref) < len(alt):                                 # an INS
+                    orig_start = start
+                    orig_ref = ref
+                    orig_alt = alt
+                    start = orig_start
+                    ref = '-'
+                    alt = orig_alt[len(orig_ref):]
+                    print "    WARNING: original INS = %s:%s:%s:%s:%s --> replaced with INS = %s:%s:%s:%s" % (chr,orig_start,end,orig_ref,orig_alt,chr,start,ref,alt)
+                else:                                                   # a DEL
+                    print "ERROR: At the momemnt, cannot deal with this non-normalized deletion"
+                    print line
+                    raise SystemExit
+
+            new_key = '%s:%s:%s:%s' % (chr,start,ref,alt)
+
+            # record the data for CHILD G2P variants (for OBS=biallelic)
+            if new_key not in G2P_DICT:
+                G2P_DICT[new_key] = 0
+            else:
+                # print "ERROR: duplicate G2P variant new_key = %s" % (new_key)
+                # raise SystemExit
+                # this will happen if a gene is e.g. hemizygous,x-linked dominant - there will be two separate lines in the output for each req
+                pass
+
+            # and record the required data (CHILD_TRANS,CHILD_GENE,CHILD_GT) in G2P_DATA
+            if new_key not in G2P_DATA:
+                G2P_DATA[new_key] = (CHILD_TRANS,CHILD_GENE,CHILD_GT)
+            else:
+                # print "ERROR: duplicate G2P variant new_key = %s" % (new_key)
+                # raise SystemExit
+                # this will happen if a gene is e.g. hemizygous,x-linked dominant - there will be two separate lines in the output for each req
+                pass
+
+    NUM_UNIQ_G2P_VARS = len(G2P_DICT)
+    print "Found %s unique G2P variants in CHILD (%s) after considering MONOALLELIC and BIALLELIC genes" % (NUM_UNIQ_G2P_VARS,CHILD_ID)
+    sys.stdout.flush()
+    print ""
+
+
+
+
+
+    ####################################################################################################################
+    ####    X-linked filtering                                                                                      ####
+#.#    ####    under the x-linked model (OBS == hemizygous or x-linked dominant, but NOT x-linked over-dominance)      ####
+    ####    under the chrX model (OBS == monoallelic_X_hem or monoallelic_X_het)                                    ####
+    ####    exclude child HET variants if seen as HOM in UNAFFECTED father                                          ####
+    ####                                                                                                            ####
+    ####    Note 18/01/2022                                                                                         ####
+    ####    This is a temporary solution, since x-linked dominant and x-linked over-dominance -> monoallelic_X_het  ####
+    ####    and we should filter x-linked dominant and monoallelic_X_hem, but not x-linked over-dominance           ####
+    ####    the code below treats x-linked over-dominance as the others (i.e. filters, while it should not)         ####
+    ####    Issue flagged to G2P plug-in team, awaiting their fix                                                   ####
+    ####    for now manually scan the output of G2P for the proband (both for boys and girls)                       ####
+    ####        to check if any variant has been called in PCDH19 and EFNB1                                         ####
+    ####    also for all the variants filtered out from monoallelic_X_het we will print in the log the gene name    ####
+    ####################################################################################################################
+
+
+    print ""
+    print "===   X-linked filtering   ==="
+
+    #######################################
+    ### process monoallelic_X_hem genes ###
+    #######################################
+
+    for key in CHILD_DICT['monoallelic_X_hem']:       # this the second key: chr:start:end:ref:alt; value: (ZYG,gene,trans)
+
+        CHILD_GT = CHILD_DICT['monoallelic_X_hem'][key][0]
+        CHILD_GENE = CHILD_DICT['monoallelic_X_hem'][key][1]
+        CHILD_TRANS = CHILD_DICT['monoallelic_X_hem'][key][2]
+
+        if CHILD_GT == 'HOM':					# do NOT filter HOM variants in proband (i.e., hemizygous in boy or HOM in girl)
+            pass
+        else:
+            if (key in DAD_DICT['monoallelic_X_hem']) and (DAD_STAT == "UNAFFECTED"):
+                DAD_GT = DAD_DICT['monoallelic_X_hem'][key][0]
+                if DAD_GT == 'HOM':                                             # i.e., hemizygous variant in unaffected father
+                    print "***[monoallelic_X_hem]*** Excluded CHILD var %s in gene = %s, CHILD_GT = %s, DAD_GT = %s, DAD_STAT = %s" % (key,CHILD_GENE,CHILD_GT,DAD_GT,DAD_STAT)
+                    continue
+
+        # if a non-normalized INDEL in child G2P - must adjust (should not happen really, we split, normalized and left-aligned the family VCF before sending it to VEP+G2P)
+        chr,start,end,ref,alt = key.split(":")
+        if len(ref) > 1 and len(alt) > 1:                           # an INDEL - not normalized
+            if len(ref) < len(alt):                                 # an INS
+                orig_start = start
+                orig_ref = ref
+                orig_alt = alt
+                start = orig_start
+                ref = '-'
+                alt = orig_alt[len(orig_ref):]
+                print "    WARNING: original INS = %s:%s:%s:%s:%s --> replaced with INS = %s:%s:%s:%s" % (chr,orig_start,end,orig_ref,orig_alt,chr,start,ref,alt)
+            else:                                                   # a DEL
+                print "ERROR: At the momemnt, cannot deal with this non-normalized deletion"
+                print line
+                raise SystemExit
+
+        new_key = '%s:%s:%s:%s' % (chr,start,ref,alt)
+
+        # record the data for CHILD G2P variants (for OBS=monoallelic_X_hem)
+        if new_key not in G2P_DICT:
+            G2P_DICT[new_key] = 0
+        else:
+            # print "ERROR: duplicate G2P variant new_key = %s" % (new_key)
+            # raise SystemExit
+            # this will happen if a gene is e.g. hemizygous,x-linked dominant - there will be two separate lines in the output for each req
+            pass
+
+        # and record the required data (CHILD_TRANS,CHILD_GENE,CHILD_GT) in G2P_DATA
+        if new_key not in G2P_DATA:
+            G2P_DATA[new_key] = (CHILD_TRANS,CHILD_GENE,CHILD_GT)
+        else:
+            # print "ERROR: duplicate G2P variant new_key = %s" % (new_key)
+            # raise SystemExit
+            # this will happen if a gene is e.g. hemizygous,x-linked dominant - there will be two separate lines in the output for each req
+            pass
+
+
+
+    #######################################
+    ### process monoallelic_X_het genes ###
+    #######################################
+
+    for key in CHILD_DICT['monoallelic_X_het']:       # this the second key: chr:start:end:ref:alt; value: (ZYG,gene,trans)
+
+        CHILD_GT = CHILD_DICT['monoallelic_X_het'][key][0]
+        CHILD_GENE = CHILD_DICT['monoallelic_X_het'][key][1]
+        CHILD_TRANS = CHILD_DICT['monoallelic_X_het'][key][2]
+
+        if CHILD_GT == 'HOM':                                                   # do NOT filter HOM variants (i.e., hemizygous in boy or HOM in girl)
+            pass
+        else:
+            if (key in DAD_DICT['monoallelic_X_het']) and (DAD_STAT == "UNAFFECTED"):
+                DAD_GT = DAD_DICT['monoallelic_X_het'][key][0]
+                if DAD_GT == 'HOM':                                             # i.e., x-linked dominant variant in unnafected father
+                    print "***[monoallelic_X_het]*** Excluded CHILD var %s in gene = %s, CHILD_GT = %s, DAD_GT = %s, DAD_STAT = %s" % (key,CHILD_GENE,CHILD_GT,DAD_GT,DAD_STAT)
+                    continue
+
+        # if a non-normalized INDEL in child G2P - must adjust (should not happen really, we split, normalized and left-aligned the family VCF before sending it to VEP+G2P)
+        chr,start,end,ref,alt = key.split(":")
+        if len(ref) > 1 and len(alt) > 1:                           # an INDEL - not normalized
+            if len(ref) < len(alt):                                 # an INS
+                orig_start = start
+                orig_ref = ref
+                orig_alt = alt
+                start = orig_start
+                ref = '-'
+                alt = orig_alt[len(orig_ref):]
+                print "    WARNING: original INS = %s:%s:%s:%s:%s --> replaced with INS = %s:%s:%s:%s" % (chr,orig_start,end,orig_ref,orig_alt,chr,start,ref,alt)
+            else:                                                   # a DEL
+                print "ERROR: At the momemnt, cannot deal with this non-normalized deletion"
+                print line
+                raise SystemExit
+
+        new_key = '%s:%s:%s:%s' % (chr,start,ref,alt)
+
+        # record the data for CHILD G2P variants (for OBS=monoallelic_X_het)
+        if new_key not in G2P_DICT:
+            G2P_DICT[new_key] = 0
+        else:
+            # print "ERROR: duplicate G2P variant new_key = %s" % (new_key)
+            # raise SystemExit
+            # this will happen if a gene is e.g. hemizygous,x-linked dominant - there will be two separate lines in the output for each req
+            pass
+
+        # and record the required data (CHILD_TRANS,CHILD_GENE,CHILD_GT) in G2P_DATA
+        if new_key not in G2P_DATA:
+            G2P_DATA[new_key] = (CHILD_TRANS,CHILD_GENE,CHILD_GT)
+        else:
+            # print "ERROR: duplicate G2P variant new_key = %s" % (new_key)
+            # raise SystemExit
+            # this will happen if a gene is e.g. hemizygous,x-linked dominant - there will be two separate lines in the output for each req
+            pass
+
+
+
+
+
+#.#    ########################################################################
+#.#    ### process x-linked over-dominance  genes - no filtering to be done ###
+#.#    ########################################################################
+
+#.#    for key in CHILD_DICT['x-linked over-dominance']:       # this the second key: chr:start:end:ref:alt; value: (ZYG,gene,trans)
+
+#.#        CHILD_GT = CHILD_DICT['x-linked over-dominance'][key][0]
+#.#        CHILD_GENE = CHILD_DICT['x-linked over-dominance'][key][1]
+#.#        CHILD_TRANS = CHILD_DICT['x-linked over-dominance'][key][2]
+
+#.#        # if a non-normalized INDEL in child G2P - must adjust (should not happen really, we split, normalized and left-aligned the family VCF before sending it to VEP+G2P)
+#.#        chr,start,end,ref,alt = key.split(":")
+#.#        if len(ref) > 1 and len(alt) > 1:                           # an INDEL - not normalized
+#.#            if len(ref) < len(alt):                                 # an INS
+#.#                orig_start = start
+#.#                orig_ref = ref
+#.#                orig_alt = alt
+#.#                start = orig_start
+#.#                ref = '-'
+#.#                alt = orig_alt[len(orig_ref):]
+#.#                print "    WARNING: original INS = %s:%s:%s:%s:%s --> replaced with INS = %s:%s:%s:%s" % (chr,orig_start,end,orig_ref,orig_alt,chr,start,ref,alt)
+#.#            else:                                                   # a DEL
+#.#                print "ERROR: At the momemnt, cannot deal with this non-normalized deletion"
+#.#                print line
+#.#                raise SystemExit
+
+#.#        new_key = '%s:%s:%s:%s' % (chr,start,ref,alt)
+
+#.#        # record the data for CHILD G2P variants (for OBS=x-linked over-dominance)
+#.#        if new_key not in G2P_DICT:
+#.#            G2P_DICT[new_key] = 0
+#.#        else:
+#.#            # print "ERROR: duplicate G2P variant new_key = %s" % (new_key)
+#.#            # raise SystemExit
+#.#            # this will happen if a gene is e.g. hemizygous,x-linked dominant - there will be two separate lines in the output for each req
+#.#            pass
+
+#.#        # and record the required data (CHILD_TRANS,CHILD_GENE,CHILD_GT) in G2P_DATA
+#.#        if new_key not in G2P_DATA:
+#.#            G2P_DATA[new_key] = (CHILD_TRANS,CHILD_GENE,CHILD_GT)
+#.#        else:
+#.#            # print "ERROR: duplicate G2P variant new_key = %s" % (new_key)
+#.#            # raise SystemExit
+#.#            # this will happen if a gene is e.g. hemizygous,x-linked dominant - there will be two separate lines in the output for each req
+#.#            pass
+
+
+    NUM_UNIQ_G2P_VARS = len(G2P_DICT)
+    print "Found %s unique G2P variants in CHILD (%s) after considering MONOALLELIC, BIALLELIC and X-LINKED genes" % (NUM_UNIQ_G2P_VARS,CHILD_ID)
+    sys.stdout.flush()
+
+    print ""
+    print ""
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+def read_ped(in_file):
+
+    global CHILD_ID
+    global CHILD_SEX
+    global DEC_CHILD_SEX
+    global MOM_ID
+    global MOM_STAT
+    global DAD_ID
+    global DAD_STAT
+
+    CHILD_ID = 0
+    CHILD_SEX = 0
+    MOM_ID = 0
+    MOM_STAT = 0
+    DAD_ID = 0
+    DAD_STAT = 0
+
+    in_han = open(in_file,'r')
+    for line in in_han:
+        data = [x.strip() for x in line.strip().split('\t')]
+        if data[2] != '0' and data[3] != '0':			# this is the child in the trio
+            if CHILD_ID == 0:
+                CHILD_ID = data[1]
+            else:						# seen another child
+                print "ERROR: already have seen a child (possibly a quad) - cannot handle at the moment"
+                raise SystemExit
+
+            if DAD_ID == 0:
+                DAD_ID = data[2]
+            else:
+                if data[2] != DAD_ID:
+                    print "ERROR: DAD_ID mismatch - from child line dad_id = %s, from dad line dad_id = %s" % (data[2],DAD_ID)
+                    raise SystemExit
+            if MOM_ID == 0:
+                MOM_ID = data[3]
+            else:
+                if data[3] != MOM_ID:
+                    print "ERROR: MOM_ID mismatch - from child line mom_id = %s, from mom line mom_id = %s" % (data[3],MOM_ID)
+                    raise SystemExit
+
+            CHILD_SEX = int(data[4])
+            if CHILD_SEX == 1:		# boy
+                DEC_CHILD_SEX = '46XY'
+            elif CHILD_SEX == 2:	# girl
+                DEC_CHILD_SEX = '46XX'
+            else:
+                print "ERROR: proband sex unknown"
+                print line
+                raise SystemExit
+
+            if int(data[5]) != 2:
+                print "ERROR: child not affected"
+                print line
+                raise SystemExit
+
+
+        elif int(data[2]) == 0 and int(data[3]) == 0:		# this is a parent record
+            if int(data[4]) == 1:				# this is the dad
+                if int(data[5]) == 1:
+                    DAD_STAT = "UNAFFECTED"
+                elif int(data[5]) == 2:
+                    DAD_STAT = "AFFECTED"
+                else:
+                    print "ERROR: cannot establish the dad's status"
+                    print line
+                    raise SystemExit
+
+                if DAD_ID == 0:
+                    DAD_ID = data[1]
+                else:
+                    if data[1] != DAD_ID:
+                        print "ERROR: DAD_ID mismatch - from dad line dad_id = %s, from child line dad_id = %s" % (data[1],DAD_ID)
+                        raise SystemExit
+
+            if int(data[4]) == 2:                               # this is the mom
+                if int(data[5]) == 1:
+                    MOM_STAT = "UNAFFECTED"
+                elif int(data[5]) == 2:
+                    MOM_STAT = "AFFECTED"
+                else:
+                    print "ERROR: cannot establish mom's status"
+                    print line
+                    raise SystemExit
+
+                if MOM_ID == 0:
+                    MOM_ID = data[1]
+                else:
+                    if data[1] != MOM_ID:
+                        print "ERROR: MOM_ID mismatch - from mom line mom_id = %s, from child line mom_id = %s" % (data[1],MOM_ID)
+                        raise SystemExit
+        else:
+            print "ERROR: problematic PED line"
+            print line
+            raise SystemExit
+
+
+
+
+
+
+#def read_map_file(in_file):
+#    in_han = open(in_file,'r')
+#    for line in in_han:
+#        data = [x.strip() for x in line.strip().split('\t')]
+#        dec_id = data[0]
+#        int_id = data[1]
+#        if dec_id not in MAP_DICT:
+#            MAP_DICT[dec_id] = int_id
+#        else:
+#            print "ERROR: duplicate DECIPHER/family ID = %s" % (dec_id)
+#            raise SystemExit
+#    in_han.close()
+
+
+
+
+def read_trans_map(in_file):
+    in_han = open(in_file,'r')
+    for line in in_han:
+        data = [x.strip() for x in line.strip().split('\t')]
+        old_trans_id = data[0]
+        new_trans_id = data[1]
+        if old_trans_id not in TRANS_DICT:
+            TRANS_DICT[old_trans_id] = new_trans_id
+        else:
+            print "ERROR: duplicate old transcript ID = %s" % (old_trans_id)
+            raise SystemExit
+    in_han.close()
+
+
+
+
+
+
+if __name__ == '__main__':
+    if len(sys.argv) == 13:
+        go(sys.argv[1],sys.argv[2],sys.argv[3],sys.argv[4],sys.argv[5],sys.argv[6],sys.argv[7],sys.argv[8],sys.argv[9],sys.argv[10],sys.argv[11],sys.argv[12])
+    else:
+        print "Suggested use: time python /home/u035/u035/shared/scripts/NHS_WES_generate_DEC_IGV.py \
+        dec_map_file,trans_map_file,ped_file,in_g2p_file,in_vase_file,fam_igv_dir,vcf_dir,plate_id,fam_id,dec_dir,fam_bam_dir,indi_id_for_this_kid"
+        raise SystemExit
+
diff --git a/generate_aff_sib_PED_from_quad.py b/generate_aff_sib_PED_from_quad.py
new file mode 100644
index 0000000000000000000000000000000000000000..9a49a7f6f2fed48f5f07c20f08cea1302b4c9eb0
--- /dev/null
+++ b/generate_aff_sib_PED_from_quad.py
@@ -0,0 +1,48 @@
+#	given a quad family PED file and the ids of the two kids
+#	generate a PED file for the two siblings, setting their parents to 0 (unknown)
+#
+#       Author: MH
+#       last modified: SEPT 16, 2020
+
+
+
+import sys
+import os
+import csv
+import gzip
+
+
+def go(inout_dir,quad_ped_file,kid_1_id,kid_2_id):
+
+    shared_ped_file = '%s/%s_shared.ped' % (inout_dir,quad_ped_file[:-4])
+    out_han = open(shared_ped_file,'w')
+    out_cntr = 0
+
+    in_han = open("%s/%s" % (inout_dir,quad_ped_file),'r')
+    in_cntr = 0
+
+    for line in in_han:
+        in_cntr += 1
+        data = [x.strip() for x in line.strip().split('\t')]
+        indi_fam_id = data[1]
+        if ((indi_fam_id.startswith(kid_1_id)) or (indi_fam_id.startswith(kid_2_id))):
+            new_line = '%s\t%s\t%s\t%s\t%s\t%s\n' % (data[0],data[1],0,0,data[4],data[5])
+            out_han.write(new_line)
+            out_cntr += 1
+
+    in_han.close()
+    out_han.close()
+    print "Found %s individuals in the quad ped file = %s/%s" % (in_cntr,inout_dir,quad_ped_file)
+    print "Recorded %s individuals in shared ped file = %s" % (out_cntr,shared_ped_file)
+
+
+
+
+
+if __name__ == '__main__':
+    if len(sys.argv) == 5:
+        go(sys.argv[1],sys.argv[2],sys.argv[3],sys.argv[4])
+    else:
+        print "Suggested use: time $PYTHON /home/u035/u035/shared/scripts/NHS_WES_generate_aff_sib_ped.py ${PED_DIR} ${quad_ped_file} ${KID_1_ID} ${KID_2_ID}"
+        raise SystemExit
+
diff --git a/generate_trio_PED_from_quad.py b/generate_trio_PED_from_quad.py
new file mode 100644
index 0000000000000000000000000000000000000000..d12da911abb2f94515ea56c605ad796fcd3fedc7
--- /dev/null
+++ b/generate_trio_PED_from_quad.py
@@ -0,0 +1,47 @@
+#	given a quad family PED file, 1 kid and 2 parents ids
+#	generate a trio PED
+#
+#       Author: MH
+#       last modified: SEPT 15, 2020
+
+
+
+import sys
+import os
+import csv
+import gzip
+
+
+def go(inout_dir,quad_ped_file,kid_id,par_1_id,par_2_id):
+
+    trio_ped_file = '%s/%s_%s.ped' % (inout_dir,quad_ped_file[:-4],kid_id)
+    out_han = open(trio_ped_file,'w')
+    out_cntr = 0
+
+    in_han = open("%s/%s" % (inout_dir,quad_ped_file),'r')
+    in_cntr = 0
+
+    for line in in_han:
+        in_cntr += 1
+        data = [x.strip() for x in line.strip().split('\t')]
+        indi_fam_id = data[1]
+        if ((indi_fam_id.startswith(kid_id)) or (indi_fam_id.startswith(par_1_id)) or (indi_fam_id.startswith(par_2_id))):
+            out_han.write(line)
+            out_cntr += 1
+
+    in_han.close()
+    out_han.close()
+    print "Found %s individuals in the quad ped file = %s/%s" % (in_cntr,inout_dir,quad_ped_file)
+    print "Recorded %s individuals in %s trio ped file = %s" % (out_cntr,kid_id,trio_ped_file)
+
+
+
+
+
+if __name__ == '__main__':
+    if len(sys.argv) == 6:
+        go(sys.argv[1],sys.argv[2],sys.argv[3],sys.argv[4],sys.argv[5])
+    else:
+        print "Suggested use: time $PYTHON /home/u035/u035/shared/scripts/NHS_WES_generate_trio_ped.py ${PED_DIR} ${quad_ped_file} ${KID_ID} ${PAR_1_ID} ${PAR_2_ID}"
+        raise SystemExit
+
diff --git a/get_cov_output.py b/get_cov_output.py
new file mode 100644
index 0000000000000000000000000000000000000000..cc81c649c0248d3e5c1116804ebbdd80e8929a7c
--- /dev/null
+++ b/get_cov_output.py
@@ -0,0 +1,122 @@
+#	given 
+#		<gene_set>.<ID>.sample_interval_summary					- e.g. DDG2P.20180830.ClinVar.20190520.plus15bp.txt
+#		<gene_set>.<gene_set_date>.ClinVar.<clinvar_date>.plus15bp.txt		- e.g. DDG2P.20180830.ClinVar.20190520.plus15bp.txt
+#
+#	generate a coverage output file
+#		all the columns from the coverage file for this regions+ another column with the number of P/LP (ACP) ClinVar variants in this region
+#		all the columns from the ClinVar file + another column with the percentage of bases covered at least 20x from the coverage file
+#
+#       Author: MH
+#       last modified: SEPT 26, 2019
+
+
+
+import sys
+import os
+import csv
+import gzip
+
+
+
+COV_DICT = {}	#	key: 'chr:start-1:end'; value: coverage column (data[8])
+
+
+
+
+def go(in_gatk_file,in_clin_file,out_cov_file):
+
+    # read in the coverage file
+    read_coverage(in_gatk_file)
+
+    out_han = open(out_cov_file,'w')
+    out_cntr = 0
+    out_han.write('chr\tstart\tend\tgene\tnum ClinVar P/LP (ACP)\tpercent bases covered > 20x\n')
+
+    in_han = open(in_clin_file,'r')
+    in_cntr = 0
+
+    for line in in_han:
+        in_cntr += 1
+        data = [x.strip() for x in line.strip().split('\t')]
+        chr = data[0]
+        start = int(data[1])
+        end = int(data[2])
+        key = '%s:%s:%s' % (chr,start,end)
+        if key in COV_DICT:
+            cov = COV_DICT[key]
+        else:
+            print "ERROR: cannot find the coverage for key = %s" % (key)
+            raiseSystemExit
+
+        to_write = ''
+        for d in data:
+            to_write = to_write + '%s\t' % (d)
+        to_write = to_write + '%.1f\n' % (cov)
+        out_han.write(to_write)
+        out_cntr += 1
+
+    in_han.close()
+    out_han.close()
+    print "Read %s intervals from %s" % (in_cntr,in_clin_file)
+    print "Recorder %s intervals in output" % (out_cntr)
+    print ""
+    print "Output coverage file = %s" % (out_cov_file)
+    print ""
+    sys.stdout.flush()
+
+
+
+def read_coverage(in_file):
+
+    in_han = open(in_file,'r')
+    for line in in_han:
+
+        if line.startswith('Target'):
+            if line.endswith('%_above_20\n'):
+                # header line, the last column is percenatge of bases covered > 20x
+                continue
+            else:
+                print "ERROR: found header line, but the last column is not bases above 20x"
+                print line
+                raise SystemExit
+
+        data = [x.strip() for x in line.strip().split('\t')]
+        if len(data) != 9:
+            print "ERROR: wrong number of columns in coverage file = %s" % (in_file)
+            print line
+            raise SystemExit
+
+        locus = data[0]
+        if data[8] == 'NaN':
+            cov = float(0.0)
+        else:
+            cov = float(data[8])
+
+        chr,pos = locus.split(':')
+        s,e = pos.split('-')
+        start = int(s)-1
+        end = int(e)
+        key = '%s:%s:%s' % (chr,start,end)
+        if key not in COV_DICT:
+            COV_DICT[key] = cov
+        else:
+            print "ERROR: key from GATK coverage file not unique = %s" % (key)
+            raise SystemExit
+
+    in_han.close()
+    print "Recorded the coverage for %s unique keys" % (len(COV_DICT))
+    sys.stdout.flush()
+
+
+
+
+if __name__ == '__main__':
+    if len(sys.argv) == 4:
+        go(sys.argv[1],sys.argv[2],sys.argv[3])
+    else:
+        print "Suggested use: time $PYTHON /home/u035/u035/shared/scripts/generate_coverage_result_file.py \
+                              DDG2P.s14-NFE-Twist-NA12878.sample_interval_summary \
+                              /home/u035/u035/shared/resources/G2P/DDG2P.20180830.ClinVar.20190520.plus15bp.txt \
+                              DDG2P.s14-NFE-Twist-NA12878.COV.txt"
+        raise SystemExit
+
diff --git a/prepare_bcbio_config_old_edge.sh b/prepare_bcbio_config_old_edge.sh
deleted file mode 100755
index ea2971ec29c8bf6c19ba646e7a5e149e3a0f5385..0000000000000000000000000000000000000000
--- a/prepare_bcbio_config_old_edge.sh
+++ /dev/null
@@ -1,91 +0,0 @@
-#!/bin/bash
-#
-# prepare_bcbio_config.sh <config.sh> <project_id> <version> <sample_suffix>
-# 
-# Given a <project_id>.ped file for a set of trios (families) in the 
-# folder $PARAMS_DIR, creates the files <project_id>.family_ids.txt
-# and <project>.sample_ids.txt in the same folder.
-#
-# Assumes that reads for the samples are in the path
-# $READS_DIR/<project_id>/<date>/<sample><sample_suffix>/*.gz,
-# and that no samples other than those with reads are listed in the 
-# PED file. $READS_DIR is specified in the <config.sh> file.
-#
-# Assumes that the sample names in the PED file match those 
-# specifying the read directories with the addition of a specified
-# suffix.
-#
-# All samples must be annotated with sex (1=male, 2=female) in the
-# 5th column and phenotype (1=unaffected, 2=affected) in the 6th
-# column of the PED file.
-#
-# Runs bcbio sample preparation and configuration file generation,
-# assuming the template configuration file is at $BCBIO_TEMPLATE,
-# specified in the <config.sh> file.
-#
-# Assumes bcbio is on the PATH (set in <config.sh>).
-#
-
-CONFIG_SH=$1
-PROJECT_ID=$2
-VERSION=$3
-SAMPLE_SUFFIX=$4
-
-source $CONFIG_SH
-
-#
-# Create the files:
-#  $PROJECT_ID.family_ids.txt - format <pcr_plate_id>_<family_id>
-#  $PROJECT_ID.$FAMILY_ID.ped - select only the individuals in a given family, 
-#                               prefix <family_id> with <pcr_plate_id> and
-#                               add suffix <family_id> to <individual_id> 
-#
-cd $PARAMS_DIR
-
-# remove DOS newline characters if necessary
-perl -pi -e 's/\r//' $PROJECT_ID.ped
-
-# create reads directory for project
-mkdir -p $READS_DIR/$PROJECT_ID
-
-cat $DOWNLOAD_DIR/$PROJECT_ID/*/*/file_list.tsv | \
-  perl $SCRIPTS/trio_whole_exome_create_parameter_files.pl \
-    --prefix ./$PROJECT_ID \
-    --ped $PROJECT_ID.ped \
-    --suffix $SAMPLE_SUFFIX
-
-for FAMILY_ID in `cat ${PROJECT_ID}.family_ids.txt`
-do
-  echo "samplename,description,batch,sex,phenotype,variant_regions" > ${VERSION}_${PROJECT_ID}_${FAMILY_ID}.csv
-  COUNT=`wc -l ${PROJECT_ID}_${FAMILY_ID}.ped | awk '{ print $1 }'`
-
-  for ((i=1; i<=$COUNT; i=i+1))
-  do
-    SAMPLE=`head -n $i ${PROJECT_ID}_${FAMILY_ID}.ped | tail -n 1 | cut -f 2`
-    SEX=`head -n $i ${PROJECT_ID}_${FAMILY_ID}.ped | tail -n 1 | cut -f 5`
-    PHENOTYPE=`head -n $i ${PROJECT_ID}_${FAMILY_ID}.ped | tail -n 1 | cut -f 6`
-
-    for FILE in `ls $DOWNLOAD_DIR/$PROJECT_ID/*/*/*${SAMPLE}*/*_[1,2].fastq.gz`
-    do
-      echo "$FILE,$SAMPLE,$FAMILY_ID,$SEX,$PHENOTYPE,$TARGET" >> ${VERSION}_${PROJECT_ID}_${FAMILY_ID}.csv
-    done
-
-  done
-
-  bcbio_prepare_samples.py --out $READS_DIR/$PROJECT_ID --csv ${VERSION}_${PROJECT_ID}_${FAMILY_ID}.csv
-
-  mv ${VERSION}_${PROJECT_ID}_${FAMILY_ID}-merged.csv ${VERSION}_${PROJECT_ID}_${FAMILY_ID}.csv
-
-  BARE_FAMILY_ID=`echo $FAMILY_ID | cut -d '_' -f 2`
-
-  bcbio_nextgen.py -w template $BCBIO_TEMPLATE ${VERSION}_${PROJECT_ID}_${FAMILY_ID}.csv $READS_DIR/$PROJECT_ID/*_${BARE_FAMILY_ID}_R[12].fastq.gz
-
-  mv ${VERSION}_${PROJECT_ID}_${FAMILY_ID}/config/${VERSION}_${PROJECT_ID}_${FAMILY_ID}.yaml $CONFIG_DIR/
-
-  COMPRESSED_ID=`echo "$FAMILY_ID" | perl -pe "s/\_//"`
-
-  perl -i -pe "s/${COMPRESSED_ID}/${FAMILY_ID}/" $CONFIG_DIR/${VERSION}_${PROJECT_ID}_${FAMILY_ID}.yaml
-
-  rm -r ${VERSION}_${PROJECT_ID}_${FAMILY_ID}
-
-done
diff --git a/prepare_bcbio_config_santosh.sh b/prepare_bcbio_config_santosh.sh
deleted file mode 100755
index 9f1fd0c16ef7dcede9afca874ab567bf54ea566b..0000000000000000000000000000000000000000
--- a/prepare_bcbio_config_santosh.sh
+++ /dev/null
@@ -1,90 +0,0 @@
-#!/bin/bash
-#
-# prepare_bcbio_config_crf.sh <config.sh> <project_id> <version>
-# 
-# Adaptation of prepare_bcbio_config.sh for data from the Santosh set
-#
-# Given a <project_id>.ped file for a set of trios (families) in the 
-# folder $PARAMS_DIR, creates the files <project_id>.family_ids.txt
-# and <project>.sample_ids.txt in the same folder.
-#
-# Assumes that reads for the samples are in the path
-# $READS_DIR/<project_id>/*.gz,
-# and that no samples other than those with reads are listed in the 
-# PED file. $READS_DIR is specified in the <config.sh> file.
-#
-# Assumes that the sample names in the PED file match those 
-# specifying the read files.
-#
-# All samples must be annotated with sex (1=male, 2=female) in the
-# 5th column and phenotype (1=unaffected, 2=affected) in the 6th
-# column of the PED file.
-#
-# Runs bcbio sample preparation and configuration file generation,
-# assuming the template configuration file is at $BCBIO_TEMPLATE,
-# specified in the <config.sh> file.
-#
-# Assumes bcbio is on the PATH (set in <config.sh>).
-#
-
-CONFIG_SH=$1
-PROJECT_ID=$2
-VERSION=$3
-
-source $CONFIG_SH
-
-#
-# Create the files:
-#  $PROJECT_ID.family_ids.txt - format <pcr_plate_id>_<family_id>
-#  $PROJECT_ID.$FAMILY_ID.ped - select only the individuals in a given family, 
-#                               prefix <family_id> with <pcr_plate_id> and
-#                               add suffix <family_id> to <individual_id> 
-#
-cd $PARAMS_DIR
-
-# remove DOS newline characters if necessary
-perl -pi -e 's/\r//' $PROJECT_ID.ped
-
-# create reads directory for project
-mkdir -p $READS_DIR/$PROJECT_ID
-
-# generate the family_ids list - makes strong assumption about relative paths!
-cut -f 1 $PROJECT_ID.ped | sort -u > $PROJECT_ID.family_ids.txt
-
-for FAMILY_ID in `cat $PROJECT_ID.family_ids.txt`
-do
-  grep ^$FAMILY_ID $PROJECT_ID.ped > ${PROJECT_ID}_${FAMILY_ID}.ped
-
-  echo "samplename,description,batch,sex,phenotype,variant_regions" > ${VERSION}_${PROJECT_ID}_${FAMILY_ID}.csv
-  COUNT=`wc -l ${PROJECT_ID}_${FAMILY_ID}.ped | awk '{ print $1 }'`
-
-  for ((i=1; i<=$COUNT; i=i+1))
-  do
-    SAMPLE=`head -n $i ${PROJECT_ID}_${FAMILY_ID}.ped | tail -n 1 | cut -f 2`
-    SEX=`head -n $i ${PROJECT_ID}_${FAMILY_ID}.ped | tail -n 1 | cut -f 5`
-    PHENOTYPE=`head -n $i ${PROJECT_ID}_${FAMILY_ID}.ped | tail -n 1 | cut -f 6`
-
-    for FILE in `ls $DOWNLOAD_DIR/$PROJECT_ID/*${SAMPLE}*.gz`
-    do
-      echo "$FILE,${SAMPLE},$FAMILY_ID,$SEX,$PHENOTYPE,$TARGET" >> ${VERSION}_${PROJECT_ID}_${FAMILY_ID}.csv
-    done
-
-  done
-
-  bcbio_prepare_samples.py --out $READS_DIR/$PROJECT_ID --csv ${VERSION}_${PROJECT_ID}_${FAMILY_ID}.csv
-
-  mv ${VERSION}_${PROJECT_ID}_${FAMILY_ID}-merged.csv ${VERSION}_${PROJECT_ID}_${FAMILY_ID}.csv
-
-  BARE_FAMILY_ID=`echo $FAMILY_ID | cut -d '_' -f 2`
-
-  bcbio_nextgen.py -w template $BCBIO_TEMPLATE ${VERSION}_${PROJECT_ID}_${FAMILY_ID}.csv $READS_DIR/$PROJECT_ID/*_${BARE_FAMILY_ID}_R[12].fastq.gz
-
-  mv ${VERSION}_${PROJECT_ID}_${FAMILY_ID}/config/${VERSION}_${PROJECT_ID}_${FAMILY_ID}.yaml $CONFIG_DIR/
-
-  COMPRESSED_ID=`echo "$FAMILY_ID" | perl -pe "s/\_//"`
-
-  perl -i -pe "s/${COMPRESSED_ID}/${FAMILY_ID}/" $CONFIG_DIR/${VERSION}_${PROJECT_ID}_${FAMILY_ID}.yaml
-
-  rm -r ${VERSION}_${PROJECT_ID}_${FAMILY_ID}
-
-done
diff --git a/process_NHS_WES_aff_probands.sh b/process_NHS_WES_aff_probands.sh
index 7cb0c832035ed2c93f20bd628b3a1494ebfbe39a..d8a1eaf5867a46883dccd99b5e3bd00fe2fac404 100755
--- a/process_NHS_WES_aff_probands.sh
+++ b/process_NHS_WES_aff_probands.sh
@@ -164,7 +164,6 @@ G2P_LOG_DIR=${G2P_DIR}/${PLATE_ID}_${FAMILY_ID}_LOG_DIR
 mkdir ${G2P_LOG_DIR}
 TXT_OUT=${G2P_LOG_DIR}/${PLATE_ID}_${FAMILY_ID}.report.txt
 HTML_OUT=${G2P_LOG_DIR}/${PLATE_ID}_${FAMILY_ID}.report.html
-#VCF_KEYS='gnomADe|gnomADg'     # old VEP version 97
 VCF_KEYS='gnomADe_r2.1.1_GRCh38|gnomADg_r3.1.1_GRCh38'
 
 
diff --git a/process_quad.sh b/process_quad.sh
new file mode 100755
index 0000000000000000000000000000000000000000..29cd8931f2b7a6130863eb2649e042055580b9f4
--- /dev/null
+++ b/process_quad.sh
@@ -0,0 +1,1005 @@
+#!/bin/bash
+#SBATCH --cpus-per-task=1
+#SBATCH --mem=16GB
+#SBATCH --time=24:00:00
+#SBATCH --job-name=process_quad
+#SBATCH --output=process_quad.%A_%a.out
+#SBATCH --error=process_quad.%A_%a.err
+
+
+
+# setup PATH
+export PATH=$PATH:/home/u035/u035/shared/software/bcbio/anaconda/envs/python2/bin:/home/u035/u035/shared/software/bcbio/anaconda/bin
+export PERL5LIB=$PERL5LIB:/home/u035/u035/shared/software/bcbio/anaconda/lib/site_perl/5.26.2
+
+
+### folder structure for the downstream analysis - created by trio_setup.sh, done previously by the stanard trio-based pipeline ###
+BASE=/home/u035/u035/shared/analysis/work
+WORK_DIR=$BASE/${PROJECT_ID}
+VCF_DIR=${WORK_DIR}/VCF
+PED_DIR=${WORK_DIR}/PED
+LOG_DIR=${WORK_DIR}/LOG
+G2P_DIR=${WORK_DIR}/G2P
+VASE_DIR=${WORK_DIR}/VASE
+COV_DIR=${WORK_DIR}/COV
+DEC_DIR=${WORK_DIR}/DECIPHER
+IGV_DIR=${DEC_DIR}/IGV
+CNV_DIR=${WORK_DIR}/CNV
+BAMOUT_DIR=${WORK_DIR}/BAMOUT
+SCRIPTS_DIR=/home/u035/u035/shared/scripts
+
+
+# other files to be used
+TARGETS=/home/u035/u035/shared/resources/G2P/DDG2P.20220113.plus15bp.merged.bed			# OK
+CLINVAR=/home/u035/u035/shared/resources/G2P/DDG2P.20220113.clinvar.20220109.plus15bp.txt	# OK
+BLACKLIST=/home/u035/u035/shared/resources/blacklist/current_blacklist.txt			# OK
+TRANS_MAP=/home/u035/u035/shared/resources/trans_map/current_trans_map.txt			# OK
+REC_SNP=/home/u035/u035/shared/resources/reccurent/current_reccurent.bed                        # OK, https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7116826/, Extended Data Table 1
+
+
+
+### TOOLS ###
+SAMTOOLS=/home/u035/u035/shared/software/bcbio/anaconda/bin/samtools
+BCFTOOLS=/home/u035/u035/shared/software/bcbio/anaconda/envs/python2/bin/bcftools
+BGZIP=/home/u035/u035/shared/software/bcbio/anaconda/envs/python2/bin/bgzip
+TABIX=/home/u035/u035/shared/software/bcbio/anaconda/envs/python2/bin/tabix
+VT=/home/u035/u035/shared/software/bcbio/anaconda/bin/vt
+VASE=/home/u035/u035/shared/software/bcbio/anaconda/bin/vase
+GATK4=/home/u035/u035/shared/software/bcbio/anaconda/bin/gatk							# points to ../share/gatk4-4.2.1.0-0/gatk
+GATK3=/home/u035/u035/shared/software/GenomeAnalysisTK-3.8/GenomeAnalysisTK.jar
+PYTHON3=/home/u035/u035/shared/software/bcbio/anaconda/bin/python3							# points to python3.6
+PYTHON2=/home/u035/u035/shared/software/bcbio/anaconda/envs/python2/bin/python2.7
+VEP="/home/u035/u035/shared/software/bcbio/anaconda/bin/perl /home/u035/u035/shared/software/bcbio/anaconda/bin/vep"	# points to ../share/ensembl-vep-100.4-0/vep
+REFERENCE_GENOME=/home/u035/u035/shared/software/bcbio/genomes/Hsapiens/hg38/seq/hg38.fa
+
+
+
+echo "SOURCE_DIR = ${SOURCE_DIR}"       # the general path to the source BAM files (VCF and PED already copied)		i.e. /home/u035/u035/shared/results
+echo "BATCH_ID = ${BATCH_ID}"           # the ID of the batch being processed                                   	e.g. 19650_Ansari_Morad
+echo "BATCH_NUM = ${BATCH_NUM}"         # the numerical part of the BATCH_ID                                            e.g. 19650
+echo "PLATE_ID = ${PLATE_ID}"           # the PCR plate ID of the batch being currently processed,              	e.g. 19285
+echo "PROJECT_ID = ${PROJECT_ID}"       # this the the folder (${BASE}/${PROJECT_ID}) where the downstream analysis will be done
+echo "VERSION_N = ${VERSION_N}"         # the version of the alignment and genotyping analysis
+echo "FAMILY_ID = ${FAMILY_ID}"		# the family ID of this family with affected probands
+echo "KID_1_ID = ${KID_1_ID}"		# the ID of the first affected proband, in the format INDI_ID
+echo "KID_2_ID = ${KID_2_ID}"           # the ID of the second affected proband
+echo "PAR_1_ID = ${PAR_1_ID}"           # the ID of the first unaffected parent, in the format INDI_ID
+echo "PAR_2_ID = ${PAR_2_ID}"           # the ID of the second unaffected parent
+echo "DECIPHER_ID = ${DECIPHER_ID}"	# the DECIPHER_ID for this family
+
+
+
+
+# change to the LOG folder
+cd ${LOG_DIR}
+
+
+
+#########################################################################################################
+###   check the PED file to make sure exactly 2 affected probands with exactly 2 unaffected parents   ###
+#########################################################################################################
+
+echo ""
+echo ""
+echo "checking the ${PED_DIR}/${BATCH_ID}_${PLATE_ID}_${FAMILY_ID}.ped file..."
+
+time ${PYTHON2} ${SCRIPTS_DIR}/check_quad_PED.py ${PED_DIR}/${BATCH_ID}_${PLATE_ID}_${FAMILY_ID}.ped
+
+# check if the PED file checks were successful (python exit code = 0), if not exit the bash script
+ret=$?
+if [ $ret -ne 0 ]; then
+     echo "...it appears that the PED file does not corresponds to a quad family !"
+     echo "ERROR: Aborting the analysis"
+     exit
+fi
+echo ""
+echo ""
+
+
+
+
+##################################################################################################
+##################################################################################################
+###    split the quad to two trios, one for each child and process as a standard TRIO family   ###
+##################################################################################################
+##################################################################################################
+
+echo ""
+echo ""
+echo "+++++++++++++++++++++++++++++++++++++++++++++++++++++++"
+echo "+++ Analysing QUAD family ${FAMILY_ID} as two trios +++"
+echo "+++++++++++++++++++++++++++++++++++++++++++++++++++++++"
+echo ""
+echo ""
+
+
+# Create an array of kid ids to loop
+KID_IDS=()
+KID_IDS+=(${KID_1_ID})
+KID_IDS+=(${KID_2_ID})
+
+
+for KID_ID in ${KID_IDS[@]}; do
+    echo ""
+    echo "++++++++++++++++++++++++++++++++++++"
+    echo "processing trio for child =  $KID_ID"
+    echo "++++++++++++++++++++++++++++++++++++"
+    echo ""
+
+    #############################################################
+    # generate this trio's PED file                             #
+    # named: ${BATCH_ID}_${PLATE_ID}_${FAMILY_ID}_${KID_ID}.ped #
+    #############################################################
+    quad_ped_file=${BATCH_ID}_${PLATE_ID}_${FAMILY_ID}.ped
+    time ${PYTHON2} ${SCRIPTS_DIR}/generate_trio_PED_from_quad.py ${PED_DIR} ${quad_ped_file} ${KID_ID} ${PAR_1_ID} ${PAR_2_ID}
+
+
+    #############################################################################
+    # generate this trio's VCF file                                             #
+    # named: ${PLATE_ID}_${FAMILY_ID}_${KID_ID}-gatk-haplotype-annotated.vcf.gz #
+    #############################################################################
+    quad_vcf_file=${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}-gatk-haplotype-annotated.vcf.gz
+    trio_vcf_file=${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}_${KID_ID}-gatk-haplotype-annotated.vcf.gz
+
+    # tabix index the quad file just in case
+    time ${TABIX} -p vcf ${quad_vcf_file}
+
+    # extract a trio VCF for this kid
+    time java -Xmx24g -jar ${GATK3} -T SelectVariants \
+        -R ${REFERENCE_GENOME} \
+        -V ${quad_vcf_file} \
+        -sn ${KID_ID}_${FAMILY_ID} \
+        -sn ${PAR_1_ID}_${FAMILY_ID} \
+        -sn ${PAR_2_ID}_${FAMILY_ID} \
+        -jdk_deflater \
+        -jdk_inflater \
+        -o ${trio_vcf_file} \
+        -env
+
+
+    ##################################################################################
+    ###        DNU and clean the the family VCF                                    ###
+    ### format: ${PLATE_ID}_${FAMILY_ID}_${KID_ID}-gatk-haplotype-annotated.vcf.gz ###
+    ##################################################################################
+
+    echo ""
+    echo ""
+    echo "Performing DNU and cleaning of the ${PLATE_ID}_${FAMILY_ID}_${KID_ID}'s VCF file..."
+
+    time ${VT} decompose -s ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}_${KID_ID}-gatk-haplotype-annotated.vcf.gz -o ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}_${KID_ID}.decomp.vcf.gz
+    time ${VT} normalize ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}_${KID_ID}.decomp.vcf.gz -r ${REFERENCE_GENOME} -o ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}_${KID_ID}.norm.vcf.gz
+    time ${VT} uniq ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}_${KID_ID}.norm.vcf.gz -o ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}_${KID_ID}.DNU.vcf.gz
+
+    # remove sites with AC=0
+    time ${BCFTOOLS} view --min-ac=1 --no-update ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}_${KID_ID}.DNU.vcf.gz > ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}_${KID_ID}.AC0.vcf
+
+    # reset GT to no-call if num_ALT < num_ALT_THERSH or VAF < VAF_THRESH and GT != 0/0
+    # exlude variants from the blacklist (matching on chr,pos,ref,alt)
+    time ${PYTHON2} ${SCRIPTS_DIR}/filter_LQ_GT.py ${BLACKLIST} ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}_${KID_ID}.AC0.vcf ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}_${KID_ID}.clean.vcf
+
+    # bgzip and tabix it
+    time cat ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}_${KID_ID}.clean.vcf | ${BGZIP} > ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}_${KID_ID}.ready.vcf.gz
+    time ${TABIX} -p vcf ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}_${KID_ID}.ready.vcf.gz
+
+
+    # delete intermediate files - keep them for now for debugging reasons
+    rm ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}_${KID_ID}-gatk-haplotype-annotated.vcf.gz
+    rm ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}_${KID_ID}.decomp.vcf.gz*
+    rm ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}_${KID_ID}.norm.vcf.gz*
+    rm ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}_${KID_ID}.DNU.vcf.gz*
+    rm ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}_${KID_ID}.AC0.vcf
+
+
+    # to avoid bgzip pipe broken annoying, but not problematic message - skip the next step, the file will be used by G2P as IN_FILE and will be deleted last
+    # rm ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}_${KID_ID}.clean.vcf
+
+    echo ""
+    echo ""
+    echo "^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^"
+    echo "DNU, AC=0, num_ALT & VAF & blacklist cleaning and of the ${PLATE_ID}_${FAMILY_ID}_${KID_ID}'s VCF file: done"
+    echo "^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^"
+    echo ""
+    echo ""
+
+
+
+    ###################################################################
+    ###     run G2P for this trio VCF (DD genes)                    ###
+    ###     format: ${PLATE_ID}_${FAMILY_ID}_${KID_ID}.clean.vcf    ###
+    ###################################################################
+
+    echo "Performing G2P analysis (DD genes)for FAMILY_ID = ${PLATE_ID}_${FAMILY_ID}_${KID_ID}..."
+    echo "Using ${TARGETS}"
+
+    IN_FILE=${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}_${KID_ID}.clean.vcf
+    G2P_LOG_DIR=${G2P_DIR}/${PLATE_ID}_${FAMILY_ID}_${KID_ID}_LOG_DIR
+    mkdir ${G2P_LOG_DIR}
+    TXT_OUT=${G2P_LOG_DIR}/${PLATE_ID}_${FAMILY_ID}_${KID_ID}.report.txt
+    HTML_OUT=${G2P_LOG_DIR}/${PLATE_ID}_${FAMILY_ID}_${KID_ID}.report.html
+    VCF_KEYS='gnomADe_r2.1.1_GRCh38|gnomADg_r3.1.1_GRCh38'
+
+    time ${VEP} \
+        -i ${IN_FILE} \
+        --output_file ${G2P_LOG_DIR}/${PLATE_ID}_${FAMILY_ID}_${KID_ID}_inter_out.txt \
+        --force_overwrite \
+        --assembly GRCh38 \
+        --fasta ${REFERENCE_GENOME} \
+        --offline \
+        --merged \
+        --use_given_ref \
+        --cache --cache_version 100 \
+        --dir_cache /home/u035/u035/shared/software/bcbio/genomes/Hsapiens/hg38/vep \
+        --individual all \
+        --transcript_filter "gene_symbol in /home/u035/u035/shared/resources/G2P/genes_in_DDG2P.20220113.txt" \
+        --dir_plugins /home/u035/u035/shared/software/bcbio/anaconda/share/ensembl-vep-100.4-0 \
+        --plugin G2P,file='/home/u035/u035/shared/resources/G2P/DDG2P.20220113.csv',af_from_vcf=1,confidence_levels='definitive&strong',af_from_vcf_keys=${VCF_KEYS},log_dir=${G2P_LOG_DIR},txt_report=${TXT_OUT},html_report=${HTML_OUT}
+
+    echo ""
+    echo ""
+    echo "^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^"
+    echo "G2P analysis of FAMILY_ID = ${PLATE_ID}_${FAMILY_ID}_${KID_ID}: done"
+    echo "^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^"
+    echo ""
+
+
+
+    ###################################################################
+    ###     run VASE for this trio VCF (de novo)                    ###
+    ###   format: ${PLATE_ID}_${FAMILY_ID}_${KID_ID}.ready.vcf.gz   ###
+    ###################################################################
+
+    echo "Performing de novo analysis with VASE for FAMILY_ID = ${PLATE_ID}_${FAMILY_ID}_${KID_ID} ..."
+
+    IN_FILE=${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}_${KID_ID}.ready.vcf.gz
+    OUT_FILE=${VASE_DIR}/${PLATE_ID}_${FAMILY_ID}_${KID_ID}.strict.denovo.vcf
+    PED_FILE=${PED_DIR}/${BATCH_ID}_${PLATE_ID}_${FAMILY_ID}_${KID_ID}.ped
+
+    time ${VASE} \
+        -i ${IN_FILE} \
+        -o ${OUT_FILE} \
+        --log_progress \
+        --prog_interval 100000 \
+        --freq 0.0001 \
+        --gq 30 --dp 10 \
+        --het_ab 0.3 \
+        --max_alt_alleles 1 \
+        --csq all \
+        --biotypes all \
+        --control_gq 15 --control_dp 5 \
+        --control_het_ab 0.01 \
+        --control_max_ref_ab 0.05 \
+        --de_novo \
+        --ped ${PED_FILE}
+
+
+
+    # do some filtering on the denovo VCFs - exclude variants not on the 24 chr, as well as variants in LCR and telomere/centromere regions
+    ### actually, ignore the filtering of variants in LCR and telomere/centromere regions --> more variants with  â€œUnknownâ€ status may be classified as â€œdenovoâ€ if enough support
+
+    cd ${VASE_DIR}
+
+    # index the denovo VCF
+    time ${GATK4} IndexFeatureFile -I ${OUT_FILE}
+
+    # select only variants on the 24 chromosomes
+    time ${GATK4} SelectVariants -R ${REFERENCE_GENOME} -V ${OUT_FILE} -O ${PLATE_ID}_${FAMILY_ID}_${KID_ID}.strict.24chr.denovo.vcf -L /home/u035/u035/shared/resources/24_chr.list --exclude-non-variants
+
+    # sort the VCF (maybe not needed?, but just in case, and it is quick)
+    rm ${PLATE_ID}_${FAMILY_ID}_${KID_ID}.strict.24chr.sort.denovo.vcf
+    grep '^#' ${PLATE_ID}_${FAMILY_ID}_${KID_ID}.strict.24chr.denovo.vcf > ${PLATE_ID}_${FAMILY_ID}_${KID_ID}.strict.24chr.sort.denovo.vcf \
+    && grep -v '^#' ${PLATE_ID}_${FAMILY_ID}_${KID_ID}.strict.24chr.denovo.vcf | LC_ALL=C sort -t $'\t' -k1,1V -k2,2n >> ${PLATE_ID}_${FAMILY_ID}_${KID_ID}.strict.24chr.sort.denovo.vcf
+
+    # index the sorted VCF
+    time ${GATK4} IndexFeatureFile -I ${PLATE_ID}_${FAMILY_ID}_${KID_ID}.strict.24chr.sort.denovo.vcf
+
+    # split multi-allelic sites [by -m -any]
+    # left-alignment and normalization [by adding the -f]
+    file=${PLATE_ID}_${FAMILY_ID}_${KID_ID}.strict.24chr.sort.denovo.vcf
+    echo "$file"
+    ${BCFTOOLS} norm -f ${REFERENCE_GENOME} -m -any -Ov -o ${file/.strict.24chr.sort.denovo.vcf/.ready.denovo.vcf} $file
+
+    # clean intermediate denovo files
+    rm ${PLATE_ID}_${FAMILY_ID}_${KID_ID}.strict.denovo.vcf*
+    rm ${PLATE_ID}_${FAMILY_ID}_${KID_ID}.strict.24chr.denovo.vcf*
+    rm ${PLATE_ID}_${FAMILY_ID}_${KID_ID}.strict.24chr.sort.denovo.vcf*
+
+    # change back to the LOG folder
+    cd ${LOG_DIR}
+
+    echo ""
+    echo ""
+    echo "^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^"
+    echo "De novo analysis of FAMILY_ID = ${PLATE_ID}_${FAMILY_ID}_${KID_ID}: done"
+    echo "^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^"
+    echo ""
+    echo ""
+
+
+
+
+    #################################################################################################################################################
+    ###          run coverage for this proband (DD genes)                                                                                         ###
+    ###   format: ${SOURCE_DIR}/????-??-??_${VERSION_N}_${BATCH_ID}_${PLATE_ID}_${FAMILY_ID}/${PROBAND_ID}_${FAMILY_ID}/${PROBAND_ID}_${FAMILY_ID}-ready.bam   ###
+    #################################################################################################################################################
+
+    echo "Performing coverage analysis for PROBAND_ID = ${KID_ID} ...."
+
+    # make sure we are reading the data from the exact batch & plate ID & version N
+    BAM_FILE=${SOURCE_DIR}/${BATCH_NUM}_${VERSION_N}/families/????-??-??_${BATCH_NUM}_${VERSION_N}_${PLATE_ID}_${FAMILY_ID}/${KID_ID}_${FAMILY_ID}/${KID_ID}_${FAMILY_ID}-ready.bam
+    OUT_FILE=${COV_DIR}/${KID_ID}_${FAMILY_ID}.DD15
+
+    time java -Xmx8g -jar ${GATK3} -T DepthOfCoverage -R ${REFERENCE_GENOME} -o ${OUT_FILE} -I ${BAM_FILE} -L ${TARGETS} \
+        --omitDepthOutputAtEachBase \
+        --minBaseQuality 20 \
+        --minMappingQuality 20 \
+        -ct 20 \
+        -jdk_deflater \
+        -jdk_inflater \
+        --allow_potentially_misencoded_quality_scores
+
+    echo ""
+    echo ""
+    echo "----------------------------------------------------------------------------------------------------"
+    echo "percentage of DD exons (+/-15bp) covered at least 20x in PROBAND_ID = ${KID_ID} ..."
+    cat ${COV_DIR}/${KID_ID}_${FAMILY_ID}.DD15.sample_summary | awk '{print $7}'
+    echo "----------------------------------------------------------------------------------------------------"
+
+    # now compute the coverage per DD exon (+/-15bp) interval, adding the number of P/LP ClinVar variants (assertion criteria provided) in each interval
+    time ${PYTHON2} ${SCRIPTS_DIR}/get_cov_output.py ${COV_DIR}/${KID_ID}_${FAMILY_ID}.DD15.sample_interval_summary ${CLINVAR} ${COV_DIR}/${KID_ID}_${FAMILY_ID}.DD15.COV.txt
+
+    echo ""
+    echo ""
+    echo "^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^"
+    echo "Coverage analysis of PROBAND_ID = ${KID_ID}_${FAMILY_ID}: done    "
+    echo "    Coverage file = ${COV_DIR}/${KID_ID}_${FAMILY_ID}.DD15.COV.txt"
+    echo "^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^"
+    echo ""
+    echo ""
+
+
+
+    ################################################################################################
+    # check the coverage per each of the reccurent de novo SNPs (padded with 15bp both directions) #
+    ################################################################################################
+    echo "Performing recurrent coverage analysis for PROBAND_ID = ${KID_ID}_${FAMILY_ID} ..."
+
+    # we have identified the name of the proband's BAM file above (BAM_FILE), reuse it
+    # set the name of the file containing info about the coverage of the recurrent SNPs
+    REC_OUT_FILE=${COV_DIR}/${KID_ID}_${FAMILY_ID}.REC_SNP_COV.txt
+
+    while IFS=$'\t' read -ra var; do
+      gene="${var[0]}"
+      chr="${var[1]}"
+      pos="${var[2]}"
+      lo=$(expr $pos - 15)
+      hi=$(expr $pos + 15)
+      reg="$lo-$hi"
+      echo "============================================="
+      echo "$gene : recurrent variant at $chr:$pos"
+      echo "exploring coverage at $chr:$reg"
+
+      echo "---------------------------------------------"
+      echo "precisely at the position"
+      ${SAMTOOLS} depth -aa -Q 20 -r $chr:$reg ${BAM_FILE} | grep "$pos"
+
+      echo "---------------------------------------------"
+      echo "average in the +/- 15bp region"
+      ${SAMTOOLS} depth -aa -Q 20 -r $chr:$reg ${BAM_FILE} | awk '{sum+=$3} END { print "Average = ",sum/NR}'
+
+      echo "---------------------------------------------"
+      echo "detailed in the +/- 15bp region"
+      ${SAMTOOLS} depth -aa -Q 20 -r $chr:$reg ${BAM_FILE}
+    done < ${REC_SNP} > ${REC_OUT_FILE}
+
+
+    echo ""
+    echo ""
+    echo "^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^"
+    echo "Coverage analysis of recurring SNPs for PROBAND_ID = ${KID_ID}_${FAMILY_ID}: done    "
+    echo "    Coverage file = ${REC_OUT_FILE}"
+    echo "^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^"
+    echo ""
+    echo ""
+
+
+
+
+
+    #############################################################################################
+    ###      generate the DECIPHER file for this proband                                      ###
+    ###  ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}_${KID_ID}.ready.vcf.gz - the cleaned family VCF  ###
+    ###  ${VASE_DIR}/${PLATE_ID}_${FAMILY_ID}_${KID_ID}.ready.denovo.vcf - the VASE file      ###
+    ###  ${TRANS_MAP} - the current transcript mapping file                                   ###
+    #############################################################################################
+
+    echo "Generating the DECIPHER file for PROBAND_ID = ${KID_ID}_${FAMILY_ID} ..."
+
+    # first, split the family VCF to individual VCFs
+    # -c1:  minimum allele count (INFO/AC) of sites to be printed
+    # split multi-allelic sites (by -m -any)
+    # left-alignment and normalization (by adding the -f)
+
+    file=${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}_${KID_ID}.ready.vcf.gz
+    echo "splitting $file"
+    for indi in `${BCFTOOLS} query -l $file`; do
+        ${BCFTOOLS} view -c1 -Oz -s $indi -o ${file/.vcf*/.$indi.rough.vcf.gz} $file
+        ${BCFTOOLS} norm -f ${REFERENCE_GENOME} -m -any -Oz -o ${file/.vcf*/.$indi.vcf.gz} ${file/.vcf*/.$indi.rough.vcf.gz}
+        rm ${file/.vcf*/.$indi.rough.vcf.gz}
+    done
+
+    # VASE file - already split, left-aligned and normalized
+
+    # create the names of the needed files
+    PED_FILE=${PED_DIR}/${BATCH_ID}_${PLATE_ID}_${FAMILY_ID}_${KID_ID}.ped
+    IN_G2P_FILE=${G2P_DIR}/${PLATE_ID}_${FAMILY_ID}_${KID_ID}_LOG_DIR/${PLATE_ID}_${FAMILY_ID}_${KID_ID}.report.txt
+    IN_VASE_FILE=${VASE_DIR}/${PLATE_ID}_${FAMILY_ID}_${KID_ID}.ready.denovo.vcf
+    FAM_IGV_DIR=${IGV_DIR}/${PLATE_ID}_${FAMILY_ID}_${KID_ID}
+    FAM_BAM_DIR=${SOURCE_DIR}/${BATCH_NUM}_${VERSION_N}/families/????-??-??_${BATCH_NUM}_${VERSION_N}_${PLATE_ID}_${FAMILY_ID}
+
+
+
+    ## call the python scrpit
+    time ${PYTHON2} ${SCRIPTS_DIR}/generate_DEC_IGV_trio_scripts_from_quad.py \
+    ${DECIPHER_ID} \
+    ${TRANS_MAP} \
+    ${PED_FILE} \
+    ${IN_G2P_FILE} \
+    ${IN_VASE_FILE} \
+    ${FAM_IGV_DIR} \
+    ${VCF_DIR} \
+    ${PLATE_ID} \
+    ${FAMILY_ID} \
+    ${DEC_DIR} \
+    ${FAM_BAM_DIR} \
+    ${KID_ID}
+
+    ## using the DECIPHER bulk upload file v9 --> generate the DECIPHER bulk upload file v10
+    echo "...Generating v10 Decipher bulk upload file for proband = ${KID_ID}, family_id = ${FAMILY_ID} ..."
+    time ${PYTHON3} ${SCRIPTS_DIR}/convert_DEC_to_v10.py ${DEC_DIR} ${KID_ID}_${FAMILY_ID}
+
+    echo ""
+    echo ""
+    echo "^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^"
+    echo "DECIPHER analysis of PROBAND_ID = ${KID_ID}_${FAMILY_ID}: done"
+    echo "^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^"
+    echo ""
+    echo ""
+
+    #rm ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}.clean.vcf
+
+
+
+
+
+    ##############################################################################################
+    ###  for each variant in the DECIPHER upload file                                          ###
+    ###  generate a IGV snapshot based on the realigned BAM used by GATK for calling variants  ###
+    ###  first, generate BAMOUTs for each variant (to be stored in the BAMOUT folder)          ###
+    ###  then, generate a batch script for IGV to produce the snapshots based on the BAMOUTs   ###
+    ##############################################################################################
+
+    # we have so far: FAMILY_ID and KID_ID
+    echo "...Generating BAMOUT files for the ${FAMILY_ID} family, proband = ${KID_ID} ..."
+    echo "...KID_ID = ${KID_ID}, PAR_1_ID = ${PAR_1_ID}, PAR_2_ID = ${PAR_2_ID} "
+
+    # gather the trio BAM files
+    kid_bam=${SOURCE_DIR}/${BATCH_NUM}_${VERSION_N}/families/????-??-??_${BATCH_NUM}_${VERSION_N}_${PLATE_ID}_${FAMILY_ID}/${KID_ID}_${FAMILY_ID}/${KID_ID}_${FAMILY_ID}-ready.bam
+    par_1_bam=${SOURCE_DIR}/${BATCH_NUM}_${VERSION_N}/families/????-??-??_${BATCH_NUM}_${VERSION_N}_${PLATE_ID}_${FAMILY_ID}/${PAR_1_ID}_${FAMILY_ID}/${PAR_1_ID}_${FAMILY_ID}-ready.bam
+    par_2_bam=${SOURCE_DIR}/${BATCH_NUM}_${VERSION_N}/families/????-??-??_${BATCH_NUM}_${VERSION_N}_${PLATE_ID}_${FAMILY_ID}/${PAR_2_ID}_${FAMILY_ID}/${PAR_2_ID}_${FAMILY_ID}-ready.bam
+
+    echo "...kid_bam = ${kid_bam}..."
+    echo "...par_1_bam = ${par_1_bam}..."
+    echo "...par_2_bam = ${par_2_bam}..."
+
+    # gather the variants in the DECIPHER file for which to generate bamouts
+    # chr is the second column - need to add the 'chr' prefix
+    # pos is the third column
+    # the first line is a header line, starting with 'Internal reference number or ID'
+    # file called: ${DEC_DIR}/<proband_id>_<fam_id>_DEC_FLT.csv
+    # and for each run GATK to generate the bamout files
+    # to be stored in ${BAMOUT_DIR}/${FAMILY_ID}
+
+    mkdir ${BAMOUT_DIR}/${FAMILY_ID}_${KID_ID}
+
+    var_file=${DEC_DIR}/${KID_ID}_${FAMILY_ID}_DEC_FLT.csv
+    echo "... reading ${var_file} to generate the bamouts..."
+
+    grep -v '^Internal' ${var_file} |
+    while IFS= read -r line
+    do
+      echo "$line"
+      IFS=, read -ra ary <<<"$line"
+      chr=${ary[1]}
+      pos=${ary[2]}
+      ref=${ary[4]}
+      alt=${ary[5]}
+      echo " --> chr = $chr, pos = $pos, ref = ${ref}, alt = ${alt}"
+
+      # generate the bamout file
+      echo "...doing the bamout"
+      echo "   time ${GATK4} HaplotypeCaller --reference ${REFERENCE_GENOME} --input ${kid_bam} --input ${par_1_bam} --input ${par_2_bam} -L chr${chr}:${pos} --interval-padding 500"
+      echo "   --active-probability-threshold 0.000 -ploidy 2"
+      echo "   --output ${BAMOUT_DIR}/${FAMILY_ID}_${KID_ID}/${FAMILY_ID}_chr${chr}_${pos}.bamout.vcf -bamout ${BAMOUT_DIR}/${FAMILY_ID}_${KID_ID}/${FAMILY_ID}_chr${chr}_${pos}.bamout.bam"
+
+      time ${GATK4} HaplotypeCaller --reference ${REFERENCE_GENOME} --input ${kid_bam} --input ${par_1_bam} --input ${par_2_bam} -L chr${chr}:${pos} --interval-padding 500 \
+      --active-probability-threshold 0.000 -ploidy 2 \
+      --output ${BAMOUT_DIR}/${FAMILY_ID}_${KID_ID}/${FAMILY_ID}_chr${chr}_${pos}.bamout.vcf -bamout ${BAMOUT_DIR}/${FAMILY_ID}_${KID_ID}/${FAMILY_ID}_chr${chr}_${pos}.bamout.bam
+    done
+
+
+    #################################################################
+    # write the IGV batch file for this family based on the bamouts #
+    # to be stored as /scratch/u035/u035/shared/trio_whole_exome/analysis/${PROJECT_ID}/DECIPHER/IGV/bamout_${KID_ID}_${FAMILY_ID}.snapshot.txt #
+    #################################################################
+
+    snap_file=/home/u035/u035/shared/analysis/work/${PROJECT_ID}/DECIPHER/IGV/bamout_${KID_ID}_${FAMILY_ID}.snapshot.txt
+
+    # check if previous version exist, if so - delete it
+    if [ -f "${snap_file}" ]; then
+        echo "previous version of ${snap_file} exist --> deleted"
+        rm ${snap_file}
+    fi
+
+    # write the header for the IGV batch file
+    echo "new" >> ${snap_file}
+    echo "genome hg38" >> ${snap_file}
+    echo "snapshotDirectory \"/home/u035/u035/shared/analysis/work/${PROJECT_ID}/DECIPHER/IGV/${PLATE_ID}_${FAMILY_ID}_${KID_ID}\"" >> ${snap_file}
+    echo "" >> ${snap_file}
+
+    # now, go again over the variants in the DECIPHER file and generate one snapshot file for all the variants
+    var_file=${DEC_DIR}/${KID_ID}_${FAMILY_ID}_DEC_FLT.csv
+    echo "... reading ${var_file} to generate the IGV batch file using the bamouts..."
+
+    grep -v '^Internal' ${var_file} |
+    while IFS= read -r line
+    do
+      IFS=, read -ra ary <<<"$line"
+      chr=${ary[1]}
+      pos=${ary[2]}
+      ref=${ary[4]}
+      alt=${ary[5]}
+      left=$((${pos}-25))
+      right=$((${pos}+25))
+
+      echo "new" >> ${snap_file}
+      echo "load ${BAMOUT_DIR}/${FAMILY_ID}_${KID_ID}/${FAMILY_ID}_chr${chr}_${pos}.bamout.bam" >> ${snap_file}
+      echo "preference SAM.SHADE_BASE_QUALITY true" >> ${snap_file}
+
+      echo "goto chr${chr}:${left}-${right}" >> ${snap_file}
+      echo "group SAMPLE" >> ${snap_file}
+      echo "sort base" >> ${snap_file}
+      echo "squish" >> ${snap_file}
+      echo "snapshot bamout_${KID_ID}_${FAMILY_ID}_chr${chr}_${pos}_${ref}_${alt}.png" >> ${snap_file}
+      echo "" >> ${snap_file}
+      echo "" >> ${snap_file}
+
+    done
+
+    echo "Generating of the IGV batch files based on bamouts - done!"
+    echo "snap_file = ${snap_file}"
+
+    echo "++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++"
+    echo "+++   Variant prioritization of trio ${FAMILY_ID} with ${KID_ID} completed   +++"
+    echo "++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++"
+
+
+done
+
+echo ""
+echo ""
+echo "++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++"
+echo "+++ Analysing QUAD family ${FAMILY_ID} as two trios: DONE! +++"
+echo "++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++"
+echo ""
+echo ""
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+#######################################################################################
+#######################################################################################
+###    analyze only the two affected siblings and identify shared variants in them  ###
+#######################################################################################
+#######################################################################################
+
+
+echo ""
+echo ""
+echo "+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++"
+echo "+++ Analysing QUAD family ${FAMILY_ID} for shared variants in the two affected siblings +++"
+echo "+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++"
+echo ""
+echo ""
+
+
+##########################################################
+# generate the affacted siblings PED file                #
+# named: ${BATCH_ID}_${PLATE_ID}_${FAMILY_ID}_shared.ped #
+##########################################################
+quad_ped_file=${BATCH_ID}_${PLATE_ID}_${FAMILY_ID}.ped
+time ${PYTHON2} ${SCRIPTS_DIR}/generate_aff_sib_PED_from_quad.py ${PED_DIR} ${quad_ped_file} ${KID_1_ID} ${KID_2_ID}
+
+
+
+
+##########################################################################
+# generate the affected siblings VCF file                                #
+# named: ${PLATE_ID}_${FAMILY_ID}_shared-gatk-haplotype-annotated.vcf.gz #
+##########################################################################
+quad_vcf_file=${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}-gatk-haplotype-annotated.vcf.gz
+shared_vcf_file=${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}_shared-gatk-haplotype-annotated.vcf.gz
+
+# tabix index the quad file just in case
+time ${TABIX} -p vcf ${quad_vcf_file}
+
+# extract a VCF for these two kids
+time java -Xmx24g -jar ${GATK3} -T SelectVariants \
+    -R ${REFERENCE_GENOME} \
+    -V ${quad_vcf_file} \
+    -sn ${KID_1_ID}_${FAMILY_ID} \
+    -sn ${KID_2_ID}_${FAMILY_ID} \
+    -jdk_deflater \
+    -jdk_inflater \
+    -o ${shared_vcf_file} \
+    -env
+
+
+
+
+
+###############################################################################
+###        DNU and clean the the siblings VCF                               ###
+### format: ${PLATE_ID}_${FAMILY_ID}_shared-gatk-haplotype-annotated.vcf.gz ###
+###############################################################################
+
+echo ""
+echo ""
+echo "Performing DNU and cleaning of the ${PLATE_ID}_${FAMILY_ID}_shared's VCF file..."
+
+
+time ${VT} decompose -s ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}_shared-gatk-haplotype-annotated.vcf.gz -o ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}_shared.decomp.vcf.gz
+time ${VT} normalize ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}_shared.decomp.vcf.gz -r ${REFERENCE_GENOME} -o ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}_shared.norm.vcf.gz
+time ${VT} uniq ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}_shared.norm.vcf.gz -o ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}_shared.DNU.vcf.gz
+
+
+# remove sites with AC=0
+time ${BCFTOOLS} view --min-ac=1 --no-update ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}_shared.DNU.vcf.gz > ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}_shared.AC0.vcf
+
+# reset GT to no-call if num_ALT < num_ALT_THERSH or VAF < VAF_THRESH and GT != 0/0
+# exlude variants from the blacklist (matching on chr,pos,ref,alt)
+time ${PYTHON2} ${SCRIPTS_DIR}/filter_LQ_GT.py ${BLACKLIST} ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}_shared.AC0.vcf ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}_shared.clean.vcf
+
+# bgzip and tabix it
+time cat ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}_shared.clean.vcf | ${BGZIP} > ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}_shared.ready.vcf.gz
+time ${TABIX} -p vcf ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}_shared.ready.vcf.gz
+
+# delete intermediate files
+rm ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}_shared-gatk-haplotype-annotated.vcf.gz
+rm ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}_shared.decomp.vcf.gz*
+rm ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}_shared.norm.vcf.gz*
+rm ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}_shared.DNU.vcf.gz*
+rm ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}_shared.AC0.vcf
+
+
+# to avoid bgzip pipe broken annoying, but not problematic message - skip the next step, the file will be used by G2P as IN_FILE and will be deleted last
+# rm ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}_shared.clean.vcf
+
+
+echo ""
+echo ""
+echo "^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^"
+echo "DNU, AC=0, num_ALT & VAF & blacklist cleaning and of the ${PLATE_ID}_${FAMILY_ID}_shared's VCF file: done"
+echo "^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^"
+echo ""
+echo ""
+
+
+################################################################
+###     run G2P for the two affected siblings (DD genes)     ###
+###     format: ${PLATE_ID}_${FAMILY_ID}_shared.clean.vcf    ###
+################################################################
+echo "Performing G2P analysis (DD genes)for FAMILY_ID = ${PLATE_ID}_${FAMILY_ID}_shared..."
+echo "Using ${TARGETS}"
+
+IN_FILE=${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}_shared.clean.vcf
+G2P_LOG_DIR=${G2P_DIR}/${PLATE_ID}_${FAMILY_ID}_shared_LOG_DIR
+mkdir ${G2P_LOG_DIR}
+TXT_OUT=${G2P_LOG_DIR}/${PLATE_ID}_${FAMILY_ID}_shared.report.txt
+HTML_OUT=${G2P_LOG_DIR}/${PLATE_ID}_${FAMILY_ID}_shared.report.html
+VCF_KEYS='gnomADe_r2.1.1_GRCh38|gnomADg_r3.1.1_GRCh38'
+
+time ${VEP} \
+    -i ${IN_FILE} \
+    --output_file ${G2P_LOG_DIR}/${PLATE_ID}_${FAMILY_ID}_shared_inter_out.txt \
+    --force_overwrite \
+    --assembly GRCh38 \
+    --fasta ${REFERENCE_GENOME} \
+    --offline \
+    --merged \
+    --use_given_ref \
+    --cache --cache_version 100 \
+    --dir_cache /home/u035/u035/shared/software/bcbio/genomes/Hsapiens/hg38/vep \
+    --individual all \
+    --transcript_filter "gene_symbol in /home/u035/u035/shared/resources/G2P/genes_in_DDG2P.20220113.txt" \
+    --dir_plugins /home/u035/u035/shared/software/bcbio/anaconda/share/ensembl-vep-100.4-0 \
+    --plugin G2P,file='/home/u035/u035/shared/resources/G2P/DDG2P.20220113.csv',af_from_vcf=1,confidence_levels='definitive&strong',af_from_vcf_keys=${VCF_KEYS},log_dir=${G2P_LOG_DIR},txt_report=${TXT_OUT},html_report=${HTML_OUT}
+
+
+echo ""
+echo ""
+echo "^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^"
+echo "G2P analysis of FAMILY_ID = ${PLATE_ID}_${FAMILY_ID}_shared: done"
+echo "^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^"
+echo ""
+
+
+
+
+########################################################
+###    run coverage for each proband (DD genes)      ###
+###    run reccurent SNP coverage for each proband   ###
+###    already did it as part of the trio analysis   ###
+########################################################
+
+
+
+
+
+
+
+###################################################################################
+###      for each proband generate the DECIPHER file                            ###
+###  ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}_shared.ready.vcf.gz - the sibling VCF  ###
+###  ${TRANS_MAP} - the current transcript mapping file                         ###
+###################################################################################
+
+
+echo "Generating the DECIPHER file for all probands in ${FAMILY_ID} ..."
+
+
+# first, split the family VCF to individual VCFs
+# -c1:  minimum allele count (INFO/AC) of sites to be printed
+# split multi-allelic sites (by -m -any)
+# left-alignment and normalization (by adding the -f)
+
+file=${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}_shared.ready.vcf.gz
+echo "splitting $file"
+for indi in `${BCFTOOLS} query -l $file`; do
+    ${BCFTOOLS} view -c1 -Oz -s $indi -o ${file/.vcf*/.$indi.rough.vcf.gz} $file
+    ${BCFTOOLS} norm -f ${REFERENCE_GENOME} -m -any -Oz -o ${file/.vcf*/.$indi.vcf.gz} ${file/.vcf*/.$indi.rough.vcf.gz}
+    rm ${file/.vcf*/.$indi.rough.vcf.gz}
+done
+
+
+
+
+# create the names of the needed files
+PED_FILE=${PED_DIR}/${BATCH_ID}_${PLATE_ID}_${FAMILY_ID}_shared.ped
+IN_G2P_FILE=${G2P_DIR}/${PLATE_ID}_${FAMILY_ID}_shared_LOG_DIR/${PLATE_ID}_${FAMILY_ID}_shared.report.txt
+FAM_IGV_DIR=${IGV_DIR}/${PLATE_ID}_${FAMILY_ID}_shared
+FAM_BAM_DIR=${SOURCE_DIR}/${BATCH_NUM}_${VERSION_N}/families/????-??-??_${BATCH_NUM}_${VERSION_N}_${PLATE_ID}_${FAMILY_ID}
+
+
+
+## call the python scrpit
+time ${PYTHON2} ${SCRIPTS_DIR}/generate_DEC_IGV_aff_sib_scripts_from_quad.py \
+${DECIPHER_ID} \
+${TRANS_MAP} \
+${PED_FILE} \
+${IN_G2P_FILE} \
+${FAM_IGV_DIR} \
+${VCF_DIR} \
+${PLATE_ID} \
+${FAMILY_ID} \
+${DEC_DIR} \
+${FAM_BAM_DIR}
+
+
+
+
+
+#############################################################################################################################
+## using the DECIPHER bulk upload file v9 for each proband --> generate the DECIPHER bulk upload file v10 for each proband ##
+#############################################################################################################################
+# from the VCF, get all IDs (in the format probad_family_id), they are all affected probands (already checked at the start)
+# INDI_ID is in format ${PROBAND_ID}_${FAMILY_ID}
+
+file=${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}_shared.ready.vcf.gz
+for INDI_ID in `${BCFTOOLS} query -l $file`; do
+    #################################
+    #####    for each proband    ####
+    #################################
+    echo "...Generating v10 Decipher bulk upload file for proband = ${INDI_ID} ...."
+    time ${PYTHON3} ${SCRIPTS_DIR}/convert_DEC_to_v10.py ${DEC_DIR} ${INDI_ID}_shared
+done
+
+echo ""
+echo ""
+echo "^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^"
+echo "DECIPHER analysis of all probands in ${FAMILY_ID}: done"
+echo "^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^"
+echo ""
+echo ""
+
+
+#rm ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}_shared.clean.vcf
+
+
+
+
+
+##############################################################################################
+###  for each of the affected probands                                                     ###
+###  for each variant in the DECIPHER upload file (only the shared variants)               ###
+###  generate a IGV snapshot based on the realigned BAM used by GATK for calling variants  ###
+###  first, generate BAMOUTs for each variant (to be stored in the BAMOUT folder)          ###
+###  then, generate a batch script for IGV to produce the snapshots based on the BAMOUTs   ###
+##############################################################################################
+
+echo ""
+echo ""
+echo "Generating the BAMOUT files for the ${FAMILY_ID} family ...."
+
+
+# get the ids of the affected probands to generate the part of the command line which refers to the BAM files to by analysed
+file=${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}_shared.ready.vcf.gz
+# INDI_ID is in format ${PROBAND_ID}_${FAMILY_ID}
+aff_pro_arr=()
+for INDI_ID in `${BCFTOOLS} query -l $file`; do
+    aff_pro_arr+=(${INDI_ID})
+done
+echo "  Found ${#aff_pro_arr[@]} affected probands in ${FAMILY_ID} for which BAMOUTs to be generated"
+
+for key in "${!aff_pro_arr[@]}"; do
+  echo "    ${aff_pro_arr[$key]}";
+#~#  INPUT_BAM_LINE=${INPUT_BAM_LINE}" --input ${SOURCE_DIR}/????-??-??_${VERSION_N}_${BATCH_ID}_${PLATE_ID}_${FAMILY_ID}/${aff_pro_arr[$key]}/${aff_pro_arr[$key]}-ready.bam"
+  INPUT_BAM_LINE=${INPUT_BAM_LINE}" --input ${SOURCE_DIR}/${BATCH_NUM}_${VERSION_N}/families/????-??-??_${BATCH_NUM}_${VERSION_N}_${PLATE_ID}_${FAMILY_ID}/${aff_pro_arr[$key]}/${aff_pro_arr[$key]}-ready.bam"
+
+done
+
+
+# now go over the shared G2P variants in the for which to generate bamouts
+# the variants should be identical in the DECIPHER files for all affected individuals, pick the first individual
+# chr is the second column - need to add the 'chr' prefix
+# pos is the third column
+# the first line is a header line, starting with 'Internal reference number or ID'
+# file called: ${DEC_DIR}/<proband_id>_<fam_id>_shared_DEC_FLT.csv
+# and for each run GATK to generate the bamout files
+# to be stored in ${BAMOUT_DIR}/${FAMILY_ID}_shared
+
+
+mkdir ${BAMOUT_DIR}/${FAMILY_ID}_shared
+
+
+var_file=${DEC_DIR}/${aff_pro_arr[0]}_shared_DEC_FLT.csv
+echo "... reading the shared variants in ${var_file} to generate the bamouts for each ..."
+
+
+grep -v '^Internal' ${var_file} |
+while IFS= read -r line
+do
+  echo ""
+  echo ""
+  echo ""
+  echo "$line"
+  IFS=, read -ra ary <<<"$line"
+  chr=${ary[1]}
+  pos=${ary[2]}
+  ref=${ary[4]}
+  alt=${ary[5]}
+  echo " --> chr = $chr, pos = $pos, ref = ${ref}, alt = ${alt}"
+
+  # generate the bamout file
+  echo "...doing the bamout"
+  echo "   time ${GATK4} HaplotypeCaller --reference ${REFERENCE_GENOME} ${INPUT_BAM_LINE}"
+  echo "   -L chr${chr}:${pos} --interval-padding 500 --active-probability-threshold 0.000 -ploidy 2"
+  echo "   --output ${BAMOUT_DIR}/${FAMILY_ID}_shared/${FAMILY_ID}_chr${chr}_${pos}.bamout.vcf -bamout ${BAMOUT_DIR}/${FAMILY_ID}_shared/${FAMILY_ID}_chr${chr}_${pos}.bamout.bam"
+
+  time ${GATK4} HaplotypeCaller --reference ${REFERENCE_GENOME} ${INPUT_BAM_LINE} -L chr${chr}:${pos} --interval-padding 500 \
+  --active-probability-threshold 0.000 -ploidy 2 \
+  --output ${BAMOUT_DIR}/${FAMILY_ID}_shared/${FAMILY_ID}_chr${chr}_${pos}.bamout.vcf -bamout ${BAMOUT_DIR}/${FAMILY_ID}_shared/${FAMILY_ID}_chr${chr}_${pos}.bamout.bam
+
+done
+
+
+
+##############################################################################################
+## write the IGV batch file for each affected individual in this family based on the bamouts #
+## to be stored as /home/u035/u035/shared/analysis/work/${PROJECT_ID}/DECIPHER/IGV/bamout_${PROBAND_ID}_${FAMILY_ID}.shared.snapshot.txt #
+## ${PROBAND_ID}_${FAMILY_ID} == ${aff_pro_arr[$key] #
+##################################################################
+
+for key in "${!aff_pro_arr[@]}"; do
+  echo ""
+  echo ""
+  echo "Generating the IGV batch file for ${aff_pro_arr[$key]}";
+
+#~#  snap_file=/scratch/u035/u035/shared/trio_whole_exome/analysis/${PROJECT_ID}/DECIPHER/IGV/bamout_${aff_pro_arr[$key]}.shared.snapshot.txt
+  snap_file=/home/u035/u035/shared/analysis/work/${PROJECT_ID}/DECIPHER/IGV/bamout_${aff_pro_arr[$key]}.shared.snapshot.txt
+
+
+  # check if previous version exist, if so - delete it
+  if [ -f "${snap_file}" ]; then
+    echo "previous version of ${snap_file} exist --> deleted"
+    rm ${snap_file}
+  fi
+
+
+  # write the header for the IGV batch file
+  echo "new" >> ${snap_file}
+  echo "genome hg38" >> ${snap_file}
+  echo "snapshotDirectory \"/home/u035/u035/shared/analysis/work/${PROJECT_ID}/DECIPHER/IGV/${PLATE_ID}_${FAMILY_ID}_shared\"" >> ${snap_file}
+  echo "" >> ${snap_file}
+
+
+  # now, go again over the variants in this individual's DECIPHER file and generate one snapshot file for all the variants
+  var_file=${DEC_DIR}/${aff_pro_arr[$key]}_shared_DEC_FLT.csv
+
+  echo "... reading ${var_file} to generate the IGV batch file using the bamouts..."
+
+
+
+  grep -v '^Internal' ${var_file} |
+  while IFS= read -r line
+  do
+    IFS=, read -ra ary <<<"$line"
+    chr=${ary[1]}
+    pos=${ary[2]}
+    ref=${ary[4]}
+    alt=${ary[5]}
+    left=$((${pos}-25))
+    right=$((${pos}+25))
+
+    echo "new" >> ${snap_file}
+    echo "load ${BAMOUT_DIR}/${FAMILY_ID}_shared/${FAMILY_ID}_chr${chr}_${pos}.bamout.bam" >> ${snap_file}
+    echo "preference SAM.SHADE_BASE_QUALITY true" >> ${snap_file}
+
+    echo "goto chr${chr}:${left}-${right}" >> ${snap_file}
+    echo "group SAMPLE" >> ${snap_file}
+    echo "sort base" >> ${snap_file}
+    echo "squish" >> ${snap_file}
+    echo "snapshot bamout_${aff_pro_arr[$key]}_shared_chr${chr}_${pos}_${ref}_${alt}.png" >> ${snap_file}
+    echo "" >> ${snap_file}
+    echo "" >> ${snap_file}
+
+  done
+
+  echo "Generating of the IGV batch file based on bamouts for ${aff_pro_arr[$key]}- done!"
+  echo "snap_file = ${snap_file}"
+
+done
+
+echo ""
+echo ""
+echo ""
+echo "++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++"
+echo "+++   Variant prioritization of family ${FAMILY_ID} as affected sib-pair completed   +++"
+echo "++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++"
+
+
+
+
+
+
+
+
+echo ""
+echo ""
+echo "++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++"
+echo "+++ Analysing QUAD family ${FAMILY_ID} for shared variants in the two affected siblings: DONE! +++"
+echo "++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++"
+echo ""
+echo ""
+
diff --git a/process_shared.sh b/process_shared.sh
new file mode 100755
index 0000000000000000000000000000000000000000..c3e10666114d645fc06a8eadb2e04d14d89c5598
--- /dev/null
+++ b/process_shared.sh
@@ -0,0 +1,526 @@
+#!/bin/bash
+#SBATCH --cpus-per-task=1
+#SBATCH --mem=16GB
+#SBATCH --time=24:00:00
+#SBATCH --job-name=process_shared
+#SBATCH --output=process_shared.%A_%a.out
+#SBATCH --error=process_shared.%A_%a.err
+
+
+# setup PATH
+export PATH=$PATH:/home/u035/u035/shared/software/bcbio/anaconda/envs/python2/bin:/home/u035/u035/shared/software/bcbio/anaconda/bin
+export PERL5LIB=$PERL5LIB:/home/u035/u035/shared/software/bcbio/anaconda/lib/site_perl/5.26.2
+
+
+### folder structure for the downstream analysis - created by trio_setup.sh, done previously by the stanard trio-based pipeline ###
+BASE=/home/u035/u035/shared/analysis/work
+WORK_DIR=$BASE/${PROJECT_ID}
+VCF_DIR=${WORK_DIR}/VCF
+PED_DIR=${WORK_DIR}/PED
+LOG_DIR=${WORK_DIR}/LOG
+G2P_DIR=${WORK_DIR}/G2P
+VASE_DIR=${WORK_DIR}/VASE
+COV_DIR=${WORK_DIR}/COV
+DEC_DIR=${WORK_DIR}/DECIPHER
+IGV_DIR=${DEC_DIR}/IGV
+CNV_DIR=${WORK_DIR}/CNV
+BAMOUT_DIR=${WORK_DIR}/BAMOUT
+SCRIPTS_DIR=/home/u035/u035/shared/scripts
+
+
+# other files to be used
+TARGETS=/home/u035/u035/shared/resources/G2P/DDG2P.20220113.plus15bp.merged.bed			# OK
+CLINVAR=/home/u035/u035/shared/resources/G2P/DDG2P.20220113.clinvar.20220109.plus15bp.txt	# OK
+BLACKLIST=/home/u035/u035/shared/resources/blacklist/current_blacklist.txt			# OK
+TRANS_MAP=/home/u035/u035/shared/resources/trans_map/current_trans_map.txt			# OK
+REC_SNP=/home/u035/u035/shared/resources/reccurent/current_reccurent.bed                        # OK, https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7116826/, Extended Data Table 1
+
+
+
+
+
+### TOOLS ###
+SAMTOOLS=/home/u035/u035/shared/software/bcbio/anaconda/bin/samtools
+BCFTOOLS=/home/u035/u035/shared/software/bcbio/anaconda/envs/python2/bin/bcftools
+BGZIP=/home/u035/u035/shared/software/bcbio/anaconda/envs/python2/bin/bgzip
+TABIX=/home/u035/u035/shared/software/bcbio/anaconda/envs/python2/bin/tabix
+VT=/home/u035/u035/shared/software/bcbio/anaconda/bin/vt
+VASE=/home/u035/u035/shared/software/bcbio/anaconda/bin/vase
+GATK4=/home/u035/u035/shared/software/bcbio/anaconda/bin/gatk                                                           # points to ../share/gatk4-4.2.1.0-0/gatk
+GATK3=/home/u035/u035/shared/software/GenomeAnalysisTK-3.8/GenomeAnalysisTK.jar
+PYTHON3=/home/u035/u035/shared/software/bcbio/anaconda/bin/python3                                                      # points to python3.6
+PYTHON2=/home/u035/u035/shared/software/bcbio/anaconda/envs/python2/bin/python2.7
+VEP="/home/u035/u035/shared/software/bcbio/anaconda/bin/perl /home/u035/u035/shared/software/bcbio/anaconda/bin/vep"    # points to ../share/ensembl-vep-100.4-0/vep
+REFERENCE_GENOME=/home/u035/u035/shared/software/bcbio/genomes/Hsapiens/hg38/seq/hg38.fa
+
+
+
+
+
+echo "SOURCE_DIR = ${SOURCE_DIR}"       # the general path to the source BAM files (VCF and PED already copied)		i.e. /home/u035/u035/shared/results
+echo "BATCH_ID = ${BATCH_ID}"           # the ID of the batch being processed                                   	e.g. 19650_Ansari_Morad
+echo "BATCH_NUM = ${BATCH_NUM}"         # the numerical part of the BATCH_ID						e.g. 19650
+echo "PLATE_ID = ${PLATE_ID}"           # the PCR plate ID of the batch being currently processed,              	e.g. 19285
+echo "PROJECT_ID = ${PROJECT_ID}"       # this the the folder (${BASE}/${PROJECT_ID}) where the downstream analysis will be done
+echo "VERSION_N = ${VERSION_N}"         # the version of the alignment and genotyping analysis
+echo "FAMILY_ID = ${FAMILY_ID}"		# the family ID of this family with affected probands
+echo "DECIPHER_ID = ${DECIPHER_ID}"	# the DECIPHER_ID for this family
+
+
+
+
+# change to the LOG folder
+cd ${LOG_DIR}
+
+
+
+
+##########################################################################################
+###   check the PED file to make sure only affected individuals with missing parents   ###
+###   write a file with all affected individuals with no parents                       ###
+###   write a file with all unaffected individuals - should be none                    ###
+##########################################################################################
+
+echo ""
+echo ""
+echo "checking the ${PED_DIR}/${BATCH_ID}_${PLATE_ID}_${FAMILY_ID}.ped file..."
+
+time ${PYTHON2} ${SCRIPTS_DIR}/check_shared_PED.py ${PED_DIR}/${BATCH_ID}_${PLATE_ID}_${FAMILY_ID}.ped
+
+# check if the PED file checks were successful (python exit code = 0), if not exit the bash script
+ret=$?
+if [ $ret -ne 0 ]; then
+     echo "...it appears that the PED file does not corresponds to a family consisting of affected probands only!"
+     echo "ERROR: Aborting the analysis"
+     exit
+fi
+echo ""
+echo ""
+
+
+
+
+
+########################################################################
+###        DNU and clean the the family VCF   			     ###
+### format: ${PLATE_ID}_${FAMILY_ID}-gatk-haplotype-annotated.vcf.gz ###
+########################################################################
+
+echo ""
+echo ""
+echo "Performing DNU and cleaning of the ${PLATE_ID}_${FAMILY_ID}'s VCF file..."
+
+
+time ${VT} decompose -s ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}-gatk-haplotype-annotated.vcf.gz -o ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}.decomp.vcf.gz
+time ${VT} normalize ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}.decomp.vcf.gz -r ${REFERENCE_GENOME} -o ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}.norm.vcf.gz
+time ${VT} uniq ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}.norm.vcf.gz -o ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}.DNU.vcf.gz
+
+
+# remove sites with AC=0
+time ${BCFTOOLS} view --min-ac=1 --no-update ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}.DNU.vcf.gz > ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}.AC0.vcf
+
+# reset GT to no-call if num_ALT < num_ALT_THERSH or VAF < VAF_THRESH and GT != 0/0
+# exlude variants from the blacklist (matching on chr,pos,ref,alt)
+time ${PYTHON2} ${SCRIPTS_DIR}/filter_LQ_GT.py ${BLACKLIST} ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}.AC0.vcf ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}.clean.vcf
+
+# bgzip and tabix it
+time cat ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}.clean.vcf | ${BGZIP} > ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}.ready.vcf.gz
+time ${TABIX} -p vcf ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}.ready.vcf.gz
+
+# delete intermediate files
+rm ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}-gatk-haplotype-annotated.vcf.gz
+rm ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}.decomp.vcf.gz*
+rm ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}.norm.vcf.gz*
+rm ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}.DNU.vcf.gz*
+rm ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}.AC0.vcf
+
+
+# to avoid bgzip pipe broken annoying, but not problematic message - skip the next step, the file will be used by G2P as IN_FILE and will be deleted last
+# rm ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}.clean.vcf
+
+
+echo ""
+echo ""
+echo "^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^"
+echo "DNU, AC=0, num_ALT & VAF & blacklist cleaning and of the ${PLATE_ID}_${FAMILY_ID}'s VCF file: done"
+echo "^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^"
+echo ""
+echo ""
+
+
+
+
+
+###########################################################
+###     run G2P for each family VCF (DD genes)          ###
+###     format: ${PLATE_ID}_${FAMILY_ID}.clean.vcf      ###
+###########################################################
+
+
+echo "Performing G2P analysis (DD genes)for FAMILY_ID = ${PLATE_ID}_${FAMILY_ID}..."
+echo "Using ${TARGETS}"
+
+
+IN_FILE=${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}.clean.vcf
+G2P_LOG_DIR=${G2P_DIR}/${PLATE_ID}_${FAMILY_ID}_LOG_DIR
+mkdir ${G2P_LOG_DIR}
+TXT_OUT=${G2P_LOG_DIR}/${PLATE_ID}_${FAMILY_ID}.report.txt
+HTML_OUT=${G2P_LOG_DIR}/${PLATE_ID}_${FAMILY_ID}.report.html
+VCF_KEYS='gnomADe_r2.1.1_GRCh38|gnomADg_r3.1.1_GRCh38'
+
+
+time ${VEP} \
+    -i ${IN_FILE} \
+    --output_file ${G2P_LOG_DIR}/${PLATE_ID}_${FAMILY_ID}_inter_out.txt \
+    --force_overwrite \
+    --assembly GRCh38 \
+    --fasta ${REFERENCE_GENOME} \
+    --offline \
+    --merged \
+    --use_given_ref \
+    --cache --cache_version 100 \
+    --dir_cache /home/u035/u035/shared/software/bcbio/genomes/Hsapiens/hg38/vep \
+    --individual all \
+    --transcript_filter "gene_symbol in /home/u035/u035/shared/resources/G2P/genes_in_DDG2P.20220113.txt" \
+    --dir_plugins /home/u035/u035/shared/software/bcbio/anaconda/share/ensembl-vep-100.4-0 \
+    --plugin G2P,file='/home/u035/u035/shared/resources/G2P/DDG2P.20220113.csv',af_from_vcf=1,confidence_levels='definitive&strong',af_from_vcf_keys=${VCF_KEYS},log_dir=${G2P_LOG_DIR},txt_report=${TXT_OUT},html_report=${HTML_OUT}
+
+
+echo ""
+echo ""
+echo "^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^"
+echo "G2P analysis of FAMILY_ID = ${PLATE_ID}_${FAMILY_ID}: done"
+echo "^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^"
+echo ""
+echo ""
+
+
+
+
+
+#################################################################################################################################################
+###          run coverage for each proband (DD genes)                                                                                         ###
+###   format: ${SOURCE_DIR}/${BATCH_NUM}_${VERSION_N}/families/????-??-??_${BATCH_NUM}_${VERSION_N}_${PLATE_ID}_${FAMILY_ID}/${PROBAND_ID}_${FAMILY_ID}/${PROBAND_ID}_${FAMILY_ID}-ready.bam   ###
+#################################################################################################################################################
+
+
+# from the VCF, get all IDs (in the format probad_family_id), they are all affected probands (already checked at the start)
+
+file=${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}.ready.vcf.gz
+
+# INDI_ID is in format ${PROBAND_ID}_${FAMILY_ID}
+
+for INDI_ID in `${BCFTOOLS} query -l $file`; do
+
+    #################################
+    #####    for each proband    ####
+    #################################
+
+    echo "Performing coverage analysis for PROBAND_ID = ${INDI_ID} ...."
+
+    # make sure we are reading the data from the exact batch & plate ID
+    BAM_FILE=${SOURCE_DIR}/${BATCH_NUM}_${VERSION_N}/families/????-??-??_${BATCH_NUM}_${VERSION_N}_${PLATE_ID}_${FAMILY_ID}/${INDI_ID}/${INDI_ID}-ready.bam
+    OUT_FILE=${COV_DIR}/${INDI_ID}.DD15
+
+    time java -Xmx8g -jar ${GATK3} -T DepthOfCoverage -R ${REFERENCE_GENOME} -o ${OUT_FILE} -I ${BAM_FILE} -L ${TARGETS} \
+        --omitDepthOutputAtEachBase \
+        --minBaseQuality 20 \
+        --minMappingQuality 20 \
+        -ct 20 \
+        -jdk_deflater \
+        -jdk_inflater \
+        --allow_potentially_misencoded_quality_scores
+
+    echo ""
+    echo ""
+    echo "----------------------------------------------------------------------------------------------------"
+    echo "percentage of DD exons (+/-15bp) covered at least 20x in PROBAND_ID = ${INDI_ID} ..."
+    cat ${COV_DIR}/${INDI_ID}.DD15.sample_summary | awk '{print $7}'
+    echo "----------------------------------------------------------------------------------------------------"
+
+    # now compute the coverage per DD exon (+/-15bp) interval, adding the number of P/LP ClinVar variants (assertion criteria provided) in each interval
+    time ${PYTHON2} ${SCRIPTS_DIR}/get_cov_output.py ${COV_DIR}/${INDI_ID}.DD15.sample_interval_summary ${CLINVAR} ${COV_DIR}/${INDI_ID}.DD15.COV.txt
+
+    echo ""
+    echo ""
+    echo "^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^"
+    echo "Coverage analysis of PROBAND_ID = ${INDI_ID}: done    "
+    echo "    Coverage file = ${COV_DIR}/${INDI_ID}.DD15.COV.txt"
+    echo "^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^"
+    echo ""
+    echo ""
+
+
+
+
+    # check the coverage per each of the reccurent de novo SNPs (padded with 15bp both directions) #
+    echo "Performing recurrent coverage analysis for PROBAND_ID = ${INDI_ID} ..."
+
+    # we have identified the name of the proband's BAM file above (BAM_FILE), reuse it
+    # set the name of the file containing info about the coverage of the recurrent SNPs
+    REC_OUT_FILE=${COV_DIR}/${INDI_ID}.REC_SNP_COV.txt
+
+    while IFS=$'\t' read -ra var; do
+      gene="${var[0]}"
+      chr="${var[1]}"
+      pos="${var[2]}"
+      lo=$(expr $pos - 15)
+      hi=$(expr $pos + 15)
+      reg="$lo-$hi"
+      echo "============================================="
+      echo "$gene : recurrent variant at $chr:$pos"
+      echo "exploring coverage at $chr:$reg"
+
+      echo "---------------------------------------------"
+      echo "precisely at the position"
+      ${SAMTOOLS} depth -aa -Q 20 -r $chr:$reg ${BAM_FILE} | grep "$pos"
+
+      echo "---------------------------------------------"
+      echo "average in the +/- 15bp region"
+      ${SAMTOOLS} depth -aa -Q 20 -r $chr:$reg ${BAM_FILE} | awk '{sum+=$3} END { print "Average = ",sum/NR}'
+
+      echo "---------------------------------------------"
+      echo "detailed in the +/- 15bp region"
+      ${SAMTOOLS} depth -aa -Q 20 -r $chr:$reg ${BAM_FILE}
+    done < ${REC_SNP} > ${REC_OUT_FILE}
+
+    echo ""
+    echo ""
+    echo "^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^"
+    echo "Coverage analysis of recurring SNPs for PROBAND_ID = ${INDI_ID}: done    "
+    echo "    Coverage file = ${REC_OUT_FILE}"
+    echo "^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^"
+    echo ""
+    echo ""
+
+done
+
+
+
+
+
+###################################################################################
+###      for each proband generate the DECIPHER file                            ###
+###  ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}.ready.vcf.gz - the cleaned family VCF  ###
+###  ${TRANS_MAP} - the current transcript mapping file                         ###
+###################################################################################
+
+
+echo "Generating the DECIPHER file for all probands in ${FAMILY_ID} ..."
+
+
+# first, split the family VCF to individual VCFs
+# -c1:  minimum allele count (INFO/AC) of sites to be printed
+# split multi-allelic sites (by -m -any)
+# left-alignment and normalization (by adding the -f)
+
+file=${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}.ready.vcf.gz
+echo "splitting $file"
+for indi in `${BCFTOOLS} query -l $file`; do
+    ${BCFTOOLS} view -c1 -Oz -s $indi -o ${file/.vcf*/.$indi.rough.vcf.gz} $file
+    ${BCFTOOLS} norm -f ${REFERENCE_GENOME} -m -any -Oz -o ${file/.vcf*/.$indi.vcf.gz} ${file/.vcf*/.$indi.rough.vcf.gz}
+    rm ${file/.vcf*/.$indi.rough.vcf.gz}
+done
+
+
+
+# create the names of the needed files
+PED_FILE=${PED_DIR}/${BATCH_ID}_${PLATE_ID}_${FAMILY_ID}.ped
+IN_G2P_FILE=${G2P_DIR}/${PLATE_ID}_${FAMILY_ID}_LOG_DIR/${PLATE_ID}_${FAMILY_ID}.report.txt
+FAM_IGV_DIR=${IGV_DIR}/${PLATE_ID}_${FAMILY_ID}
+FAM_BAM_DIR=${SOURCE_DIR}/${BATCH_NUM}_${VERSION_N}/families/????-??-??_${BATCH_NUM}_${VERSION_N}_${PLATE_ID}_${FAMILY_ID}
+
+
+
+
+## call the python scrpit
+time ${PYTHON2} ${SCRIPTS_DIR}/generate_DEC_IGV_shared_scripts.py \
+${DECIPHER_ID} \
+${TRANS_MAP} \
+${PED_FILE} \
+${IN_G2P_FILE} \
+${FAM_IGV_DIR} \
+${VCF_DIR} \
+${PLATE_ID} \
+${FAMILY_ID} \
+${DEC_DIR} \
+${FAM_BAM_DIR}
+
+
+
+
+
+#############################################################################################################################
+## using the DECIPHER bulk upload file v9 for each proband --> generate the DECIPHER bulk upload file v10 for each proband ##
+#############################################################################################################################
+# from the VCF, get all IDs (in the format probad_family_id), they are all affected probands (already checked at the start)
+# INDI_ID is in format ${PROBAND_ID}_${FAMILY_ID}
+
+file=${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}.ready.vcf.gz
+for INDI_ID in `${BCFTOOLS} query -l $file`; do
+    #################################
+    #####    for each proband    ####
+    #################################
+    echo "...Generating v10 Decipher bulk upload file for proband = ${INDI_ID} ...."
+    time ${PYTHON3} ${SCRIPTS_DIR}/convert_DEC_to_v10.py ${DEC_DIR} ${INDI_ID}
+done
+
+
+echo ""
+echo ""
+echo "^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^"
+echo "DECIPHER analysis of all probands in ${FAMILY_ID}: done"
+echo "^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^"
+echo ""
+echo ""
+
+
+rm ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}.clean.vcf
+
+
+
+
+##############################################################################################
+###  for each of the affected probands                                                     ###
+###  for each variant in the DECIPHER upload file (only the shared variants)               ###
+###  generate a IGV snapshot based on the realigned BAM used by GATK for calling variants  ###
+###  first, generate BAMOUTs for each variant (to be stored in the BAMOUT folder)          ###
+###  then, generate a batch script for IGV to produce the snapshots based on the BAMOUTs   ###
+##############################################################################################
+
+echo ""
+echo ""
+echo "Generating the BAMOUT files for the ${FAMILY_ID} family ...."
+
+
+# get the ids of the affected probands to generate the part of the command line which refers to the BAM files to by analysed
+file=${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}.ready.vcf.gz
+# INDI_ID is in format ${PROBAND_ID}_${FAMILY_ID}
+aff_pro_arr=()
+for INDI_ID in `${BCFTOOLS} query -l $file`; do
+    aff_pro_arr+=(${INDI_ID})
+done
+echo "  Found ${#aff_pro_arr[@]} affected probands in ${FAMILY_ID} for which BAMOUTs to be generated"
+
+for key in "${!aff_pro_arr[@]}"; do
+  echo "    ${aff_pro_arr[$key]}";
+  INPUT_BAM_LINE=${INPUT_BAM_LINE}" --input ${SOURCE_DIR}/${BATCH_NUM}_${VERSION_N}/families/????-??-??_${BATCH_NUM}_${VERSION_N}_${PLATE_ID}_${FAMILY_ID}/${aff_pro_arr[$key]}/${aff_pro_arr[$key]}-ready.bam"
+
+done
+
+
+
+
+# now go over the shared G2P variants in the for which to generate bamouts
+# the variants should be identical in the DECIPHER files for all affected individuals, pick the first individual
+# chr is the second column - need to add the 'chr' prefix
+# pos is the third column
+# the first line is a header line, starting with 'Internal reference number or ID'
+# file called: ${DEC_DIR}/<proband_id>_<fam_id>_DEC_FLT.csv
+# and for each run GATK to generate the bamout files
+# to be stored in ${BAMOUT_DIR}/${FAMILY_ID}
+
+mkdir ${BAMOUT_DIR}/${FAMILY_ID}
+
+var_file=${DEC_DIR}/${aff_pro_arr[0]}_DEC_FLT.csv
+echo "... reading the shared variants in ${var_file} to generate the bamouts for each ..."
+
+
+grep -v '^Internal' ${var_file} |
+while IFS= read -r line
+do
+  echo ""
+  echo ""
+  echo ""
+  echo "$line"
+  IFS=, read -ra ary <<<"$line"
+#  for key in "${!ary[@]}"; do echo "$key ${ary[$key]}"; done
+  chr=${ary[1]}
+  pos=${ary[2]}
+  ref=${ary[4]}
+  alt=${ary[5]}
+  echo " --> chr = $chr, pos = $pos, ref = ${ref}, alt = ${alt}"
+
+  # generate the bamout file
+  echo "...doing the bamout"
+  echo "   time ${GATK4} HaplotypeCaller --reference ${REFERENCE_GENOME}" ${INPUT_BAM_LINE}" -L chr${chr}:${pos} --interval-padding 500 --active-probability-threshold 0.000 -ploidy 2 --output ${BAMOUT_DIR}/${FAMILY_ID}/${FAMILY_ID}_chr${chr}_${pos}.bamout.vcf -bamout ${BAMOUT_DIR}/${FAMILY_ID}/${FAMILY_ID}_chr${chr}_${pos}.bamout.bam"
+
+  time ${GATK4} HaplotypeCaller --reference ${REFERENCE_GENOME} ${INPUT_BAM_LINE} -L chr${chr}:${pos} --interval-padding 500 \
+  --active-probability-threshold 0.000 -ploidy 2 \
+  --output ${BAMOUT_DIR}/${FAMILY_ID}/${FAMILY_ID}_chr${chr}_${pos}.bamout.vcf -bamout ${BAMOUT_DIR}/${FAMILY_ID}/${FAMILY_ID}_chr${chr}_${pos}.bamout.bam
+
+done
+
+
+
+
+##############################################################################################
+## write the IGV batch file for each affected individual in this family based on the bamouts #
+## to be stored as /home/u035/u035/shared/analysis/work/${PROJECT_ID}/DECIPHER/IGV/bamout_${PROBAND_ID}_${FAMILY_ID}.snapshot.txt #
+## ${PROBAND_ID}_${FAMILY_ID} == ${aff_pro_arr[$key] #
+##################################################################
+
+for key in "${!aff_pro_arr[@]}"; do
+  echo ""
+  echo ""
+  echo "Generating the IGV batch file for ${aff_pro_arr[$key]}";
+
+  snap_file=/home/u035/u035/shared/analysis/work/${PROJECT_ID}/DECIPHER/IGV/bamout_${aff_pro_arr[$key]}.snapshot.txt
+
+  # check if previous version exist, if so - delete it
+  if [ -f "${snap_file}" ]; then
+    echo "previous version of ${snap_file} exist --> deleted"
+    rm ${snap_file}
+  fi
+
+  # write the header for the IGV batch file
+  echo "new" >> ${snap_file}
+  echo "genome hg38" >> ${snap_file}
+  echo "snapshotDirectory \"/home/u035/u035/shared/analysis/work/${PROJECT_ID}/DECIPHER/IGV/${PLATE_ID}_${FAMILY_ID}\"" >> ${snap_file}
+  echo "" >> ${snap_file}
+
+
+  # now, go again over the variants in this individual's DECIPHER file and generate one snapshot file for all the variants
+  var_file=${DEC_DIR}/${aff_pro_arr[$key]}_DEC_FLT.csv
+  echo "... reading ${var_file} to generate the IGV batch file using the bamouts..."
+
+  grep -v '^Internal' ${var_file} |
+  while IFS= read -r line
+  do
+    IFS=, read -ra ary <<<"$line"
+    chr=${ary[1]}
+    pos=${ary[2]}
+    ref=${ary[4]}
+    alt=${ary[5]}
+    left=$((${pos}-25))
+    right=$((${pos}+25))
+
+    echo "new" >> ${snap_file}
+    echo "load ${BAMOUT_DIR}/${FAMILY_ID}/${FAMILY_ID}_chr${chr}_${pos}.bamout.bam" >> ${snap_file}
+    echo "preference SAM.SHADE_BASE_QUALITY true" >> ${snap_file}
+
+    echo "goto chr${chr}:${left}-${right}" >> ${snap_file}
+    echo "group SAMPLE" >> ${snap_file}
+    echo "sort base" >> ${snap_file}
+    echo "squish" >> ${snap_file}
+    echo "snapshot bamout_${aff_pro_arr[$key]}_chr${chr}_${pos}_${ref}_${alt}.png" >> ${snap_file}
+    echo "" >> ${snap_file}
+    echo "" >> ${snap_file}
+
+  done
+
+  echo "Generating of the IGV batch file based on bamouts for ${aff_pro_arr[$key]}- done!"
+  echo "snap_file = ${snap_file}"
+
+done
+
+echo ""
+echo ""
+echo ""
+echo "+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++"
+echo "+++   Variant prioritization of family ${FAMILY_ID} completed   +++"
+echo "+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++"
+
+
+
+
diff --git a/process_solo.sh b/process_solo.sh
new file mode 100755
index 0000000000000000000000000000000000000000..29edeca5796c2d270f358a903862c7dd0cbbdf66
--- /dev/null
+++ b/process_solo.sh
@@ -0,0 +1,479 @@
+#!/bin/bash
+#SBATCH --cpus-per-task=1
+#SBATCH --mem=16GB
+#SBATCH --time=24:00:00
+#SBATCH --job-name=process_trio
+#SBATCH --output=process_solo.%A_%a.out
+#SBATCH --error=process_solo.%A_%a.err
+
+
+
+# setup PATH
+export PATH=$PATH:/home/u035/u035/shared/software/bcbio/anaconda/envs/python2/bin:/home/u035/u035/shared/software/bcbio/anaconda/bin
+export PERL5LIB=$PERL5LIB:/home/u035/u035/shared/software/bcbio/anaconda/lib/site_perl/5.26.2
+
+
+### folder structure for the downstream analysis - created by trio_setup.sh ###
+BASE=/home/u035/u035/shared/analysis/work
+WORK_DIR=$BASE/${PROJECT_ID}
+VCF_DIR=${WORK_DIR}/VCF
+PED_DIR=${WORK_DIR}/PED
+LOG_DIR=${WORK_DIR}/LOG
+G2P_DIR=${WORK_DIR}/G2P
+VASE_DIR=${WORK_DIR}/VASE
+COV_DIR=${WORK_DIR}/COV
+DEC_DIR=${WORK_DIR}/DECIPHER
+IGV_DIR=${DEC_DIR}/IGV
+CNV_DIR=${WORK_DIR}/CNV
+BAMOUT_DIR=${WORK_DIR}/BAMOUT
+SCRIPTS_DIR=/home/u035/u035/shared/scripts
+
+
+
+# other files to be used
+FAMILY_IDS=${WORK_DIR}/solo_FAM_IDs.txt								# created by trio_setup.sh
+CHILD_IDS=${WORK_DIR}/solo_PRO_IDs.txt								# created by trio_setup.sh
+TARGETS=/home/u035/u035/shared/resources/G2P/DDG2P.20220113.plus15bp.merged.bed			# OK
+CLINVAR=/home/u035/u035/shared/resources/G2P/DDG2P.20220113.clinvar.20220109.plus15bp.txt	# OK
+BLACKLIST=/home/u035/u035/shared/resources/blacklist/current_blacklist.txt			# OK
+TRANS_MAP=/home/u035/u035/shared/resources/trans_map/current_trans_map.txt			# OK
+REC_SNP=/home/u035/u035/shared/resources/reccurent/current_reccurent.bed			# OK, https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7116826/, Extended Data Table 1
+
+
+
+### TOOLS ###
+SAMTOOLS=/home/u035/u035/shared/software/bcbio/anaconda/bin/samtools
+BCFTOOLS=/home/u035/u035/shared/software/bcbio/anaconda/envs/python2/bin/bcftools
+BGZIP=/home/u035/u035/shared/software/bcbio/anaconda/envs/python2/bin/bgzip
+TABIX=/home/u035/u035/shared/software/bcbio/anaconda/envs/python2/bin/tabix
+VT=/home/u035/u035/shared/software/bcbio/anaconda/bin/vt
+VASE=/home/u035/u035/shared/software/bcbio/anaconda/bin/vase
+GATK4=/home/u035/u035/shared/software/bcbio/anaconda/bin/gatk								# points to ../share/gatk4-4.2.1.0-0/gatk
+GATK3=/home/u035/u035/shared/software/GenomeAnalysisTK-3.8/GenomeAnalysisTK.jar
+PYTHON3=/home/u035/u035/shared/software/bcbio/anaconda/bin/python3							# points to python3.6
+PYTHON2=/home/u035/u035/shared/software/bcbio/anaconda/envs/python2/bin/python2.7
+VEP="/home/u035/u035/shared/software/bcbio/anaconda/bin/perl /home/u035/u035/shared/software/bcbio/anaconda/bin/vep"	# points to ../share/ensembl-vep-100.4-0/vep
+REFERENCE_GENOME=/home/u035/u035/shared/software/bcbio/genomes/Hsapiens/hg38/seq/hg38.fa
+
+
+
+echo "SOURCE_DIR = ${SOURCE_DIR}"       # the general path to the source BAM files (VCF and PED already copied)		i.e. /home/u035/u035/shared/results
+echo "BATCH_ID = ${BATCH_ID}"           # the ID of the batch being processed                                   	e.g. 19650_Ansari_Morad
+echo "BATCH_NUM = ${BATCH_NUM}"         # the numerical part of the BATCH_ID						e.g. 19650
+echo "PLATE_ID = ${PLATE_ID}"           # the PCR plate ID of the batch being currently processed,              	e.g. 19285
+echo "PROJECT_ID = ${PROJECT_ID}"       # this the the folder (${BASE}/${PROJECT_ID}) where the downstream analysis will be done
+echo "VERSION_N = ${VERSION_N}"         # the version of the alignment and genotyping analysis
+
+
+
+
+
+# change to the LOG folder
+cd ${LOG_DIR}
+
+
+
+
+################################
+#####    for each family    ####
+################################
+
+FAMILY_ID=`head -n ${SLURM_ARRAY_TASK_ID} ${FAMILY_IDS} | tail -n 1`
+
+
+########################################################################
+###        DNU and clean the each family VCF                         ###
+### format: ${PLATE_ID}_${FAMILY_ID}-gatk-haplotype-annotated.vcf.gz ###
+########################################################################
+
+echo "Performing DNU and cleaning of the ${PLATE_ID}_${FAMILY_ID}'s VCF file..."
+
+
+time ${VT} decompose -s ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}-gatk-haplotype-annotated.vcf.gz -o ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}.decomp.vcf.gz
+time ${VT} normalize ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}.decomp.vcf.gz -r ${REFERENCE_GENOME} -o ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}.norm.vcf.gz
+time ${VT} uniq ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}.norm.vcf.gz -o ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}.DNU.vcf.gz
+
+
+# remove sites with AC=0
+time ${BCFTOOLS} view --min-ac=1 --no-update ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}.DNU.vcf.gz > ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}.AC0.vcf
+
+# reset GT to no-call if num_ALT < num_ALT_THERSH or VAF < VAF_THRESH and GT != 0/0
+# exlude variants from the blacklist (matching on chr,pos,ref,alt)
+time ${PYTHON2} ${SCRIPTS_DIR}/filter_LQ_GT.py ${BLACKLIST} ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}.AC0.vcf ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}.clean.vcf
+
+# bgzip and tabix it
+time cat ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}.clean.vcf | ${BGZIP} > ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}.ready.vcf.gz
+time ${TABIX} -p vcf ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}.ready.vcf.gz
+
+# delete intermediate files
+rm ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}-gatk-haplotype-annotated.vcf.gz
+rm ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}.decomp.vcf.gz*
+rm ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}.norm.vcf.gz*
+rm ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}.DNU.vcf.gz*
+rm ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}.AC0.vcf
+
+
+# to avoid bgzip pipe broken annoying, but not problematic message - skip the next step, the file will be used by G2P as IN_FILE and will be deleted last
+# rm ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}.clean.vcf
+
+
+echo ""
+echo ""
+echo "^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^"
+echo "DNU, AC=0, num_ALT & VAF & blacklist cleaning and of the ${PLATE_ID}_${FAMILY_ID}'s VCF file: done"
+echo "^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^"
+echo ""
+echo ""
+
+
+
+
+
+###########################################################
+###     run G2P for each family VCF (DD genes)          ###
+###     format: ${PLATE_ID}_${FAMILY_ID}.clean.vcf      ###
+###########################################################
+
+
+echo "Performing G2P analysis (DD genes)for FAMILY_ID = ${PLATE_ID}_${FAMILY_ID}..."
+echo "Using ${TARGETS}"
+
+
+IN_FILE=${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}.clean.vcf
+G2P_LOG_DIR=${G2P_DIR}/${PLATE_ID}_${FAMILY_ID}_LOG_DIR
+mkdir ${G2P_LOG_DIR}
+TXT_OUT=${G2P_LOG_DIR}/${PLATE_ID}_${FAMILY_ID}.report.txt
+HTML_OUT=${G2P_LOG_DIR}/${PLATE_ID}_${FAMILY_ID}.report.html
+VCF_KEYS='gnomADe_r2.1.1_GRCh38|gnomADg_r3.1.1_GRCh38'
+
+time ${VEP} \
+    -i ${IN_FILE} \
+    --output_file ${G2P_LOG_DIR}/${PLATE_ID}_${FAMILY_ID}_inter_out.txt \
+    --force_overwrite \
+    --assembly GRCh38 \
+    --fasta ${REFERENCE_GENOME} \
+    --offline \
+    --merged \
+    --use_given_ref \
+    --cache --cache_version 100 \
+    --dir_cache /home/u035/u035/shared/software/bcbio/genomes/Hsapiens/hg38/vep \
+    --individual all \
+    --transcript_filter "gene_symbol in /home/u035/u035/shared/resources/G2P/genes_in_DDG2P.20220113.txt" \
+    --dir_plugins /home/u035/u035/shared/software/bcbio/anaconda/share/ensembl-vep-100.4-0 \
+    --plugin G2P,file='/home/u035/u035/shared/resources/G2P/DDG2P.20220113.csv',af_from_vcf=1,confidence_levels='definitive&strong',af_from_vcf_keys=${VCF_KEYS},log_dir=${G2P_LOG_DIR},txt_report=${TXT_OUT},html_report=${HTML_OUT}
+
+
+echo ""
+echo ""
+echo "^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^"
+echo "G2P analysis of FAMILY_ID = ${PLATE_ID}_${FAMILY_ID}: done"
+echo "^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^"
+echo ""
+echo ""
+
+
+
+
+
+############################################################################################################################################################################################
+###          run coverage for each proband (DD genes)                                                                                                                                    ###
+###   format: ${SOURCE_DIR}/${BATCH_NUM}_${VERSION_N}/families/????-??-??_${BATCH_NUM}_${VERSION_N}_${PLATE_ID}_${FAMILY_ID}/${INDI_ID}_${FAMILY_ID}/${INDI_ID}_${FAMILY_ID}-ready.bam   ###
+############################################################################################################################################################################################
+
+
+#################################
+#####    for each proband    ####
+#################################
+
+PROBAND_ID=`head -n ${SLURM_ARRAY_TASK_ID} ${CHILD_IDS} | tail -n 1`                                # contains only the proband IDs (e.g. 107060)
+
+echo "Performing coverage analysis for PROBAND_ID = ${PROBAND_ID}_${FAMILY_ID} ..."
+
+
+# make sure we are reading the data from the exact batch & plate ID
+BAM_FILE=${SOURCE_DIR}/${BATCH_NUM}_${VERSION_N}/families/????-??-??_${BATCH_NUM}_${VERSION_N}_${PLATE_ID}_${FAMILY_ID}/${PROBAND_ID}_${FAMILY_ID}/${PROBAND_ID}_${FAMILY_ID}-ready.bam
+OUT_FILE=${COV_DIR}/${PROBAND_ID}_${FAMILY_ID}.DD15
+
+
+time java -Xmx8g -jar ${GATK3} -T DepthOfCoverage -R ${REFERENCE_GENOME} -o ${OUT_FILE} -I ${BAM_FILE} -L ${TARGETS} \
+  --omitDepthOutputAtEachBase \
+  --minBaseQuality 20 \
+  --minMappingQuality 20 \
+  -ct 20 \
+  -jdk_deflater \
+  -jdk_inflater \
+  --allow_potentially_misencoded_quality_scores
+
+echo ""
+echo ""
+echo "----------------------------------------------------------------------------------------------------"
+echo "percentage of DD exons (+/-15bp) covered at least 20x in PROBAND_ID = ${PROBAND_ID}_${FAMILY_ID} ..."
+cat ${COV_DIR}/${PROBAND_ID}_${FAMILY_ID}.DD15.sample_summary | awk '{print $7}'
+echo "----------------------------------------------------------------------------------------------------"
+
+
+# now compute the coverage per DD exon (+/-15bp) interval, adding the number of P/LP ClinVar variants (assertion criteria provided) in each interval
+time ${PYTHON2} ${SCRIPTS_DIR}/get_cov_output.py ${COV_DIR}/${PROBAND_ID}_${FAMILY_ID}.DD15.sample_interval_summary ${CLINVAR} ${COV_DIR}/${PROBAND_ID}_${FAMILY_ID}.DD15.COV.txt
+
+
+echo ""
+echo ""
+echo "^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^"
+echo "Coverage analysis of PROBAND_ID = ${PROBAND_ID}_${FAMILY_ID}: done    "
+echo "    Coverage file = ${COV_DIR}/${PROBAND_ID}_${FAMILY_ID}.DD15.COV.txt"
+echo "^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^"
+echo ""
+echo ""
+
+
+
+
+################################################################################################
+# check the coverage per each of the reccurent de novo SNPs (padded with 15bp both directions) #
+################################################################################################
+echo "Performing recurrent coverage analysis for PROBAND_ID = ${PROBAND_ID}_${FAMILY_ID} ..."
+
+# we have identified the name of the proband's BAM file above (BAM_FILE), reuse it
+# set the name of the file containing info about the coverage of the recurrent SNPs
+REC_OUT_FILE=${COV_DIR}/${PROBAND_ID}_${FAMILY_ID}.REC_SNP_COV.txt
+
+while IFS=$'\t' read -ra var; do
+  gene="${var[0]}"
+  chr="${var[1]}"
+  pos="${var[2]}"
+  lo=$(expr $pos - 15)
+  hi=$(expr $pos + 15)
+  reg="$lo-$hi"
+  echo "============================================="
+  echo "$gene : recurrent variant at $chr:$pos"
+  echo "exploring coverage at $chr:$reg"
+
+  echo "---------------------------------------------"
+  echo "precisely at the position"
+  ${SAMTOOLS} depth -aa -Q 20 -r $chr:$reg ${BAM_FILE} | grep "$pos"
+
+  echo "---------------------------------------------"
+  echo "average in the +/- 15bp region"
+  ${SAMTOOLS} depth -aa -Q 20 -r $chr:$reg ${BAM_FILE} | awk '{sum+=$3} END { print "Average = ",sum/NR}'
+
+  echo "---------------------------------------------"
+  echo "detailed in the +/- 15bp region"
+  ${SAMTOOLS} depth -aa -Q 20 -r $chr:$reg ${BAM_FILE}
+done < ${REC_SNP} > ${REC_OUT_FILE}
+
+
+echo ""
+echo ""
+echo "^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^"
+echo "Coverage analysis of recurring SNPs for PROBAND_ID = ${PROBAND_ID}_${FAMILY_ID}: done    "
+echo "    Coverage file = ${REC_OUT_FILE}"
+echo "^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^"
+echo ""
+echo ""
+
+
+
+
+
+###################################################################################
+###      for each proband generate the DECIPHER file                            ###
+###  ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}.ready.vcf.gz - the cleaned family VCF  ###
+###  ${VASE_DIR}/${PLATE_ID}_${FAMILY_ID}.ready.denovo.vcf - the VASE file      ###
+###  ${TRANS_MAP} - the current transcript mapping file                         ###
+###################################################################################
+
+echo "Generating the DECIPHER file for PROBAND_ID = ${PROBAND_ID}_${FAMILY_ID} ..."
+
+# first, split the family VCF to individual VCFs
+# -c1:  minimum allele count (INFO/AC) of sites to be printed
+# split multi-allelic sites (by -m -any)
+# left-alignment and normalization (by adding the -f)
+
+file=${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}.ready.vcf.gz
+echo "splitting $file"
+for indi in `${BCFTOOLS} query -l $file`; do
+    ${BCFTOOLS} view -c1 -Oz -s $indi -o ${file/.vcf*/.$indi.rough.vcf.gz} $file
+    ${BCFTOOLS} norm -f ${REFERENCE_GENOME} -m -any -Oz -o ${file/.vcf*/.$indi.vcf.gz} ${file/.vcf*/.$indi.rough.vcf.gz}
+    rm ${file/.vcf*/.$indi.rough.vcf.gz}
+done
+
+################# VASE file - already split, left-aligned and normalized
+
+
+# create the names of the needed files
+PED_FILE=${PED_DIR}/${BATCH_ID}_${PLATE_ID}_${FAMILY_ID}.ped
+DEC_MAP=${WORK_DIR}/solo_DECIPHER_INTERNAL_IDs.txt
+IN_G2P_FILE=${G2P_DIR}/${PLATE_ID}_${FAMILY_ID}_LOG_DIR/${PLATE_ID}_${FAMILY_ID}.report.txt
+FAM_IGV_DIR=${IGV_DIR}/${PLATE_ID}_${FAMILY_ID}
+FAM_BAM_DIR=${SOURCE_DIR}/${BATCH_NUM}_${VERSION_N}/families/????-??-??_${BATCH_NUM}_${VERSION_N}_${PLATE_ID}_${FAMILY_ID}
+
+
+## call the python scrpit
+time ${PYTHON2} ${SCRIPTS_DIR}/generate_DEC_IGV_solo_scripts.py \
+${DEC_MAP} \
+${TRANS_MAP} \
+${PED_FILE} \
+${IN_G2P_FILE} \
+${FAM_IGV_DIR} \
+${VCF_DIR} \
+${PLATE_ID} \
+${FAMILY_ID} \
+${DEC_DIR} \
+${FAM_BAM_DIR}
+
+
+## using the DECIPHER bulk upload file v9 --> generate the DECIPHER bulk upload file v10
+echo "...Generating v10 Decipher bulk upload file for proband = ${PROBAND_ID}, family_id = ${FAMILY_ID} ..."
+time ${PYTHON3} ${SCRIPTS_DIR}/convert_DEC_to_v10.py ${DEC_DIR} ${PROBAND_ID}_${FAMILY_ID}
+
+
+
+echo ""
+echo ""
+echo "^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^"
+echo "DECIPHER analysis of PROBAND_ID = ${PROBAND_ID}_${FAMILY_ID}: done"
+echo "^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^"
+echo ""
+echo ""
+
+
+rm ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}.clean.vcf
+
+
+
+##############################################################################################
+###  for each variant in the DECIPHER upload file                                          ###
+###  generate a IGV snapshot based on the realigned BAM used by GATK for calling variants  ###
+###  first, generate BAMOUTs for each variant (to be stored in the BAMOUT folder)          ###
+###  then, generate a batch script for IGV to produce the snapshots based on the BAMOUTs   ###
+##############################################################################################
+
+
+
+# we have so far
+# FAMILY_ID=`head -n ${SLURM_ARRAY_TASK_ID} ${FAMILY_IDS} | tail -n 1`
+# PROBAND_ID=`head -n ${SLURM_ARRAY_TASK_ID} ${CHILD_IDS} | tail -n 1`
+
+echo "...Generating BAMOUT files for the ${FAMILY_ID} family, proband = ${PROBAND_ID} ..."
+
+# identify proband ID from the VCF file
+kid_id=''
+file=${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}.ready.vcf.gz
+for indi in `${BCFTOOLS} query -l $file`; do
+  echo "indi = $indi"
+  if [ "${indi}" = "${PROBAND_ID}_${FAMILY_ID}" ]
+  then
+    kid_id=${indi}
+  fi
+done
+echo "...kid_id = ${kid_id} "
+
+
+# gather the proband BAM file
+kid_bam=${SOURCE_DIR}/${BATCH_NUM}_${VERSION_N}/families/????-??-??_${BATCH_NUM}_${VERSION_N}_${PLATE_ID}_${FAMILY_ID}/${kid_id}/${kid_id}-ready.bam
+echo "...kid_bam = ${kid_bam}..."
+
+
+# gather the variants in the DECIPHER file for which to generate bamouts
+# chr is the second column - need to add the 'chr' prefix
+# pos is the third column
+# the first line is a header line, starting with 'Internal reference number or ID'
+# file called: ${DEC_DIR}/<proband_id>_<fam_id>_DEC_FLT.csv
+# and for each run GATK to generate the bamout files
+# to be stored in ${BAMOUT_DIR}/${FAMILY_ID}
+
+mkdir ${BAMOUT_DIR}/${FAMILY_ID}
+
+var_file=${DEC_DIR}/${kid_id}_DEC_FLT.csv
+
+echo "... reading ${var_file} to generate the bamouts..."
+
+grep -v '^Internal' ${var_file} |
+while IFS= read -r line
+do
+  echo "$line"
+  IFS=, read -ra ary <<<"$line"
+#  for key in "${!ary[@]}"; do echo "$key ${ary[$key]}"; done
+  chr=${ary[1]}
+  pos=${ary[2]}
+  ref=${ary[4]}
+  alt=${ary[5]}
+  echo " --> chr = $chr, pos = $pos, ref = ${ref}, alt = ${alt}"
+
+  # generate the bamout file
+  echo "...doing the bamout"
+  echo "   time ${GATK4} HaplotypeCaller --reference ${REFERENCE_GENOME} --input ${kid_bam} -L chr${chr}:${pos} --interval-padding 500 \"
+  echo "   --active-probability-threshold 0.000 -ploidy 2 \"
+  echo "   --output ${BAMOUT_DIR}/${FAMILY_ID}/${FAMILY_ID}_chr${chr}_${pos}.bamout.vcf -bamout ${BAMOUT_DIR}/${FAMILY_ID}/${FAMILY_ID}_chr${chr}_${pos}.bamout.bam"
+
+  time ${GATK4} HaplotypeCaller --reference ${REFERENCE_GENOME} --input ${kid_bam} -L chr${chr}:${pos} \
+  --interval-padding 500 --active-probability-threshold 0.000 -ploidy 2 \
+  --output ${BAMOUT_DIR}/${FAMILY_ID}/${FAMILY_ID}_chr${chr}_${pos}.bamout.vcf \
+  -bamout ${BAMOUT_DIR}/${FAMILY_ID}/${FAMILY_ID}_chr${chr}_${pos}.bamout.bam
+
+done
+
+
+
+#################################################################
+# write the IGV batch file for this family based on the bamouts #
+# to be stored as /home/u035/u035/shared/analysis/work/${PROJECT_ID}/DECIPHER/IGV/bamout_${PROBAND_ID}_${FAMILY_ID}.snapshot.txt #
+#################################################################
+
+snap_file=/home/u035/u035/shared/analysis/work/${PROJECT_ID}/DECIPHER/IGV/bamout_${PROBAND_ID}_${FAMILY_ID}.solo.snapshot.txt
+
+# check if previous version exist, if so - delete it
+if [ -f "${snap_file}" ]; then
+    echo "previous version of ${snap_file} exist --> deleted"
+    rm ${snap_file}
+fi
+
+
+# write the header for the IGV batch file
+echo "new" >> ${snap_file}
+echo "genome hg38" >> ${snap_file}
+echo "snapshotDirectory \"/home/u035/u035/shared/analysis/work/${PROJECT_ID}/DECIPHER/IGV/${PLATE_ID}_${FAMILY_ID}\"" >> ${snap_file}
+echo "" >> ${snap_file}
+
+
+# now, go again over the variants in the DECIPHER file and generate one snapshot file for all the variants
+var_file=${DEC_DIR}/${kid_id}_DEC_FLT.csv
+echo "... reading ${var_file} to generate the IGV batch file using the bamouts..."
+
+grep -v '^Internal' ${var_file} |
+while IFS= read -r line
+do
+  IFS=, read -ra ary <<<"$line"
+  chr=${ary[1]}
+  pos=${ary[2]}
+  ref=${ary[4]}
+  alt=${ary[5]}
+  left=$((${pos}-25))
+  right=$((${pos}+25))
+
+  echo "new" >> ${snap_file}
+  echo "load ${BAMOUT_DIR}/${FAMILY_ID}/${FAMILY_ID}_chr${chr}_${pos}.bamout.bam" >> ${snap_file}
+  echo "preference SAM.SHADE_BASE_QUALITY true" >> ${snap_file}
+
+  echo "goto chr${chr}:${left}-${right}" >> ${snap_file}
+  echo "group SAMPLE" >> ${snap_file}
+  echo "sort base" >> ${snap_file}
+  echo "squish" >> ${snap_file}
+  echo "snapshot bamout_${PROBAND_ID}_${FAMILY_ID}_chr${chr}_${pos}_${ref}_${alt}.png" >> ${snap_file}
+  echo "" >> ${snap_file}
+  echo "" >> ${snap_file}
+
+done
+
+
+echo "Generating of the IGV batch files based on bamouts - done!"
+echo "snap_file = ${snap_file}"
+
+
+
+
+echo "+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++"
+echo "+++   Variant prioritization of family ${FAMILY_ID} completed   +++"
+echo "+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++"
+
+
diff --git a/process_trio.sh b/process_trio.sh
new file mode 100755
index 0000000000000000000000000000000000000000..b0ae363af34b001a7da780fdbfda5ef9ba586093
--- /dev/null
+++ b/process_trio.sh
@@ -0,0 +1,607 @@
+#!/bin/bash
+#SBATCH --cpus-per-task=1
+#SBATCH --mem=16GB
+#SBATCH --time=24:00:00
+#SBATCH --job-name=process_trio
+#SBATCH --output=process_trio.%A_%a.out
+#SBATCH --error=process_trio.%A_%a.err
+
+
+
+# setup PATH
+export PATH=$PATH:/home/u035/u035/shared/software/bcbio/anaconda/envs/python2/bin:/home/u035/u035/shared/software/bcbio/anaconda/bin
+export PERL5LIB=$PERL5LIB:/home/u035/u035/shared/software/bcbio/anaconda/lib/site_perl/5.26.2
+
+
+### folder structure for the downstream analysis - created by trio_setup.sh ###
+BASE=/home/u035/u035/shared/analysis/work
+WORK_DIR=$BASE/${PROJECT_ID}
+VCF_DIR=${WORK_DIR}/VCF
+PED_DIR=${WORK_DIR}/PED
+LOG_DIR=${WORK_DIR}/LOG
+G2P_DIR=${WORK_DIR}/G2P
+VASE_DIR=${WORK_DIR}/VASE
+COV_DIR=${WORK_DIR}/COV
+DEC_DIR=${WORK_DIR}/DECIPHER
+IGV_DIR=${DEC_DIR}/IGV
+CNV_DIR=${WORK_DIR}/CNV
+BAMOUT_DIR=${WORK_DIR}/BAMOUT
+SCRIPTS_DIR=/home/u035/u035/shared/scripts
+
+
+
+# other files to be used
+FAMILY_IDS=${WORK_DIR}/FAM_IDs.txt								# created by trio_setup.sh
+CHILD_IDS=${WORK_DIR}/PRO_IDs.txt								# created by trio_setup.sh
+TARGETS=/home/u035/u035/shared/resources/G2P/DDG2P.20220113.plus15bp.merged.bed			# OK
+CLINVAR=/home/u035/u035/shared/resources/G2P/DDG2P.20220113.clinvar.20220109.plus15bp.txt	# OK
+BLACKLIST=/home/u035/u035/shared/resources/blacklist/current_blacklist.txt			# OK
+TRANS_MAP=/home/u035/u035/shared/resources/trans_map/current_trans_map.txt			# OK
+REC_SNP=/home/u035/u035/shared/resources/reccurent/current_reccurent.bed			# OK, https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7116826/, Extended Data Table 1
+
+
+
+### TOOLS ###
+SAMTOOLS=/home/u035/u035/shared/software/bcbio/anaconda/bin/samtools
+BCFTOOLS=/home/u035/u035/shared/software/bcbio/anaconda/envs/python2/bin/bcftools
+BGZIP=/home/u035/u035/shared/software/bcbio/anaconda/envs/python2/bin/bgzip
+TABIX=/home/u035/u035/shared/software/bcbio/anaconda/envs/python2/bin/tabix
+VT=/home/u035/u035/shared/software/bcbio/anaconda/bin/vt
+VASE=/home/u035/u035/shared/software/bcbio/anaconda/bin/vase
+GATK4=/home/u035/u035/shared/software/bcbio/anaconda/bin/gatk								# points to ../share/gatk4-4.2.1.0-0/gatk
+GATK3=/home/u035/u035/shared/software/GenomeAnalysisTK-3.8/GenomeAnalysisTK.jar
+PYTHON3=/home/u035/u035/shared/software/bcbio/anaconda/bin/python3							# points to python3.6
+PYTHON2=/home/u035/u035/shared/software/bcbio/anaconda/envs/python2/bin/python2.7
+VEP="/home/u035/u035/shared/software/bcbio/anaconda/bin/perl /home/u035/u035/shared/software/bcbio/anaconda/bin/vep"	# points to ../share/ensembl-vep-100.4-0/vep
+REFERENCE_GENOME=/home/u035/u035/shared/software/bcbio/genomes/Hsapiens/hg38/seq/hg38.fa
+
+
+
+echo "SOURCE_DIR = ${SOURCE_DIR}"       # the general path to the source BAM files (VCF and PED already copied)		i.e. /home/u035/u035/shared/results
+echo "BATCH_ID = ${BATCH_ID}"           # the ID of the batch being processed                                   	e.g. 19650_Ansari_Morad
+echo "BATCH_NUM = ${BATCH_NUM}"         # the numerical part of the BATCH_ID						e.g. 19650
+echo "PLATE_ID = ${PLATE_ID}"           # the PCR plate ID of the batch being currently processed,              	e.g. 19285
+echo "PROJECT_ID = ${PROJECT_ID}"       # this the the folder (${BASE}/${PROJECT_ID}) where the downstream analysis will be done
+echo "VERSION_N = ${VERSION_N}"         # the version of the alignment and genotyping analysis
+
+
+
+
+
+# change to the LOG folder
+cd ${LOG_DIR}
+
+
+
+
+################################
+#####    for each family    ####
+################################
+
+#~#FAMILY_ID=`head -n ${PBS_ARRAY_INDEX} ${FAMILY_IDS} | tail -n 1`
+FAMILY_ID=`head -n ${SLURM_ARRAY_TASK_ID} ${FAMILY_IDS} | tail -n 1`
+
+
+
+
+
+########################################################################
+###        DNU and clean the each family VCF                         ###
+### format: ${PLATE_ID}_${FAMILY_ID}-gatk-haplotype-annotated.vcf.gz ###
+########################################################################
+
+echo "Performing DNU and cleaning of the ${PLATE_ID}_${FAMILY_ID}'s VCF file..."
+
+
+time ${VT} decompose -s ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}-gatk-haplotype-annotated.vcf.gz -o ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}.decomp.vcf.gz
+time ${VT} normalize ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}.decomp.vcf.gz -r ${REFERENCE_GENOME} -o ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}.norm.vcf.gz
+time ${VT} uniq ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}.norm.vcf.gz -o ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}.DNU.vcf.gz
+
+
+# remove sites with AC=0
+time ${BCFTOOLS} view --min-ac=1 --no-update ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}.DNU.vcf.gz > ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}.AC0.vcf
+
+# reset GT to no-call if num_ALT < num_ALT_THERSH or VAF < VAF_THRESH and GT != 0/0
+# exlude variants from the blacklist (matching on chr,pos,ref,alt)
+time ${PYTHON2} ${SCRIPTS_DIR}/filter_LQ_GT.py ${BLACKLIST} ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}.AC0.vcf ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}.clean.vcf
+
+# bgzip and tabix it
+time cat ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}.clean.vcf | ${BGZIP} > ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}.ready.vcf.gz
+time ${TABIX} -p vcf ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}.ready.vcf.gz
+
+# delete intermediate files
+rm ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}-gatk-haplotype-annotated.vcf.gz
+rm ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}.decomp.vcf.gz*
+rm ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}.norm.vcf.gz*
+rm ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}.DNU.vcf.gz*
+rm ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}.AC0.vcf
+
+
+# to avoid bgzip pipe broken annoying, but not problematic message - skip the next step, the file will be used by G2P as IN_FILE and will be deleted last
+# rm ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}.clean.vcf
+
+
+echo ""
+echo ""
+echo "^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^"
+echo "DNU, AC=0, num_ALT & VAF & blacklist cleaning and of the ${PLATE_ID}_${FAMILY_ID}'s VCF file: done"
+echo "^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^"
+echo ""
+echo ""
+
+
+
+
+
+
+###########################################################
+###     run G2P for each family VCF (DD genes)          ###
+###     format: ${PLATE_ID}_${FAMILY_ID}.clean.vcf      ###
+###########################################################
+
+
+echo "Performing G2P analysis (DD genes)for FAMILY_ID = ${PLATE_ID}_${FAMILY_ID}..."
+echo "Using ${TARGETS}"
+
+
+IN_FILE=${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}.clean.vcf
+G2P_LOG_DIR=${G2P_DIR}/${PLATE_ID}_${FAMILY_ID}_LOG_DIR
+mkdir ${G2P_LOG_DIR}
+TXT_OUT=${G2P_LOG_DIR}/${PLATE_ID}_${FAMILY_ID}.report.txt
+HTML_OUT=${G2P_LOG_DIR}/${PLATE_ID}_${FAMILY_ID}.report.html
+VCF_KEYS='gnomADe_r2.1.1_GRCh38|gnomADg_r3.1.1_GRCh38'
+
+time ${VEP} \
+    -i ${IN_FILE} \
+    --output_file ${G2P_LOG_DIR}/${PLATE_ID}_${FAMILY_ID}_inter_out.txt \
+    --force_overwrite \
+    --assembly GRCh38 \
+    --fasta ${REFERENCE_GENOME} \
+    --offline \
+    --merged \
+    --use_given_ref \
+    --cache --cache_version 100 \
+    --dir_cache /home/u035/u035/shared/software/bcbio/genomes/Hsapiens/hg38/vep \
+    --individual all \
+    --transcript_filter "gene_symbol in /home/u035/u035/shared/resources/G2P/genes_in_DDG2P.20220113.txt" \
+    --dir_plugins /home/u035/u035/shared/software/bcbio/anaconda/share/ensembl-vep-100.4-0 \
+    --plugin G2P,file='/home/u035/u035/shared/resources/G2P/DDG2P.20220113.csv',af_from_vcf=1,confidence_levels='definitive&strong',af_from_vcf_keys=${VCF_KEYS},log_dir=${G2P_LOG_DIR},txt_report=${TXT_OUT},html_report=${HTML_OUT}
+
+
+echo ""
+echo ""
+echo "^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^"
+echo "G2P analysis of FAMILY_ID = ${PLATE_ID}_${FAMILY_ID}: done"
+echo "^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^"
+echo ""
+echo ""
+
+
+
+
+
+
+###########################################################
+###     run VASE for each family VCF (de novo)          ###
+###   format: ${PLATE_ID}_${FAMILY_ID}.ready.vcf.gz     ###
+###########################################################
+
+echo "Performing de novo analysis with VASE for FAMILY_ID = ${PLATE_ID}_${FAMILY_ID} ..."
+
+IN_FILE=${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}.ready.vcf.gz
+OUT_FILE=${VASE_DIR}/${PLATE_ID}_${FAMILY_ID}.strict.denovo.vcf
+PED_FILE=${PED_DIR}/${BATCH_ID}_${PLATE_ID}_${FAMILY_ID}.ped
+
+time ${VASE} \
+    -i ${IN_FILE} \
+    -o ${OUT_FILE} \
+    --log_progress \
+    --prog_interval 100000 \
+    --freq 0.0001 \
+    --gq 30 --dp 10 \
+    --het_ab 0.3 \
+    --max_alt_alleles 1 \
+    --csq all \
+    --biotypes all \
+    --control_gq 15 --control_dp 5 \
+    --control_het_ab 0.01 \
+    --control_max_ref_ab 0.05 \
+    --de_novo \
+    --ped ${PED_FILE}
+
+
+# for the cases where one of the parents is also affected, VASE de novo will crash not being able to find parents for the affected parent
+# not producing any output file, which trips the pipeline downstream
+# to handle: check if VASE denovo produced an output and if not (as it will be for such cases)
+# create an empty VCF headered output == no denovos found)
+
+echo ""
+echo ""
+if [ -f "${OUT_FILE}" ]; then
+    echo "VASE denovo completed successfully"
+else
+    echo "WARNING: VASE denovo has not produced an output (e.g. affected parent), generate an empty VCF one (with headers)"
+    ${TABIX} -H ${IN_FILE} > ${OUT_FILE}
+fi
+echo ""
+echo ""
+
+
+
+
+# do some filtering on the denovo VCFs - exclude variants not on the 24 chr, as well as variants in LCR and telomere/centromere regions
+### actually, ignore the filtering of variants in LCR and telomere/centromere regions --> more variants with  â€œUnknownâ€ status may be classified as â€œdenovoâ€ if enough support
+cd ${VASE_DIR}
+
+# index the denovo VCF
+time ${GATK4} IndexFeatureFile -I ${OUT_FILE}
+
+# select only variants on the 24 chromosomes
+time ${GATK4} SelectVariants -R ${REFERENCE_GENOME} -V ${OUT_FILE} -O ${PLATE_ID}_${FAMILY_ID}.strict.24chr.denovo.vcf -L /home/u035/u035/shared/resources/24_chr.list --exclude-non-variants
+
+# sort the VCF (maybe not needed?, but just in case, and it is quick)
+if [ -f "${PLATE_ID}_${FAMILY_ID}.strict.24chr.sort.denovo.vcf" ]; then
+    rm ${PLATE_ID}_${FAMILY_ID}.strict.24chr.sort.denovo.vcf
+fi
+grep '^#' ${PLATE_ID}_${FAMILY_ID}.strict.24chr.denovo.vcf > ${PLATE_ID}_${FAMILY_ID}.strict.24chr.sort.denovo.vcf \
+&& grep -v '^#' ${PLATE_ID}_${FAMILY_ID}.strict.24chr.denovo.vcf | LC_ALL=C sort -t $'\t' -k1,1V -k2,2n >> ${PLATE_ID}_${FAMILY_ID}.strict.24chr.sort.denovo.vcf
+
+# index the sorted VCF
+time ${GATK4} IndexFeatureFile -I ${PLATE_ID}_${FAMILY_ID}.strict.24chr.sort.denovo.vcf
+
+# split multi-allelic sites [by -m -any]
+# left-alignment and normalization [by adding the -f]
+file=${PLATE_ID}_${FAMILY_ID}.strict.24chr.sort.denovo.vcf
+echo "$file"
+${BCFTOOLS} norm -f ${REFERENCE_GENOME} -m -any -Ov -o ${file/.strict.24chr.sort.denovo.vcf/.ready.denovo.vcf} $file
+
+
+# clean intermediate denovo files
+rm ${PLATE_ID}_${FAMILY_ID}.strict.denovo.vcf*
+rm ${PLATE_ID}_${FAMILY_ID}.strict.24chr.denovo.vcf*
+rm ${PLATE_ID}_${FAMILY_ID}.strict.24chr.sort.denovo.vcf*
+
+
+# change back to the LOG folder
+cd ${LOG_DIR}
+
+echo ""
+echo ""
+echo "^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^"
+echo "De novo analysis of FAMILY_ID = ${PLATE_ID}_${FAMILY_ID}: done"
+echo "^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^"
+echo ""
+echo ""
+
+
+
+
+
+############################################################################################################################################################################################
+###          run coverage for each proband (DD genes)                                                                                                                                    ###
+###   format: ${SOURCE_DIR}/${BATCH_NUM}_${VERSION_N}/families/????-??-??_${BATCH_NUM}_${VERSION_N}_${PLATE_ID}_${FAMILY_ID}/${INDI_ID}_${FAMILY_ID}/${INDI_ID}_${FAMILY_ID}-ready.bam   ###
+############################################################################################################################################################################################
+
+
+#################################
+#####    for each proband    ####
+#################################
+
+#~#PROBAND_ID=`head -n ${PBS_ARRAY_INDEX} ${CHILD_IDS} | tail -n 1`                                # contains only the proband IDs (e.g. 107060)
+PROBAND_ID=`head -n ${SLURM_ARRAY_TASK_ID} ${CHILD_IDS} | tail -n 1`                                # contains only the proband IDs (e.g. 107060)
+
+echo "Performing coverage analysis for PROBAND_ID = ${PROBAND_ID}_${FAMILY_ID} ..."
+
+
+# make sure we are reading the data from the exact batch & plate ID
+#~#BAM_FILE=${SOURCE_DIR}/????-??-??_${VERSION_N}_${BATCH_ID}_${PLATE_ID}_${FAMILY_ID}/${PROBAND_ID}_${FAMILY_ID}/${PROBAND_ID}_${FAMILY_ID}-ready.bam
+BAM_FILE=${SOURCE_DIR}/${BATCH_NUM}_${VERSION_N}/families/????-??-??_${BATCH_NUM}_${VERSION_N}_${PLATE_ID}_${FAMILY_ID}/${PROBAND_ID}_${FAMILY_ID}/${PROBAND_ID}_${FAMILY_ID}-ready.bam
+OUT_FILE=${COV_DIR}/${PROBAND_ID}_${FAMILY_ID}.DD15
+
+
+time java -Xmx8g -jar ${GATK3} -T DepthOfCoverage -R ${REFERENCE_GENOME} -o ${OUT_FILE} -I ${BAM_FILE} -L ${TARGETS} \
+  --omitDepthOutputAtEachBase \
+  --minBaseQuality 20 \
+  --minMappingQuality 20 \
+  -ct 20 \
+  -jdk_deflater \
+  -jdk_inflater \
+  --allow_potentially_misencoded_quality_scores
+
+echo ""
+echo ""
+echo "----------------------------------------------------------------------------------------------------"
+echo "percentage of DD exons (+/-15bp) covered at least 20x in PROBAND_ID = ${PROBAND_ID}_${FAMILY_ID} ..."
+cat ${COV_DIR}/${PROBAND_ID}_${FAMILY_ID}.DD15.sample_summary | awk '{print $7}'
+echo "----------------------------------------------------------------------------------------------------"
+
+
+# now compute the coverage per DD exon (+/-15bp) interval, adding the number of P/LP ClinVar variants (assertion criteria provided) in each interval
+time ${PYTHON2} ${SCRIPTS_DIR}/get_cov_output.py ${COV_DIR}/${PROBAND_ID}_${FAMILY_ID}.DD15.sample_interval_summary ${CLINVAR} ${COV_DIR}/${PROBAND_ID}_${FAMILY_ID}.DD15.COV.txt
+
+
+echo ""
+echo ""
+echo "^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^"
+echo "Coverage analysis of PROBAND_ID = ${PROBAND_ID}_${FAMILY_ID}: done    "
+echo "    Coverage file = ${COV_DIR}/${PROBAND_ID}_${FAMILY_ID}.DD15.COV.txt"
+echo "^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^"
+echo ""
+echo ""
+
+
+
+################################################################################################
+# check the coverage per each of the reccurent de novo SNPs (padded with 15bp both directions) #
+################################################################################################
+echo "Performing recurrent coverage analysis for PROBAND_ID = ${PROBAND_ID}_${FAMILY_ID} ..."
+
+# we have identified the name of the proband's BAM file above (BAM_FILE), reuse it
+# set the name of the file containing info about the coverage of the recurrent SNPs
+REC_OUT_FILE=${COV_DIR}/${PROBAND_ID}_${FAMILY_ID}.REC_SNP_COV.txt
+
+while IFS=$'\t' read -ra var; do
+  gene="${var[0]}"
+  chr="${var[1]}"
+  pos="${var[2]}"
+  lo=$(expr $pos - 15)
+  hi=$(expr $pos + 15)
+  reg="$lo-$hi"
+  echo "============================================="
+  echo "$gene : recurrent variant at $chr:$pos"
+  echo "exploring coverage at $chr:$reg"
+
+  echo "---------------------------------------------"
+  echo "precisely at the position"
+  ${SAMTOOLS} depth -aa -Q 20 -r $chr:$reg ${BAM_FILE} | grep "$pos"
+
+  echo "---------------------------------------------"
+  echo "average in the +/- 15bp region"
+  ${SAMTOOLS} depth -aa -Q 20 -r $chr:$reg ${BAM_FILE} | awk '{sum+=$3} END { print "Average = ",sum/NR}'
+
+  echo "---------------------------------------------"
+  echo "detailed in the +/- 15bp region"
+  ${SAMTOOLS} depth -aa -Q 20 -r $chr:$reg ${BAM_FILE}
+done < ${REC_SNP} > ${REC_OUT_FILE}
+
+
+echo ""
+echo ""
+echo "^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^"
+echo "Coverage analysis of recurring SNPs for PROBAND_ID = ${PROBAND_ID}_${FAMILY_ID}: done    "
+echo "    Coverage file = ${REC_OUT_FILE}"
+echo "^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^"
+echo ""
+echo ""
+
+
+
+
+
+
+###################################################################################
+###      for each proband generate the DECIPHER file                            ###
+###  ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}.ready.vcf.gz - the cleaned family VCF  ###
+###  ${VASE_DIR}/${PLATE_ID}_${FAMILY_ID}.ready.denovo.vcf - the VASE file      ###
+###  ${TRANS_MAP} - the current transcript mapping file                         ###
+###################################################################################
+
+echo "Generating the DECIPHER file for PROBAND_ID = ${PROBAND_ID}_${FAMILY_ID} ..."
+
+# first, split the family VCF to individual VCFs
+# -c1:  minimum allele count (INFO/AC) of sites to be printed
+# split multi-allelic sites (by -m -any)
+# left-alignment and normalization (by adding the -f)
+
+file=${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}.ready.vcf.gz
+echo "splitting $file"
+for indi in `${BCFTOOLS} query -l $file`; do
+    ${BCFTOOLS} view -c1 -Oz -s $indi -o ${file/.vcf*/.$indi.rough.vcf.gz} $file
+    ${BCFTOOLS} norm -f ${REFERENCE_GENOME} -m -any -Oz -o ${file/.vcf*/.$indi.vcf.gz} ${file/.vcf*/.$indi.rough.vcf.gz}
+    rm ${file/.vcf*/.$indi.rough.vcf.gz}
+done
+
+
+# VASE file - already split, left-aligned and normalized
+
+
+# create the names of the needed files
+PED_FILE=${PED_DIR}/${BATCH_ID}_${PLATE_ID}_${FAMILY_ID}.ped
+DEC_MAP=${WORK_DIR}/DECIPHER_INTERNAL_IDs.txt
+IN_G2P_FILE=${G2P_DIR}/${PLATE_ID}_${FAMILY_ID}_LOG_DIR/${PLATE_ID}_${FAMILY_ID}.report.txt
+IN_VASE_FILE=${VASE_DIR}/${PLATE_ID}_${FAMILY_ID}.ready.denovo.vcf
+FAM_IGV_DIR=${IGV_DIR}/${PLATE_ID}_${FAMILY_ID}
+#~#FAM_BAM_DIR=${SOURCE_DIR}/????-??-??_${VERSION_N}_${BATCH_ID}_${PLATE_ID}_${FAMILY_ID}
+FAM_BAM_DIR=${SOURCE_DIR}/${BATCH_NUM}_${VERSION_N}/families/????-??-??_${BATCH_NUM}_${VERSION_N}_${PLATE_ID}_${FAMILY_ID}
+
+
+## call the python scrpit
+##time ${PYTHON2} /home/u035/u035/shared/temp/generate_DEC_IGV_scripts.py \
+time ${PYTHON2} ${SCRIPTS_DIR}/generate_DEC_IGV_scripts.py \
+${DEC_MAP} \
+${TRANS_MAP} \
+${PED_FILE} \
+${IN_G2P_FILE} \
+${IN_VASE_FILE} \
+${FAM_IGV_DIR} \
+${VCF_DIR} \
+${PLATE_ID} \
+${FAMILY_ID} \
+${DEC_DIR} \
+${FAM_BAM_DIR}
+
+
+## using the DECIPHER bulk upload file v9 --> generate the DECIPHER bulk upload file v10
+echo "...Generating v10 Decipher bulk upload file for proband = ${PROBAND_ID}, family_id = ${FAMILY_ID} ..."
+time ${PYTHON3} ${SCRIPTS_DIR}/convert_DEC_to_v10.py ${DEC_DIR} ${PROBAND_ID}_${FAMILY_ID}
+
+
+
+echo ""
+echo ""
+echo "^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^"
+echo "DECIPHER analysis of PROBAND_ID = ${PROBAND_ID}_${FAMILY_ID}: done"
+echo "^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^"
+echo ""
+echo ""
+
+
+rm ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}.clean.vcf
+
+
+
+
+##############################################################################################
+###  for each variant in the DECIPHER upload file                                          ###
+###  generate a IGV snapshot based on the realigned BAM used by GATK for calling variants  ###
+###  first, generate BAMOUTs for each variant (to be stored in the BAMOUT folder)          ###
+###  then, generate a batch script for IGV to produce the snapshots based on the BAMOUTs   ###
+##############################################################################################
+
+
+
+# we have so far
+# FAMILY_ID=`head -n ${SLURM_ARRAY_TASK_ID} ${FAMILY_IDS} | tail -n 1`
+# PROBAND_ID=`head -n ${SLURM_ARRAY_TASK_ID} ${CHILD_IDS} | tail -n 1`
+
+echo "...Generating BAMOUT files for the ${FAMILY_ID} family, proband = ${PROBAND_ID} ..."
+
+# identify parent IDs from the trio VCF file
+kid_id=''
+par_1_id=''
+par_2_id=''
+
+
+file=${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}.ready.vcf.gz
+for indi in `${BCFTOOLS} query -l $file`; do
+  echo "indi = $indi"
+  if [ "${indi}" = "${PROBAND_ID}_${FAMILY_ID}" ]
+  then
+    kid_id=${indi}
+  elif [ "${par_1_id}" = "" ]
+  then
+    par_1_id=${indi}
+  else
+    par_2_id=${indi}
+  fi
+done
+
+echo "...kid_id = ${kid_id}, par_1_id = ${par_1_id}, par_2_id = ${par_2_id} "
+
+
+
+# gather the trio BAM files
+#~#kid_bam=${SOURCE_DIR}/????-??-??_${VERSION_N}_${BATCH_ID}_${PLATE_ID}_${FAMILY_ID}/${kid_id}/${kid_id}-ready.bam
+#~#par_1_bam=${SOURCE_DIR}/????-??-??_${VERSION_N}_${BATCH_ID}_${PLATE_ID}_${FAMILY_ID}/${par_1_id}/${par_1_id}-ready.bam
+#~#par_2_bam=${SOURCE_DIR}/????-??-??_${VERSION_N}_${BATCH_ID}_${PLATE_ID}_${FAMILY_ID}/${par_2_id}/${par_2_id}-ready.bam
+kid_bam=${SOURCE_DIR}/${BATCH_NUM}_${VERSION_N}/families/????-??-??_${BATCH_NUM}_${VERSION_N}_${PLATE_ID}_${FAMILY_ID}/${kid_id}/${kid_id}-ready.bam
+par_1_bam=${SOURCE_DIR}/${BATCH_NUM}_${VERSION_N}/families/????-??-??_${BATCH_NUM}_${VERSION_N}_${PLATE_ID}_${FAMILY_ID}/${par_1_id}/${par_1_id}-ready.bam
+par_2_bam=${SOURCE_DIR}/${BATCH_NUM}_${VERSION_N}/families/????-??-??_${BATCH_NUM}_${VERSION_N}_${PLATE_ID}_${FAMILY_ID}/${par_2_id}/${par_2_id}-ready.bam
+
+echo "...kid_bam = ${kid_bam}..."
+echo "...par_1_bam = ${par_1_bam}..."
+echo "...par_2_bam = ${par_2_bam}..."
+
+
+# gather the variants in the DECIPHER file for which to generate bamouts
+# chr is the second column - need to add the 'chr' prefix
+# pos is the third column
+# the first line is a header line, starting with 'Internal reference number or ID'
+# file called: ${DEC_DIR}/<proband_id>_<fam_id>_DEC_FLT.csv
+# and for each run GATK to generate the bamout files
+# to be stored in ${BAMOUT_DIR}/${FAMILY_ID}
+
+mkdir ${BAMOUT_DIR}/${FAMILY_ID}
+
+var_file=${DEC_DIR}/${kid_id}_DEC_FLT.csv
+
+echo "... reading ${var_file} to generate the bamouts..."
+
+grep -v '^Internal' ${var_file} |
+while IFS= read -r line
+do
+  echo "$line"
+  IFS=, read -ra ary <<<"$line"
+#  for key in "${!ary[@]}"; do echo "$key ${ary[$key]}"; done
+  chr=${ary[1]}
+  pos=${ary[2]}
+  ref=${ary[4]}
+  alt=${ary[5]}
+  echo " --> chr = $chr, pos = $pos, ref = ${ref}, alt = ${alt}"
+
+  # generate the bamout file
+  echo "...doing the bamout"
+  echo "   time ${GATK4} HaplotypeCaller --reference ${REFERENCE_GENOME} --input ${kid_bam} --input ${par_1_bam} --input ${par_2_bam} -L chr${chr}:${pos} --interval-padding 500 \"
+  echo "   --active-probability-threshold 0.000 -ploidy 2 \"
+  echo "   --output ${BAMOUT_DIR}/${FAMILY_ID}/${FAMILY_ID}_chr${chr}_${pos}.bamout.vcf -bamout ${BAMOUT_DIR}/${FAMILY_ID}/${FAMILY_ID}_chr${chr}_${pos}.bamout.bam"
+
+  time ${GATK4} HaplotypeCaller --reference ${REFERENCE_GENOME} --input ${kid_bam} --input ${par_1_bam} --input ${par_2_bam} -L chr${chr}:${pos} \
+  --interval-padding 500 --active-probability-threshold 0.000 -ploidy 2 \
+  --output ${BAMOUT_DIR}/${FAMILY_ID}/${FAMILY_ID}_chr${chr}_${pos}.bamout.vcf -bamout ${BAMOUT_DIR}/${FAMILY_ID}/${FAMILY_ID}_chr${chr}_${pos}.bamout.bam
+
+done
+
+
+
+
+#################################################################
+# write the IGV batch file for this family based on the bamouts #
+# to be stored as /home/u035/u035/shared/analysis/work/${PROJECT_ID}/DECIPHER/IGV/bamout_${PROBAND_ID}_${FAMILY_ID}.snapshot.txt #
+#################################################################
+
+
+
+snap_file=/home/u035/u035/shared/analysis/work/${PROJECT_ID}/DECIPHER/IGV/bamout_${PROBAND_ID}_${FAMILY_ID}.snapshot.txt
+
+# check if previous version exist, if so - delete it
+if [ -f "${snap_file}" ]; then
+    echo "previous version of ${snap_file} exist --> deleted"
+    rm ${snap_file}
+fi
+
+
+# write the header for the IGV batch file
+echo "new" >> ${snap_file}
+echo "genome hg38" >> ${snap_file}
+echo "snapshotDirectory \"/home/u035/u035/shared/analysis/work/${PROJECT_ID}/DECIPHER/IGV/${PLATE_ID}_${FAMILY_ID}\"" >> ${snap_file}
+echo "" >> ${snap_file}
+
+
+
+# now, go again over the variants in the DECIPHER file and generate one snapshot file for all the variants
+var_file=${DEC_DIR}/${kid_id}_DEC_FLT.csv
+echo "... reading ${var_file} to generate the IGV batch file using the bamouts..."
+
+grep -v '^Internal' ${var_file} |
+while IFS= read -r line
+do
+  IFS=, read -ra ary <<<"$line"
+  chr=${ary[1]}
+  pos=${ary[2]}
+  ref=${ary[4]}
+  alt=${ary[5]}
+  left=$((${pos}-25))
+  right=$((${pos}+25))
+
+  echo "new" >> ${snap_file}
+  echo "load ${BAMOUT_DIR}/${FAMILY_ID}/${FAMILY_ID}_chr${chr}_${pos}.bamout.bam" >> ${snap_file}
+  echo "preference SAM.SHADE_BASE_QUALITY true" >> ${snap_file}
+
+  echo "goto chr${chr}:${left}-${right}" >> ${snap_file}
+  echo "group SAMPLE" >> ${snap_file}
+  echo "sort base" >> ${snap_file}
+  echo "squish" >> ${snap_file}
+  echo "snapshot bamout_${PROBAND_ID}_${FAMILY_ID}_chr${chr}_${pos}_${ref}_${alt}.png" >> ${snap_file}
+  echo "" >> ${snap_file}
+  echo "" >> ${snap_file}
+
+done
+
+echo "Generating of the IGV batch files based on bamouts - done!"
+echo "snap_file = ${snap_file}"
+
+
+echo "+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++"
+echo "+++   Variant prioritization of family ${FAMILY_ID} completed   +++"
+echo "+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++"
+
diff --git a/reanalysis_preparation.sh b/reanalysis_preparation.sh
new file mode 100644
index 0000000000000000000000000000000000000000..217cfa671688c01adc053a8b12f434a6ff78349e
--- /dev/null
+++ b/reanalysis_preparation.sh
@@ -0,0 +1,51 @@
+#!/bin/bash
+
+#
+# Create the reanalysis folder named e.g. results/20220418_reanalysis, and create files for each
+# of the family types, tab-delimited text format:
+# project_version plate_id family_id (excluding plate id)
+#
+# Run this script in the reanalysis folder. Ensure that the environment variable project_id is set to
+# the same name as the reanalysis folder.
+#
+
+# Create symlinks for the families that don't require any re-processing
+for file in quad.txt shared_affected.txt singleton.txt trio_affected_parent.txt trio.txt
+do
+    count=`wc -l params/$file | awk '{ print $1 }'`
+
+    for ((i = 1; i <= $count; i = i + 1))
+    do
+	project=`head -n $i params/$file | tail -n 1 | cut -f 1`
+	family=`head -n $i params/$file | tail -n 1 | cut -f 3`
+
+	cd families
+	family_dir=`ls ../../${project}/families | grep $family`
+	ln -s ../../${project}/families/$family_dir $family_dir
+
+	cd ../params
+	ped=`ls ../../${project}/params/*.ped | grep $family`
+	ln -s $ped `basename $ped`
+
+	cd ..
+    done
+done
+
+# For the singletons from duos with unaffected parents that need to be re-generated,
+# prepare appropriate PED files in the analysis/params folder to begin analysis.
+cp singleton_from_duo.txt ../../analysis/params
+cd ../../analysis/params
+
+count=`wc -l singleton_from_duo.txt | awk '{ print $1 }'`
+file=singleton_from_duo.txt
+for ((i = 1; i <= $count; i = i + 1))
+do
+    project=`head -n $i $file | tail -n 1 | cut -f 1`
+    family=`head -n $i $file | tail -n 1 | cut -f 3`
+
+    ped=`ls ../../results/${project}/params/*.ped | grep $family`
+    grep 2$ $ped | awk '{ print $1 "\t" $2 "\t0\t0\t" $5 "\t" $6}' > `basename $ped`
+done
+
+# Create a family ids list
+cat *.ped | cut -f 1 | sort > $project_id.family_ids.txt
diff --git a/submit_bcbio_trio_wes.sh b/submit_bcbio_trio_wes.sh
deleted file mode 100755
index d07d9eb2e101ec8727d043f892f93ba46582ea8b..0000000000000000000000000000000000000000
--- a/submit_bcbio_trio_wes.sh
+++ /dev/null
@@ -1,56 +0,0 @@
-#!/bin/bash
-#PBS -l walltime=48:00:00
-#PBS -l ncpus=16,mem=8gb
-#PBS -q sgp
-#PBS -N trio_whole_exome_bcbio
-#PBS -j oe
-
-# enable running singletons
-if [ -z $PBS_ARRAY_INDEX ]
-then
-  if [ -z $INDEX ]
-  then
-    export PBS_ARRAY_INDEX=1
-  else
-    export PBS_ARRAY_INDEX=$INDEX
-  fi
-fi
-
-# Expects environment variables to be set
-# PROJECT_ID - e.g. 12345_LastnameFirstname
-# CONFIG_SH - absolute path to configuration script setting environment variables
-# VERSION - e.g. v1, v2
-
-source $CONFIG_SH
-
-FAMILY_ID=`head -n $PBS_ARRAY_INDEX $PARAMS_DIR/$PROJECT_ID.family_ids.txt | tail -n 1`
-
-CONFIG_FILE=$CONFIG_DIR/${VERSION}_${PROJECT_ID}_${FAMILY_ID}.yaml
-mkdir -p $WORK_DIR/$FAMILY_ID
-cd $WORK_DIR/$FAMILY_ID
-
-bcbio_nextgen.py $CONFIG_FILE -n 16 -t local
-
-DATE=$(basename `tail log/bcbio-nextgen.log | grep 'Storing in local filesystem' | tail -n 1 | awk '{ print $6 }' | perl -pe "s/_${PROJECT_ID}.+//"`)
-
-if [ -e $OUTPUT_DIR/${DATE}_${PROJECT_ID}_${FAMILY_ID} ]
-then
-  for INDV in `cut -f 2 $PARAMS_DIR/${PROJECT_ID}_${FAMILY_ID}.ped`
-  do
-    mv $OUTPUT_DIR/$INDV $OUTPUT_DIR/${DATE}_${PROJECT_ID}_${FAMILY_ID}/
-  done
-
-  #Â fix VCF output file names
-  cd $OUTPUT_DIR/${DATE}_${PROJECT_ID}_${FAMILY_ID}
-  if [ ! -e ${FAMILY_ID}-gatk-haplotype-annotated.vcf.gz ]
-  then
-    PREFIX=`echo $FAMILY_ID | cut -d '_' -f 1`
-    SUFFIX=`echo $FAMILY_ID | cut -d '_' -f 2`
-    mv ${PREFIX}${SUFFIX}-gatk-haplotype-annotated.vcf.gz ${FAMILY_ID}-gatk-haplotype-annotated.vcf.gz
-    mv ${PREFIX}${SUFFIX}-gatk-haplotype-annotated.vcf.gz.tbi ${FAMILY_ID}-gatk-haplotype-annotated.vcf.gz.tbi
-  fi
-
-else
-  echo $OUTPUT_DIR/${DATE}_${PROJECT_ID}_${FAMILY_ID} does not exist.
-fi
-
diff --git a/submit_depth_of_coverage_MQ20_BQ20.sh b/submit_depth_of_coverage_MQ20_BQ20.sh
old mode 100644
new mode 100755
diff --git a/submit_trio_wes_archive_project.sh b/submit_trio_wes_archive_project.sh
deleted file mode 100755
index b00f33b86cd3f4fa34add0bd575a5894d4267be2..0000000000000000000000000000000000000000
--- a/submit_trio_wes_archive_project.sh
+++ /dev/null
@@ -1,70 +0,0 @@
-#!/bin/bash
-#PBS -l walltime=24:00:00
-#PBS -l ncpus=1,mem=2gb
-#PBS -q uv2000
-#PBS -N trio_whole_exome_archive_project
-#PBS -j oe
-
-# Expects environment variables to be set
-# PROJECT_ID - e.g. 12345_LastnameFirstname
-# VERSION - e.g. v1, v2
-# PRIORITY_DIRS - e.g. 05122019,07122019 (colon delimited if more than one)
-# CONFIG_SH - absolute path to configuration script setting environment variables
-
-#Â Source the configuration file
-source $CONFIG_SH
-
-# Move to the raw reads folder
-cd $DOWNLOAD_DIR
-
-# Copy the raw read files
-rsync -av $PROJECT_ID $ARCHIVE_DIR/data/
-
-# Move to the output directory
-cd $OUTPUT_DIR
-
-# Copy bcbio output files
-BASE_PROJECT_ID=`echo ${PROJECT_ID} | cut -d '_' -f 1`
-rsync -av --exclude '*.bam' ${VERSION}_${BASE_PROJECT_ID} $ARCHIVE_DIR/
-
-# Copy qc files
-cd qc
-mkdir -p $ARCHIVE_DIR/qc
-rsync -av ${VERSION}_${PROJECT_ID}* $ARCHIVE_DIR/qc/
-
-# Copy prioritization files
-cd ../prioritization
-mkdir -p $ARCHIVE_DIR/prioritization
-
-DIRS=$(echo $PRIORITY_DIRS | tr ":" "\n")
-
-for dir in $DIRS
-do
-  rsync -av $dir $ARCHIVE_DIR/prioritization/
-  rsync -av $dir.md5sum.txt $ARCHIVE_DIR/prioritization/
-done
-
-# move to the archive area and check the md5s
-cd $ARCHIVE_DIR
-
-cd ${VERSION}_${BASE_PROJECT_ID}
-for family_dir in *${VERSION}_${PROJECT_ID}*
-do
-  cd $family_dir
-  md5sum --check md5sum.txt
-  cd ..
-done
-cd ..
-
-cd qc
-md5sum --check ${VERSION}_${PROJECT_ID}_qc_report.md5sum.txt
-
-cd ../prioritization
-
-for dir in $DIRS
-do
-  cd $dir
-  md5sum --check ../$dir.md5sum.txt
-  cd ..
-done
-
diff --git a/submit_trio_wes_bcbio.sh b/submit_trio_wes_bcbio.sh
new file mode 100755
index 0000000000000000000000000000000000000000..c919c194867de60f648c53fe33da59b39f9d750c
--- /dev/null
+++ b/submit_trio_wes_bcbio.sh
@@ -0,0 +1,54 @@
+#!/bin/bash
+#SBATCH --cpus-per-task=16
+#SBATCH --mem=8GB
+#SBATCH --time=48:00:00
+#SBATCH --job-name=trio_whole_exome_bcbio
+#SBATCH --output=trio_whole_exome_bcbio.%A_%a.out
+#SBATCH --error=trio_whole_exome_bcbio.%A_%a.err
+
+# Expects environment variables to be set
+# PROJECT_ID - e.g. 12345_LastnameFirstname
+# CONFIG_SH - absolute path to configuration script setting environment variables
+# VERSION - e.g. v1, v2
+
+source $CONFIG_SH
+
+FAMILY_ID=`head -n $SLURM_ARRAY_TASK_ID $PARAMS_DIR/$PROJECT_ID.family_ids.txt | tail -n 1`
+
+SHORT_PROJECT_ID=`echo $PROJECT_ID | cut -f 1 -d '_'`
+
+CONFIG_FILE=$CONFIG_DIR/*_${FAMILY_ID}.yaml
+
+mkdir -p $WORK_DIR/$FAMILY_ID
+cd $WORK_DIR/$FAMILY_ID
+
+bcbio_nextgen.py $CONFIG_FILE -n $SLURM_CPUS_PER_TASK -t local
+
+DATE=$(basename `tail log/bcbio-nextgen.log | grep 'Storing in local filesystem' | tail -n 1 | awk '{ print $6 }' | perl -pe "s/_${SHORT_PROJECT_ID}.+//"`)
+
+FAMILY_DIR=${DATE}_${SHORT_PROJECT_ID}_${VERSION}_${FAMILY_ID}
+
+if [ -e $OUTPUT_DIR/$FAMILY_DIR ]
+then
+  for INDV in `cut -f 2 $OUTPUT_DIR/${SHORT_PROJECT_ID}_${VERSION}/params/${PROJECT_ID}_${FAMILY_ID}.ped`
+  do
+    mv $OUTPUT_DIR/$INDV $OUTPUT_DIR/$FAMILY_DIR/
+  done
+
+  #Â fix VCF output file names
+  cd $OUTPUT_DIR/$FAMILY_DIR
+  if [ ! -e ${FAMILY_ID}-gatk-haplotype-annotated.vcf.gz ]
+  then
+    PREFIX=`echo $FAMILY_ID | cut -d '_' -f 1`
+    SUFFIX=`echo $FAMILY_ID | cut -d '_' -f 2`
+    mv ${PREFIX}${SUFFIX}-gatk-haplotype-annotated.vcf.gz ${FAMILY_ID}-gatk-haplotype-annotated.vcf.gz
+    mv ${PREFIX}${SUFFIX}-gatk-haplotype-annotated.vcf.gz.tbi ${FAMILY_ID}-gatk-haplotype-annotated.vcf.gz.tbi
+  fi
+
+  cd $OUTPUT_DIR
+  mkdir -p ${SHORT_PROJECT_ID}_${VERSION}/families
+  mv $FAMILY_DIR ${SHORT_PROJECT_ID}_${VERSION}/families/
+
+else
+  echo $OUTPUT_DIR/${DATE}_${SHORT_PROJECT_ID}_${VERSION}_${FAMILY_ID} does not exist.
+fi
diff --git a/submit_trio_wes_checksums.sh b/submit_trio_wes_checksums.sh
deleted file mode 100755
index b67cba4ac6fd1f02bd68c23062ad02d100e372c3..0000000000000000000000000000000000000000
--- a/submit_trio_wes_checksums.sh
+++ /dev/null
@@ -1,40 +0,0 @@
-#!/bin/bash
-#PBS -l walltime=48:00:00
-#PBS -l ncpus=1,mem=2gb
-#PBS -q uv2000
-#PBS -N trio_whole_exome_checksums
-#PBS -j oe
-
-# enable running singletons
-if [ -z $PBS_ARRAY_INDEX ]
-then
-  if [ -z $INDEX ]
-  then
-    export PBS_ARRAY_INDX=1
-  else
-    export PBS_ARRAY_INDEX=$INDEX
-  fi
-fi
-
-# Expects environment variables to be set
-# PROJECT_ID - e.g. 12345_LastnameFirstname
-# VERSION - e.g. v1, v2
-# CONFIG_SH - absolute path to configuration script setting environment variables
-
-source $CONFIG_SH
-
-FAMILY_ID=`head -n $PBS_ARRAY_INDEX $PARAMS_DIR/$PROJECT_ID.family_ids.txt | tail -n 1`
-
-BASE_PROJECT_ID=`echo $PROJECT_ID | cut -f 1 -d '_'`
-
-# This assumes that ${VERSION}_${PROJECT_ID}_${FAMILY_ID} is unique, and it should be - if there was
-# a re-run of a family, it should have a new project id.
-cd $OUTPUT_DIR/${VERSION}_${BASE_PROJECT_ID}/*${VERSION}_${PROJECT_ID}_${FAMILY_ID}*
-
-rm md5sum.txt 2> /dev/null
-
-for file in `find . -type f | grep -v '\.bam'`
-do
-  md5sum $file >> md5sum.txt
-done
-
diff --git a/submit_trio_wes_cram_compression.sh b/submit_trio_wes_cram_compression.sh
index b0e8498bec1216a7b5673ec6a2278fbddbf154b2..05e95bd39546f70700997a27a7329e6686e2332c 100755
--- a/submit_trio_wes_cram_compression.sh
+++ b/submit_trio_wes_cram_compression.sh
@@ -1,20 +1,10 @@
 #!/bin/bash
-#PBS -l walltime=48:00:00
-#PBS -l ncpus=16,mem=8gb
-#PBS -q uv2000
-#PBS -N trio_whole_exome_cram_compression
-#PBS -j oe
-
-# enable running singletons
-if [ -z $PBS_ARRAY_INDEX ]
-then
-  if [ -z $INDEX ]
-  then
-    export PBS_ARRAY_INDX=1
-  else
-    export PBS_ARRAY_INDEX=$INDEX
-  fi
-fi
+#SBATCH --cpus-per-task=16
+#SBATCH --mem=8GB
+#SBATCH --time=48:00:00
+#SBATCH --job-name=trio_whole_exome_cram_compression
+#SBATCH --output=trio_whole_exome_cram_compression.%A_%a.out
+#SBATCH --error=trio_whole_exome_cram_compression.%A_%a.err
 
 # Expects environment variables to be set
 # PROJECT_ID - e.g. 12345_LastnameFirstname
@@ -23,20 +13,20 @@ fi
 
 source $CONFIG_SH
 
-FAMILY_ID=`head -n $PBS_ARRAY_INDEX $PARAMS_DIR/$PROJECT_ID.family_ids.txt | tail -n 1`
+FAMILY_ID=`head -n $SLURM_ARRAY_TASK_ID $PARAMS_DIR/$PROJECT_ID.family_ids.txt | tail -n 1`
 
-BASE_PROJECT_ID=`echo $PROJECT_ID | cut -f 1 -d '_'`
+SHORT_PROJECT_ID=`echo $PROJECT_ID | cut -f 1 -d '_'`
 
-# This assumes that ${VERSION}_${PROJECT_ID}_${FAMILY_ID} is unique, and it should be - if there was
-# a re-run of a family, it should have a new project id.
-cd $OUTPUT_DIR/${VERSION}_${BASE_PROJECT_ID}/*${VERSION}_${PROJECT_ID}_${FAMILY_ID}*
+# This assumes that ${SHORT_PROJECT_ID}_${VERSION}_${FAMILY_ID} is unique, and it should be -
+# if there was a re-run of a family, it should have a new project id and version.
+cd $OUTPUT_DIR/${SHORT_PROJECT_ID}_${VERSION}/families/*_${FAMILY_ID}
 
 for BAM in */*.bam
 do
 
   # 1. Compress to CRAM format without quality score binning
   CRAM=${BAM%.bam}.cram
-  samtools view -@ 16 -T $REFERENCE_GENOME -C -o $CRAM $BAM
+  samtools view -@ $SLURM_CPUS_PER_TASK -T $REFERENCE_GENOME -C -o $CRAM $BAM
 
   #Â 2. Index the CRAM file - good sanity check
   samtools index $CRAM
diff --git a/submit_trio_wes_family_checksums.sh b/submit_trio_wes_family_checksums.sh
new file mode 100755
index 0000000000000000000000000000000000000000..d113eb54dd38bb009593c9fbf61684043424da94
--- /dev/null
+++ b/submit_trio_wes_family_checksums.sh
@@ -0,0 +1,29 @@
+#!/bin/bash
+#SBATCH --cpus-per-task=1
+#SBATCH --mem=2GB
+#SBATCH --time=6:00:00
+#SBATCH --job-name=trio_whole_exome_family_checksums
+#SBATCH --output=trio_whole_exome_family_checksums.%A_%a.out
+#SBATCH --error=trio_whole_exome_family_checksums.%A_%a.err
+
+# Expects environment variables to be set
+# PROJECT_ID - e.g. 12345_LastnameFirstname
+# VERSION - e.g. v1, v2
+# CONFIG_SH - absolute path to configuration script setting environment variables
+
+source $CONFIG_SH
+
+FAMILY_ID=`head -n $SLURM_ARRAY_TASK_ID $PARAMS_DIR/$PROJECT_ID.family_ids.txt | tail -n 1`
+
+SHORT_PROJECT_ID=`echo $PROJECT_ID | cut -f 1 -d '_'`
+
+# This assumes that ${SHORT_PROJECT_ID}_${VERSION}_${FAMILY_ID} is unique, and it should be -
+# if there was a re-run of a family, it should have a new project id and version.
+cd $OUTPUT_DIR/${SHORT_PROJECT_ID}_${VERSION}/families/*_${FAMILY_ID}
+
+rm md5sum.txt 2> /dev/null
+
+for file in `find . -type f | grep -v '\.bam'`
+do
+  md5sum $file >> md5sum.txt
+done
diff --git a/submit_trio_wes_priority_and_qc_checksums.sh b/submit_trio_wes_priority_and_qc_checksums.sh
deleted file mode 100755
index 143b7574325b4612149f2929deb5259759d6b9a4..0000000000000000000000000000000000000000
--- a/submit_trio_wes_priority_and_qc_checksums.sh
+++ /dev/null
@@ -1,55 +0,0 @@
-#!/bin/bash
-#PBS -l walltime=48:00:00
-#PBS -l ncpus=1,mem=2gb
-#PBS -q uv2000
-#PBS -N trio_whole_exome_priority_and_qc_checksums
-#PBS -j oe
-
-# Expects environment variables to be set
-# PROJECT_ID - e.g. 12345_LastnameFirstname
-# VERSION - e.g. v1, v2
-# PRIORITY_DIRS - e.g. 05122019,07122019 (colon delimited if more than one)
-# CONFIG_SH - absolute path to configuration script setting environment variables
-
-source $CONFIG_SH
-
-# calculate checksums on the qc files for this project
-
-cd $OUTPUT_DIR/qc
-
-for file in ${VERSION}_${PROJECT_ID}_qc_report*.html
-do
-  md5sum $file >> ${VERSION}_${PROJECT_ID}_qc_report.md5sum.txt
-done
-
-for file in ${VERSION}_${PROJECT_ID}.ped_check*.txt
-do
-  md5sum $file >> ${VERSION}_${PROJECT_ID}_qc_report.md5sum.txt
-done
-
-for file in `find ${VERSION}_${PROJECT_ID}_qc_report*_data -type f`
-do
-  md5sum $file >> ${VERSION}_${PROJECT_ID}_qc_report.md5sum.txt
-done
-
-#Â calculate checksusms on the prioritization files for this project
-
-cd $OUTPUT_DIR/prioritization
-
-DIRS=$(echo $PRIORITY_DIRS | tr ":" "\n")
-
-for dir in $DIRS
-do
-  cd $dir
-
-  rm ../$dir.md5sum.txt 2> /dev/null
-
-  for file in `find . -type f`
-  do
-    md5sum $file >> ../$dir.md5sum.txt
-  done
-
-  cd ..
-done
-
-
diff --git a/submit_trio_wes_project_checksums.sh b/submit_trio_wes_project_checksums.sh
new file mode 100755
index 0000000000000000000000000000000000000000..e108b2fde570b188bf3477a8e5643c0ad4babad2
--- /dev/null
+++ b/submit_trio_wes_project_checksums.sh
@@ -0,0 +1,27 @@
+#!/bin/bash
+#SBATCH --cpus-per-task=1
+#SBATCH --mem=2GB
+#SBATCH --time=6:00:00
+#SBATCH --job-name=trio_whole_exome_project_checksums
+#SBATCH --output=trio_whole_exome_project_checksums.%A.out
+#SBATCH --error=trio_whole_exome_project_checksums.%A.err
+
+# Expects environment variables to be set
+# PROJECT_ID - e.g. 12345_LastnameFirstname
+# VERSION - e.g. v1, v2
+# CONFIG_SH - absolute path to configuration script setting environment variables
+
+source $CONFIG_SH
+
+# calculate checksums on all files for this project except the families directory
+
+SHORT_PROJECT_ID=`echo $PROJECT_ID | cut -f 1 -d '_'`
+
+cd $OUTPUT_DIR/${SHORT_PROJECT_ID}_${VERSION}
+
+rm md5sum.txt 2> /dev/null
+
+for file in `find . -type f | grep -v families`
+do
+  md5sum $file >> md5sum.txt
+done
diff --git a/submit_trio_wes_wget_download.sh b/submit_trio_wes_wget_download.sh
new file mode 100755
index 0000000000000000000000000000000000000000..6917c974a72e035c483f5137b1f207d7fbf4ad4f
--- /dev/null
+++ b/submit_trio_wes_wget_download.sh
@@ -0,0 +1,34 @@
+#!/bin/bash
+#SBATCH --cpus-per-task=1
+#SBATCH --mem=4GB
+#SBATCH --time=24:00:00
+#SBATCH --job-name=get_data
+#SBATCH --output=get_data.%A_%a.out
+#SBATCH --error=get_data.%A_%a.err
+
+
+# setup the connection
+PROJ_CONN="ftps://transfer.genomics.ed.ac.uk/${TOKEN}/."
+echo ${PROJ_CONN}
+
+## set up an EPCC folder for this project
+#mkdir /home/u035/u035/shared/data/$PROJECT
+#cd /home/u035/u035/shared/data/$PROJECT
+cd /home/u035/u035/shared/data
+
+# download the data
+#wget -crnH --cut-dirs=1 -i - <<<'ftps://transfer.genomics.ed.ac.uk/${TOKEN}/.'
+wget -crnH --cut-dirs=1 -i - <<<${PROJ_CONN}
+
+
+
+# perform the md5_check
+cd /home/u035/u035/shared/data/$PROJECT/raw_data
+
+rm md5_check.txt 2> /dev/null
+for DATE in 20*[0-9]
+do
+  cd $DATE
+  md5sum --check md5sums.txt >> ../md5_check.txt
+  cd ..
+done
diff --git a/trio_cram_setup.sh b/trio_cram_setup.sh
new file mode 100755
index 0000000000000000000000000000000000000000..0cad9bd38dc8e55e77950c31a7dbccc7a36a89da
--- /dev/null
+++ b/trio_cram_setup.sh
@@ -0,0 +1,137 @@
+#!/bin/bash
+#SBATCH --cpus-per-task=16
+#SBATCH --mem=2GB
+#SBATCH --time=24:00:00
+#SBATCH --job-name=trio_cram_setup
+#SBATCH --output=trio_cram_setup.%A_%a.out
+#SBATCH --error=trio_cram_setup.%A_%a.err
+
+
+### Setup the folder structure for the downstream analysis###
+BASE=/home/u035/u035/shared/analysis/work
+WORK_DIR=$BASE/${PROJECT_ID}
+VCF_DIR=${WORK_DIR}/VCF
+PED_DIR=${WORK_DIR}/PED
+LOG_DIR=${WORK_DIR}/LOG
+G2P_DIR=${WORK_DIR}/G2P
+VASE_DIR=${WORK_DIR}/VASE
+COV_DIR=${WORK_DIR}/COV
+DEC_DIR=${WORK_DIR}/DECIPHER
+IGV_DIR=${DEC_DIR}/IGV
+CNV_DIR=${WORK_DIR}/CNV
+BAMOUT_DIR=${WORK_DIR}/BAMOUT
+SCRIPTS_DIR=/home/u035/u035/shared/scripts
+
+
+
+### Tools
+PYTHON2=/home/u035/u035/shared/software/bcbio/anaconda/envs/python2/bin/python2.7
+SAMTOOLS=/home/u035/u035/shared/software/bcbio/anaconda/bin/samtools
+PICARD=/home/u035/u035/shared/software/bcbio/anaconda/bin/picard
+REFERENCE_GENOME=/home/u035/u035/shared/software/bcbio/genomes/Hsapiens/hg38/seq/hg38.fa
+
+
+
+#~## check if ${WORK_DIR} already exists - if so, exit - to prevent accidental overwriting
+#~#if [ -d "${WORK_DIR}" ]; then
+#~#  echo "${WORK_DIR} already exists - EXIT! If really intended, delete manually!!!!"
+#~#  exit
+#~#fi
+
+
+
+
+echo "SOURCE_DIR = ${SOURCE_DIR}"       # the general path to the source VCF, BAM and PED files                 i.e. /home/u035/u035/shared/results
+echo "BATCH_ID = ${BATCH_ID}"           # the ID of the batch being processed                                   e.g. 19650_Ansari_Morad
+echo "BATCH_NUM = ${BATCH_NUM}"         # the numerical part of the BATCH_ID                                    e.g. 19650
+echo "PLATE_ID = ${PLATE_ID}"           # the PCR plate ID of the batch being currently processed,              e.g. 19285
+echo "PROJECT_ID = ${PROJECT_ID}"       # this the the folder (${BASE}/${PROJECT_ID}) where the downstream analysis will be done
+echo "VERSION_N = ${VERSION_N}"         # the version of the alignment and genotyping analysis
+
+#~#S_PED_DIR=${SOURCE_DIR}/../../params		# requires that the family PED files are in this folder
+
+
+
+
+# create the working dir and the required subfolders
+mkdir ${WORK_DIR}
+mkdir ${VCF_DIR}
+mkdir ${PED_DIR}
+mkdir ${LOG_DIR}
+mkdir ${G2P_DIR}
+mkdir ${VASE_DIR}
+mkdir ${COV_DIR}
+mkdir ${DEC_DIR}
+mkdir ${IGV_DIR}
+mkdir ${CNV_DIR}
+mkdir ${BAMOUT_DIR}
+echo "Created ${WORK_DIR} for this batch and all the required subfolders"
+
+
+
+######################################################
+###   Copy the VCF and PED file per each family    ###
+######################################################
+
+SOURCE_VCF_DIRS=${SOURCE_DIR}/${BATCH_NUM}_${VERSION_N}/families/????-??-??_${BATCH_NUM}_${VERSION_N}_${PLATE_ID}_*
+SOURCE_PED_DIR=${SOURCE_DIR}/${BATCH_NUM}_${VERSION_N}/params
+
+for S_VCF_DIR in ${SOURCE_VCF_DIRS}
+do
+#  echo "  ${S_VCF_DIR}"
+  VCF_DIR_NAME="${S_VCF_DIR##*/}"
+#  echo "    ${VCF_DIR_NAME}"
+  IFS=_ read -ra my_arr <<< "${VCF_DIR_NAME}"
+  FAM_ID=${my_arr[-1]}
+#  echo "      BATCH = ${BATCH_ID}, PLATE = ${PLATE_ID}, FAM_ID = ${FAM_ID}"
+  echo "  FAM_ID = ${FAM_ID}"
+
+  # construct the VCF and PED file names for this family
+  S_VCF_FILE=${S_VCF_DIR}/${PLATE_ID}_${FAM_ID}-gatk-haplotype-annotated.vcf.gz
+  S_PED_FILE=${SOURCE_PED_DIR}/${BATCH_ID}_${PLATE_ID}_${FAM_ID}.ped
+
+
+  # copy the trio VCF and PED files
+  cp ${S_VCF_FILE} ${VCF_DIR}
+  cp ${S_PED_FILE} ${PED_DIR}
+  echo "    copied ${S_VCF_FILE} --> ${VCF_DIR}"
+  echo "    copied ${S_PED_FILE} --> ${PED_DIR}"
+
+
+  # identify all folders (one for each individual) for this family containing cram/bam files (format: <INDI_ID>_<FAM_ID>)
+  cd ${SOURCE_DIR}/${BATCH_NUM}_${VERSION_N}/families/????-??-??_${BATCH_NUM}_${VERSION_N}_${PLATE_ID}_${FAM_ID}
+  for ITEM in `ls -l`
+  do
+      if test -d $ITEM && [[ "$ITEM" == *"_"* ]]
+      then
+        echo "    $ITEM is a CRAM/BAM folder..."
+        BAM=${ITEM}/${ITEM}-ready.bam
+        CRAM=${ITEM}/${ITEM}-ready.cram
+        ${SAMTOOLS} view -@ 16 -T ${REFERENCE_GENOME} -hb -o ${BAM} ${CRAM}
+        ${PICARD} BuildBamIndex -I ${BAM} -O ${BAM}.bai -USE_JDK_DEFLATER true -USE_JDK_INFLATER true -VERBOSITY ERROR -QUIET true
+        echo "      Generated ${BAM} and ${BAM}.bai from ${CRAM}"
+      fi
+  done
+
+done
+
+
+
+######################################################################################
+### generate the FAM_IDs.txt, PRO_IDs.txt and FAM_PRO.txt *only for trio* families ###
+######################################################################################
+
+time ${PYTHON2} ${SCRIPTS_DIR}/extract_trio_FAM_PRO_ID.py ${WORK_DIR}
+
+echo ""
+echo ""
+echo "OK: Setup for PROJECT_ID = $PROJECT_ID successful"
+
+
+
+
+
+
+
+
+
diff --git a/trio_setup.sh b/trio_setup.sh
new file mode 100755
index 0000000000000000000000000000000000000000..57bbba9fa6b98e4848c85d9be3d7cc7d129d4dcc
--- /dev/null
+++ b/trio_setup.sh
@@ -0,0 +1,135 @@
+#!/bin/bash
+#SBATCH --cpus-per-task=1
+#SBATCH --mem=2GB
+#SBATCH --time=01:00:00
+#SBATCH --job-name=trio_setup
+#SBATCH --output=trio_setup.%A_%a.out
+#SBATCH --error=trio_setup.%A_%a.err
+
+
+
+
+### Setup the folder structure for the downstream analysis###
+BASE=/home/u035/u035/shared/analysis/work
+WORK_DIR=$BASE/${PROJECT_ID}
+VCF_DIR=${WORK_DIR}/VCF
+PED_DIR=${WORK_DIR}/PED
+LOG_DIR=${WORK_DIR}/LOG
+G2P_DIR=${WORK_DIR}/G2P
+VASE_DIR=${WORK_DIR}/VASE
+COV_DIR=${WORK_DIR}/COV
+DEC_DIR=${WORK_DIR}/DECIPHER
+IGV_DIR=${DEC_DIR}/IGV
+CNV_DIR=${WORK_DIR}/CNV
+BAMOUT_DIR=${WORK_DIR}/BAMOUT
+SCRIPTS_DIR=/home/u035/u035/shared/scripts
+
+
+### Tools
+PYTHON2=/home/u035/u035/shared/software/bcbio/anaconda/envs/python2/bin/python2.7
+
+
+
+#~### check if ${WORK_DIR} already exists - if so, exit - to prevent accidental overwriting
+#~#if [ -d "${WORK_DIR}" ]; then
+#~#  echo "${WORK_DIR} already exists - EXIT! If really intended, delete manually!!!!"
+#~#  exit
+#~#fi
+
+
+
+
+echo "SOURCE_DIR = ${SOURCE_DIR}"	# the general path to the source VCF, BAM and PED files			i.e. /home/u035/u035/shared/results
+echo "BATCH_ID = ${BATCH_ID}"		# the ID of the batch being processed 					e.g. 19650_Ansari_Morad
+echo "BATCH_NUM = ${BATCH_NUM}"		# the numerical part of the BATCH_ID					e.g. 19650
+echo "PLATE_ID = ${PLATE_ID}" 		# the PCR plate ID of the batch being currently processed, 		e.g. 19285
+echo "PROJECT_ID = ${PROJECT_ID}"	# this the the folder (${BASE}/${PROJECT_ID}) where the downstream analysis will be done
+echo "VERSION_N = ${VERSION_N}"         # the version of the alignment and genotyping analysis
+
+
+#~#S_PED_DIR=${SOURCE_DIR}/../../params	# requires that the family PED files are in this folder
+
+
+
+
+# create the working dir and the required subfolders
+mkdir ${WORK_DIR}
+mkdir ${VCF_DIR}
+mkdir ${PED_DIR}
+mkdir ${LOG_DIR}
+mkdir ${G2P_DIR}
+mkdir ${VASE_DIR}
+mkdir ${COV_DIR}
+mkdir ${DEC_DIR}
+mkdir ${IGV_DIR}
+mkdir ${CNV_DIR}
+mkdir ${BAMOUT_DIR}
+echo "Created ${WORK_DIR} for this batch and all the required subfolders"
+
+
+
+
+######################################################
+###   Copy the VCF and PED file per each family    ###
+######################################################
+
+SOURCE_VCF_DIRS=${SOURCE_DIR}/${BATCH_NUM}_${VERSION_N}/families/????-??-??_${BATCH_NUM}_${VERSION_N}_${PLATE_ID}_*
+SOURCE_PED_DIR=${SOURCE_DIR}/${BATCH_NUM}_${VERSION_N}/params
+
+
+
+for S_VCF_DIR in ${SOURCE_VCF_DIRS}
+do
+#  echo "  ${S_VCF_DIR}"
+  VCF_DIR_NAME="${S_VCF_DIR##*/}"
+#  echo "    ${VCF_DIR_NAME}"
+  IFS=_ read -ra my_arr <<< "${VCF_DIR_NAME}"
+  FAM_ID=${my_arr[-1]}
+#  echo "      BATCH = ${BATCH_ID}, PLATE = ${PLATE_ID}, FAM_ID = ${FAM_ID}"
+  echo "  FAM_ID = ${FAM_ID}"
+
+  # construct the VCF and PED file names for this family
+  S_VCF_FILE=${S_VCF_DIR}/${PLATE_ID}_${FAM_ID}-gatk-haplotype-annotated.vcf.gz
+  S_PED_FILE=${SOURCE_PED_DIR}/${BATCH_ID}_${PLATE_ID}_${FAM_ID}.ped
+
+
+  # copy the trio VCF and PED files
+  cp ${S_VCF_FILE} ${VCF_DIR}
+  cp ${S_PED_FILE} ${PED_DIR}
+  echo "    copied ${S_VCF_FILE} --> ${VCF_DIR}"
+  echo "    copied ${S_PED_FILE} --> ${PED_DIR}"
+
+done
+
+
+
+
+
+######################################################################################
+### generate the FAM_IDs.txt, PRO_IDs.txt and FAM_PRO.txt *only for trio* families ###
+######################################################################################
+
+time ${PYTHON2} ${SCRIPTS_DIR}/extract_trio_FAM_PRO_ID.py ${WORK_DIR}
+
+
+######################################################################################
+### generate the FAM_IDs.txt, PRO_IDs.txt and FAM_PRO.txt *for singleton* families ###
+######################################################################################
+
+time ${PYTHON2} ${SCRIPTS_DIR}/extract_solo_FAM_PRO_ID.py ${WORK_DIR}
+
+
+
+echo ""
+echo ""
+echo "OK: Setup for PROJECT_ID = $PROJECT_ID successful"
+
+
+
+
+
+
+
+
+
+
diff --git a/prepare_bcbio_config.sh b/trio_wes_prepare_bcbio_config.sh
similarity index 63%
rename from prepare_bcbio_config.sh
rename to trio_wes_prepare_bcbio_config.sh
index 31793d873b0f429ab072104beeb2e229130461e8..572df644a0355d64c34a629e8b743d17a6f70d8b 100755
--- a/prepare_bcbio_config.sh
+++ b/trio_wes_prepare_bcbio_config.sh
@@ -48,15 +48,21 @@ perl -pi -e 's/\r//' $PROJECT_ID.ped
 # create reads directory for project and symlink directory underneath
 mkdir -p $READS_DIR/$PROJECT_ID/symlinks
 
-cat $DOWNLOAD_DIR/$PROJECT_ID/*/file_list.tsv | \
+cat $DOWNLOAD_DIR/$PROJECT_ID/raw_data/*/file_list.tsv | \
   perl $SCRIPTS/trio_whole_exome_create_parameter_files.pl \
     --prefix ./$PROJECT_ID \
     --ped $PROJECT_ID.ped \
     --suffix $SAMPLE_SUFFIX
 
+SHORT_PROJECT_ID=`echo $PROJECT_ID | cut -f 1 -d '_'`
+
+mkdir -p ${OUTPUT_DIR}/${SHORT_PROJECT_ID}_${VERSION}/params
+
 for FAMILY_ID in `cat ${PROJECT_ID}.family_ids.txt`
 do
-  echo "samplename,description,batch,sex,phenotype,variant_regions" > ${VERSION}_${PROJECT_ID}_${FAMILY_ID}.csv
+  PREFIX=${SHORT_PROJECT_ID}_${VERSION}_${FAMILY_ID}
+  
+  echo "samplename,description,batch,sex,phenotype,variant_regions" > ${PREFIX}.csv
   COUNT=`wc -l ${PROJECT_ID}_${FAMILY_ID}.ped | awk '{ print $1 }'`
 
   for ((i=1; i<=$COUNT; i=i+1))
@@ -67,49 +73,52 @@ do
 
     # create symlinks for problematic filenames
     mkdir $READS_DIR/$PROJECT_ID/symlinks/$SAMPLE
-    for FILE in `ls $DOWNLOAD_DIR/$PROJECT_ID/*/*$SAMPLE*/*_1_*_1.fastq.gz`
+    for FILE in `find $DOWNLOAD_DIR/$ORIG_PROJECT_ID* -wholename "*$SAMPLE*/*_1_*_1.fastq.gz"`
     do
-      newname=`basename $FILE | sed -e 's/_1_/_one_/'`
-      ln -s $FILE $READS_DIR/$PROJECT_ID/symlinks/$SAMPLE/${newname%1.fastq.gz}R1.fastq.gz
+	newname=`basename $FILE | sed -e 's/_1_/_one_/'`
+	ln -s $FILE $READS_DIR/$PROJECT_ID/symlinks/$SAMPLE/${newname%1.fastq.gz}R1.fastq.gz
     done
-    for FILE in `ls $DOWNLOAD_DIR/$PROJECT_ID/*/*$SAMPLE*/*_1_*_2.fastq.gz`
+    for FILE in `find $DOWNLOAD_DIR/$ORIG_PROJECT_ID* -wholename "*$SAMPLE*/*_1_*_2.fastq.gz"`
     do
-      newname=`basename $FILE | sed -e 's/_1_/_one_/'`
-      ln -s $FILE $READS_DIR/$PROJECT_ID/symlinks/$SAMPLE/${newname%2.fastq.gz}R2.fastq.gz
+	newname=`basename $FILE | sed -e 's/_1_/_one_/'`
+	ln -s $FILE $READS_DIR/$PROJECT_ID/symlinks/$SAMPLE/${newname%2.fastq.gz}R2.fastq.gz
     done
-    for FILE in `ls $DOWNLOAD_DIR/$PROJECT_ID/*/*$SAMPLE*/*_2_*_1.fastq.gz`
+    for FILE in `find $DOWNLOAD_DIR/$ORIG_PROJECT_ID* -wholename "*$SAMPLE*/*_2_*_1.fastq.gz"`
     do
-      newname=`basename $FILE | sed -e 's/_2_/_two_/'`
-      ln -s $FILE $READS_DIR/$PROJECT_ID/symlinks/$SAMPLE/${newname%1.fastq.gz}R1.fastq.gz
+	newname=`basename $FILE | sed -e 's/_2_/_two_/'`
+	ln -s $FILE $READS_DIR/$PROJECT_ID/symlinks/$SAMPLE/${newname%1.fastq.gz}R1.fastq.gz
     done
-    for FILE in `ls $DOWNLOAD_DIR/$PROJECT_ID/*/*$SAMPLE*/*_2_*_2.fastq.gz`
+    for FILE in `find $DOWNLOAD_DIR/$ORIG_PROJECT_ID* -wholename "*$SAMPLE*/*_2_*_2.fastq.gz"`
     do
-      newname=`basename $FILE | sed -e 's/_2_/_two_/'`
-      ln -s $FILE $READS_DIR/$PROJECT_ID/symlinks/$SAMPLE/${newname%2.fastq.gz}R2.fastq.gz
+	newname=`basename $FILE | sed -e 's/_2_/_two_/'`
+	ln -s $FILE $READS_DIR/$PROJECT_ID/symlinks/$SAMPLE/${newname%2.fastq.gz}R2.fastq.gz
     done
-
+    
     for FILE in `ls $READS_DIR/$PROJECT_ID/symlinks/$SAMPLE/*_R[1,2].fastq.gz`
     do
-      echo "$FILE,$SAMPLE,$FAMILY_ID,$SEX,$PHENOTYPE,$TARGET" >> ${VERSION}_${PROJECT_ID}_${FAMILY_ID}.csv
+      echo "$FILE,$SAMPLE,$FAMILY_ID,$SEX,$PHENOTYPE,$TARGET" >> ${PREFIX}.csv
     done
 
   done
 
-  bcbio_prepare_samples.py --out $READS_DIR/$PROJECT_ID --csv ${VERSION}_${PROJECT_ID}_${FAMILY_ID}.csv
+  bcbio_prepare_samples.py --out $READS_DIR/$PROJECT_ID --csv ${PREFIX}.csv
 
-  mv ${VERSION}_${PROJECT_ID}_${FAMILY_ID}-merged.csv ${VERSION}_${PROJECT_ID}_${FAMILY_ID}.csv
+  mv ${PREFIX}-merged.csv ${PREFIX}.csv
 
   BARE_FAMILY_ID=`echo $FAMILY_ID | cut -d '_' -f 2`
 
-  bcbio_nextgen.py -w template $BCBIO_TEMPLATE ${VERSION}_${PROJECT_ID}_${FAMILY_ID}.csv $READS_DIR/$PROJECT_ID/*_${BARE_FAMILY_ID}_R[12].fastq.gz
+  bcbio_nextgen.py -w template $BCBIO_TEMPLATE ${PREFIX}.csv $READS_DIR/$PROJECT_ID/*_${BARE_FAMILY_ID}_R[12].fastq.gz
 
-  mv ${VERSION}_${PROJECT_ID}_${FAMILY_ID}/config/${VERSION}_${PROJECT_ID}_${FAMILY_ID}.yaml $CONFIG_DIR/
+  mv ${PREFIX}/config/${PREFIX}.yaml $CONFIG_DIR/
 
   COMPRESSED_ID=`echo "$FAMILY_ID" | perl -pe "s/\_//"`
 
-  perl -i -pe "s/${COMPRESSED_ID}/${FAMILY_ID}/" $CONFIG_DIR/${VERSION}_${PROJECT_ID}_${FAMILY_ID}.yaml
+  perl -i -pe "s/${COMPRESSED_ID}/${FAMILY_ID}/" $CONFIG_DIR/${PREFIX}.yaml
+
+  rm -r ${PREFIX}
 
-  rm -r ${VERSION}_${PROJECT_ID}_${FAMILY_ID}
+  cp ${PREFIX}.csv ${OUTPUT_DIR}/${SHORT_PROJECT_ID}_${VERSION}/params/
+  cp ${PROJECT_ID}_${FAMILY_ID}.ped ${OUTPUT_DIR}/${SHORT_PROJECT_ID}_${VERSION}/params/
 
 done
 
diff --git a/prepare_bcbio_config_crf.sh b/trio_wes_prepare_bcbio_config_crf.sh
similarity index 73%
rename from prepare_bcbio_config_crf.sh
rename to trio_wes_prepare_bcbio_config_crf.sh
index f9e3c007b4b0d98efe88a4a3ca87078de625fdb1..c1c1d77a446b07312aee962fd66788707680a614 100755
--- a/prepare_bcbio_config_crf.sh
+++ b/trio_wes_prepare_bcbio_config_crf.sh
@@ -51,8 +51,8 @@ perl -pi -e 's/\r//' $PROJECT_ID.ped
 # create reads directory for project
 mkdir -p $READS_DIR/$PROJECT_ID
 
-# generate the family_ids list - makes strong assumption about relative paths!
-ls ../../data/$PROJECT_ID/*.gz | grep -v Undetermined | cut -d '/' -f 5 | cut -f 1,3 -d '_' | sort -u > $PROJECT_ID.family_ids.txt
+# generate the family_ids list
+ls $DOWNLOAD_DIR/$PROJECT_ID/*.gz | grep -v Undetermined | cut -d '/' -f 8 | cut -f 1,3 -d '_' | sort -u > $PROJECT_ID.family_ids.txt
 
 PLATE_ID=`cut -f 1 -d '_' $PROJECT_ID.family_ids.txt | sort -u`
 for FAMILY_ID in `cut -f 2 -d '_' $PROJECT_ID.family_ids.txt`
@@ -63,11 +63,16 @@ do
     > ${PROJECT_ID}_${PLATE_ID}_${FAMILY_ID}.ped
 done
 
+SHORT_PROJECT_ID=`echo $PROJECT_ID | cut -f 1 -d '_'`
+
 for FAMILY_ID in `cat ${PROJECT_ID}.family_ids.txt`
 do
-  echo "samplename,description,batch,sex,phenotype,variant_regions" > ${VERSION}_${PROJECT_ID}_${FAMILY_ID}.csv
+  PREFIX=${SHORT_PROJECT_ID}_${VERSION}_${FAMILY_ID}
+  echo "samplename,description,batch,sex,phenotype,variant_regions" > $PREFIX.csv
   COUNT=`wc -l ${PROJECT_ID}_${FAMILY_ID}.ped | awk '{ print $1 }'`
 
+  echo $COUNT
+
   for ((i=1; i<=$COUNT; i=i+1))
   do
     SAMPLE=`head -n $i ${PROJECT_ID}_${FAMILY_ID}.ped | tail -n 1 | cut -f 2`
@@ -76,25 +81,31 @@ do
 
     for FILE in `ls $DOWNLOAD_DIR/$PROJECT_ID/*${SAMPLE}*.gz`
     do
-      echo "$FILE,$SAMPLE,$FAMILY_ID,$SEX,$PHENOTYPE,$TARGET" >> ${VERSION}_${PROJECT_ID}_${FAMILY_ID}.csv
+      echo "$FILE,$SAMPLE,$FAMILY_ID,$SEX,$PHENOTYPE,$TARGET" >> $PREFIX.csv
     done
 
   done
 
-  bcbio_prepare_samples.py --out $READS_DIR/$PROJECT_ID --csv ${VERSION}_${PROJECT_ID}_${FAMILY_ID}.csv
+  bcbio_prepare_samples.py --out $READS_DIR/$PROJECT_ID --csv $PREFIX.csv
 
-  mv ${VERSION}_${PROJECT_ID}_${FAMILY_ID}-merged.csv ${VERSION}_${PROJECT_ID}_${FAMILY_ID}.csv
+  mv $PREFIX-merged.csv $PREFIX.csv
 
   BARE_FAMILY_ID=`echo $FAMILY_ID | cut -d '_' -f 2`
 
-  bcbio_nextgen.py -w template $BCBIO_TEMPLATE ${VERSION}_${PROJECT_ID}_${FAMILY_ID}.csv $READS_DIR/$PROJECT_ID/*_${BARE_FAMILY_ID}_R[12].fastq.gz
+  bcbio_nextgen.py -w template $BCBIO_TEMPLATE $PREFIX.csv $READS_DIR/$PROJECT_ID/*_${BARE_FAMILY_ID}_R[12].fastq.gz
 
-  mv ${VERSION}_${PROJECT_ID}_${FAMILY_ID}/config/${VERSION}_${PROJECT_ID}_${FAMILY_ID}.yaml $CONFIG_DIR/
+  mv $PREFIX/config/$PREFIX.yaml $CONFIG_DIR/
 
   COMPRESSED_ID=`echo "$FAMILY_ID" | perl -pe "s/\_//"`
 
-  perl -i -pe "s/${COMPRESSED_ID}/${FAMILY_ID}/" $CONFIG_DIR/${VERSION}_${PROJECT_ID}_${FAMILY_ID}.yaml
+  perl -i -pe "s/${COMPRESSED_ID}/${FAMILY_ID}/" $CONFIG_DIR/$PREFIX.yaml
 
-  rm -r ${VERSION}_${PROJECT_ID}_${FAMILY_ID}
+  rm -r $PREFIX
+
+  mkdir -p ${OUTPUT_DIR}/${SHORT_PROJECT_ID}_${VERSION}/params/
+  mv ${PREFIX}.csv ${OUTPUT_DIR}/${SHORT_PROJECT_ID}_${VERSION}/params/
+  mv ${PROJECT_ID}_${FAMILY_ID}.ped ${OUTPUT_DIR}/${SHORT_PROJECT_ID}_${VERSION}/params/
 
 done
+
+mv *.txt *.log *.ped ${OUTPUT_DIR}/${SHORT_PROJECT_ID}_${VERSION}/params/
diff --git a/trio_wes_prepare_bcbio_config_singleton_from_duo.sh b/trio_wes_prepare_bcbio_config_singleton_from_duo.sh
new file mode 100755
index 0000000000000000000000000000000000000000..03c7e4a5a30295cc374e778a5dee1824e52be056
--- /dev/null
+++ b/trio_wes_prepare_bcbio_config_singleton_from_duo.sh
@@ -0,0 +1,106 @@
+#!/bin/bash
+#
+# trio_wes_prepare_bcbio_singleton_from_duo_config.sh <config.sh> <project_id> <params>
+# 
+# Assumes that reads for the samples are in the path
+# $READS_DIR/<project_id>/<date>/<sample><sample_suffix>/*.gz,
+# and that no samples other than those with reads are listed in the 
+# PED file. $READS_DIR is specified in the <config.sh> file.
+#
+# Assumes that the sample names in the PED file match those 
+# specifying the read directories with the addition of a specified
+# suffix.
+#
+# All samples must be annotated with sex (1=male, 2=female) in the
+# 5th column and phenotype (1=unaffected, 2=affected) in the 6th
+# column of the PED file.
+#
+# Runs bcbio sample preparation and configuration file generation,
+# assuming the template configuration file is at $BCBIO_TEMPLATE,
+# specified in the <config.sh> file.
+#
+# Assumes bcbio is on the PATH (set in <config.sh>).
+#
+
+CONFIG_SH=$1
+PROJECT_ID=$2
+PARAMS=$3
+
+source $CONFIG_SH
+
+#
+# Create the file $PROJECT_ID.family_ids.txt
+#
+cd $PARAMS_DIR
+
+cat *.ped | cut -f 1 > $PROJECT_ID.family_ids.txt
+
+SHORT_PROJECT_ID=`echo $PROJECT_ID | cut -f 1 -d '_'`
+
+COUNT=`wc -l ${PROJECT_ID}.family_ids.txt | awk '{ print $1 }'`
+
+for ((i = 1; i <= $COUNT; i = i + 1))
+do
+
+    ORIG_PROJECT_ID=`head -n $i $PARAMS | tail -n 1 | cut -f 1 -d '_'`
+    ORIG_VERSION=`head -n $i $PARAMS | tail -n 1 | cut -f 1 | cut -f 2 -d '_'`
+    BATCH_ID=`head -n $i $PARAMS | tail -n 1 | cut -f 2`
+    FAMILY_ID=`head -n $i $PARAMS | tail -n 1 | cut -f 3`
+
+    SAMPLE=`cut -f 2 *_${FAMILY_ID}.ped`
+    SEX=`cut -f 5 *_${FAMILY_ID}.ped`
+    PHENOTYPE=`cut -f 6 *_${FAMILY_ID}.ped`
+
+    PREFIX=${ORIG_PROJECT_ID}_${ORIG_VERSION}_${BATCH_ID}_${FAMILY_ID}
+    echo "samplename,description,batch,sex,phenotype,variant_regions" > ${PREFIX}.csv
+    len=`expr length $ORIG_PROJECT_ID`
+
+    if [ $len -eq 5 ]
+    then
+	mkdir -p $READS_DIR/$PROJECT_ID/symlinks/$SAMPLE
+
+	for FILE in `find $DOWNLOAD_DIR/$ORIG_PROJECT_ID* -wholename "*$SAMPLE*/*_1_*_1.fastq.gz"`
+	do
+	    newname=`basename $FILE | sed -e 's/_1_/_one_/'`
+	    ln -s $FILE $READS_DIR/$PROJECT_ID/symlinks/$SAMPLE/${newname%1.fastq.gz}R1.fastq.gz
+	done
+	for FILE in `find $DOWNLOAD_DIR/$ORIG_PROJECT_ID* -wholename "*$SAMPLE*/*_1_*_2.fastq.gz"`
+	do
+	    newname=`basename $FILE | sed -e 's/_1_/_one_/'`
+	    ln -s $FILE $READS_DIR/$PROJECT_ID/symlinks/$SAMPLE/${newname%2.fastq.gz}R2.fastq.gz
+	done
+	for FILE in `find $DOWNLOAD_DIR/$ORIG_PROJECT_ID* -wholename "*$SAMPLE*/*_2_*_1.fastq.gz"`
+	do
+	    newname=`basename $FILE | sed -e 's/_2_/_two_/'`
+	    ln -s $FILE $READS_DIR/$PROJECT_ID/symlinks/$SAMPLE/${newname%1.fastq.gz}R1.fastq.gz
+	done
+	for FILE in `find $DOWNLOAD_DIR/$ORIG_PROJECT_ID* -wholename "*$SAMPLE*/*_2_*_2.fastq.gz"`
+	do
+	    newname=`basename $FILE | sed -e 's/_2_/_two_/'`
+	    ln -s $FILE $READS_DIR/$PROJECT_ID/symlinks/$SAMPLE/${newname%2.fastq.gz}R2.fastq.gz
+	done
+
+	for FILE in `ls $READS_DIR/$PROJECT_ID/symlinks/$SAMPLE/*_R[1,2].fastq.gz`
+	do
+	    echo "$FILE,$SAMPLE,${BATCH_ID}_${FAMILY_ID},$SEX,$PHENOTYPE,$TARGET" >> ${PREFIX}.csv
+	done
+
+    else
+	for FILE in `ls $DOWNLOAD_DIR/$ORIG_PROJECT_ID*/*${SAMPLE}*.gz`
+	do
+	    echo "$FILE,$SAMPLE,${BATCH_ID}_${FAMILY_ID},$SEX,$PHENOTYPE,$TARGET" >> $PREFIX.csv
+	done
+    fi
+
+    bcbio_prepare_samples.py --out $READS_DIR/$PROJECT_ID --csv ${PREFIX}.csv
+
+    mv ${PREFIX}-merged.csv ${PREFIX}.csv
+
+    bcbio_nextgen.py -w template $BCBIO_TEMPLATE ${PREFIX}.csv $READS_DIR/$PROJECT_ID/*_${FAMILY_ID}_R[12].fastq.gz
+
+    mv ${PREFIX}/config/${PREFIX}.yaml $CONFIG_DIR/
+
+    perl -i -pe "s/${BATCH_ID}${FAMILY_ID}/${BATCH_ID}_${FAMILY_ID}/" $CONFIG_DIR/${PREFIX}.yaml
+
+    rm -r ${PREFIX}
+done
diff --git a/trio_whole_exome_bcbio_crf_template.yaml b/trio_whole_exome_bcbio_crf_template.yaml
deleted file mode 100644
index e379642d408852e6aff9309fbc3be6ae0502a6ff..0000000000000000000000000000000000000000
--- a/trio_whole_exome_bcbio_crf_template.yaml
+++ /dev/null
@@ -1,22 +0,0 @@
-details:
-- algorithm:
-    platform: illumina
-    quality_format: standard
-    aligner: bwa
-    align_split_size: false
-    trim_reads: fastp
-    adapters: [nextera2, polyg]
-    mark_duplicates: true
-    realign: false
-    recalibrate: true
-    effects: vep
-    effects_transcripts: all
-    variantcaller: gatk-haplotype
-    indelcaller: false
-    remove_lcr: true
-    tools_on:
-    - vep_splicesite_annotations
-  analysis: variant2
-  genome_build: hg38
-upload:
-  dir: /scratch/u035/u035/shared/trio_whole_exome/analysis/output
diff --git a/trio_whole_exome_bcbio_template.yaml b/trio_whole_exome_bcbio_template.yaml
index f6ebbb44f3e55484bf6eab03decdbf5ecb9263f6..8c8a98940d79514a66e551b3bf966a4179ae036c 100644
--- a/trio_whole_exome_bcbio_template.yaml
+++ b/trio_whole_exome_bcbio_template.yaml
@@ -3,6 +3,9 @@ details:
     platform: illumina
     quality_format: standard
     aligner: bwa
+    align_split_size: false
+    trim_reads: fastp
+    adapters: [nextera2, polyg]
     mark_duplicates: true
     realign: false
     recalibrate: true
@@ -16,4 +19,4 @@ details:
   analysis: variant2
   genome_build: hg38
 upload:
-  dir: /scratch/u035/u035/shared/trio_whole_exome/analysis/output
+  dir: /home/u035/u035/shared/results
diff --git a/trio_whole_exome_config.sh b/trio_whole_exome_config.sh
index 5c291d3d156d343b7595ed8415980aa939d63033..d3202b6bc1216b6b124058b8a8dca1802038b6e5 100644
--- a/trio_whole_exome_config.sh
+++ b/trio_whole_exome_config.sh
@@ -3,19 +3,22 @@
 # Basic configuration options for trio WES pipeline
 #
 
-SCRIPTS=/home/u035/u035/shared/scripts
-BCBIO_TEMPLATE=$SCRIPTS/trio_whole_exome_bcbio_template.yaml
-TARGET=/home/u035/u035/shared/resources/exome_targets/Twist_Exome_RefSeq_targets_hg38.plus15bp.bed
-DOWNLOAD_DIR=/scratch/u035/u035/shared/data
-REFERENCE_GENOME=/home/u035/u035/shared/software/bcbio/genomes/Hsapiens/hg38/seq/hg38.fa
+# primary locations
+BASE=/home/u035/u035/shared
+SCRIPTS=$BASE/scripts
+DOWNLOAD_DIR=$BASE/data
+OUTPUT_DIR=$BASE/results
 
-BASE=/scratch/u035/u035/shared/analysis
-PARAMS_DIR=$BASE/params
-READS_DIR=$BASE/reads
-CONFIG_DIR=$BASE/config
-WORK_DIR=$BASE/work
-OUTPUT_DIR=$BASE/output
+# resource locations
+BCBIO_TEMPLATE=$SCRIPTS/trio_whole_exome_bcbio_template.yaml
+TARGET=$BASE/resources/exome_targets/Twist_Exome_RefSeq_targets_hg38.plus15bp.bed
+REFERENCE_GENOME=$BASE/software/bcbio/genomes/Hsapiens/hg38/seq/hg38.fa
 
-ARCHIVE_DIR=/archive/u035/trio_whole_exome
+# temporary working files
+PARAMS_DIR=$BASE/analysis/params
+READS_DIR=$BASE/analysis/reads
+CONFIG_DIR=$BASE/analysis/config
+WORK_DIR=$BASE/analysis/work
+LOGS_DIR=$BASE/analysis/logs
 
-export PATH=/home/u035/u035/shared/software/bcbio/tools/bin:$PATH
+export PATH=$BASE/software/bcbio/tools/bin:$PATH
diff --git a/trio_whole_exome_create_parameter_files.pl b/trio_whole_exome_create_parameter_files.pl
index 2092253ab78556c8ebb30ebdd14982ec51f595cc..f84cb9f2610a462f123bcf82f6e38b9fe47f284a 100644
--- a/trio_whole_exome_create_parameter_files.pl
+++ b/trio_whole_exome_create_parameter_files.pl
@@ -53,6 +53,7 @@ my %family;
 while (my $line = <>)
 {
 	next if ($line =~ /^File/);
+	next if ($line =~ /unassigned/);
 	chomp $line;
 
 	my @tokens = split(/\t/, $line);
@@ -88,7 +89,7 @@ my $family_id_out_fh = new IO::File;
 $family_id_out_fh->open(sprintf("%s.family_ids.txt", $output_prefix), "w") or die "Could not open $output_prefix.family_ids.txt\n$!";
 
 my $ped_out_fh = new IO::File;
-foreach my $family_id (keys %family)
+foreach my $family_id (keys %ped)
 {
 	my $pcr_plate_id = $family{$family_id}{'pcr_plate_id'};
 	my $new_family_id = sprintf("%s_%s", $pcr_plate_id, $family_id);
@@ -97,7 +98,7 @@ foreach my $family_id (keys %family)
 
 	$ped_out_fh->open(sprintf("%s_%s.ped", $output_prefix, $new_family_id), "w") or die "Could not open $output_prefix.$new_family_id.ped\n$!";
 
-	foreach my $individual_id (keys %{ $family{$family_id}{'individual_id'} })
+	foreach my $individual_id (keys %{ $ped{$family_id} })
 	{
 		my $father_id = $ped{$family_id}{$individual_id}{'father_id'};
 		my $mother_id = $ped{$family_id}{$individual_id}{'mother_id'};
diff --git a/trio_whole_exome_crf_config.sh b/trio_whole_exome_crf_config.sh
deleted file mode 100644
index 3b7a89fdf2e885ce50a550b089645f45c9bcd194..0000000000000000000000000000000000000000
--- a/trio_whole_exome_crf_config.sh
+++ /dev/null
@@ -1,21 +0,0 @@
-#!/usr/bin/bash
-#
-# Basic configuration options for trio WES pipeline
-#
-
-SCRIPTS=/home/u035/u035/shared/scripts
-BCBIO_TEMPLATE=$SCRIPTS/trio_whole_exome_bcbio_crf_template.yaml
-TARGET=/home/u035/u035/shared/resources/exome_targets/Twist_Exome_RefSeq_targets_hg38.plus15bp.bed
-DOWNLOAD_DIR=/scratch/u035/u035/shared/data
-REFERENCE_GENOME=/home/u035/u035/shared/software/bcbio/genomes/Hsapiens/hg38/seq/hg38.fa
-
-BASE=/scratch/u035/u035/shared/analysis
-PARAMS_DIR=$BASE/params
-READS_DIR=$BASE/reads
-CONFIG_DIR=$BASE/config
-WORK_DIR=$BASE/work
-OUTPUT_DIR=$BASE/output
-
-ARCHIVE_DIR=/archive/u035/trio_whole_exome
-
-export PATH=/home/u035/u035/shared/software/bcbio/tools/bin:$PATH
diff --git a/trio_whole_exome_parse_peddy_ped_csv.pl b/trio_whole_exome_parse_peddy_ped_csv.pl
index 7f964789ecfabfb906ebcf53e2b14ce38595c461..05e4d02085e235d092d7926321c3715b1adc8de1 100644
--- a/trio_whole_exome_parse_peddy_ped_csv.pl
+++ b/trio_whole_exome_parse_peddy_ped_csv.pl
@@ -21,30 +21,33 @@ use IO::File;
 
 my $usage = qq{USAGE:
 $0 [--help]
-  --output  Output directory
-  --ped     Pedigree file for project
-  --project Project id
-  --batch   Batch id
-  --version Analysis run version (v1, v2, etc)
+  --output   Output file
+  --families Family directory
+  --ped      Pedigree file for project
+  --project  Project id
+  --batch    Batch id
+  --version  Analysis run version (v1, v2, etc)
 };
 
 my $help = 0;
 my $ped_file;
+my $fam_dir;
 my $project_id;
 my $version;
-my $out_dir;
+my $out_file;
 my $batch_id;
 
 GetOptions(
-    'help'      => \$help,
-    'project=s' => \$project_id,
-    'ped=s'     => \$ped_file,
-    'output=s'  => \$out_dir,
-    'version=s' => \$version,
-    'batch=s'   => \$batch_id
+    'help'       => \$help,
+    'project=s'  => \$project_id,
+    'ped=s'      => \$ped_file,
+    'output=s'   => \$out_file,
+    'families=s' => \$fam_dir,
+    'version=s'  => \$version,
+    'batch=s'    => \$batch_id
 ) or die $usage;
 
-if ($help || !$project_id || !$ped_file || !$out_dir || !$batch_id || !$version)
+if ($help || !$project_id || !$ped_file || !$out_file || !$batch_id || !$version || !$fam_dir)
 {
     print $usage;
     exit(0);
@@ -70,7 +73,6 @@ while (my $line = <$in_fh>)
 
 $in_fh->close();
 
-my $out_file = sprintf("$out_dir/qc/%s_%s.ped_check.txt", $version, $project_id);
 my $out_fh = new IO::File;
 $out_fh->open($out_file, "w") or die "Could not open $out_file\n$!";
 
@@ -78,8 +80,8 @@ printf $out_fh "project_id\tbatch_id\tsample_a\tsample_b\tpedigree_parents\tpred
 
 foreach my $family_id (sort keys %ped)
 {
-	my @peddy_glob = glob(sprintf("$out_dir/*_%s_%s_%s_%s/%s_%s/qc/peddy/%s%s.ped_check.csv", 
-		$version, $project_id, $batch_id, $family_id, $ped{$family_id}{'aff'}, $family_id, $batch_id, $family_id));
+	my @peddy_glob = glob(sprintf("$fam_dir/*_%s_%s_%s_%s/%s_%s/qc/peddy/%s%s.ped_check.csv", 
+	        $project_id, $version, $batch_id, $family_id, $ped{$family_id}{'aff'}, $family_id, $batch_id, $family_id));
 	next if (scalar(@peddy_glob) == 0);
 
 	my $peddy_fh = new IO::File;
diff --git a/trio_whole_exome_parse_peddy_ped_csv_no_batch.pl b/trio_whole_exome_parse_peddy_ped_csv_no_batch.pl
deleted file mode 100644
index 6ef53f117e86e6eb20e3a4fb1d963914a352ee45..0000000000000000000000000000000000000000
--- a/trio_whole_exome_parse_peddy_ped_csv_no_batch.pl
+++ /dev/null
@@ -1,136 +0,0 @@
-#!/usr/bin/perl -w
-
-=head1 NAME
-
-trio_whole_exome_parse_peddy_ped_csv_no_batch.pl
-
-=head1 AUTHOR
-
-Alison Meynert (alison.meynert@igmm.ed.ac.uk)
-
-=head1 DESCRIPTION
-
-Checks the parent-child and parent-parent relationships from peddy output.
-
-=cut
-
-use strict;
-
-use Getopt::Long;
-use IO::File;
-
-my $usage = qq{USAGE:
-$0 [--help]
-  --output  Output directory
-  --ped     Pedigree file for project
-  --project Project id
-  --version Analysis run version (v1, v2, etc)
-};
-
-my $help = 0;
-my $ped_file;
-my $project_id;
-my $version;
-my $out_dir;
-
-GetOptions(
-    'help'      => \$help,
-    'project=s' => \$project_id,
-    'ped=s'     => \$ped_file,
-    'output=s'  => \$out_dir,
-    'version=s' => \$version
-) or die $usage;
-
-if ($help || !$project_id || !$ped_file || !$out_dir || !$version)
-{
-    print $usage;
-    exit(0);
-}
-
-# Read in the pedigree file
-my $in_fh = new IO::File;
-$in_fh->open($ped_file, "r") or die "Could not open $ped_file\n$!";
-
-my %ped;
-while (my $line = <$in_fh>)
-{
-	chomp $line;
-	my ($family_id, $individual_id, $father_id, $mother_id, $sex, $aff) = split(/\t/, $line);
-	$ped{$family_id}{'count'}++;
-	$ped{$family_id}{$individual_id}{'father'} = $father_id;
-	$ped{$family_id}{$individual_id}{'mother'} = $mother_id;
-	if ($aff == 2)
-	{
-		$ped{$family_id}{'aff'} = $individual_id;
-	}
-}
-
-$in_fh->close();
-
-my $out_file = sprintf("$out_dir/qc/%s_%s.ped_check.txt", $version, $project_id);
-my $out_fh = new IO::File;
-$out_fh->open($out_file, "w") or die "Could not open $out_file\n$!";
-
-printf $out_fh "project_id\tsample_a\tsample_b\tpedigree_parents\tpredicted_parents\tparent_error\n";
-
-foreach my $family_id (sort keys %ped)
-{
-	my @peddy_glob = glob(sprintf("$out_dir/*_%s_%s_%s/%s_%s/qc/peddy/*.ped_check.csv", 
-		$version, $project_id, $family_id, $ped{$family_id}{'aff'}, $family_id));
-	next if (scalar(@peddy_glob) == 0);
-
-	my $peddy_fh = new IO::File;
-	$peddy_fh->open($peddy_glob[0], "r") or die "Could not open $peddy_glob[0]\n$!";
-
-	my @headers;
-	my %info;
-	my @sample_pairs;
-	while (my $line = <$peddy_fh>)
-	{
-		chomp $line;
-		if ($line =~ /^sample_a/)
-		{
-			@headers = split(/,/, $line);
-		}
-		else
-		{
-			my @data = split(/,/, $line);
-			push(@sample_pairs, sprintf("%s\t%s", $data[0], $data[1]));
-			for (my $i = 2; $i < scalar(@headers); $i++)
-			{
-				$info{$headers[$i]}{sprintf("%s\t%s", $data[0], $data[1])} = $data[$i];
-			}
-		}
-	}
-
-	$peddy_fh->close();
-
-	foreach my $sample_pair (@sample_pairs)
-	{
-		my ($sample_a, $sample_b) = split(/\t/, $sample_pair);
-
-		$sample_a =~ /(.+)_$family_id/;
-		my $sample_a_nofam = $1;
-		$sample_b =~ /(.+)_$family_id/;
-		my $sample_b_nofam = $1;
-
-		if ($ped{$family_id}{$sample_a_nofam}{'father'} eq $sample_b_nofam ||
-		    $ped{$family_id}{$sample_a_nofam}{'mother'} eq $sample_b_nofam ||
-		    $ped{$family_id}{$sample_b_nofam}{'father'} eq $sample_a_nofam ||
-		    $ped{$family_id}{$sample_b_nofam}{'mother'} eq $sample_a_nofam)
-		{
-			$info{'pedigree_parents'}{$sample_pair} = 'True';
-		}
-
-		$info{'parent_error'}{$sample_pair} = $info{'pedigree_parents'}{$sample_pair} eq $info{'predicted_parents'}{$sample_pair} ? 'False' : 'True';
-
-		printf $out_fh "$project_id\t$sample_pair\t%s\t%s\t%s\n", 
-		    $info{'pedigree_parents'}{$sample_pair}, 
-		    $info{'predicted_parents'}{$sample_pair},
-		    $info{'parent_error'}{$sample_pair};
-	}
-}
-
-$out_fh->close();
-
-
diff --git a/vcf_config.json.backup b/vcf_config.json.backup
index 59464bf2fe878e1df7d9f0d50bab569d787f737f..e36dc5e45814196cb7cedae06138b57043aba205 100644
--- a/vcf_config.json.backup
+++ b/vcf_config.json.backup
@@ -1,146 +1,12 @@
 {
   "collections": [
     {
-      "id": "1000genomes_phase3_GRCh37",
-      "species": "homo_sapiens",
-      "assembly": "GRCh37",
-      "type": "remote",
-      "strict_name_match": 1,
-      "filename_template": "ftp://ftp.ensembl.org/pub/data_files/homo_sapiens/GRCh37/variation_genotype/ALL.chr###CHR###.phase3_shapeit2_mvncall_integrated_v3plus_nounphased.rsID.genotypes.vcf.gz",
-      "chromosomes": [
-        "1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14",
-        "15", "16", "17", "18", "19", "20", "21", "22", "X", "Y"
-      ],
-      "sample_prefix": "1000GENOMES:phase_3:"
-    },
-    {
-      "id": "1000genomes_phase3_GRCh38",
-      "species": "homo_sapiens",
-      "assembly": "GRCh38",
-      "type": "remote",
-      "strict_name_match": 1,
-      "filename_template": "ftp://ftp.ensembl.org/pub/data_files/homo_sapiens/GRCh38/variation_genotype/ALL.chr###CHR###_GRCh38.genotypes.20170504.vcf.gz",
-      "chromosomes": [
-        "1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14",
-        "15", "16", "17", "18", "19", "20", "21", "22", "X", "Y"
-      ],
-      "sample_prefix": "1000GENOMES:phase_3:"
-    },
-    {
-      "id": "gnomADg_GRCh37",
-      "description": "Genome Aggregation Database genomes r2.1",
-      "species": "homo_sapiens",
-      "assembly": "GRCh37",
-      "type": "remote",
-      "filename_template": "ftp://ftp.ensembl.org/pub/data_files/homo_sapiens/GRCh37/variation_genotype/gnomad/r2.1/genomes/gnomad.genomes.r2.1.sites.chr###CHR###_noVEP.vcf.gz",
-      "chromosomes": [
-        "1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14",
-        "15", "16", "17", "18", "19", "20", "21", "22", "X"
-      ],
-      "population_prefix": "gnomADg:",
-      "population_display_group": {
-        "display_group_name": "gnomAD genomes",
-        "display_group_priority": 1.5
-      },
-      "populations": {
-        "9900000": {
-          "name": "gnomADg:ALL",
-          "_raw_name": "",
-          "description": "All gnomAD genomes individuals"
-        },
-        "9900001": {
-          "name": "afr",
-          "description": "African/African American"
-        },
-        "9900002": {
-          "name": "amr",
-          "description": "Latino"
-        },
-        "9900003": {
-          "name": "asj",
-          "description": "Ashkenazi Jewish"
-        },
-        "9900004": {
-          "name": "eas",
-          "description": "East Asian"
-        },
-        "9900005": {
-          "name": "fin",
-          "description": "Finnish"
-        },
-        "9900006": {
-          "name": "nfe",
-          "description": "Non-Finnish European"
-        },
-        "9900007": {
-          "name": "oth",
-          "description": "Other"
-        }
-      }
-    },
-    {
-      "id": "gnomADe_GRCh37",
-      "description": "Genome Aggregation Database exomes r2.1",
-      "species": "homo_sapiens",
-      "assembly": "GRCh37",
-      "type": "remote",
-      "filename_template": "ftp://ftp.ensembl.org/pub/data_files/homo_sapiens/GRCh37/variation_genotype/gnomad/r2.1/exomes/gnomad.exomes.r2.1.sites.chr###CHR###_noVEP.vcf.gz",
-      "chromosomes": [
-        "1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14",
-        "15", "16", "17", "18", "19", "20", "21", "22", "X", "Y"
-      ],
-      "population_prefix": "gnomADe:",
-      "population_display_group": {
-        "display_group_name": "gnomAD exomes",
-        "display_group_priority": 1.4
-      },
-      "populations": {
-        "9900010": {
-          "name": "gnomADe:ALL",
-          "_raw_name": "",
-          "description": "All gnomAD exomes individuals"
-        },
-        "9900011": {
-          "name": "afr",
-          "description": "African/African American"
-        },
-        "9900012": {
-          "name": "amr",
-          "description": "Latino"
-        },
-        "9900013": {
-          "name": "asj",
-          "description": "Ashkenazi Jewish"
-        },
-        "9900014": {
-          "name": "eas",
-          "description": "East Asian"
-        },
-        "9900015": {
-          "name": "fin",
-          "description": "Finnish"
-        },
-        "9900016": {
-          "name": "nfe",
-          "description": "Non-Finnish European"
-        },
-        "9900017": {
-          "name": "oth",
-          "description": "Other"
-        },
-        "9900018": {
-          "name": "sas",
-          "description": "South Asian"
-        }
-      }
-    },
-    {
-      "id": "gnomADg_r3.0_GRCh38",
-      "description": "Genome Aggregation Database genomes r3.0",
+      "id": "gnomADg_r3.1.1_GRCh38",
+      "description": "Genome Aggregation Database genomes r3.1.1",
       "species": "homo_sapiens",
       "assembly": "GRCh38",
       "type": "local",
-      "filename_template": "/home/u035/u035/shared/resources/gnomad/r3.0/genomes/gnomad.genomes.r3.0.sites.chr###CHR###_trimmed_info.vcf.bgz",
+      "filename_template": "/home/u035/u035/shared/resources/gnomad/r3.1.1/genomes/gnomad.genomes.v3.1.1.sites.chr###CHR###.vcf.bgz",
       "chromosomes": [
         "1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14",
         "15", "16", "17", "18", "19", "20", "21", "22", "X", "Y"
@@ -196,12 +62,12 @@
       }
     },
     {
-      "id": "gnomADe_GRCh38",
-      "description": "Genome Aggregation Database exomes r2.1",
+      "id": "gnomADe_r2.1.1_GRCh38",
+      "description": "Genome Aggregation Database exomes r2.1.1 liftover to GRCh38",
       "species": "homo_sapiens",
       "assembly": "GRCh38",
       "type": "local",
-      "filename_template": "/home/u035/u035/shared/resources/gnomad/r2.1/exomes/gnomad.exomes.r2.1.sites.grch38.chr###CHR###_noVEP.vcf.gz",
+      "filename_template": "/home/u035/u035/shared/resources/gnomad/r2.1.1/exomes/gnomad.exomes.r2.1.1.sites.###CHR###.liftover_grch38.vcf.bgz",
       "chromosomes": [
         "1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14",
         "15", "16", "17", "18", "19", "20", "21", "22", "X", "Y"
@@ -250,488 +116,6 @@
           "description": "South Asian"
         }
       }
-    },
-    {
-      "id": "topmed_GRCh37",
-      "species": "homo_sapiens",
-      "assembly": "GRCh37",
-      "type": "remote",
-      "filename_template": "ftp://ftp.ensembl.org/pub/data_files/homo_sapiens/GRCh37/variation_genotype/TOPMED_GRCh37.vcf.gz",
-      "chromosomes": [
-        "1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14",
-        "15", "16", "17", "18", "19", "20", "21", "22", "X", "Y"
-      ],
-      "source_name": "TOPMed",
-      "population_display_group": {
-        "display_group_name": "TOPMed",
-        "display_group_priority": 2.5
-      },
-      "populations": {
-        "9990000": {
-          "name": "TOPMed",
-          "_raw_name": "TOPMed",
-          "_af": "TOPMED",
-          "description": "Trans-Omics for Precision Medicine (TOPMed) Program"
-        }
-      }
-    },
-    {
-      "id": "topmed_GRCh38",
-      "species": "homo_sapiens",
-      "assembly": "GRCh38",
-      "type": "remote",
-      "filename_template": "ftp://ftp.ensembl.org/pub/data_files/homo_sapiens/GRCh38/variation_genotype/TOPMED_GRCh38_20180418.vcf.gz",
-      "chromosomes": [
-        "1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14",
-        "15", "16", "17", "18", "19", "20", "21", "22", "X", "Y"
-      ],
-      "source_name": "TOPMed",
-      "population_display_group": {
-        "display_group_name": "TOPMed",
-        "display_group_priority": 2.5
-      },
-      "populations": {
-        "9990000": {
-          "name": "TOPMed",
-          "_raw_name": "TOPMed",
-          "_af": "TOPMED",
-          "description": "Trans-Omics for Precision Medicine (TOPMed) Program"
-        }
-      }
-    },
-    {
-      "id": "uk10k_GRCh37",
-      "species": "homo_sapiens",
-      "assembly": "GRCh37",
-      "type": "remote",
-      "filename_template": "ftp://ftp.ensembl.org/pub/data_files/homo_sapiens/GRCh37/variation_genotype/UK10K_COHORT.20160215.sites.vcf.gz",
-      "chromosomes": [
-        "1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14",
-        "15", "16", "17", "18", "19", "20", "21", "22", "X", "Y"
-      ],
-      "source_name": "UK10K",
-      "population_prefix": "UK10K:",
-      "population_display_group": {
-        "display_group_name": "UK10K",
-        "display_group_priority": 2.6
-      },
-      "populations": {
-        "9999000": {
-          "name": "ALSPAC",
-          "_raw_name": "ALSPAC",
-          "description": "ALSPAC cohort"
-        },
-        "9999001": {
-          "name": "TWINSUK",
-          "_raw_name": "TWINSUK_NODUP",
-          "description": "TWINSUK cohort excluding 67 samples where a monozygotic or dyzygotic twin was included in the release"
-        }
-      }
-    },
-    {
-      "id": "uk10k_GRCh38",
-      "species": "homo_sapiens",
-      "assembly": "GRCh38",
-      "type": "remote",
-      "filename_template": "ftp://ftp.ensembl.org/pub/data_files/homo_sapiens/GRCh38/variation_genotype/UK10K_COHORT.20160215.sites.GRCh38.vcf.gz",
-      "chromosomes": [
-        "1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14",
-        "15", "16", "17", "18", "19", "20", "21", "22", "X", "Y"
-      ],
-      "source_name": "UK10K",
-      "population_prefix": "UK10K:",
-      "population_display_group": {
-        "display_group_name": "UK10K",
-        "display_group_priority": 2.6
-      },
-      "populations": {
-        "9999000": {
-          "name": "ALSPAC",
-          "_raw_name": "ALSPAC",
-          "description": "ALSPAC cohort"
-        },
-        "9999001": {
-          "name": "TWINSUK",
-          "_raw_name": "TWINSUK_NODUP",
-          "description": "TWINSUK cohort excluding 67 samples where a monozygotic or dyzygotic twin was included in the release"
-        }
-      }
-    },
-    { "id": "esp_GRCh37",
-      "species": "homo_sapiens",
-      "assembly": "GRCh37",
-      "type": "remote",
-      "filename_template": "ftp://ftp.ensembl.org/pub/data_files/homo_sapiens/GRCh37/variation_genotype/ESP6500SI-V2-SSA137.vcf.gz",
-      "chromosomes": [
-        "1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14",
-        "15", "16", "17", "18", "19", "20", "21", "22", "X", "Y"
-      ],
-      "source_name": "NHLBI Exome Sequencing Project",
-      "population_display_group": {
-        "display_group_name": "NHLBI Exome Sequencing Project",
-        "display_group_priority": 2.7
-      },
-      "population_prefix": "ESP6500:",
-      "populations": {
-        "9910000": {
-          "name": "AA",
-          "description": "African American",
-          "_ac": "AA_AC"
-        },
-        "9910001": {
-          "name": "EA",
-          "description": "European American",
-          "_ac": "EA_AC"
-        }
-      }
-    },
-    { "id": "esp_GRCh38",
-      "species": "homo_sapiens",
-      "assembly": "GRCh38",
-      "type": "remote",
-      "filename_template": "ftp://ftp.ensembl.org/pub/data_files/homo_sapiens/GRCh38/variation_genotype/ESP6500SI-V2-SSA137_GRCh38.vcf.gz",
-      "chromosomes": [
-        "1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14",
-        "15", "16", "17", "18", "19", "20", "21", "22", "X", "Y"
-      ],
-      "source_name": "NHLBI Exome Sequencing Project",
-      "population_display_group": {
-        "display_group_name": "NHLBI Exome Sequencing Project",
-        "display_group_priority": 2.7
-      },
-      "population_prefix": "ESP6500:",
-      "populations": {
-        "9910000": {
-          "name": "AA",
-          "description": "African American",
-          "_ac": "AA_AC"
-        },
-        "9910001": {
-          "name": "EA",
-          "description": "European American",
-          "_ac": "EA_AC"
-        }
-      }
-    },
-    {
-      "id": "nextgen_cow_irbt",
-      "species": "bos_taurus",
-      "assembly": "ARS-UCD1.2",
-      "type": "remote",
-      "filename_template": "ftp://ftp.ensembl.org/pub/data_files/bos_taurus/ARS-UCD1.2/variation_genotype/IRBT_ARS-UCD1_2.vcf.gz",
-      "chromosomes": [
-        "1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14",
-        "15", "16", "17", "18", "19", "20", "21", "22", "23", "24", "25", "26", "27", "28", "29", "X"
-      ],
-      "population_prefix": "NextGen:"
-    },
-    {
-      "id": "nextgen_sheep_iroa",
-      "species": "ovis_aries",
-      "assembly": "Oar_v3.1",
-      "type": "remote",
-      "filename_template": "ftp://ftp.ensembl.org/pub/data_files/ovis_aries/Oar_v3.1/variation_genotype/IROA.population_sites.OARv3_1.20140307.vcf.gz",
-      "chromosomes": [
-        "1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14",
-        "15", "16", "17", "18", "19", "20", "21", "22", "23", "24", "25", "26", "X"
-      ],
-      "population_prefix": "NextGen:"
-    },
-    {
-      "id": "nextgen_sheep_mooa",
-      "species": "ovis_aries",
-      "assembly": "Oar_v3.1",
-      "type": "remote",
-      "filename_template": "ftp://ftp.ensembl.org/pub/data_files/ovis_aries/Oar_v3.1/variation_genotype/MOOA.population_sites.OARv3_1.20140328.vcf.gz",
-      "chromosomes": [
-        "1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14",
-        "15", "16", "17", "18", "19", "20", "21", "22", "23", "24", "25", "26", "X"
-      ],
-      "population_prefix": "NextGen:"
-    },
-    {
-      "id": "sheep_genome_consortium",
-      "species": "ovis_aries",
-      "assembly": "Oar_v3.1",
-      "type": "remote",
-      "filename_template" : "ftp://ftp.ensembl.org/pub/data_files/ovis_aries/Oar_v3.1/variation_genotype/###CHR###.filtered_intersect.vcf.gz",
-      "chromosomes": [
-        "1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14",
-        "15", "16", "17", "18", "19", "20", "21", "22", "23", "24", "25", "26", "X", "MT"
-      ],
-      "population_prefix": "ISGC:",
-      "sample_prefix": "ISGC:"
-    },
-    {
-      "id": "nextgen_goat",
-      "species": "capra_hircus",
-      "assembly": "ARS1",
-      "type": "remote",
-      "filename_template" : "ftp://ftp.ensembl.org/pub/data_files/capra_hircus/ARS1/variation_genotype/NextGenCapraHircusRemappedARS1.vcf.gz",
-      "chromosomes": [
-        "1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14",
-        "15", "16", "17", "18", "19", "20", "21", "22", "23", "24", "25", "26", "27", "28", "29"
-      ],
-      "population_prefix": "NextGen:",
-      "sample_prefix": "NextGen:"
-    },
-    {
-      "id": "mouse_genome_project_snp",
-      "species": "mus_musculus",
-      "assembly": "GRCm38",
-      "type": "remote",
-      "filename_template": "ftp://ftp.ensembl.org/pub/data_files/mus_musculus/GRCm38/variation_genotype/mgp.v3.snps.sorted.rsIDdbSNPv137.vcf.gz",
-      "chromosomes": [
-        "1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14",
-        "15", "16", "17", "18", "19", "X", "Y"
-      ],
-      "sample_prefix": "MGP:"
-    },
-    {
-      "id": "mouse_genome_project_indel",
-      "species": "mus_musculus",
-      "assembly": "GRCm38",
-      "type": "remote",
-      "filename_template": "ftp://ftp.ensembl.org/pub/data_files/mus_musculus/GRCm38/variation_genotype/mgp.v3.indels.sorted.rsIDdbSNPv137.vcf.gz",
-      "chromosomes": [
-        "1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14",
-        "15", "16", "17", "18", "19", "X", "Y"
-      ],
-      "sample_prefix": "MGP:"
-    },
-    {
-      "id": "dog_EVA_PRJEB24066",
-      "species": "canis_familiaris",
-      "assembly": "CanFam3.1",
-      "type": "remote",
-      "filename_template" : "ftp://ftp.ebi.ac.uk/pub/databases/eva/PRJEB24066/dogs.557.publicSamples.ann.vcf.gz",
-      "chromosomes": [
-        "1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", "15", "16", "17", "18", "19", "20", "21",
-        "22", "23", "24", "25", "26", "27", "28", "29", "30", "31", "32", "33", "34", "35", "36", "37", "38", "X", "MT"
-      ],
-      "sample_prefix": "PRJEB24066:"
-    },
-    {
-      "id": "horse_EVA_PRJEB9799",
-      "species": "equus_caballus",
-      "assembly": "EquCab3.0",
-      "type": "remote",
-      "filename_template" : "ftp://ftp.ensembl.org/pub/data_files/equus_caballus/EquCab3.0/variation_genotype/PRJEB9799_EquCab3_0.vcf.gz",
-      "chromosomes": [
-        "1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", "15", "16", "17", "18", "19", "20", "21",
-        "22", "23", "24", "25", "26", "27", "28", "29", "30", "31", "X", "MT"
-      ],
-      "sample_prefix": "PRJEB9799:"
-    },
-    {
-      "id": "atlantic_salmon_EVA_PRJEB34225",
-      "source_name": "PRJEB34225",
-      "description": "Variants from EVA study PRJEB34225",
-      "species": "salmo_salar",
-      "assembly": "ICSASG_v2",
-      "type": "remote",
-      "use_as_source" : 1,
-      "use_seq_region_synonyms": 1,
-      "strict_name_match" : 0,
-      "filename_template" : "ftp://ftp.ebi.ac.uk/pub/databases/eva/PRJEB34225/Salmon_SNP_80_samples_freebayes_EBI_noINFO.vcf.gz",
-      "chromosomes": [
-        "ssa01", "ssa02", "ssa03", "ssa04", "ssa05", "ssa06", "ssa07", "ssa08", "ssa09", "ssa10", "ssa11", "ssa12", "ssa13", "ssa14", "ssa15", "ssa16", "ssa17", "ssa18", "ssa19", "ssa20", "ssa21", "ssa22", "ssa23", "ssa24", "ssa25", "ssa26", "ssa27", "ssa28", "ssa29"
-      ]
-    },
-    {
-      "id": "vervet_EVA_PRJEB22989",
-      "source_name": "PRJEB22989",
-      "species": "chlorocebus_sabaeus",
-      "assembly": "ChlSab1.1",
-      "type": "remote",
-      "use_as_source" : 1,
-      "strict_name_match" : 0,
-      "filename_template" : "ftp://ftp.ebi.ac.uk/pub/databases/eva/PRJEB22988/Svardal_et_al_2017_vervet_monkey_SNPs_imputed_phased.vcf.gz"
-    },
-    {
-      "id": "103_mammals.gerp_conservation_score",
-      "species": "bos_taurus",
-      "assembly": "ARS-UCD1.2",
-      "type": "remote",
-      "filename_template" : "ftp://ftp.ensembl.org/pub/release-100/compara/conservation_scores/103_mammals.gerp_conservation_score/gerp_conservation_scores.bos_taurus.ARS-UCD1.2.bw",
-      "annotation_type": "gerp"
-    },
-    {
-      "id": "103_mammals.gerp_conservation_score",
-      "species": "canis_familiaris",
-      "assembly": "CanFam3.1",
-      "type": "remote",
-      "filename_template" : "ftp://ftp.ensembl.org/pub/release-100/compara/conservation_scores/103_mammals.gerp_conservation_score/gerp_conservation_scores.canis_familiaris.CanFam3.1.bw",
-      "annotation_type": "gerp"
-    },
-    {
-      "id": "103_mammals.gerp_conservation_score",
-      "species": "capra_hircus",
-      "assembly": "ARS1",
-      "type": "remote",
-      "filename_template" : "ftp://ftp.ensembl.org/pub/release-100/compara/conservation_scores/103_mammals.gerp_conservation_score/gerp_conservation_scores.capra_hircus.ARS1.bw",
-      "annotation_type": "gerp"
-    },
-    {
-      "id": "103_mammals.gerp_conservation_score",
-      "species": "chlorocebus_sabaeus",
-      "assembly": "ChlSab1.1",
-      "type": "remote",
-      "filename_template" : "ftp://ftp.ensembl.org/pub/release-100/compara/conservation_scores/103_mammals.gerp_conservation_score/gerp_conservation_scores.chlorocebus_sabaeus.ChlSab1.1.bw",
-      "annotation_type": "gerp"
-    },
-    {
-      "id": "82_fish.gerp_conservation_score",
-      "species": "danio_rerio",
-      "assembly": "GRCz11",
-      "type": "remote",
-      "filename_template" : "ftp://ftp.ensembl.org/pub/release-100/compara/conservation_scores/82_fish.gerp_conservation_score/gerp_conservation_scores.danio_rerio.GRCz11.bw",
-      "annotation_type": "gerp"
-    },
-    {
-      "id": "103_mammals.gerp_conservation_score",
-      "species": "equus_caballus",
-      "assembly": "EquCab3.0",
-      "type": "remote",
-      "filename_template" : "ftp://ftp.ensembl.org/pub/release-100/compara/conservation_scores/103_mammals.gerp_conservation_score/gerp_conservation_scores.equus_caballus.EquCab3.0.bw",
-      "annotation_type": "gerp"
-    },
-    {
-      "id": "103_mammals.gerp_conservation_score",
-      "species": "felis_catus",
-      "assembly": "Felis_catus_9.0",
-      "type": "remote",
-      "filename_template" : "ftp://ftp.ensembl.org/pub/release-100/compara/conservation_scores/100_mammals.gerp_conservation_score/gerp_conservation_scores.felis_catus.Felis_catus_9.0.bw",
-      "filename_template" : "ftp://ftp.ensembl.org/pub/release-100/compara/conservation_scores/103_mammals.gerp_conservation_score/gerp_conservation_scores.felis_catus.Felis_catus_9.0.bw",
-      "annotation_type": "gerp"
-    },
-    {
-      "id": "81_amniotes.gerp_conservation_score",
-      "species": "gallus_gallus",
-      "assembly": "GRCg6a",
-      "type": "remote",
-      "filename_template" : "ftp://ftp.ensembl.org/pub/release-100/compara/conservation_scores/81_amniotes.gerp_conservation_score/gerp_conservation_scores.gallus_gallus.GRCg6a.bw",
-      "annotation_type": "gerp"
-    },
-    {
-      "id": "103_mammals.gerp_conservation_score",
-      "species": "homo_sapiens",
-      "assembly": "GRCh37",
-      "type": "remote",
-      "filename_template" : "ftp://ftp.ensembl.org/pub/grch37/release-100/compara/conservation_scores/37_mammals.gerp_conservation_score/gerp_conservation_scores.homo_sapiens.GRCh37.bw",
-      "annotation_type": "gerp"
-    },
-    {
-      "id": "103_mammals.gerp_conservation_score",
-      "species": "homo_sapiens",
-      "assembly": "GRCh38",
-      "type": "remote",
-      "filename_template" : "ftp://ftp.ensembl.org/pub/release-100/compara/conservation_scores/103_mammals.gerp_conservation_score/gerp_conservation_scores.homo_sapiens.GRCh38.bw",
-      "annotation_type": "gerp"
-    },
-    {
-      "id": "103_mammals.gerp_conservation_score",
-      "species": "macaca_mulatta",
-      "assembly": "Mmul_10",
-      "type": "remote",
-      "filename_template" : "ftp://ftp.ensembl.org/pub/release-100/compara/conservation_scores/103_mammals.gerp_conservation_score/gerp_conservation_scores.macaca_mulatta.Mmul_10.bw",
-      "annotation_type": "gerp"
-    },
-    {
-      "id": "81_amniotes.gerp_conservation_score",
-      "species": "meleagris_gallopavo",
-      "assembly": "UMD2",
-      "type": "remote",
-      "filename_template" : "ftp://ftp.ensembl.org/pub/release-100/compara/conservation_scores/81_amniotes.gerp_conservation_score/gerp_conservation_scores.meleagris_gallopavo.UMD2.bw",
-      "annotation_type": "gerp"
-    },
-    {
-      "id": "103_mammals.gerp_conservation_score",
-      "species": "mus_musculus",
-      "assembly": "GRCm38",
-      "type": "remote",
-      "filename_template" : "ftp://ftp.ensembl.org/pub/release-100/compara/conservation_scores/103_mammals.gerp_conservation_score/gerp_conservation_scores.mus_musculus.GRCm38.bw",
-      "annotation_type": "gerp"
-    },
-    {
-      "id": "103_mammals.gerp_conservation_score",
-      "species": "nomascus_leucogenys",
-      "assembly": "Nleu_3.0",
-      "type": "remote",
-      "filename_template" : "ftp://ftp.ensembl.org/pub/release-100/compara/conservation_scores/103_mammals.gerp_conservation_score/gerp_conservation_scores.nomascus_leucogenys.Nleu_3.0.bw",
-      "annotation_type": "gerp"
-    },
-    {
-      "id": "103_mammals.gerp_conservation_score",
-      "species": "ovis_aries",
-      "assembly": "Oar_v3.1",
-      "type": "remote",
-      "filename_template" : "ftp://ftp.ensembl.org/pub/release-100/compara/conservation_scores/103_mammals.gerp_conservation_score/gerp_conservation_scores.ovis_aries.Oar_v3.1.bw",
-      "annotation_type": "gerp"
-    },
-    {
-      "id": "103_mammals.gerp_conservation_score",
-      "species": "pan_troglodytes",
-      "assembly": "Pan_tro_3.0",
-      "type": "remote",
-      "filename_template" : "ftp://ftp.ensembl.org/pub/release-100/compara/conservation_scores/103_mammals.gerp_conservation_score/gerp_conservation_scores.pan_troglodytes.Pan_tro_3.0.bw",
-      "annotation_type": "gerp"
-    },
-    {
-      "id": "103_mammals.gerp_conservation_score",
-      "species": "pongo_abelii",
-      "assembly": "PPYG2",
-      "type": "remote",
-      "filename_template" : "ftp://ftp.ensembl.org/pub/release-100/compara/conservation_scores/103_mammals.gerp_conservation_score/gerp_conservation_scores.pongo_abelii.PPYG2.bw",
-      "annotation_type": "gerp"
-    },
-    {
-      "id": "103_mammals.gerp_conservation_score",
-      "species": "rattus_norvegicus",
-      "assembly": "Rnor_6.0",
-      "type": "remote",
-      "filename_template" : "ftp://ftp.ensembl.org/pub/release-100/compara/conservation_scores/103_mammals.gerp_conservation_score/gerp_conservation_scores.rattus_norvegicus.Rnor_6.0.bw",
-      "annotation_type": "gerp"
-    },
-    {
-      "id": "103_mammals.gerp_conservation_score",
-      "species": "sus_scrofa",
-      "assembly": "Sscrofa11.1",
-      "type": "remote",
-      "filename_template" : "ftp://ftp.ensembl.org/pub/release-100/compara/conservation_scores/103_mammals.gerp_conservation_score/gerp_conservation_scores.sus_scrofa.Sscrofa11.1.bw",
-      "annotation_type": "gerp"
-    },
-    {
-      "id": "81_amniotes.gerp_conservation_score",
-      "species": "taeniopygia_guttata",
-      "assembly": "bTaeGut1_v1.p",
-      "type": "remote",
-      "filename_template" : "ftp://ftp.ensembl.org/pub/release-100/compara/conservation_scores/81_amniotes.gerp_conservation_score/gerp_conservation_scores.taeniopygia_guttata.bTaeGut1_v1.p.bw",
-      "annotation_type": "gerp"
-    },
-    {
-      "id": "82_fish.gerp_conservation_score",
-      "species": "tetraodon_nigroviridis",
-      "assembly": "TETRAODON8",
-      "type": "remote",
-      "filename_template" : "ftp://ftp.ensembl.org/pub/release-100/compara/conservation_scores/82_fish.gerp_conservation_score/gerp_conservation_scores.tetraodon_nigroviridis.TETRAODON8.bw",
-      "annotation_type": "gerp"
-    },
-    {
-      "id": "82_fish.gerp_conservation_score",
-      "species": "salmo_salar",
-      "assembly": "ICSASG_v2",
-      "type": "remote",
-      "filename_template" : "ftp://ftp.ensembl.org/pub/release-100/compara/conservation_scores/82_fish.gerp_conservation_score/gerp_conservation_scores.salmo_salar.ICSASG_v2.bw",
-      "annotation_type": "gerp"
-    },
-    {
-      "id": "CADD_GRCh38_whole_genome_SNVs",
-      "species": "homo_sapiens",
-      "assembly": "GRCh38",
-      "type": "remote",
-      "filename_template" : "ftp://ftp.ensembl.org/pub/data_files/homo_sapiens/GRCh38/variation_annotation/CADD_GRCh38_whole_genome_SNVs.tsv.gz",
-      "annotation_type": "cadd"
     }
   ]
 }