Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
T
trio-whole-exome
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Container Registry
Model registry
Operate
Environments
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
igmmbioinformatics
trio-whole-exome
Commits
3d2e76b8
Commit
3d2e76b8
authored
4 years ago
by
ameyner2
Browse files
Options
Downloads
Patches
Plain Diff
Mike updates
parent
9e11a751
No related branches found
No related tags found
No related merge requests found
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
compare_indi_vars_by_version.py
+149
-0
149 additions, 0 deletions
compare_indi_vars_by_version.py
gather_NHS_WES_trio_results.sh
+2
-1
2 additions, 1 deletion
gather_NHS_WES_trio_results.sh
process_NHS_WES_trio.sh
+11
-11
11 additions, 11 deletions
process_NHS_WES_trio.sh
with
162 additions
and
12 deletions
compare_indi_vars_by_version.py
0 → 100644
+
149
−
0
View file @
3d2e76b8
#
# Author: MH
# last modified: SEPT 25, 2020
import
sys
import
os
import
fnmatch
from
collections
import
defaultdict
V1_VARIANTS
=
defaultdict
(
list
)
# key: <indi_id>_<fam_id>; value: list of variants in the decipher upload file in the form internal_id:chr:pos:ref:alt
V2_VARIANTS
=
defaultdict
(
list
)
# key: <indi_id>_<fam_id>; value: list of variants in the decipher upload file in the form internal_id:chr:pos:ref:alt
def
go
(
v1_dir
,
v2_dir
):
v1_files
=
[]
for
root
,
dirnames
,
filenames
in
os
.
walk
(
v1_dir
):
for
filename
in
fnmatch
.
filter
(
filenames
,
'
*_DEC_FLT.csv
'
):
v1_files
.
append
(
os
.
path
.
join
(
root
,
filename
))
print
"
Found %s *_DEC_FLT.csv files in %s
"
%
(
len
(
v1_files
),
v1_dir
)
for
v1_file
in
v1_files
:
parsed
=
[
x
.
strip
()
for
x
in
v1_file
.
strip
().
split
(
'
/
'
)]
dname
=
parsed
[
8
]
fname
=
parsed
[
9
]
fname_parsed
=
[
y
.
strip
()
for
y
in
fname
.
strip
().
split
(
'
_
'
)]
indi_id
=
fname_parsed
[
0
]
fam_id
=
fname_parsed
[
1
]
key
=
"
%s_%s
"
%
(
indi_id
,
fam_id
)
V1_VARIANTS
[
key
]
=
[]
# print "Processing V1: folder = %s, file = %s, indi_id = %s, family_id = %s, key = %s" % (dname,fname,indi_id,fam_id,key)
# now open the file and read the variants
in_han
=
open
(
v1_file
,
'
r
'
)
for
line
in
in_han
:
if
line
.
startswith
(
"
Internal reference number or ID
"
):
continue
data
=
[
z
.
strip
()
for
z
in
line
.
strip
().
split
(
'
,
'
)]
internal_id
=
data
[
0
]
chr
=
data
[
1
]
pos
=
data
[
2
]
ref
=
data
[
4
]
alt
=
data
[
5
]
value
=
'
%s:%s:%s:%s:%s
'
%
(
internal_id
,
chr
,
pos
,
ref
,
alt
)
# print value
V1_VARIANTS
[
key
].
append
(
value
)
# print " Found %s variants for %s" % (len(V1_VARIANTS[key]),key)
# now process V2
v2_files
=
[]
for
root
,
dirnames
,
filenames
in
os
.
walk
(
v2_dir
):
for
filename
in
fnmatch
.
filter
(
filenames
,
'
*_DEC_FLT.csv
'
):
v2_files
.
append
(
os
.
path
.
join
(
root
,
filename
))
print
"
Found %s *_DEC_FLT.csv files in %s
"
%
(
len
(
v2_files
),
v2_dir
)
for
v2_file
in
v2_files
:
parsed
=
[
x
.
strip
()
for
x
in
v2_file
.
strip
().
split
(
'
/
'
)]
dname
=
parsed
[
8
]
fname
=
parsed
[
9
]
fname_parsed
=
[
y
.
strip
()
for
y
in
fname
.
strip
().
split
(
'
_
'
)]
indi_id
=
fname_parsed
[
0
]
fam_id
=
fname_parsed
[
1
]
key
=
"
%s_%s
"
%
(
indi_id
,
fam_id
)
V2_VARIANTS
[
key
]
=
[]
# print "Processing V2: folder = %s, file = %s, indi_id = %s, family_id = %s, key = %s" % (dname,fname,indi_id,fam_id,key)
# now open the file and read the variants
in_han
=
open
(
v2_file
,
'
r
'
)
for
line
in
in_han
:
if
line
.
startswith
(
"
Internal reference number or ID
"
):
continue
data
=
[
z
.
strip
()
for
z
in
line
.
strip
().
split
(
'
,
'
)]
internal_id
=
data
[
0
]
chr
=
data
[
1
]
pos
=
data
[
2
]
ref
=
data
[
4
]
alt
=
data
[
5
]
value
=
'
%s:%s:%s:%s:%s
'
%
(
internal_id
,
chr
,
pos
,
ref
,
alt
)
# print value
V2_VARIANTS
[
key
].
append
(
value
)
# print " Found %s variants for %s" % (len(V2_VARIANTS[key]),key)
# check that in both folders we have the same individuals (i.e. keys)
v1_keys
=
V1_VARIANTS
.
keys
()
v2_keys
=
V2_VARIANTS
.
keys
()
for
v1_key
in
v1_keys
:
if
v1_key
not
in
v2_keys
:
print
"
v1_key = %s not found in v2_keys
"
%
(
v1_key
)
raise
SystemExit
for
v2_key
in
v2_keys
:
if
v2_key
not
in
v1_keys
:
print
"
v2_key = %s not found in v1_keys
"
%
(
v2_key
)
raise
SystemExit
print
"
Match individuals between the two vesrions: OK
"
sys
.
stdout
.
flush
()
# now compare the variants per individual
for
key
in
v1_keys
:
print
""
print
"
Processing individual = %s
"
%
(
key
)
v1_variants
=
V1_VARIANTS
[
key
]
v2_variants
=
V2_VARIANTS
[
key
]
for
v1_var
in
v1_variants
:
if
v1_var
in
v2_variants
:
print
"
%s: %s found in both versions
"
%
(
key
,
v1_var
)
v2_variants
.
remove
(
v1_var
)
else
:
print
"
<<<< %s: %s found only in V1
"
%
(
key
,
v1_var
)
# in v2_variants, remaining ar only those for which no match in V1 was found
for
v2_var
in
v2_variants
:
print
"
>>>> %s: %s found only in V2
"
%
(
key
,
v2_var
)
print
"
-----------------------------
"
if
__name__
==
'
__main__
'
:
if
len
(
sys
.
argv
)
==
3
:
go
(
sys
.
argv
[
1
],
sys
.
argv
[
2
])
else
:
print
(
"
Suggested use: time python compare_indi_vars_by_version.py results_folder_v1 results_folder_v2
"
)
raise
SystemExit
This diff is collapsed.
Click to expand it.
gather_NHS_WES_trio_results.sh
+
2
−
1
View file @
3d2e76b8
...
@@ -10,7 +10,7 @@
...
@@ -10,7 +10,7 @@
### folder structure for the downstream analysis - created by NHS_WES_trio_setup.sh ###
### folder structure for the downstream analysis - created by NHS_WES_trio_setup.sh ###
BASE
=
/scratch/u035/project/trio_whole_exome/analysis
BASE
=
/scratch/u035/project/trio_whole_exome/analysis
WORK_DIR
=
${
BASE
}
/
${
PROJECT_ID
}
WORK_DIR
=
${
BASE
}
/
${
PROJECT_ID
}
NHS_DIR
=
${
WORK_DIR
}
/
${
PLATE_ID
}
_results
NHS_DIR
=
${
WORK_DIR
}
/
${
PLATE_ID
}
_
${
VERSION_N
}
_
results
# other files to be used
# other files to be used
...
@@ -20,6 +20,7 @@ CHILD_IDS=${WORK_DIR}/PRO_IDs.txt
...
@@ -20,6 +20,7 @@ CHILD_IDS=${WORK_DIR}/PRO_IDs.txt
echo
"PLATE_ID =
${
PLATE_ID
}
"
# the PCR plate ID of the batch being currently processed, e.g. 16862
echo
"PLATE_ID =
${
PLATE_ID
}
"
# the PCR plate ID of the batch being currently processed, e.g. 16862
echo
"PROJECT_ID =
${
PROJECT_ID
}
"
# this the the folder (${BASE}/${PROJECT_ID}) where the downstream analysis will be done
echo
"PROJECT_ID =
${
PROJECT_ID
}
"
# this the the folder (${BASE}/${PROJECT_ID}) where the downstream analysis will be done
echo
"VERSION_N =
${
VERSION_N
}
"
# the version of the alignment and genotyping analysis
# check if ${NHS_DIR} already exists - if not, exit and ask to be created
# check if ${NHS_DIR} already exists - if not, exit and ask to be created
...
...
This diff is collapsed.
Click to expand it.
process_NHS_WES_trio.sh
+
11
−
11
View file @
3d2e76b8
#!/bin/bash
#!/bin/bash
#PBS -l walltime=24:00:00
#PBS -l walltime=24:00:00
#PBS -l ncpus=1,mem=16gb
#PBS -l ncpus=1,mem=16gb
#PBS -q
uv2000
#PBS -q
sgp
#PBS -N process_trio
#PBS -N process_trio
#PBS -j oe
#PBS -j oe
...
@@ -57,7 +57,7 @@ echo "SOURCE_DIR = ${SOURCE_DIR}" # the general path to the source BAM fil
...
@@ -57,7 +57,7 @@ echo "SOURCE_DIR = ${SOURCE_DIR}" # the general path to the source BAM fil
echo
"BATCH_ID =
${
BATCH_ID
}
"
# the ID of the batch being processed e.g. 11870_Germain_Lorna
echo
"BATCH_ID =
${
BATCH_ID
}
"
# the ID of the batch being processed e.g. 11870_Germain_Lorna
echo
"PLATE_ID =
${
PLATE_ID
}
"
# the PCR plate ID of the batch being currently processed, e.g. 16862
echo
"PLATE_ID =
${
PLATE_ID
}
"
# the PCR plate ID of the batch being currently processed, e.g. 16862
echo
"PROJECT_ID =
${
PROJECT_ID
}
"
# this the the folder (${BASE}/${PROJECT_ID}) where the downstream analysis will be done
echo
"PROJECT_ID =
${
PROJECT_ID
}
"
# this the the folder (${BASE}/${PROJECT_ID}) where the downstream analysis will be done
echo
"VERSION_N =
${
VERSION_N
}
"
# the version of the alignment and genotyping analysis
...
@@ -266,10 +266,10 @@ echo ""
...
@@ -266,10 +266,10 @@ echo ""
#################################################################################################################################################
#################################################################################################################################################
#############
### run coverage for each proband (DD genes) ###
### run coverage for each proband (DD genes)
###
### format: ${SOURCE_DIR}/????-??-??_${BATCH_ID}_${PLATE_ID}_${FAMILY_ID}/${PROBAND_ID}_${FAMILY_ID}/${PROBAND_ID}_${FAMILY_ID}-ready.bam ###
### format: ${SOURCE_DIR}/????-??-??_${
VERSION_N}_${
BATCH_ID}_${PLATE_ID}_${FAMILY_ID}/${PROBAND_ID}_${FAMILY_ID}/${PROBAND_ID}_${FAMILY_ID}-ready.bam ###
#################################################################################################################################################
#################################################################################################################################################
#############
#################################
#################################
...
@@ -282,7 +282,7 @@ echo "Performing coverage analysis for PROBAND_ID = ${PROBAND_ID}_${FAMILY_ID} .
...
@@ -282,7 +282,7 @@ echo "Performing coverage analysis for PROBAND_ID = ${PROBAND_ID}_${FAMILY_ID} .
# make sure we are reading the data from the exact batch & plate ID
# make sure we are reading the data from the exact batch & plate ID
BAM_FILE
=
${
SOURCE_DIR
}
/????-??-??_
${
BATCH_ID
}
_
${
PLATE_ID
}
_
${
FAMILY_ID
}
/
${
PROBAND_ID
}
_
${
FAMILY_ID
}
/
${
PROBAND_ID
}
_
${
FAMILY_ID
}
-ready
.bam
BAM_FILE
=
${
SOURCE_DIR
}
/????-??-??_
${
VERSION_N
}
_
${
BATCH_ID
}
_
${
PLATE_ID
}
_
${
FAMILY_ID
}
/
${
PROBAND_ID
}
_
${
FAMILY_ID
}
/
${
PROBAND_ID
}
_
${
FAMILY_ID
}
-ready
.bam
OUT_FILE
=
${
COV_DIR
}
/
${
PROBAND_ID
}
_
${
FAMILY_ID
}
.DD15
OUT_FILE
=
${
COV_DIR
}
/
${
PROBAND_ID
}
_
${
FAMILY_ID
}
.DD15
...
@@ -353,7 +353,7 @@ DEC_MAP=${WORK_DIR}/DECIPHER_INTERNAL_IDs.txt
...
@@ -353,7 +353,7 @@ DEC_MAP=${WORK_DIR}/DECIPHER_INTERNAL_IDs.txt
IN_G2P_FILE
=
${
G2P_DIR
}
/
${
PLATE_ID
}
_
${
FAMILY_ID
}
_LOG_DIR/
${
PLATE_ID
}
_
${
FAMILY_ID
}
.report.txt
IN_G2P_FILE
=
${
G2P_DIR
}
/
${
PLATE_ID
}
_
${
FAMILY_ID
}
_LOG_DIR/
${
PLATE_ID
}
_
${
FAMILY_ID
}
.report.txt
IN_VASE_FILE
=
${
VASE_DIR
}
/
${
PLATE_ID
}
_
${
FAMILY_ID
}
.ready.denovo.vcf
IN_VASE_FILE
=
${
VASE_DIR
}
/
${
PLATE_ID
}
_
${
FAMILY_ID
}
.ready.denovo.vcf
FAM_IGV_DIR
=
${
IGV_DIR
}
/
${
PLATE_ID
}
_
${
FAMILY_ID
}
FAM_IGV_DIR
=
${
IGV_DIR
}
/
${
PLATE_ID
}
_
${
FAMILY_ID
}
FAM_BAM_DIR
=
${
SOURCE_DIR
}
/????-??-??_
${
BATCH_ID
}
_
${
PLATE_ID
}
_
${
FAMILY_ID
}
FAM_BAM_DIR
=
${
SOURCE_DIR
}
/????-??-??_
${
VERSION_N
}
_
${
BATCH_ID
}
_
${
PLATE_ID
}
_
${
FAMILY_ID
}
## call the python scrpit
## call the python scrpit
...
@@ -432,9 +432,9 @@ done
...
@@ -432,9 +432,9 @@ done
echo
"...kid_id =
${
kid_id
}
, par_1_id =
${
par_1_id
}
, par_2_id =
${
par_2_id
}
"
echo
"...kid_id =
${
kid_id
}
, par_1_id =
${
par_1_id
}
, par_2_id =
${
par_2_id
}
"
# gather the trio BAM files
# gather the trio BAM files
kid_bam
=
${
SOURCE_DIR
}
/????-??-??_
${
BATCH_ID
}
_
${
PLATE_ID
}
_
${
FAMILY_ID
}
/
${
kid_id
}
/
${
kid_id
}
-ready
.bam
kid_bam
=
${
SOURCE_DIR
}
/????-??-??_
${
VERSION_N
}
_
${
BATCH_ID
}
_
${
PLATE_ID
}
_
${
FAMILY_ID
}
/
${
kid_id
}
/
${
kid_id
}
-ready
.bam
par_1_bam
=
${
SOURCE_DIR
}
/????-??-??_
${
BATCH_ID
}
_
${
PLATE_ID
}
_
${
FAMILY_ID
}
/
${
par_1_id
}
/
${
par_1_id
}
-ready
.bam
par_1_bam
=
${
SOURCE_DIR
}
/????-??-??_
${
VERSION_N
}
_
${
BATCH_ID
}
_
${
PLATE_ID
}
_
${
FAMILY_ID
}
/
${
par_1_id
}
/
${
par_1_id
}
-ready
.bam
par_2_bam
=
${
SOURCE_DIR
}
/????-??-??_
${
BATCH_ID
}
_
${
PLATE_ID
}
_
${
FAMILY_ID
}
/
${
par_2_id
}
/
${
par_2_id
}
-ready
.bam
par_2_bam
=
${
SOURCE_DIR
}
/????-??-??_
${
VERSION_N
}
_
${
BATCH_ID
}
_
${
PLATE_ID
}
_
${
FAMILY_ID
}
/
${
par_2_id
}
/
${
par_2_id
}
-ready
.bam
echo
"...kid_bam =
${
kid_bam
}
..."
echo
"...kid_bam =
${
kid_bam
}
..."
echo
"...par_1_bam =
${
par_1_bam
}
..."
echo
"...par_1_bam =
${
par_1_bam
}
..."
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment