convert_DEC_to_v10.py

#	given a DECIPHER bulk upload file v9
#	convert it to v10
#
#       Author: MH
#       last modified: APR 25, 2022


import sys
import os
import csv
import xlsxwriter


def go(inout_dir,id):		# the folder where the bulk upload files are strored; id is in the format <indiv_id>_<family_id>

    in_file = '%s/%s_DEC_FLT.csv' % (inout_dir,id)
    out_file = '%s/%s_DECIPHER_v10.xlsx' % (inout_dir,id)

    # create the workbook
    workbook = xlsxwriter.Workbook(out_file)

    # create the worksheet
    worksheet = workbook.add_worksheet('Sequence Variants')

    # write the header row
    header = ('Patient internal reference number or ID','Shared','Assembly','HGVS code','Chromosome','Genomic start','Ref sequence','Alt sequence','Gene name','Transcript','Is intergenic','Genotype','Inheritance','Pathogenicity','Pathogenicity evidence','Contribution','Genotype groups')
    worksheet.write_row(0,0,header)

    # now, open and read the old file, for each variant collecting the information required for v10 and writing it in the v10 file
    cntr = 0

    with open(in_file,'r') as tsvfile:
        reader = csv.reader(tsvfile, delimiter=',', quotechar='"')
        for row in reader:
            if row[0] == 'Internal reference number or ID':      # ignore the header line
                continue

            cntr += 1
            id = str(row[0])
            shared = 'NHS-SCE'
            assembly = 'GRCh38'
            HGVS = ''
            chr = str(row[1])
            start = str(row[2])
            ref = str(row[4])
            alt = str(row[5])
            gene = str(row[7])
            trans = str(row[6])
            inter = str(row[8])
            genotype = str(row[19])
            inher = str(row[15])
            if inher == 'Maternally inherited, constitutive in mother':
                inher = 'Maternally inherited'
            elif inher == 'Paternally inherited, constitutive in father':
                inher = 'Paternally inherited'
            patho = ''
            evid = ''
            cont = ''
            gt_groups = ''
            data = (id,shared,assembly,HGVS,chr,start,ref,alt,gene,trans,inter,genotype,inher,patho,evid,cont,gt_groups)

            # write it
            worksheet.write_row(cntr,0,data)


    # close the workbook
    workbook.close()


if __name__ == '__main__':
    if len(sys.argv) == 3:
        go(sys.argv[1],sys.argv[2])
    else:
        print ("Suggested use: time python convert_DEC_to_v10.py decipher_dir <indiv_id>_<family_id>")
        raise SystemExit