Skip to content
Snippets Groups Projects
NHS_WES_extract_trio_FAM_PRO_ID.py 3.93 KiB
Newer Older
#	input:	the work folder which contains a PED subfolder where all family PED files were copied
#	output:	only for trio families
#		FAM_IDs.txt, PRO_IDs.txt and FAM_PRO.txt
#
#       Author: MH
#       last modified: SEPT 25, 2019



import sys
import os



def go(work_dir):

    out_fam_file = work_dir + '/FAM_IDs.txt'
    out_pro_file = work_dir + '/PRO_IDs.txt'
    out_f_p_file = work_dir + '/FAM_PRO.txt'

    out_fam_han = open(out_fam_file,'w')
    out_pro_han = open(out_pro_file,'w')
    out_f_p_han = open(out_f_p_file,'w')

    cntr_fam = 0
    cntr_pro = 0
    cntr_f_p = 0




    # go over the PED folder in the working dir and process each PED file
    ped_dir = work_dir + '/PED'
    print ""
    print "Processing the PED files (in %s) to extract FAM_ID, PRO_ID amd FAM_PRO files" % (ped_dir)

    for file in os.listdir(ped_dir):					# per each PED file
        if file.endswith(".ped"):
#            print(os.path.join(ped_dir, file))
                        
            print "  %s" % (file) 
            in_file = os.path.join(ped_dir, file)  

            CHILD_ID = 0
            FAM_ID = 0            

            in_han = open(in_file,'r')
            for line in in_han:
                data = [x.strip() for x in line.strip().split('\t')]

                x_plate,x_fam = data[0].split('_')			# in the internal PED file, family_id is plateID_familyID, will keep only clean family_id, which corresponds to DECIPHER ID
                y_indi,y_fam = data[1].split('_')			# in the internal PED file, indi_id is indiID_familyID, split


#                print "data[0]=%s,data[1]=%s,x_plate=%s,x_fam=%s,y_indi=%s,y_fam=%s" % (data[0],data[1],x_plate,x_fam,y_indi,y_fam)

                if FAM_ID == 0:
                    FAM_ID = x_fam
                elif FAM_ID != x_fam:
                    print "ERROR: more than one FAMILY_ID in %s" % (file)
                    raise SystemExit 
                     
                if data[2] != '0' and data[3] != '0':			# this is the child in the trio
                    if CHILD_ID == 0:
                        CHILD_ID = y_indi
                    else:						# seen another child
                        print "WARNING: already have seen a child (possibly a quad) in %s" % (file)
                        CHILD_ID = 0
                        break 

                    CHILD_SEX = int(data[4])
                    if (CHILD_SEX == 1) or (CHILD_SEX == 2):
                        pass
                    else:
                        print "ERROR: proband sex unknown"
                        print line
                        raise SystemExit

                    if int(data[5]) != 2:
                        print "ERROR: child in a trio not affected"
                        print line
                        raise SystemExit  

            if FAM_ID == 0:
                print "ERROR: Cannot find the FAMILY_ID in %s" % (file)
                raise SystemExit
            if CHILD_ID == 0:
                print "WARNING: Cannot find exactly one CHILD_ID (with 2 available parents) in %s : not a trio --> will not be analyzed" % (file)  
            else:
                out_fam_han.write('%s\n' % (FAM_ID))
                cntr_fam += 1
                out_pro_han.write('%s\n' % (CHILD_ID))
                cntr_pro += 1
                out_f_p_han.write('%s\t%s\n' % (FAM_ID,CHILD_ID))
                cntr_f_p += 1 
                  
    out_fam_han.close()
    out_pro_han.close()
    out_f_p_han.close()



    print ""
    print "Trio Families:"
    print "   %s FAM_IDs --> %s" % (cntr_fam, out_fam_file)
    print "   %s PRO_IDs --> %s" % (cntr_pro, out_pro_file)
    print "   %s FAM_PRO --> %s" % (cntr_f_p, out_f_p_file)
    print ""





if __name__ == '__main__':
    if len(sys.argv) == 2:
        go(sys.argv[1])
    else:
        print "Suggested use: time python /home/u035/u035/shared/scripts/NHS_WES_extract_trio_FAM_PRO_ID.py /scratch/u035/u035/shared/trio_whole_exome/analysis/${PROJECT_ID}"
        raise SystemExit