Skip to content
Snippets Groups Projects
extract_solo_FAM_PRO_ID.py 4.17 KiB
Newer Older
#	input:	the work folder which contains a PED subfolder where all family PED files were copied
#	output:	only for singletons
#		solo_FAM_IDs.txt, solo_PRO_IDs.txt and solo_FAM_PRO.txt
#
#       Author: MH
#       last modified: MARCH 04, 2022



import sys
import os



def go(work_dir):

    out_fam_file = work_dir + '/solo_FAM_IDs.txt'
    out_pro_file = work_dir + '/solo_PRO_IDs.txt'
    out_f_p_file = work_dir + '/solo_FAM_PRO.txt'

    out_fam_han = open(out_fam_file,'w')
    out_pro_han = open(out_pro_file,'w')
    out_f_p_han = open(out_f_p_file,'w')

    cntr_fam = 0
    cntr_pro = 0
    cntr_f_p = 0


    # go over the PED folder in the working dir and process each PED file
    ped_dir = work_dir + '/PED'
    print ""
    print "Processing the PED files (in %s) to extract singleton FAM_ID, PRO_ID amd FAM_PRO files" % (ped_dir)

    for file in os.listdir(ped_dir):					# per each PED file
        if file.endswith(".ped"):

            print "  %s" % (file)
            in_file = os.path.join(ped_dir, file)

            # check how many lines in the PED file - if more than one cannot be a singleton, ignore
            num_lines = 0
            in_han = open(in_file,'r')
            for line in in_han:
                num_lines += 1
            in_han.close()

            if num_lines > 1:
                continue
            if num_lines == 0:
                print "ERROR: empty PED file %s" % (file)
                raise SystemExit

            # if here, the PED file contains exactly one line
            # check if all fine: parents IDs = 0, kid with known sex and is affected
            CHILD_ID = 0
            FAM_ID = 0

            in_han = open(in_file,'r')
            for line in in_han:
                data = [x.strip() for x in line.strip().split('\t')]

                x_plate,x_fam = data[0].split('_')			# in the internal PED file, family_id is plateID_familyID, will keep only clean family_id, which corresponds to DECIPHER ID
                y_indi,y_fam = data[1].split('_')			# in the internal PED file, indi_id is indiID_familyID, split

                if x_fam != y_fam:
                    print "ERROR: FAMILY_ID mismatch in %s" % (file)
                    print line
                    raise SystemExit

                FAM_ID = x_fam
                CHILD_ID = y_indi

                # check both parent IDs == 0
                if (data[2] != '0') or (data[3] != '0'):
                    print "ERROR: found parent ID for a singleton child in %s" % (file)
                    print line
                    raise SystemExit

                # make sure the sex of the child is known
                CHILD_SEX = int(data[4])
                if (CHILD_SEX == 1) or (CHILD_SEX == 2):
                    pass
                else:
                    print "ERROR: proband sex unknown in %s" % (file)
                    print line
                    raise SystemExit

                # check kid is affected
                if int(data[5]) != 2:
                    print "ERROR: singleton child not affected"
                    print line
                    raise SystemExit

            if FAM_ID == 0:
                print "ERROR: Cannot find the FAMILY_ID in %s" % (file)
                raise SystemExit
            if CHILD_ID == 0:
                print "ERROR: Cannot find CHILD_ID in %s" % (file)
                raise SystemExit
            else:
                out_fam_han.write('%s\n' % (FAM_ID))
                cntr_fam += 1
                out_pro_han.write('%s\n' % (CHILD_ID))
                cntr_pro += 1
                out_f_p_han.write('%s\t%s\n' % (FAM_ID,CHILD_ID))
                cntr_f_p += 1

    out_fam_han.close()
    out_pro_han.close()
    out_f_p_han.close()



    print ""
    print "Singleton Families:"
    print "   %s FAM_IDs --> %s" % (cntr_fam, out_fam_file)
    print "   %s PRO_IDs --> %s" % (cntr_pro, out_pro_file)
    print "   %s FAM_PRO --> %s" % (cntr_f_p, out_f_p_file)
    print ""





if __name__ == '__main__':
    if len(sys.argv) == 2:
        go(sys.argv[1])
    else:
        print "Suggested use: time python /home/u035/u035/shared/scripts/extract_solo_FAM_PRO_ID.py /home/u035/u035/shared/analysis/work/<PROJECT_ID>"
        raise SystemExit