# input: the work folder which contains a PED subfolder where all family PED files were copied # output: only for singletons # solo_FAM_IDs.txt, solo_PRO_IDs.txt and solo_FAM_PRO.txt # # Author: MH # last modified: MARCH 04, 2022 import sys import os def go(work_dir): out_fam_file = work_dir + '/solo_FAM_IDs.txt' out_pro_file = work_dir + '/solo_PRO_IDs.txt' out_f_p_file = work_dir + '/solo_FAM_PRO.txt' out_fam_han = open(out_fam_file,'w') out_pro_han = open(out_pro_file,'w') out_f_p_han = open(out_f_p_file,'w') cntr_fam = 0 cntr_pro = 0 cntr_f_p = 0 # go over the PED folder in the working dir and process each PED file ped_dir = work_dir + '/PED' print "" print "Processing the PED files (in %s) to extract singleton FAM_ID, PRO_ID amd FAM_PRO files" % (ped_dir) for file in os.listdir(ped_dir): # per each PED file if file.endswith(".ped"): print " %s" % (file) in_file = os.path.join(ped_dir, file) # check how many lines in the PED file - if more than one cannot be a singleton, ignore num_lines = 0 in_han = open(in_file,'r') for line in in_han: num_lines += 1 in_han.close() if num_lines > 1: continue if num_lines == 0: print "ERROR: empty PED file %s" % (file) raise SystemExit # if here, the PED file contains exactly one line # check if all fine: parents IDs = 0, kid with known sex and is affected CHILD_ID = 0 FAM_ID = 0 in_han = open(in_file,'r') for line in in_han: data = [x.strip() for x in line.strip().split('\t')] x_plate,x_fam = data[0].split('_') # in the internal PED file, family_id is plateID_familyID, will keep only clean family_id, which corresponds to DECIPHER ID y_indi,y_fam = data[1].split('_') # in the internal PED file, indi_id is indiID_familyID, split if x_fam != y_fam: print "ERROR: FAMILY_ID mismatch in %s" % (file) print line raise SystemExit FAM_ID = x_fam CHILD_ID = y_indi # check both parent IDs == 0 if (data[2] != '0') or (data[3] != '0'): print "ERROR: found parent ID for a singleton child in %s" % (file) print line raise SystemExit # make sure the sex of the child is known CHILD_SEX = int(data[4]) if (CHILD_SEX == 1) or (CHILD_SEX == 2): pass else: print "ERROR: proband sex unknown in %s" % (file) print line raise SystemExit # check kid is affected if int(data[5]) != 2: print "ERROR: singleton child not affected" print line raise SystemExit if FAM_ID == 0: print "ERROR: Cannot find the FAMILY_ID in %s" % (file) raise SystemExit if CHILD_ID == 0: print "ERROR: Cannot find CHILD_ID in %s" % (file) raise SystemExit else: out_fam_han.write('%s\n' % (FAM_ID)) cntr_fam += 1 out_pro_han.write('%s\n' % (CHILD_ID)) cntr_pro += 1 out_f_p_han.write('%s\t%s\n' % (FAM_ID,CHILD_ID)) cntr_f_p += 1 out_fam_han.close() out_pro_han.close() out_f_p_han.close() print "" print "Singleton Families:" print " %s FAM_IDs --> %s" % (cntr_fam, out_fam_file) print " %s PRO_IDs --> %s" % (cntr_pro, out_pro_file) print " %s FAM_PRO --> %s" % (cntr_f_p, out_f_p_file) print "" if __name__ == '__main__': if len(sys.argv) == 2: go(sys.argv[1]) else: print "Suggested use: time python /home/u035/u035/shared/scripts/extract_solo_FAM_PRO_ID.py /home/u035/u035/shared/analysis/work/<PROJECT_ID>" raise SystemExit