# input: the work folder which contains a PED subfolder where all family PED files were copied # output: only for trio families # FAM_IDs.txt, PRO_IDs.txt and FAM_PRO.txt # # Author: MH # last modified: SEPT 25, 2019 import sys import os def go(work_dir): out_fam_file = work_dir + '/FAM_IDs.txt' out_pro_file = work_dir + '/PRO_IDs.txt' out_f_p_file = work_dir + '/FAM_PRO.txt' out_fam_han = open(out_fam_file,'w') out_pro_han = open(out_pro_file,'w') out_f_p_han = open(out_f_p_file,'w') cntr_fam = 0 cntr_pro = 0 cntr_f_p = 0 # go over the PED folder in the working dir and process each PED file ped_dir = work_dir + '/PED' print "" print "Processing the PED files (in %s) to extract FAM_ID, PRO_ID amd FAM_PRO files" % (ped_dir) for file in os.listdir(ped_dir): # per each PED file if file.endswith(".ped"): # print(os.path.join(ped_dir, file)) print " %s" % (file) in_file = os.path.join(ped_dir, file) CHILD_ID = 0 FAM_ID = 0 in_han = open(in_file,'r') for line in in_han: data = [x.strip() for x in line.strip().split('\t')] x_plate,x_fam = data[0].split('_') # in the internal PED file, family_id is plateID_familyID, will keep only clean family_id, which corresponds to DECIPHER ID y_indi,y_fam = data[1].split('_') # in the internal PED file, indi_id is indiID_familyID, split # print "data[0]=%s,data[1]=%s,x_plate=%s,x_fam=%s,y_indi=%s,y_fam=%s" % (data[0],data[1],x_plate,x_fam,y_indi,y_fam) if FAM_ID == 0: FAM_ID = x_fam elif FAM_ID != x_fam: print "ERROR: more than one FAMILY_ID in %s" % (file) raise SystemExit if data[2] != '0' and data[3] != '0': # this is the child in the trio if CHILD_ID == 0: CHILD_ID = y_indi else: # seen another child print "WARNING: already have seen a child (possibly a quad) in %s" % (file) CHILD_ID = 0 break CHILD_SEX = int(data[4]) if (CHILD_SEX == 1) or (CHILD_SEX == 2): pass else: print "ERROR: proband sex unknown" print line raise SystemExit if int(data[5]) != 2: print "ERROR: child in a trio not affected" print line raise SystemExit if FAM_ID == 0: print "ERROR: Cannot find the FAMILY_ID in %s" % (file) raise SystemExit if CHILD_ID == 0: print "WARNING: Cannot find exactly one CHILD_ID (with 2 available parents) in %s : not a trio --> will not be analyzed" % (file) else: out_fam_han.write('%s\n' % (FAM_ID)) cntr_fam += 1 out_pro_han.write('%s\n' % (CHILD_ID)) cntr_pro += 1 out_f_p_han.write('%s\t%s\n' % (FAM_ID,CHILD_ID)) cntr_f_p += 1 out_fam_han.close() out_pro_han.close() out_f_p_han.close() print "" print "Trio Families:" print " %s FAM_IDs --> %s" % (cntr_fam, out_fam_file) print " %s PRO_IDs --> %s" % (cntr_pro, out_pro_file) print " %s FAM_PRO --> %s" % (cntr_f_p, out_f_p_file) print "" if __name__ == '__main__': if len(sys.argv) == 2: go(sys.argv[1]) else: print "Suggested use: time python /home/u035/u035/shared/scripts/NHS_WES_extract_trio_FAM_PRO_ID.py /scratch/u035/u035/shared/trio_whole_exome/analysis/${PROJECT_ID}" raise SystemExit