#!/usr/bin/env python3 ## parse_tab2n.py ss_randall-200_v_qfo_BL50.txt ss_randall-200_v_qfo_BP62.txt ss_randall-200_v_qfo_VT10.txt ... ## read a blast tabular output file with multiple query results ## and report the median percent identity and alignment length of the top hit for each query import sys import statistics ## print out command line if (len(sys.argv) < 5): print("# " + " ".join(sys.argv)) else: print("# " + " ".join(sys.argv[0:5])+' ...') ## blastp -outfmt 7/ ssearch -m8C fields field_names = ('qseqid','sseqid','percid','alen','mism','gaps','qstart','qend','sstart','send','evalue','bits') ## integer fields int_fields = ('alen','mism','gaps','qstart','qend','sstart','send') ## floating point fields flt_fields = ('percid','evalue','bits') ## this version of read_result_file() in parse_tab2.py reads multiple ## results from a file, and returns the first result for each query sequence def read_nresults_file (result_file): results = [] with open(result_file,'r') as rfd: new_result = False for line in rfd: if line.startswith('# Query:'): new_result = True continue elif line.startswith('#'): continue data = dict(zip(field_names,line.strip('\n').split('\t'))) for f in int_fields: data[f] = int(data[f]) for f in flt_fields: data[f] = float(data[f]) if (new_result): results.append(data) new_result=False return results mat_info = {} mat_names = [] for rfile in sys.argv[1:]: ## extract matrix name from 'ss_rand5-200_v_qfo_BL50.txt' prefix = rfile.split('.')[0] matrix = prefix.split('_')[-1] ## read dat mat_data = read_nresults_file(rfile) if (len(mat_data)>0): ## should have multiple results from multiple queries alen_list = [x['alen'] for x in mat_data] alen_med = statistics.median(alen_list) percid_list = [x['percid'] for x in mat_data] percid_med = statistics.median(percid_list) eval_list = [x['evalue'] for x in mat_data] eval_med = statistics.median(eval_list) mat_info[matrix] = {'alen':alen_med, 'percid':percid_med, 'nsamp':len(percid_list), 'eval':eval_med} mat_names.append(matrix) else: sys.stderr.write("no data from %s\n",rfile) s_mat_list = ['VT10','VT20','VT40','VT80','VT160','BP62','BL50'] bl_mat_list = ['PAM30','PAM70','BLOSUM80','BLOSUM62'] if (mat_names[0] in s_mat_list): disp_mat_list = s_mat_list else: disp_mat_list = bl_mat_list ## print out header for table print('\t'.join(['matrix','percid','alen','evalue','N'])) for m in disp_mat_list: if (m in mat_info): print('\t'.join([m,'{:.2f}'.format(mat_info[m]['percid']),'{}'.format(mat_info[m]['alen']),'{:.2g}'.format(mat_info[m]['eval']),'{}'.format(mat_info[m]['nsamp'])]))