#!/usr/bin/env python ############################################################################################# # # # This program is free software; you can redistribute it and/or modify # # it under the terms of the GNU General Public License as published by # # the Free Software Foundation; either version 3 of the License, or # # (at your option) any later version. # # # # This program is distributed in the hope that it will be useful, # # but WITHOUT ANY WARRANTY; without even the implied warranty of # # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # # GNU General Public License for more details. # # # # You should have received a copy of the GNU General Public License # # along with this program; if not, see http://www.gnu.org/licenses # # or write to the Free Software Foundation, Inc., 51 Franklin Street, # # Fifth Floor, Boston, MA 02110-1301 USA # # # ############################################################################################# import bz2 import time import sys from splicesites.utils import create_dataset, create_modsel from esvm.utils import calcroc from esvm.experiment import crossvalidation from esvm.mldata import init_datasetfile from numpy.linalg import norm import numpy def write_results(f, results): """ Write out results """ f.write('Kernel\tParameters\tC\tauROC\n'); for i in xrange(len(results)): C=results[i][0] k_param=results[i][1] param_name=k_param[1]['name'] kernel=k_param[0] if kernel.endswith('2'): kernel=kernel[:-1] kernel_parameters= param_name + '=' + `k_param[1][param_name]` perf = 100*results[i][2] f.write('%s\t' % kernel) f.write('%s\t' % kernel_parameters) f.write('C=%2.2f\t' % C) f.write('%2.1f%%\n' % perf) def normalize(examples, subtract_mean=False, divide_std=False, rescale=False, norm_one=False): """ Scale GC data to ... (be on a ball? just const? 0 mean, std 1?) """ if subtract_mean: # mean = 0.0 mean=numpy.mean(examples, axis=1) for i in xrange(examples.shape[1]): examples[:,i]-=mean if divide_std: # std = 1.0 std=numpy.std(examples, axis=1) for i in xrange(examples.shape[1]): examples[:,i]/=(std+1e-10) if rescale: # scale to have on average 1 on linear kernel diagonal scale=numpy.sqrt(numpy.mean(numpy.diag(numpy.mat(examples).T*numpy.mat(examples)))) examples/=scale if norm_one: # ball/circle for i in xrange(examples.shape[1]): examples[:,i]/=norm(examples[:,i]) return examples ; def run_single_experiment(results, num_fold_cv, kernelname, kparam, C, examples, labels): """ Run a single experiment, i.e. for a fixed kernel and parameters do num_fold cross-validation """ param_name=kparam['name'] print 'Running C =', C, kernelname.title(), 'Kernel with', param_name, '=', kparam[param_name] (all_outputs, all_split) = crossvalidation(num_fold_cv, kernelname, kparam, C, examples, labels) results.append( (C, (kernelname, kparam), calcroc(all_outputs,labels)) ) def splice_example(Cs, gcfilename,seqfilename,seq2filename, plot=False): """ For the data files, apply the set of kernels """ # hyperparameters num_fold_cv = 5 # The area under the receiver operating characteristic results=[] # Read datasets # GC features fp = init_datasetfile(gcfilename,'vec') (gc_examples,gc_labels) = fp.readlines() gc_examples = normalize(gc_examples, subtract_mean=True) if plot: from pylab import scatter,show color=['b','r'] scatter(gc_examples[0,], gc_examples[1,], s=400*(gc_labels+2), c=''.join([ color[(int(i)+1)/2] for i in gc_labels]), alpha=0.1) show() # 2 sequence features fp = init_datasetfile(seq2filename,'mseq') (dna2_examples,dna2_labels) = fp.readlines() # DNA sequences fp = init_datasetfile(seqfilename,'seq') (dna_examples,dna_labels) = fp.readlines() #Define experiments to carry out experiments=( # Linear kernel on GC content ('linear', {'scale':1.0, 'name':'scale'}, (gc_examples, gc_labels)), # Polynomial kernel on GC content ( 'poly', {'degree':3, 'name':'degree', 'inhomogene':True, 'normal':True}, (gc_examples, gc_labels)), ( 'poly', {'degree':5, 'name':'degree', 'inhomogene':True, 'normal':True}, (gc_examples, gc_labels)), # Gaussian kernel on GC content ('gauss', {'width':100.0, 'name':'width'}, (gc_examples, gc_labels)), ('gauss', {'width':1.0, 'name':'width'}, (gc_examples, gc_labels)), ('gauss', {'width':0.01, 'name':'width'}, (gc_examples, gc_labels)), # Spectrum kernel on 2 dna sequences ('spec2', {'degree':1, 'name':'degree'}, (dna2_examples, dna2_labels)), ('spec2', {'degree':3, 'name':'degree'}, (dna2_examples, dna2_labels)), ('spec2', {'degree':5, 'name':'degree'}, (dna2_examples, dna2_labels)), # Cumulative Spectrum kernel on 2 dna sequences ('cumspec2', {'degree':1, 'name':'degree'}, (dna2_examples, dna2_labels)), ('cumspec2', {'degree':3, 'name':'degree'}, (dna2_examples, dna2_labels)), ('cumspec2', {'degree':5, 'name':'degree'}, (dna2_examples, dna2_labels)), # Weighted degree kernel on dna sequences ('wd', {'degree':1,'shift':0, 'name':'degree'}, (dna_examples, dna_labels)), ('wd', {'degree':3,'shift':0, 'name':'degree'}, (dna_examples, dna_labels)), ('wd', {'degree':5,'shift':0, 'name':'degree'}, (dna_examples, dna_labels)) ) if Cs is None: for C in (0.01, 0.1, 1, 2, 5, 10): for e in experiments: run_single_experiment(results, num_fold_cv, e[0], e[1], C, e[2][0], e[2][1]) else: for i in xrange(len(experiments)): e=experiments[i] run_single_experiment(results, num_fold_cv, e[0], e[1], Cs[i], e[2][0], e[2][1]) return results def get_best_results(results): methods=('linear', 'poly', 'gauss', 'spec2', 'cumspec2', 'wd') best_result=[] for m in methods: params=set() for r in results: if r[1][0]==m: params.add(tuple(zip(r[1][1].keys(),r[1][1].values()))) for p in params: m_result=0.0 m_best=None for r in results: if r[1][0]==m and r[1][1]==dict(p) and r[2]>m_result: m_result=r[2] m_best=r best_result.append(m_best) return best_result if __name__ == '__main__': if len(sys.argv) > 1: if sys.argv[1] == 'create_data': create_dataset() sys.exit(0) elif sys.argv[1] == 'mselect': results = splice_example(None, 'C_elegans_acc_modsel_gc.csv','C_elegans_acc_modsel_seq.csv','C_elegans_acc_modsel_seq2.csv') #results = splice_example(None, 'C_elegans_acc_gc.csv','C_elegans_acc_seq.csv','C_elegans_acc_seq2.csv') import pickle pickle.dump(results, file('mselect_result.pickle','w')) sys.exit(0) elif sys.argv[1] == 'get_best': import pickle results=pickle.load(file('mselect_result.pickle')) best_result=get_best_results(results) write_results(sys.stdout, best_result) #print 'Cs=[', #for e in best_result: # print e[0], ",", #print ']' sys.exit(0) else: print "unknown argument" sys.exit(1) # without any argument starttime = time.time() Cs = [ 5, 10, 10, \ 5, 0.01, 10, \ 10, 10, 0.01, \ 10, 10, 10, \ 1, 1, 2 ] # run the experiment results = splice_example(Cs, 'C_elegans_acc_gc.csv','C_elegans_acc_seq.csv','C_elegans_acc_seq2.csv', False) stoptime = time.time() elapsedtime = time.strftime('Elapsed time (HH.MM:SS): %H.%M:%S',time.gmtime(stoptime-starttime)) print elapsedtime write_results(file('results.txt','w'), results) for curline in file('results.txt').readlines(): print curline.strip() if __name__ == '__main__': if len(sys.argv) > 1: if sys.argv[1] == 'create_data': create_dataset() elif sys.argv[1] == 'create_modsel': create_modsel() sys.exit() #main() print 'results in results.txt'