ToPS
|
00001 /* 00002 * TrainSimilarityBasedSequenceWeighting.cpp 00003 * 00004 * Copyright 2011 Andre Yoshiaki Kashiwabara <akashiwabara@usp.br> 00005 * Ígor Bonádio <ibonadio@ime.usp.br> 00006 * Vitor Onuchic <vitoronuchic@gmail.com> 00007 * Alan Mitchell Durham <aland@usp.br> 00008 * 00009 * This program is free software; you can redistribute it and/or modify 00010 * it under the terms of the GNU General Public License as published by 00011 * the Free Software Foundation; either version 3 of the License, or 00012 * (at your option) any later version. 00013 * 00014 * This program is distributed in the hope that it will be useful, 00015 * but WITHOUT ANY WARRANTY; without even the implied warranty of 00016 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00017 * GNU General Public License for more details. 00018 * 00019 * You should have received a copy of the GNU General Public License 00020 * along with this program; if not, write to the Free Software 00021 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, 00022 * MA 02110-1301, USA. 00023 */ 00024 00025 #include "ProbabilisticModel.hpp" 00026 #include "SimilarityBasedSequenceWeighting.hpp" 00027 #include "TrainSimilarityBasedSequenceWeighting.hpp" 00028 #include "ConfigurationReader.hpp" 00029 #include "ContextTree.hpp" 00030 #include "VariableLengthMarkovChain.hpp" 00031 #include "Symbol.hpp" 00032 #include "SequenceFactory.hpp" 00033 #include "util.hpp" 00034 namespace tops { 00035 00036 double test (ProbabilisticModelPtr m, std::string q, int max) 00037 { 00038 if(q.size() >= max) 00039 { 00040 SequenceFactory f(m->alphabet()); 00041 std::stringstream s; 00042 for (int i = 0; i < q.size(); i++) 00043 { 00044 if(i == 0) 00045 s << q[i]; 00046 else 00047 s << " " << q[i]; 00048 } 00049 Sequence x = f.createSequence(s.str()); 00050 double p = exp(m->evaluate(x, 0, x.size()-1)); 00051 std::cerr << q << " " << p << std::endl; 00052 return p; 00053 } 00054 else 00055 { 00056 double sum = 0.0; 00057 for(int i = 0 ; i < m->alphabet()->size(); i++) 00058 { 00059 std::stringstream s; 00060 s <<q<< m->alphabet()->getSymbol(i)->name() ; 00061 sum += test(m, s.str(), max); 00062 } 00063 return sum; 00064 } 00065 } 00066 00067 double calculate_normalizer (int skip_length,int skip_offset, int max_length, std::map<std::string, double > & counter, int alphabet_size) 00068 { 00069 int npatterns_differ_1 = 0; 00070 npatterns_differ_1 = (alphabet_size - 1) * (max_length - skip_length); 00071 if(skip_length < 0) 00072 npatterns_differ_1 = (alphabet_size - 1) * (max_length ); 00073 double sum = 0.0; 00074 std::map<std::string, double>::const_iterator it; 00075 std::map<std::string, double>::const_iterator it2; 00076 for(it = counter.begin(); it != counter.end(); it++) 00077 { 00078 sum += it->second; 00079 00080 int diff = 0; 00081 int np_differ_1 = 0; 00082 00083 for(it2 = counter.begin(); it2 != counter.end(); it2++) 00084 { 00085 std::string a = it->first; 00086 std::string b = it2->first; 00087 for(int i = 0; i < max_length; i++) 00088 if((i >= skip_offset) && (i <= skip_offset+skip_length)){ 00089 if(a[i] != b[i]) 00090 diff+=2; 00091 }else if(a[i] != b[i]) 00092 diff++; 00093 00094 if(diff == 1) { 00095 //sum += it2->second * 0.001; 00096 np_differ_1 ++; 00097 } 00098 } 00099 sum += 0.001*it->second*(npatterns_differ_1 - np_differ_1); 00100 } 00101 return sum; 00102 } 00103 00104 00105 00106 00107 00108 00109 ProbabilisticModelPtr TrainSimilarityBasedSequenceWeighting::create( 00110 ProbabilisticModelParameters & parameters, double & loglikelihood, 00111 int & sample_size) const { 00112 ProbabilisticModelParameterValuePtr trainpar = 00113 parameters.getMandatoryParameterValue("training_set"); 00114 ProbabilisticModelParameterValuePtr alphapar = 00115 parameters.getMandatoryParameterValue("alphabet"); 00116 00117 ProbabilisticModelParameterValuePtr offsetpar = parameters.getOptionalParameterValue("skip_offset"); 00118 ProbabilisticModelParameterValuePtr skiplengthpar = parameters.getOptionalParameterValue("skip_length"); 00119 ProbabilisticModelParameterValuePtr skipseqpar = parameters.getOptionalParameterValue("skip_sequence"); 00120 00121 if ((trainpar == NULL) || (alphapar == NULL) ) { 00122 std::cerr << help() << std::endl; 00123 exit(-1); 00124 } 00125 int skip_length = -1; 00126 int skip_offset = -1; 00127 std::string skip_seq; 00128 if((offsetpar != NULL) && (skiplengthpar != NULL)) 00129 { 00130 skip_offset = offsetpar->getInt(); 00131 skip_length = skiplengthpar->getInt(); 00132 skip_seq = skiplengthpar -> getString(); 00133 00134 } 00135 00136 AlphabetPtr alphabet = AlphabetPtr(new Alphabet()); 00137 alphabet ->initializeFromVector(alphapar->getStringVector()); 00138 SequenceEntryList sample_set; 00139 readSequencesFromFile(sample_set, alphabet, trainpar->getString()); 00140 std::map<std::string, double> counter; 00141 int min_length = 999999999; 00142 for(int i = 0; i < (int) sample_set.size(); i++) 00143 { 00144 std::stringstream o; 00145 for(int j = 0; j < (int)(sample_set[i]->getSequence()).size();j++) 00146 { 00147 o << alphabet->getSymbol((sample_set[i]->getSequence())[j])->name(); 00148 } 00149 if(counter.find(o.str()) == counter.end()) 00150 { 00151 counter[o.str()] = 1; 00152 } 00153 else 00154 { 00155 counter[o.str()] += 1; 00156 } 00157 if((int)o.str().size() < min_length) 00158 min_length = o.str().size(); 00159 } 00160 std::string q; 00161 double normalize = calculate_normalizer(skip_length,skip_offset, min_length, counter, alphabet->size()); 00162 00163 ProbabilisticModelParameters pars; 00164 pars.add("alphabet", alphapar); 00165 pars.add("counter", DoubleMapParameterValuePtr(new DoubleMapParameterValue(counter))); 00166 pars.add("normalizer", DoubleParameterValuePtr(new DoubleParameterValue(normalize))); 00167 if(skiplengthpar != NULL && offsetpar != NULL) { 00168 pars.add("skip_length", skiplengthpar); 00169 pars.add("skip_offset", offsetpar); 00170 pars.add("skip_sequence", skipseqpar); 00171 } 00172 ProbabilisticModelPtr m = SimilarityBasedSequenceWeightingPtr(new SimilarityBasedSequenceWeighting()); 00173 m->initialize(pars); 00174 00175 // std::string x; 00176 // std::cerr << "SUM: " << test(m, x, min_length)<< std::endl; 00177 00178 loglikelihood = 0.0; 00179 sample_size = 0; 00180 for (int i = 0; i < (int) sample_set.size(); i++) { 00181 loglikelihood 00182 += m->evaluate((sample_set[i]->getSequence()), 0, (sample_set[i]->getSequence()).size() - 1); 00183 sample_size += (sample_set[i]->getSequence()).size(); 00184 } 00185 return m; 00186 } 00187 ProbabilisticModelPtr TrainSimilarityBasedSequenceWeighting::create( 00188 ProbabilisticModelParameters & parameters) const { 00189 double loglike; 00190 int samplesize; 00191 return create(parameters, loglike, samplesize); 00192 00193 } 00194 00195 } 00196 ;