ToPS
|
00001 /* 00002 * SimilarityBasedSequenceWeighting.cpp 00003 * 00004 * Copyright 2011 Andre Yoshiaki Kashiwabara <akashiwabara@usp.br> 00005 * Ígor Bonádio <ibonadio@ime.usp.br> 00006 * Vitor Onuchic <vitoronuchic@gmail.com> 00007 * Alan Mitchell Durham <aland@usp.br> 00008 * 00009 * This program is free software; you can redistribute it and/or modify 00010 * it under the terms of the GNU General Public License as published by 00011 * the Free Software Foundation; either version 3 of the License, or 00012 * (at your option) any later version. 00013 * 00014 * This program is distributed in the hope that it will be useful, 00015 * but WITHOUT ANY WARRANTY; without even the implied warranty of 00016 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00017 * GNU General Public License for more details. 00018 * 00019 * You should have received a copy of the GNU General Public License 00020 * along with this program; if not, write to the Free Software 00021 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, 00022 * MA 02110-1301, USA. 00023 */ 00024 00025 #include "SimilarityBasedSequenceWeighting.hpp" 00026 //#include "SimilarityBasedSequenceWeightingCreator.hpp" 00027 #include "TrainDiscreteIIDModel.hpp" 00028 #include "Symbol.hpp" 00029 #include <iostream> 00030 #include <cmath> 00031 #include <sstream> 00032 #include <vector> 00033 #include <iterator> 00034 00035 namespace tops { 00036 00037 00038 00039 double SimilarityBasedSequenceWeighting::prefix_sum_array_compute(int begin, int end, int phase) { 00040 if ((begin < (int) _scores.size()) && (begin >= 0)){ 00041 return _scores[begin]; 00042 } 00043 return -HUGE; 00044 } 00045 double SimilarityBasedSequenceWeighting::prefix_sum_array_compute(int begin, int end) 00046 { 00047 return prefix_sum_array_compute(begin, end, 0); 00048 } 00049 00050 bool SimilarityBasedSequenceWeighting::initialize_prefix_sum_array(const Sequence & s, int phase) 00051 { 00052 if(ProbabilisticModel::initialize_prefix_sum_array(s)) 00053 return true; 00054 _scores.resize(s.size()); 00055 for (int i = 0; i < (int) s.size(); i++) { 00056 _scores[i] = evaluate(s, i, s.size() - 1); 00057 } 00058 return true; 00059 } 00060 00061 bool SimilarityBasedSequenceWeighting::initialize_prefix_sum_array(const Sequence & s) 00062 { 00063 return initialize_prefix_sum_array(s, 0); 00064 } 00065 00066 00067 00068 00069 std::string SimilarityBasedSequenceWeighting::str() const 00070 { 00071 std::stringstream out; 00072 00073 DoubleMapParameterValuePtr p = DoubleMapParameterValuePtr(new DoubleMapParameterValue(_counter)) ; 00074 DoubleParameterValuePtr x = DoubleParameterValuePtr(new DoubleParameterValue(_normalizer)); 00075 IntParameterValuePtr so = IntParameterValuePtr(new IntParameterValue(_skip_offset)); 00076 IntParameterValuePtr sl = IntParameterValuePtr(new IntParameterValue(_skip_length)); 00077 StringParameterValuePtr ss = StringParameterValuePtr(new StringParameterValue(_skip_sequence)); 00078 out << "model_name=\"SimilarityBasedSequenceWeighting\""<< std::endl; 00079 out << ProbabilisticModel::alphabet()->str() << std::endl; 00080 out << "counter = " << p->str() << std::endl; 00081 out << "normalizer = " << x->str() << std::endl; 00082 if(_skip_offset >= 0) 00083 { 00084 out << "skip_offset = " << so->str() << std::endl; 00085 out << "skip_length = " << sl->str() << std::endl; 00086 out << "skip_sequence = " << ss->str() << std::endl; 00087 } 00088 00089 return out.str(); 00090 } 00091 00093 double SimilarityBasedSequenceWeighting::evaluate(const Sequence & s, unsigned int begin, unsigned int end) const { 00094 if (end >= s.size()) 00095 return -HUGE; 00096 if(begin < 0) 00097 return -HUGE; 00098 int length = (_counter.begin()->first).size(); 00099 std::stringstream qstream; 00100 for(int i = begin ; (i <= (int)end) && (i < (int) (begin + length)) ; i++){ 00101 qstream << alphabet()->getSymbol(s[i])->name(); 00102 } 00103 std::string q = qstream.str(); 00104 int psize = 0; 00105 std::map<std::string, double>::const_iterator it; 00106 double sum = 0; 00107 for(it = _counter.begin(); it != _counter.end(); it++) 00108 { 00109 std::string q2 = it->first; 00110 psize = q2.size(); 00111 int diff = 0; 00112 if(q.size () != q2.size()) 00113 return -HUGE; 00114 bool valid = true; 00115 for(int i = 0; i < (int)q2.size(); i++) 00116 { 00117 if((i >= _skip_offset) && (i < _skip_offset+_skip_length)){ 00118 if(q[i] != _skip_sequence[i-_skip_offset]){ 00119 valid = false; 00120 break; 00121 } 00122 }else if(q[i] != q2[i]) 00123 diff++; 00124 } 00125 if(!valid) 00126 return -HUGE; 00127 if(diff == 1){ 00128 sum += 0.001 * it->second; 00129 } else if (diff==0){ 00130 sum += it->second; 00131 } 00132 } 00133 if(close(sum , 0.0, 1e-10)) 00134 return -HUGE; 00135 return log(sum/(_normalizer)); 00136 } 00137 void SimilarityBasedSequenceWeighting::initialize (const ProbabilisticModelParameters & p ) 00138 { 00139 ProbabilisticModelParameterValuePtr alphabetpar = p.getOptionalParameterValue("alphabet"); 00140 ProbabilisticModelParameterValuePtr counterpar = p.getOptionalParameterValue("counter"); 00141 ProbabilisticModelParameterValuePtr lengthpar = p.getOptionalParameterValue("length"); 00142 ProbabilisticModelParameterValuePtr normalizerp = p.getOptionalParameterValue("normalizer"); 00143 00144 ProbabilisticModelParameterValuePtr offsetpar = p.getOptionalParameterValue("skip_offset"); 00145 ProbabilisticModelParameterValuePtr skiplengthpar = p.getOptionalParameterValue("skip_length"); 00146 ProbabilisticModelParameterValuePtr skipseqpar = p.getOptionalParameterValue("skip_sequence"); 00147 00148 if (alphabetpar != NULL) 00149 { 00150 AlphabetPtr alpha = AlphabetPtr(new Alphabet()); 00151 alpha->initializeFromVector(alphabetpar->getStringVector()); 00152 setAlphabet(alpha); 00153 } 00154 _counter = counterpar->getDoubleMap(); 00155 _normalizer = normalizerp->getDouble(); 00156 if((skiplengthpar == NULL) || (offsetpar == NULL) ) 00157 { 00158 _skip_offset = -1; 00159 _skip_length = -1; 00160 } 00161 else 00162 { 00163 _skip_offset = offsetpar->getInt(); 00164 _skip_length = skiplengthpar->getInt(); 00165 _skip_sequence = skipseqpar->getString(); 00166 } 00167 } 00168 00169 ProbabilisticModelParameters SimilarityBasedSequenceWeighting::parameters() const 00170 { 00171 ProbabilisticModelParameters p; 00172 p.add("model_name", StringParameterValuePtr(new StringParameterValue("SimilarityBasedSequenceWeighting"))); 00173 p.add("alphabet", alphabet()->getParameterValue()); 00174 p.add("counter", DoubleMapParameterValuePtr(new DoubleMapParameterValue(_counter))); 00175 p.add("normalizer", DoubleParameterValuePtr(new DoubleParameterValue(_normalizer))); 00176 if(_skip_offset >= 0){ 00177 p.add("skip_length", IntParameterValuePtr(new IntParameterValue(_skip_length))); 00178 p.add("skip_offset", IntParameterValuePtr(new IntParameterValue(_skip_offset))); 00179 } 00180 return p; 00181 } 00182 }