ToPS
SimilarityBasedSequenceWeighting.cpp
00001 /*
00002  *       SimilarityBasedSequenceWeighting.cpp
00003  *
00004  *       Copyright 2011 Andre Yoshiaki Kashiwabara <akashiwabara@usp.br>
00005  *                      Ígor Bonádio <ibonadio@ime.usp.br>
00006  *                      Vitor Onuchic <vitoronuchic@gmail.com>
00007  *                      Alan Mitchell Durham <aland@usp.br>
00008  *
00009  *       This program is free software; you can redistribute it and/or modify
00010  *       it under the terms of the GNU  General Public License as published by
00011  *       the Free Software Foundation; either version 3 of the License, or
00012  *       (at your option) any later version.
00013  *
00014  *       This program is distributed in the hope that it will be useful,
00015  *       but WITHOUT ANY WARRANTY; without even the implied warranty of
00016  *       MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00017  *       GNU General Public License for more details.
00018  *
00019  *       You should have received a copy of the GNU General Public License
00020  *       along with this program; if not, write to the Free Software
00021  *       Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
00022  *       MA 02110-1301, USA.
00023  */
00024 
00025 #include "SimilarityBasedSequenceWeighting.hpp"
00026 //#include "SimilarityBasedSequenceWeightingCreator.hpp"
00027 #include "TrainDiscreteIIDModel.hpp"
00028 #include "Symbol.hpp"
00029 #include <iostream>
00030 #include <cmath>
00031 #include <sstream>
00032 #include <vector>
00033 #include <iterator>
00034 
00035 namespace tops {
00036 
00037 
00038 
00039   double SimilarityBasedSequenceWeighting::prefix_sum_array_compute(int begin, int end, int phase) {
00040     if ((begin < (int) _scores.size()) && (begin >= 0)){
00041       return _scores[begin];
00042     }
00043     return -HUGE;
00044   }
00045   double SimilarityBasedSequenceWeighting::prefix_sum_array_compute(int begin, int end)
00046   {
00047     return prefix_sum_array_compute(begin, end, 0);
00048   }
00049 
00050   bool SimilarityBasedSequenceWeighting::initialize_prefix_sum_array(const Sequence & s, int phase)
00051   {
00052     if(ProbabilisticModel::initialize_prefix_sum_array(s))
00053       return true;
00054     _scores.resize(s.size());
00055     for (int i = 0; i < (int) s.size(); i++)  {
00056       _scores[i] = evaluate(s, i, s.size() - 1);
00057     }
00058     return true;
00059   }
00060 
00061   bool SimilarityBasedSequenceWeighting::initialize_prefix_sum_array(const Sequence & s)
00062   {
00063     return initialize_prefix_sum_array(s, 0);
00064   }
00065 
00066 
00067 
00068 
00069   std::string SimilarityBasedSequenceWeighting::str() const
00070   {
00071     std::stringstream out;
00072 
00073     DoubleMapParameterValuePtr p = DoubleMapParameterValuePtr(new DoubleMapParameterValue(_counter)) ;
00074     DoubleParameterValuePtr x = DoubleParameterValuePtr(new DoubleParameterValue(_normalizer));
00075     IntParameterValuePtr so = IntParameterValuePtr(new IntParameterValue(_skip_offset));
00076     IntParameterValuePtr sl = IntParameterValuePtr(new IntParameterValue(_skip_length));
00077     StringParameterValuePtr ss = StringParameterValuePtr(new StringParameterValue(_skip_sequence));
00078     out << "model_name=\"SimilarityBasedSequenceWeighting\""<< std::endl;
00079     out << ProbabilisticModel::alphabet()->str() << std::endl;
00080     out << "counter = " <<  p->str() << std::endl;
00081     out << "normalizer = " << x->str() << std::endl;
00082     if(_skip_offset >= 0)
00083       {
00084         out << "skip_offset = " <<  so->str() << std::endl;
00085         out << "skip_length = " << sl->str() << std::endl;
00086         out << "skip_sequence = " << ss->str() << std::endl;
00087       }
00088 
00089     return out.str();
00090   }
00091 
00093   double SimilarityBasedSequenceWeighting::evaluate(const Sequence & s, unsigned int begin, unsigned int end) const {
00094     if (end >= s.size())
00095       return -HUGE;
00096     if(begin < 0)
00097       return -HUGE;
00098     int length =     (_counter.begin()->first).size();
00099     std::stringstream qstream;
00100     for(int i = begin ; (i <= (int)end) && (i < (int) (begin + length)) ; i++){
00101       qstream << alphabet()->getSymbol(s[i])->name();
00102     }
00103     std::string q = qstream.str();
00104     int psize = 0;
00105     std::map<std::string, double>::const_iterator it;
00106     double sum = 0;
00107     for(it = _counter.begin(); it != _counter.end();  it++)
00108       {
00109           std::string q2 = it->first;
00110           psize = q2.size();
00111         int diff = 0;
00112         if(q.size () != q2.size())
00113           return -HUGE;
00114         bool valid = true;
00115         for(int i = 0; i < (int)q2.size(); i++)
00116           {
00117             if((i >= _skip_offset) && (i < _skip_offset+_skip_length)){
00118                 if(q[i] != _skip_sequence[i-_skip_offset]){
00119                     valid = false;
00120                     break;
00121                 }
00122             }else if(q[i] != q2[i])
00123               diff++;
00124           }
00125         if(!valid)
00126             return -HUGE;
00127         if(diff == 1){
00128           sum += 0.001 * it->second;
00129         } else if (diff==0){
00130           sum += it->second;
00131         }
00132       }
00133     if(close(sum , 0.0, 1e-10))
00134         return -HUGE;
00135     return log(sum/(_normalizer));
00136   }
00137   void SimilarityBasedSequenceWeighting::initialize (const ProbabilisticModelParameters & p )
00138   {
00139     ProbabilisticModelParameterValuePtr alphabetpar = p.getOptionalParameterValue("alphabet");
00140     ProbabilisticModelParameterValuePtr counterpar = p.getOptionalParameterValue("counter");
00141     ProbabilisticModelParameterValuePtr lengthpar = p.getOptionalParameterValue("length");
00142     ProbabilisticModelParameterValuePtr normalizerp = p.getOptionalParameterValue("normalizer");
00143 
00144     ProbabilisticModelParameterValuePtr offsetpar = p.getOptionalParameterValue("skip_offset");
00145     ProbabilisticModelParameterValuePtr skiplengthpar = p.getOptionalParameterValue("skip_length");
00146     ProbabilisticModelParameterValuePtr skipseqpar = p.getOptionalParameterValue("skip_sequence");
00147 
00148     if (alphabetpar != NULL)
00149       {
00150         AlphabetPtr alpha = AlphabetPtr(new Alphabet());
00151         alpha->initializeFromVector(alphabetpar->getStringVector());
00152         setAlphabet(alpha);
00153       }
00154     _counter = counterpar->getDoubleMap();
00155     _normalizer  = normalizerp->getDouble();
00156     if((skiplengthpar == NULL) || (offsetpar == NULL) )
00157       {
00158         _skip_offset = -1;
00159         _skip_length = -1;
00160       }
00161     else
00162       {
00163         _skip_offset = offsetpar->getInt();
00164         _skip_length = skiplengthpar->getInt();
00165         _skip_sequence = skipseqpar->getString();
00166       }
00167   }
00168 
00169   ProbabilisticModelParameters SimilarityBasedSequenceWeighting::parameters() const
00170   {
00171     ProbabilisticModelParameters p;
00172     p.add("model_name", StringParameterValuePtr(new StringParameterValue("SimilarityBasedSequenceWeighting")));
00173     p.add("alphabet", alphabet()->getParameterValue());
00174     p.add("counter", DoubleMapParameterValuePtr(new DoubleMapParameterValue(_counter)));
00175     p.add("normalizer", DoubleParameterValuePtr(new DoubleParameterValue(_normalizer)));
00176     if(_skip_offset >= 0){
00177       p.add("skip_length", IntParameterValuePtr(new IntParameterValue(_skip_length)));
00178       p.add("skip_offset", IntParameterValuePtr(new IntParameterValue(_skip_offset)));
00179     }
00180     return p;
00181   }
00182 }