ToPS
InhomogeneousMarkovChain.cpp
00001 /*
00002  *       InhomogeneousMarkovChain.cpp
00003  *
00004  *       Copyright 2011 Andre Yoshiaki Kashiwabara <akashiwabara@usp.br>
00005  *                      Ígor Bonádio <ibonadio@ime.usp.br>
00006  *                      Vitor Onuchic <vitoronuchic@gmail.com>
00007  *                      Alan Mitchell Durham <aland@usp.br>
00008  *
00009  *       This program is free software; you can redistribute it and/or modify
00010  *       it under the terms of the GNU  General Public License as published by
00011  *       the Free Software Foundation; either version 3 of the License, or
00012  *       (at your option) any loater version.
00013  *
00014  *       This program is distributed in the hope that it will be useful,
00015  *       but WITHOUT ANY WARRANTY; without even the implied warranty of
00016  *       MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00017  *       GNU General Public License for more details.
00018  *
00019  *       You should have received a copy of the GNU General Public License
00020  *       along with this program; if not, write to the Free Software
00021  *       Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
00022  *       MA 02110-1301, USA.
00023  */
00024 
00025 #include "InhomogeneousMarkovChain.hpp"
00026 #include "Alphabet.hpp"
00027 #include "ProbabilisticModelParameter.hpp"
00028 #include "Symbol.hpp"
00029 #include "VariableLengthMarkovChainCreator.hpp"
00030 
00031 namespace tops{
00033   double InhomogeneousMarkovChain::evaluatePosition(const Sequence & s, int i, int t) const{
00034 
00035     assert((t >= 0) && (t < (int)_context_trees.size()));
00036     ContextTreeNodePtr context = _context_trees[t]->getContext(s,i);
00037     if(context == NULL) {
00038         return -HUGE;
00039     }
00040     DiscreteIIDModelPtr distr = context->getDistribution();
00041     if(distr == NULL)
00042         return -HUGE;
00043     return  distr->evaluatePosition(s,i);
00044   }
00045 
00047   int InhomogeneousMarkovChain::choosePosition(const Sequence & s, int i, int t) const{
00048     assert((t >= 0) && (t < (int)_context_trees.size()));
00049     return  _context_trees[t]->getContext(s,i)->getDistribution()->choosePosition(s,i);
00050   }
00051 
00052   std::string InhomogeneousMarkovChain::str() const {
00053     std::stringstream out;
00054     out << "model_name = \"" << model_name() << "\"" << std::endl;
00055 
00056     for(int i = 0; i < (int)_context_trees.size(); i++)
00057       {
00058         out << "p" << i << " = (";
00059         printDistribution( _context_trees[i], _context_trees[i]->getRoot(), out, alphabet()) ;
00060         out << ")" << std::endl;
00061       }
00062     out << "position_specific_distribution = (";
00063     if (_context_trees.size() >= 1){
00064       out << "\"p0\"";
00065       for(int i = 1; i < (int)_context_trees.size(); i++)
00066         {
00067           out <<",\"p" << i << "\"";
00068         }
00069     }
00070     out << ")\n";
00071     out << "phased =" << _phased << std::endl;
00072     out << alphabet()->str();
00073     return out.str();
00074   }
00075 
00076   int InhomogeneousMarkovChain::size() const
00077   {
00078     int size= 0;
00079     for(int i  = 0; i < (int)_context_trees.size(); i++)
00080       size += _context_trees[i]->getNumberOfNodes();
00081     return size;
00082   }
00083 
00084   void InhomogeneousMarkovChain::printDistribution(ContextTreePtr tree, ContextTreeNodePtr node, std::stringstream & out, AlphabetPtr alphabet) const
00085     {
00086       ContextTreeNodePtr current = node;
00087       std::vector <std::string> aux;
00088       bool root=true;
00089       if(current != tree->getRoot() )
00090         root = false;
00091       while(current != tree->getRoot())
00092         {
00093           aux.push_back(alphabet->getSymbol(current->symbol())->name() );
00094           current = tree->getContext(current->getParent());
00095         }
00096       for(int k = 0; k < (int)alphabet->size(); k++)
00097         {
00098           //      if(close(exp(node->getDistribution()->log_probability_of(k)), 0.0, 1e-100))
00099           //        continue;
00100           out << "\"" << alphabet->getSymbol(k)->name() << "\" | " ;
00101           if(root)
00102             out << "\"\" " ;
00103           else
00104             {
00105               out << "\"";
00106               out << aux[aux.size() -1];
00107               for(int i = aux.size()-2; i >=0; i--)
00108                 out << " " << aux[i] ;
00109               out << "\"";
00110             }
00111           out << ": " ;
00112           out << exp(node->getDistribution()->log_probability_of(k)) << ";";
00113           if(node->isLeaf())
00114             out << " # leaf";
00115           out << std::endl;
00116 
00117         }
00118       if(!node->isLeaf())
00119         for(int l = 0; l < node->alphabet_size(); l++)
00120           if(node->getChild(l) != NULL)
00121             printDistribution(tree, node->getChild(l), out, alphabet);
00122     }
00123   void InhomogeneousMarkovChain::removeSequenceFromModel(const Sequence & s,  int phase){
00124     for(int i = 0; i < (int)s.size()-1; i++)
00125       phase = mod(phase + 1, maximumTimeValue()+1);
00126     ContextTreeNodePtr c =  _context_trees[phase]->getRoot();
00127     ContextTreeNodePtr p;
00128     int j;
00129     for(j = s.size()-2; j >=0; j--){
00130       if(c->isLeaf())
00131         break;
00132       p = c;
00133       c = c->getChild(s[j]);
00134       if(c == NULL)
00135         {
00136           c = p;
00137           break;
00138         }
00139     }
00140     for(; j >= 0; j--) {
00141       for(int l = 0; l < (int)alphabet()->size(); l++) {
00142         ContextTreeNodePtr n = _context_trees[phase]->createContext();
00143         DoubleVector prob;
00144         for(int i = 0; i < n->alphabet_size(); i++)
00145           prob.push_back(exp(c ->getDistribution()->log_probability_of(i)));
00146         DiscreteIIDModelPtr distr = DiscreteIIDModelPtr(new DiscreteIIDModel(prob));
00147         n->setDistribution(distr);
00148         c->setChild(n, l);
00149       }
00150       c = c->getChild(s[j]);
00151     }
00152 
00153     ContextTreeNodePtr current = _context_trees[phase]->getContext(s,s.size()-1);
00154 
00155 
00156     current->getDistribution()->log_probability_of(s[s.size()-1], -HUGE);
00157     double sum = 0;
00158     for(int symbol = 0; symbol < alphabet()->size(); symbol++)
00159         {
00160             sum += exp(current->getDistribution()->log_probability_of(symbol));
00161         }
00162     for(int symbol = 0; symbol < alphabet()->size(); symbol++)
00163         {
00164             if(symbol != s[s.size()-1]) {
00165                 double x = exp(current->getDistribution()->log_probability_of(symbol));
00166     current->getDistribution()->log_probability_of(symbol, log(x/sum));
00167     if(close(x, 0, 1e-10))
00168       current->getDistribution()->log_probability_of(symbol, -HUGE);
00169 
00170             }
00171         }
00172 
00173 
00174     std::vector <ContextTreeNodePtr> children = current->getChildren();
00175     if(current->isLeaf())
00176       return;
00177     std::vector <ContextTreeNodePtr> stack;
00178     for(int i = 0 ; i < (int)children.size(); i++)
00179         stack.push_back(children[i]);
00180 #if 1
00181     while(stack.size() > 0) {
00182       current = stack.back();
00183       stack.pop_back();
00184       if(current == NULL)
00185         continue;
00186       assert(current->getDistribution() != NULL);
00187 
00188 
00189 
00190     current->getDistribution()->log_probability_of(s[s.size()-1], -HUGE);
00191     double sum = 0;
00192     for(int symbol = 0; symbol < alphabet()->size(); symbol++)
00193         {
00194             sum += exp(current->getDistribution()->log_probability_of(symbol));
00195         }
00196     for(int symbol = 0; symbol < alphabet()->size(); symbol++)
00197         {
00198             if(symbol != s[s.size()-1]) {
00199                 double x = exp(current->getDistribution()->log_probability_of(symbol));
00200                 current->getDistribution()->log_probability_of(symbol, log(x/sum));
00201     if(close(x, 0, 1e-10))
00202       current->getDistribution()->log_probability_of(symbol, -HUGE);
00203 
00204             }
00205         }
00206       if(current->isLeaf())
00207         continue;
00208       children = current->getChildren();
00209       for(int i = 0 ; i < (int)children.size(); i++)
00210         stack.push_back(children[i]);
00211     }
00212 #endif
00213 
00214 
00215 
00216 
00217   }
00218   ProbabilisticModelParameters InhomogeneousMarkovChain::parameters() const {
00219     ProbabilisticModelParameters par;
00220     par.add("model_name", StringParameterValuePtr(new StringParameterValue(model_name())));
00221     StringVector parnames;
00222     for(int i = 0; i <(int)_context_trees.size();i++)
00223       {
00224         std::stringstream out;
00225         out << "p" << i;
00226         parnames.push_back(out.str());
00227       }
00228     par.add("position_specific_distribution" ,
00229             StringVectorParameterValuePtr(new StringVectorParameterValue(parnames)));
00230     for(int i = 0; i <(int)_context_trees.size();i++)
00231       {
00232         std::stringstream out;
00233         out << "p" << i;
00234         par.add(out.str(), _context_trees[i]->getParameterValue());
00235       }
00236     par.add("phased", IntParameterValuePtr(new IntParameterValue(_phased)));
00237     par.add("alphabet", alphabet()->getParameterValue());
00238     return par;
00239   }
00240   void InhomogeneousMarkovChain::initialize(const ProbabilisticModelParameters & parameters)  {
00241     ProbabilisticModelParameterValuePtr symbols = parameters.getMandatoryParameterValue("alphabet");
00242     ProbabilisticModelParameterValuePtr position_specific_model_names = parameters.getMandatoryParameterValue("position_specific_distribution");
00243     ProbabilisticModelParameterValuePtr p = parameters.getMandatoryParameterValue("phased");
00244 
00245     bool is_phased = (p->getInt() == 1);
00246 
00247 
00248     AlphabetPtr alphabet = AlphabetPtr(new Alphabet());
00249     alphabet->initializeFromVector(symbols->getStringVector());
00250     std::vector <ContextTreePtr> position_specific_trees;
00251     StringVector model_names = position_specific_model_names->getStringVector();
00252     position_specific_trees.resize(model_names.size());
00253     for(int i = 0; i < (int)model_names.size(); i++)
00254       {
00255         ProbabilisticModelParameters vlmc;
00256         ProbabilisticModelParameterValuePtr probs = parameters.getMandatoryParameterValue(model_names[i]);
00257         if (probs == NULL)
00258           {
00259             std::cerr << "ERROR:  Could not find " << model_names[i] << std::endl;
00260           }
00261 
00262         vlmc.add("alphabet", symbols);
00263         vlmc.add("probabilities", probs);
00264 
00265         VariableLengthMarkovChainCreator vlmcCreator;
00266         VariableLengthMarkovChainPtr mc = vlmcCreator.createVLMC(vlmc);
00267         position_specific_trees[i] = mc->getTree();
00268       }
00269     phased(is_phased);
00270     setAlphabet(alphabet);
00271     setPositionSpecificDistribution(position_specific_trees);
00272   }
00273 }
00274