ToPS
VariableLengthMarkovChain.cpp
00001 /*
00002  *       VariableLengthMarkovChain.cpp
00003  *
00004  *       Copyright 2011 Andre Yoshiaki Kashiwabara <akashiwabara@usp.br>
00005  *                      Ígor Bonádio <ibonadio@ime.usp.br>
00006  *                      Vitor Onuchic <vitoronuchic@gmail.com>
00007  *                      Alan Mitchell Durham <aland@usp.br>
00008  *
00009  *       This program is free software; you can redistribute it and/or modify
00010  *       it under the terms of the GNU  General Public License as published by
00011  *       the Free Software Foundation; either version 3 of the License, or
00012  *       (at your option) any later version.
00013  *
00014  *       This program is distributed in the hope that it will be useful,
00015  *       but WITHOUT ANY WARRANTY; without even the implied warranty of
00016  *       MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00017  *       GNU General Public License for more details.
00018  *
00019  *       You should have received a copy of the GNU General Public License
00020  *       along with this program; if not, write to the Free Software
00021  *       Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
00022  *       MA 02110-1301, USA.
00023  */
00024 
00025 #include "VariableLengthMarkovChain.hpp"
00026 #include "VariableLengthMarkovChainCreator.hpp"
00027 #include "Symbol.hpp"
00028 #include <boost/algorithm/string.hpp>
00029 namespace tops {
00030 
00031     std::string VariableLengthMarkovChain::print_graph () const {
00032         std::stringstream out;
00033         AlphabetPtr alphabet = ProbabilisticModel::alphabet();
00034         int nnodes = _tree->getNumberOfNodes();
00035         for (int i = 0; i < nnodes; i++) {
00036             ContextTreeNodePtr current = _tree->getContext(i);
00037             ContextTreeNodePtr root = _tree->getRoot();
00038             std::vector <std::string> aux;
00039             while(current != root) {
00040                 aux.push_back(alphabet->getSymbol(current->symbol())->name() );
00041                 current = _tree->getContext(current->getParent());
00042             }
00043             std::stringstream node_id;
00044             for (int j = aux.size() - 1; j >= 0; j--)
00045                 node_id  << aux[j] ;
00046             out << _tree->getContext(i)->id() << " " <<  node_id.str() << std::endl;
00047         }
00048         out << "#" << std::endl;
00049         for (int i = 0; i < nnodes; i++) {
00050             out << _tree->getContext(i)->id() << " " <<  _tree->getContext(_tree->getContext(i)->getParent())->id()  << std::endl;
00051         }
00052         return out.str();
00053     }
00054 
00055 
00056   void VariableLengthMarkovChain::printDistribution(ContextTreePtr tree, ContextTreeNodePtr node, std::stringstream & out, AlphabetPtr alphabet) const {
00057     ContextTreeNodePtr current = node;
00058     std::vector <std::string> aux;
00059     bool root=true;
00060     if(current != tree->getRoot() )
00061       root = false;
00062     while(current != tree->getRoot())
00063       {
00064         aux.push_back(alphabet->getSymbol(current->symbol())->name() );
00065         current = tree->getContext(current->getParent());
00066       }
00067     for(int k = 0; k < (int)alphabet->size(); k++)
00068       {
00069         out << "\"" << alphabet->getSymbol(k)->name() << "\" | " ;
00070         if(root)
00071           out << "\"\" " ;
00072         else
00073           {
00074             out << "\"";
00075             out << aux[aux.size() -1];
00076             for(int i = aux.size()-2; i >=0; i--)
00077               out << " " << aux[i] ;
00078             out << "\"";
00079           }
00080         out << ": " ;
00081         out << exp(node->getDistribution()->log_probability_of(k)) << ";";
00082         if(node->isLeaf())
00083           out << " # leaf";
00084         out << std::endl;
00085       }
00086     if(!node->isLeaf())
00087       for(int l = 0; l < node->alphabet_size(); l++)
00088         if(node->getChild(l) != NULL)
00089           printDistribution(tree, node->getChild(l), out, alphabet);
00090   }
00091 
00092 
00093   double VariableLengthMarkovChain::evaluatePosition(const Sequence & s, unsigned int i)const{
00094     ContextTreeNodePtr c = _tree->getContext(s,i);
00095     if (c == NULL)
00096       return -HUGE;
00097     else
00098       return c->getDistribution()->evaluatePosition(s,i);
00099   }
00100 
00101   double VariableLengthMarkovChain::choosePosition(const Sequence & s, int i)const{
00102     ContextTreeNodePtr c = _tree->getContext(s,i);
00103     if (c == NULL)
00104       return -HUGE;
00105     else
00106       return c->getDistribution()->choosePosition(s,i);
00107   }
00108 
00109   ProbabilisticModelCreatorPtr VariableLengthMarkovChain::getFactory() const {
00110     return VariableLengthMarkovChainCreatorPtr (new VariableLengthMarkovChainCreator());
00111   }
00112 
00113   int VariableLengthMarkovChain::size() const {
00114     return (alphabet()->size() -1)* _tree->getNumberOfNodes();
00115   }
00116 
00117   std::string VariableLengthMarkovChain::str() const {
00118     std::stringstream out;
00119     if(_tree == NULL){
00120       std::cerr << "This Markov chain model was not defined !" << std::endl;
00121       return out.str();
00122     }
00123 
00124     out << "model_name = \"" << model_name() << "\"" << std::endl;
00125       out << "probabilities = (";
00126       printDistribution( _tree, _tree->getRoot(), out, alphabet()) ;
00127       out << ")" << std::endl;
00128       out << alphabet()->str() ;
00129       return out.str();
00130   }
00131 
00132   ProbabilisticModelParameters VariableLengthMarkovChain::parameters() const
00133   {
00134     ProbabilisticModelParameters p;
00135     p.add("model_name", StringParameterValuePtr(new StringParameterValue("VariableLengthMarkovChain")));
00136     p.add("probabilities", _tree->getParameterValue());
00137     p.add("alphabet", alphabet()->getParameterValue());
00138     return p;
00139   }
00140 
00141   void VariableLengthMarkovChain::initialize(const ProbabilisticModelParameters & p) {
00142     ProbabilisticModelParameterValuePtr probs = p.getMandatoryParameterValue("probabilities");
00143     ProbabilisticModelParameterValuePtr symbols = p.getMandatoryParameterValue("alphabet");
00144     if((probs == NULL) || (symbols == NULL))
00145       {
00146         return;
00147       }
00148     AlphabetPtr alphabet = AlphabetPtr(new Alphabet());
00149     alphabet->initializeFromVector(symbols->getStringVector());
00150     ContextTreePtr tree =
00151       ContextTreePtr(new ContextTree(alphabet));
00152     ContextTreeNodePtr root = tree->createContext();
00153 
00154     std::string  root_str("");
00155     setTree(tree);
00156     setAlphabet(alphabet);
00157 
00158     map<std::string,DoubleVector> probmap;
00159     std::map <std::string,double>::const_iterator it;
00160     std::map <std::string,std::vector<double> >::const_iterator it2;
00161     for(it = (probs->getDoubleMap()).begin();
00162         it !=(probs->getDoubleMap()).end();
00163         it++)
00164       {
00165         std::vector <std::string> splited;
00166         boost::regex separator("\\|");
00167         tops::split_regex(it->first, splited, separator);
00168         if(splited.size() == 1) {
00169           splited.push_back("");
00170         }
00171 
00172         std::string context (splited[1]);
00173         std::string symbol ( splited[0]);
00174 
00175         if(probmap.find(context) == probmap.end())
00176           {
00177             int id = alphabet->getSymbol(symbol)->id();
00178             DoubleVector probs;
00179             probs.resize(alphabet->size());
00180             probmap[context]=probs;
00181             if(id < (int)probmap[context].size())
00182               (probmap[context])[id] = it->second;
00183           }
00184         else
00185           {
00186             int id = alphabet->getSymbol(symbol)->id();
00187             if(id < (int)probmap[context].size())
00188               (probmap[context])[id] = it->second;
00189           }
00190       }
00191 
00192     for(it2 = probmap.begin(); it2 != probmap.end(); it2++) {
00193         std::vector <std::string> history;
00194         std::string context = it2->first;
00195         std::vector<double> p = it2->second;
00196         boost::split(history, context, boost::is_any_of(" "));
00197         if(context.size() <= 0)
00198           {
00199             ProbabilisticModelParameters fddpar ;
00200             fddpar.add("probabilities", DoubleVectorParameterValuePtr(new DoubleVectorParameterValue(p)));
00201             fddpar.add("alphabet", alphabet->getParameterValue());
00202             DiscreteIIDModelPtr distr =
00203               DiscreteIIDModelPtr(new DiscreteIIDModel());
00204             distr->initialize(fddpar);
00205             root->setDistribution(distr);
00206           }
00207         else
00208           {
00209             ContextTreeNodePtr current  = root;
00210             // history with indices -1, -2, -3,...
00211             for(int i = 0; i < (int)history.size() ;i++)
00212               {
00213                 int id = alphabet->getSymbol(history[i])->id();
00214                 if(id < 0)
00215                   {
00216                     std::cerr << "ERROR: Symbol, '" << history[i] << "', not in the alphabet !" << std::endl;
00217                     exit(-1);
00218                   }
00219 
00220                 if(current->getChild(id) == NULL)
00221                   {
00222                     ContextTreeNodePtr node2 = tree->createContext();
00223                     current->setChild(node2, id);
00224                     if(i == ((int)history.size() - 1))
00225                       {
00226                         ProbabilisticModelParameters fddpar ;
00227                         fddpar.add("probabilities", DoubleVectorParameterValuePtr(new DoubleVectorParameterValue(p)));
00228                         fddpar.add("alphabet", alphabet->getParameterValue());
00229                         DiscreteIIDModelPtr distr =
00230                           DiscreteIIDModelPtr(new DiscreteIIDModel());
00231                         distr->initialize(fddpar);
00232                         node2->setDistribution(distr);
00233                       }
00234                     else
00235                       {
00236                         DoubleVector probs2;
00237                         for(int i = 0; i < (int)alphabet->size()-1; i++)
00238                           probs2.push_back(1.0/(double)alphabet->size());
00239                         ProbabilisticModelParameters fddpar ;
00240                         fddpar.add("probabilities", DoubleVectorParameterValuePtr(new DoubleVectorParameterValue(probs2)));
00241                         fddpar.add("alphabet", alphabet->getParameterValue());
00242                         DiscreteIIDModelPtr distr =
00243                           DiscreteIIDModelPtr(new DiscreteIIDModel());
00244                         distr->initialize(fddpar);
00245                         node2->setDistribution(distr);
00246                       }
00247                   }
00248                 current = current->getChild(id);
00249               }
00250           }
00251       }
00252 
00253   }
00254 
00255   void VariableLengthMarkovChain::removeSequenceFromModel(const Sequence & s,  int phase){
00256       ContextTreeNodePtr c =  _tree->getRoot();
00257     ContextTreeNodePtr p;
00258     int j;
00259     for(j = s.size()-2; j >=0; j--){
00260       if(c->isLeaf())
00261         break;
00262       p = c;
00263       c = c->getChild(s[j]);
00264       if(c == NULL)
00265         {
00266           c = p;
00267           break;
00268         }
00269     }
00270     for(; j >= 0; j--) {
00271       for(int l = 0; l < (int)alphabet()->size(); l++) {
00272         ContextTreeNodePtr n = _tree->createContext();
00273         DoubleVector prob;
00274         for(int i = 0; i < n->alphabet_size(); i++)
00275           prob.push_back(exp(c ->getDistribution()->log_probability_of(i)));
00276         DiscreteIIDModelPtr distr = DiscreteIIDModelPtr(new DiscreteIIDModel(prob));
00277         n->setDistribution(distr);
00278         c->setChild(n, l);
00279       }
00280       c = c->getChild(s[j]);
00281     }
00282 
00283     ContextTreeNodePtr current = _tree->getContext(s,s.size()-1);
00284 
00285 
00286     current->getDistribution()->log_probability_of(s[s.size()-1], -HUGE);
00287     double sum = 0;
00288     for(int symbol = 0; symbol < alphabet()->size(); symbol++)
00289         {
00290             sum += exp(current->getDistribution()->log_probability_of(symbol));
00291         }
00292     for(int symbol = 0; symbol < alphabet()->size(); symbol++)
00293         {
00294             if(symbol != s[s.size()-1]) {
00295                 double x = exp(current->getDistribution()->log_probability_of(symbol));
00296                 current->getDistribution()->log_probability_of(symbol, log(x/sum));
00297             }
00298         }
00299 
00300 
00301     std::vector <ContextTreeNodePtr> children = current->getChildren();
00302     if(current->isLeaf())
00303       return;
00304     std::vector <ContextTreeNodePtr> stack;
00305     for(int i = 0 ; i < (int)children.size(); i++)
00306         stack.push_back(children[i]);
00307 #if 1
00308     while(stack.size() > 0) {
00309       current = stack.back();
00310       stack.pop_back();
00311       if(current == NULL)
00312         continue;
00313       assert(current->getDistribution() != NULL);
00314 
00315 
00316 
00317     current->getDistribution()->log_probability_of(s[s.size()-1], -HUGE);
00318     double sum = 0;
00319     for(int symbol = 0; symbol < alphabet()->size(); symbol++)
00320         {
00321             sum += exp(current->getDistribution()->log_probability_of(symbol));
00322         }
00323     for(int symbol = 0; symbol < alphabet()->size(); symbol++)
00324         {
00325             if(symbol != s[s.size()-1]) {
00326                 double x = exp(current->getDistribution()->log_probability_of(symbol));
00327                 current->getDistribution()->log_probability_of(symbol, log(x/sum));
00328             }
00329         }
00330       if(current->isLeaf())
00331         continue;
00332       children = current->getChildren();
00333       for(int i = 0 ; i < (int)children.size(); i++)
00334         stack.push_back(children[i]);
00335     }
00336 #endif
00337 
00338 
00339 
00340 
00341   }
00342 
00343 }