ToPS: HiddenMarkovModel.cpp Source File

00001 /*
00002  *       HiddenMarkovModel.cpp
00003  *
00004  *       Copyright 2011 Andre Yoshiaki Kashiwabara <akashiwabara@usp.br>
00005  *                      �gor Bon�dio <ibonadio@ime.usp.br>
00006  *                      Vitor Onuchic <vitoronuchic@gmail.com>
00007  *                      Alan Mitchell Durham <aland@usp.br>
00008  *
00009  *       This program is free software; you can redistribute it and/or modify
00010  *       it under the terms of the GNU  General Public License as published by
00011  *       the Free Software Foundation; either version 3 of the License, or
00012  *       (at your option) any later version.
00013  *
00014  *       This program is distributed in the hope that it will be useful,
00015  *       but WITHOUT ANY WARRANTY; without even the implied warranty of
00016  *       MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00017  *       GNU General Public License for more details.
00018  *
00019  *       You should have received a copy of the GNU General Public License
00020  *       along with this program; if not, write to the Free Software
00021  *       Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
00022  *       MA 02110-1301, USA.
00023  */
00024 
00025 #include <boost/numeric/ublas/matrix.hpp>
00026 #include "Alphabet.hpp"
00027 #include "HiddenMarkovModel.hpp"
00028 #include "ProbabilisticModelParameter.hpp"
00029 #include "Symbol.hpp"
00030 #include <iostream>
00031 #include <cmath>
00032 #include <sstream>
00033 #include <vector>
00034 #include <iterator>
00035 #include <stdio.h>
00036 
00037 namespace tops {
00038   std::string HiddenMarkovModel::getStateName(int state) const{
00039       return getState(state)->getName()->name();
00040     }
00041 
00042   Sequence &  HiddenMarkovModel::chooseObservation ( Sequence & h, int i, int state) const
00043   {
00044     if((state >= 0) && (!getState(state)->isSilent() ))
00045       return getState(state)->emission()->chooseWithHistory(h,i,1);
00046     return h;
00047   }
00048   int HiddenMarkovModel::chooseState(int state) const
00049   {
00050     return getState(state) ->transitions()->choose();
00051   }
00052   int HiddenMarkovModel::chooseFirstState() const
00053   {
00054     return _initial_probability->choose();
00055   }
00056 
00057   double HiddenMarkovModel::forward(const Sequence & sequence, Matrix &a) const
00058   {
00059     int nstates = _states.size();
00060     int size = sequence.size();
00061     Matrix alpha (nstates, size);
00062     for(int k = 0; k < nstates; k++)
00063       alpha(k,0) = _initial_probability->log_probability_of(k) + getState(k)->emission()->log_probability_of(sequence[0]);
00064 
00065     for (int t = 0; t < size-1; t++)
00066       {
00067         for(int i = 0; i < nstates; i++)     {
00068           int j  = 0;
00069           if(j < nstates){
00070             alpha(i,t+1) =  alpha(j, t) + getState(j)->transitions()->log_probability_of(i);
00071             for( j = 1; j < nstates; j++)
00072               alpha(i, t+1) = log_sum(alpha(i,t+1), alpha(j, t) + getState(j)->transitions()->log_probability_of(i));
00073           }
00074           alpha(i,t+1) += getState(i)->emission()->log_probability_of(sequence[t+1]);
00075 
00076         }
00077       }
00078     a = alpha;
00079     double sum =  alpha(0, size-1);
00080     for(int k = 1; k < nstates; k++)
00081       sum = log_sum(sum, alpha(k, size-1));
00082     return sum;
00083   }
00084 
00086   double HiddenMarkovModel::backward(const Sequence & sequence, Matrix &b) const
00087   {
00088     int nstates = _states.size();
00089     int size = sequence.size();
00090     Matrix beta (nstates, size);
00091     for(int k = 0; k < nstates; k++)
00092       beta(k,size-1) = 0.0;
00093     for (int t = size-2; t >= 0; t--)
00094       {
00095 
00096         for(int i = 0; i < nstates; i++)
00097           {
00098             int j = 0;
00099             if(j < nstates) {
00100               beta(i,t) =  getState(i)->transitions()->log_probability_of(j) + getState(j)->emission()->log_probability_of(sequence[t+1]) + beta(j, t+1);
00101               for(j = 1; j < nstates; j++)
00102                 beta(i, t) = log_sum(beta(i, t), getState(i)->transitions()->log_probability_of(j) + getState(j)->emission()->log_probability_of(sequence[t+1]) + beta(j, t+1));
00103             }
00104           }
00105       }
00106     b = beta;
00107     double sum = -HUGE;
00108     for(int k = 0; k < nstates; k++)
00109       sum = log_sum(sum, beta(k, 0) + _initial_probability->log_probability_of(k) + getState(k)->emission()->log_probability_of(sequence[0]));
00110     return sum;
00111 
00112   }
00113 
00115   double HiddenMarkovModel::viterbi (const Sequence &sequence, Sequence &path, Matrix & viterbi) const
00116   {
00117     typedef boost::numeric::ublas::matrix<int> MatrixInt;
00118     int nstates = _states.size();
00119     int size =  sequence.size();
00120 
00121     Matrix gamma (nstates, size+1);
00122     MatrixInt psi (nstates, size+1);
00123 
00124     for(int k = 0; k < nstates; k++)
00125       gamma(k,0) = _initial_probability->log_probability_of(k) + getState(k)->emission()->log_probability_of(sequence[0]);
00126 
00127     for (int i = 0; i < size-1; i++)
00128       {
00129         for(int k = 0; k < nstates; k++)     {
00130           gamma(k,i+1) =  gamma(0, i) +getState(0)->transitions()->log_probability_of(k);
00131           psi(k,i+1) = 0;
00132           for(int p = 1; p < nstates; p++) { // p is the predecessor
00133             double v = gamma(p, i) + getState(p)->transitions()->log_probability_of(k); //
00134             if(gamma(k, i+1) < v)
00135               {
00136                 gamma(k, i+1) = v;
00137                 psi(k,i+1) = p;
00138               }
00139           }
00140           gamma(k,i+1) += getState(k)->emission()->log_probability_of(sequence[i+1]);
00141         }
00142       }
00143 
00144     viterbi = gamma;
00145     int L = size-1;
00146     path.resize(L+1);
00147     double max = gamma(0, L);
00148     path[L] = 0;
00149     for(int k = 1; k < nstates; k++)
00150       if(max < gamma(k, L)) {
00151         max = gamma(k,L);
00152         path[L] = k;
00153       }
00154     for(int i = L; i >= 1; i--) {
00155       path[i-1] = psi(path[i], i);
00156     }
00157     for(int i = 0; i < L; i++) {
00158       path[i] = path[i+1];
00159     }
00160     for(int i = 0; i < L; i++) {
00161       path[i] = getState(path[i+1])->getName()->id();
00162     }
00163     return max;
00164 
00165   }
00166 
00167   void HiddenMarkovModel::scale(std::vector<double> & in,  int t)
00168   {
00169     double sum = 0.0;
00170     for(int i = 0; i < (int)in.size(); i++)
00171       {
00172         sum += in[i];
00173       }
00174     _ctFactors[t] = sum;
00175     for(int i = 0; i < (int) in.size(); i++)
00176       {
00177         in[i] = in[i]/sum;
00178       }
00179   }
00180 
00181 
00182   void HiddenMarkovModel::trainBaumWelch(SequenceList & sample, int maxiterations, double diff_threshold)
00183   {
00184     int nstates = _states.size();
00185     int alphabet_size = alphabet()->size();
00186 
00187 
00188     double diff  = 10.0;
00189     if(maxiterations < 0)
00190       maxiterations = 500;
00191 
00192     for(int s = 0; s < (int)sample.size(); s++)
00193       {
00194         double last =10000;
00195         for(int iterations = 0; iterations < maxiterations; iterations++)
00196           {
00197 
00198             Matrix A (nstates, nstates);
00199             Matrix E (nstates,alphabet_size);
00200             Matrix pi (nstates, 1);
00201 
00202             Matrix alpha(nstates, sample[s].size());
00203             Matrix beta(nstates, sample[s].size());
00204 
00205             double P = forward(sample[s], alpha);
00206             backward(sample[s], beta);
00207 
00208             double sum = alpha(0, 0) + beta(0, 0);
00209             for(int i = 1; i < nstates; i++)
00210               sum = log_sum(sum, alpha(i,0) + beta(i,0));
00211 
00212             for(int i = 0; i < nstates; i++){
00213               pi(i, 0) = alpha(i, 0) + beta(i, 0) - sum;
00214             }
00215 
00216 
00217             for(int i = 0; i < nstates; i++)
00218               {
00219                 for(int j = 0; j < nstates; j++)
00220                   {
00221                     int t = 0;
00222                     double sum = -HUGE;
00223                     if(t < (int)sample[s].size()-1) {
00224                       sum = alpha(i, t) + getState(i)->transitions()->log_probability_of(j) + getState(j)->emission()-> log_probability_of(sample[s][t+1]) + beta(j, t+1);
00225                       for( t = 1; t < (int)sample[s].size()-1; t++)
00226                         {
00227                           sum = log_sum(sum, alpha(i, t) + getState(i)->transitions()->log_probability_of(j) + getState(j)->emission()-> log_probability_of(sample[s][t+1]) + beta(j, t+1));
00228                         }
00229                     }
00230                     A(i,j) =  sum;
00231                   }
00232                 for(int sigma = 0; sigma < alphabet_size; sigma++)
00233                   {
00234                     int t = 0;
00235                     double sum = -HUGE;
00236                     bool first = true;
00237                     for(t = 0; t < (int)sample[s].size(); t++)
00238                       {
00239                         if((sigma == sample[s][t]) && first){
00240                           sum =  alpha(i, t) + beta(i,t);
00241                           first = false;
00242                         }else if(sigma == sample[s][t]) {
00243                           sum  = log_sum(sum, alpha(i,t) + beta(i,t));
00244                         }
00245                       }
00246                     E(i, sigma) =  sum ;
00247                   }
00248               }
00249 
00250 
00251 
00252             Matrix sumA(1, nstates);
00253             Matrix sumE(1, alphabet_size);
00254             for(int k = 0; k < nstates; k++)
00255               {
00256                 int l = 0;
00257                 if(l < nstates) {
00258                   sumA(0, k) = A(k, l);
00259                   for(l = 1; l < nstates; l++)
00260                     sumA(0, k) = log_sum(sumA(0,k), A(k,l));
00261                 }
00262                 int b = 0;
00263                 if(b < alphabet_size) {
00264                   sumE(0, k) = E(k,b);
00265                   for( b = 1; b < alphabet_size; b++)
00266                     sumE(0, k) = log_sum(sumE(0, k), E(k,b));
00267                 }
00268               }
00269             std::vector <double> probs;
00270             probs.resize(nstates);
00271             for(int k = 0; k < nstates; k++)
00272               {
00273                 _initial_probability->log_probability_of(k,pi(k,0) );
00274                 for(int l = 0; l < nstates; l++)
00275                   {
00276                     A(k,l) = A(k,l) - sumA(0,k);
00277                     getState(k)->transitions()->log_probability_of(l, A(k,l));
00278                   }
00279                 for(int b = 0; b <alphabet_size; b++){
00280                   E(k,b) = E(k,b) - sumE(0,k);
00281                   getState(k)->emission()->log_probability_of(b, E(k,b));
00282                 }
00283               }
00284 
00285 
00286             diff = fabs(last - P);
00287             if(diff < diff_threshold)
00288                 break;
00289 #if 0
00290             std::cerr << "iteration: " << iterations << std::endl;
00291             fprintf(stderr, "LL: %lf\n" , P );
00292             std::cerr << "Diff: " << diff << std::endl;
00293 #endif
00294             last = P;
00295           }
00296       }
00297 
00298 
00299 
00300   }
00301 
00302   std::string HiddenMarkovModel::str () const
00303   {
00304     int nstates = _states.size();
00305     std::stringstream out;
00306     out << "model_name = \"" << model_name() << "\"" << std::endl;
00307     out << "state_names = (" ;
00308     out << "\"" << getStateName(0) << "\"";
00309     for(int i = 1; i < (int)getStateNames()->size(); i++)
00310       out << ",\"" << getStateName(i) << "\"";
00311     out << ")" << std::endl;
00312 
00313     out << "observation_symbols = (" ;
00314     out << "\"" << alphabet()->getSymbol(0)->name() << "\"";
00315     for(int i = 1; i < (int)alphabet()->size(); i++)
00316       out << ",\"" << alphabet()->getSymbol(i)->name() << "\"";
00317     out << ")" << std::endl;
00318 
00319     out << "transitions = (" ;
00320     out << "\"" << getStateName(0) << "\" | \"" << getStateName(0) << "\": " << exp(getState(0)->transitions()->log_probability_of(0)) ;
00321     for(int i = 0; i < nstates; i++)
00322       for(int j = 0; j < nstates; j++)
00323         if((i != 0) || (j != 0))
00324           out << ";\n \"" << getStateName(j) << "\" | \"" << getStateName(i) << "\": " << exp(getState(i)->transitions()->log_probability_of(j));
00325     out << ")" << std::endl;
00326 
00327 
00328     out << "emission_probabilities = (" ;
00329     out << "\"" <<  alphabet()->getSymbol(0)->name() << "\" | \"" << getStateName(0) << "\": " << exp(getState(0)->emission()->log_probability_of(0)) ;
00330     for(int i = 0; i < nstates; i++)
00331       for(int j = 0; j < (int)alphabet()->size(); j++)
00332         if((i != 0) || (j != 0))
00333           out << ";\n \"" << alphabet()->getSymbol(j)->name() << "\" | \"" <<  getStateName(i) << "\": " << exp(getState(i)->emission()->log_probability_of(j));
00334     out << ")" << std::endl;
00335 
00336     double sum = 0;
00337     std::vector <double> probs;
00338     probs.resize(nstates);
00339     for(int i = 0; i < nstates; i++){
00340       probs[i] = exp(_initial_probability->log_probability_of(i));
00341       sum += probs[i];
00342     }
00343 
00344 
00345     out << "initial_probabilities = (";
00346     out << "\"" << getStateName(0) << "\":  " << probs[0]/sum ;
00347     for(int i = 1; i < nstates; i++)
00348       out << ";\n \"" << getStateName(i) << "\": " << probs[i]/sum;
00349     out << ")" << std::endl;
00350     return out.str();
00351   }
00352 
00353   void HiddenMarkovModel::initialize(const ProbabilisticModelParameters & parameters) {
00354     ProbabilisticModelParameterValuePtr state_names = parameters.getMandatoryParameterValue("state_names");
00355     ProbabilisticModelParameterValuePtr observation_symbols = parameters.getMandatoryParameterValue("observation_symbols");
00356     ProbabilisticModelParameterValuePtr initial_probabilities = parameters.getMandatoryParameterValue("initial_probabilities");
00357     ProbabilisticModelParameterValuePtr transitions = parameters.getMandatoryParameterValue("transitions");
00358     ProbabilisticModelParameterValuePtr emissions = parameters.getMandatoryParameterValue("emission_probabilities");
00359 
00360     std::vector<HMMStatePtr> state_list;
00361     AlphabetPtr states = AlphabetPtr(new Alphabet());
00362     AlphabetPtr observations = AlphabetPtr(new Alphabet());
00363     states->initializeFromVector(state_names->getStringVector());
00364     observations->initializeFromVector(observation_symbols->getStringVector());
00365 
00366     std::map<std::string,double>::const_iterator it;
00367     std::map<std::string,std::vector<double> >::const_iterator it2;
00368 
00369     DiscreteIIDModelPtr pi = DiscreteIIDModelPtr(new DiscreteIIDModel());
00370     pi->initializeFromMap(initial_probabilities->getDoubleMap(), states);
00371 
00372     std::map<std::string,double> emisspar = emissions->getDoubleMap();
00373     std::map<std::string,double> transpar = transitions->getDoubleMap();
00374 
00375     std::map<std::string,DoubleVector> emiss;
00376     std::map<std::string,DoubleVector> trans;
00377 
00378 
00379     for(it = emisspar.begin(); it != emisspar.end(); it++)
00380       {
00381         std::vector<std::string> splited;
00382         boost::regex separator("\\|");
00383         split_regex(it->first, splited, separator);
00384         if(splited.size() == 1)
00385           splited.push_back("");
00386         std::string symbol(splited[0]);
00387         std::string state(splited[1]);
00388 
00389 
00390         std::map<std::string,DoubleVector>::iterator eit;
00391         eit = emiss.find(state);
00392         if(eit == emiss.end())
00393           {
00394             int id = observations->getSymbol(symbol)->id();
00395             emiss[state].resize(observations->size());
00396             if((id >= 0) && (id < (int)(emiss[state]).size()))
00397               (emiss[state])[id] = it->second;
00398           }
00399         else
00400           {
00401             int id = observations->getSymbol(symbol)->id();
00402             if((id >= 0) && (id < (int)(eit->second).size()))
00403               (eit->second)[id] = it->second;
00404           }
00405       }
00406 
00407 
00408     for(it = transpar.begin(); it != transpar.end(); it++)
00409       {
00410         std::vector<std::string> splited;
00411         boost::regex separator("\\|");
00412         split_regex(it->first, splited, separator);
00413         if(splited.size() == 1)
00414           splited.push_back("");
00415 
00416         std::string to(splited[0]);
00417         std::string from(splited[1]);
00418         if(trans.find(from) == trans.end())
00419           {
00420             int id = states->getSymbol(to)->id();
00421             DoubleVector probs;
00422             probs.resize(states->size());
00423             trans[from]=probs;
00424             if(id < (int)trans[from].size())
00425               trans[from][id] = it->second;
00426           }
00427         else
00428           {
00429             int id = states->getSymbol(to)->id();
00430             if(id < (int)trans[from].size())
00431               trans[from][id] = it->second;
00432           }
00433       }
00434     for(unsigned int i = 0; i < states->size(); i++)
00435       {
00436         SymbolPtr state_name = states->getSymbol(i);
00437         DiscreteIIDModelPtr e ;
00438         DiscreteIIDModelPtr t ;
00439         it2 = emiss.find(state_name->name());
00440         if(it2 != emiss.end())
00441           e = DiscreteIIDModelPtr(new DiscreteIIDModel(it2->second));
00442         it2 = trans.find(state_name->name());
00443         if(it2 != trans.end())
00444           t = DiscreteIIDModelPtr(new DiscreteIIDModel(it2->second));
00445         else {
00446           std::cerr << "ERROR: Could not configure the state " << state_name->name() << "!" << std::endl;
00447           exit(-1);
00448         }
00449         HMMStatePtr statePtr = HMMStatePtr(new HMMState(state_list.size(), state_name, e, t));
00450         state_list.push_back(statePtr);
00451       }
00452 
00453     setStates(state_list, states);
00454     setInitialProbability(pi);
00455     setObservationSymbols(observations);
00456   }
00457 
00458   ProbabilisticModelParameters HiddenMarkovModel::parameters() const {
00459     ProbabilisticModelParameters answer;
00460     int nstates = _states.size();
00461     answer.add("model_name", StringParameterValuePtr(new StringParameterValue(model_name().c_str())));
00462     answer.add("state_names", _state_names->getParameterValue());
00463     answer.add("observation_symbols", alphabet()->getParameterValue());
00464     std::map <std::string, double> trans;
00465     std::stringstream out;
00466     out << getStateName(0) << "|" << getStateName(0) ;
00467     trans[out.str()] =  exp(getState(0)->transitions()->log_probability_of(0)) ;
00468     for(int i = 0; i < nstates; i++)
00469       for(int j = 0; j < nstates; j++)
00470         if((i != 0) || (j != 0)) {
00471           std::stringstream out2;
00472           out2 << getStateName(j) << "|" << getStateName(i) ;
00473           trans[out2.str()] = exp(getState(i)->transitions()->log_probability_of(j));
00474         }
00475     answer.add("transitions", DoubleMapParameterValuePtr(new DoubleMapParameterValue(trans)));
00476 
00477     std::map <std::string, double> emission;
00478     std::stringstream out3;
00479     out3 <<   alphabet()->getSymbol(0)->name() << "|" << getStateName(0) ;
00480     emission[out3.str()] =  exp(getState(0)->emission()->log_probability_of(0));
00481     for(int i = 0; i < nstates; i++)
00482       for(int j = 0; j < (int)alphabet()->size(); j++)
00483         if((i != 0) || (j != 0)){
00484           std::stringstream out4;
00485           out4 <<alphabet()->getSymbol(j)->name() << "|" <<  getStateName(i) ;
00486           emission[out4.str()] =  exp(getState(i)->emission()->log_probability_of(j));
00487         }
00488     answer.add("emission_probabilities", DoubleMapParameterValuePtr(new DoubleMapParameterValue(emission)));
00489     double sum = 0;
00490     std::vector <double> probs;
00491     probs.resize(nstates);
00492     for(int i = 0; i < nstates; i++){
00493       probs[i] = exp(_initial_probability->log_probability_of(i));
00494       sum += probs[i];
00495     }
00496     std::map <std::string, double> initial;
00497     std::stringstream out5;
00498     out5 <<  getStateName(0);
00499     initial[out5.str()] =  probs[0]/sum;
00500     for(int i = 0; i < nstates; i++)
00501       for(int j = 0; j < (int)alphabet()->size(); j++)
00502         if((i != 0) || (j != 0)){
00503           std::stringstream out6;
00504           out6 << getStateName(i);
00505           initial[out6.str()] = probs[i]/sum;
00506         }
00507     answer.add("initial_probabilities", DoubleMapParameterValuePtr(new DoubleMapParameterValue(initial)));
00508     return answer;
00509 
00510   }
00511 
00512 
00513   void HiddenMarkovModel::setInitialProbability(DiscreteIIDModelPtr initial) {
00514     _initial_probability = initial;
00515   }
00516   void HiddenMarkovModel::setObservationSymbols(AlphabetPtr obs) {
00517     tops::ProbabilisticModel::setAlphabet(obs);
00518   }
00519   void HiddenMarkovModel::setStates(std::vector<HMMStatePtr> states, AlphabetPtr state_names) {
00520     _states = states;
00521     _state_names = state_names;
00522   }
00523 
00524 }