ToPS
|
00001 /* 00002 * HiddenMarkovModel.cpp 00003 * 00004 * Copyright 2011 Andre Yoshiaki Kashiwabara <akashiwabara@usp.br> 00005 * Ígor Bonádio <ibonadio@ime.usp.br> 00006 * Vitor Onuchic <vitoronuchic@gmail.com> 00007 * Alan Mitchell Durham <aland@usp.br> 00008 * 00009 * This program is free software; you can redistribute it and/or modify 00010 * it under the terms of the GNU General Public License as published by 00011 * the Free Software Foundation; either version 3 of the License, or 00012 * (at your option) any later version. 00013 * 00014 * This program is distributed in the hope that it will be useful, 00015 * but WITHOUT ANY WARRANTY; without even the implied warranty of 00016 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00017 * GNU General Public License for more details. 00018 * 00019 * You should have received a copy of the GNU General Public License 00020 * along with this program; if not, write to the Free Software 00021 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, 00022 * MA 02110-1301, USA. 00023 */ 00024 00025 #include <boost/numeric/ublas/matrix.hpp> 00026 #include "Alphabet.hpp" 00027 #include "HiddenMarkovModel.hpp" 00028 #include "ProbabilisticModelParameter.hpp" 00029 #include "Symbol.hpp" 00030 #include <iostream> 00031 #include <cmath> 00032 #include <sstream> 00033 #include <vector> 00034 #include <iterator> 00035 #include <stdio.h> 00036 00037 namespace tops { 00038 std::string HiddenMarkovModel::getStateName(int state) const{ 00039 return getState(state)->getName()->name(); 00040 } 00041 00042 Sequence & HiddenMarkovModel::chooseObservation ( Sequence & h, int i, int state) const 00043 { 00044 if((state >= 0) && (!getState(state)->isSilent() )) 00045 return getState(state)->emission()->chooseWithHistory(h,i,1); 00046 return h; 00047 } 00048 int HiddenMarkovModel::chooseState(int state) const 00049 { 00050 return getState(state) ->transitions()->choose(); 00051 } 00052 int HiddenMarkovModel::chooseFirstState() const 00053 { 00054 return _initial_probability->choose(); 00055 } 00056 00057 double HiddenMarkovModel::forward(const Sequence & sequence, Matrix &a) const 00058 { 00059 int nstates = _states.size(); 00060 int size = sequence.size(); 00061 Matrix alpha (nstates, size); 00062 for(int k = 0; k < nstates; k++) 00063 alpha(k,0) = _initial_probability->log_probability_of(k) + getState(k)->emission()->log_probability_of(sequence[0]); 00064 00065 for (int t = 0; t < size-1; t++) 00066 { 00067 for(int i = 0; i < nstates; i++) { 00068 int j = 0; 00069 if(j < nstates){ 00070 alpha(i,t+1) = alpha(j, t) + getState(j)->transitions()->log_probability_of(i); 00071 for( j = 1; j < nstates; j++) 00072 alpha(i, t+1) = log_sum(alpha(i,t+1), alpha(j, t) + getState(j)->transitions()->log_probability_of(i)); 00073 } 00074 alpha(i,t+1) += getState(i)->emission()->log_probability_of(sequence[t+1]); 00075 00076 } 00077 } 00078 a = alpha; 00079 double sum = alpha(0, size-1); 00080 for(int k = 1; k < nstates; k++) 00081 sum = log_sum(sum, alpha(k, size-1)); 00082 return sum; 00083 } 00084 00086 double HiddenMarkovModel::backward(const Sequence & sequence, Matrix &b) const 00087 { 00088 int nstates = _states.size(); 00089 int size = sequence.size(); 00090 Matrix beta (nstates, size); 00091 for(int k = 0; k < nstates; k++) 00092 beta(k,size-1) = 0.0; 00093 for (int t = size-2; t >= 0; t--) 00094 { 00095 00096 for(int i = 0; i < nstates; i++) 00097 { 00098 int j = 0; 00099 if(j < nstates) { 00100 beta(i,t) = getState(i)->transitions()->log_probability_of(j) + getState(j)->emission()->log_probability_of(sequence[t+1]) + beta(j, t+1); 00101 for(j = 1; j < nstates; j++) 00102 beta(i, t) = log_sum(beta(i, t), getState(i)->transitions()->log_probability_of(j) + getState(j)->emission()->log_probability_of(sequence[t+1]) + beta(j, t+1)); 00103 } 00104 } 00105 } 00106 b = beta; 00107 double sum = -HUGE; 00108 for(int k = 0; k < nstates; k++) 00109 sum = log_sum(sum, beta(k, 0) + _initial_probability->log_probability_of(k) + getState(k)->emission()->log_probability_of(sequence[0])); 00110 return sum; 00111 00112 } 00113 00115 double HiddenMarkovModel::viterbi (const Sequence &sequence, Sequence &path, Matrix & viterbi) const 00116 { 00117 typedef boost::numeric::ublas::matrix<int> MatrixInt; 00118 int nstates = _states.size(); 00119 int size = sequence.size(); 00120 00121 Matrix gamma (nstates, size+1); 00122 MatrixInt psi (nstates, size+1); 00123 00124 for(int k = 0; k < nstates; k++) 00125 gamma(k,0) = _initial_probability->log_probability_of(k) + getState(k)->emission()->log_probability_of(sequence[0]); 00126 00127 for (int i = 0; i < size-1; i++) 00128 { 00129 for(int k = 0; k < nstates; k++) { 00130 gamma(k,i+1) = gamma(0, i) +getState(0)->transitions()->log_probability_of(k); 00131 psi(k,i+1) = 0; 00132 for(int p = 1; p < nstates; p++) { // p is the predecessor 00133 double v = gamma(p, i) + getState(p)->transitions()->log_probability_of(k); // 00134 if(gamma(k, i+1) < v) 00135 { 00136 gamma(k, i+1) = v; 00137 psi(k,i+1) = p; 00138 } 00139 } 00140 gamma(k,i+1) += getState(k)->emission()->log_probability_of(sequence[i+1]); 00141 } 00142 } 00143 00144 viterbi = gamma; 00145 int L = size-1; 00146 path.resize(L+1); 00147 double max = gamma(0, L); 00148 path[L] = 0; 00149 for(int k = 1; k < nstates; k++) 00150 if(max < gamma(k, L)) { 00151 max = gamma(k,L); 00152 path[L] = k; 00153 } 00154 for(int i = L; i >= 1; i--) { 00155 path[i-1] = psi(path[i], i); 00156 } 00157 for(int i = 0; i < L; i++) { 00158 path[i] = path[i+1]; 00159 } 00160 for(int i = 0; i < L; i++) { 00161 path[i] = getState(path[i+1])->getName()->id(); 00162 } 00163 return max; 00164 00165 } 00166 00167 void HiddenMarkovModel::scale(std::vector<double> & in, int t) 00168 { 00169 double sum = 0.0; 00170 for(int i = 0; i < (int)in.size(); i++) 00171 { 00172 sum += in[i]; 00173 } 00174 _ctFactors[t] = sum; 00175 for(int i = 0; i < (int) in.size(); i++) 00176 { 00177 in[i] = in[i]/sum; 00178 } 00179 } 00180 00181 00182 void HiddenMarkovModel::trainBaumWelch(SequenceList & sample, int maxiterations, double diff_threshold) 00183 { 00184 int nstates = _states.size(); 00185 int alphabet_size = alphabet()->size(); 00186 00187 00188 double diff = 10.0; 00189 if(maxiterations < 0) 00190 maxiterations = 500; 00191 00192 for(int s = 0; s < (int)sample.size(); s++) 00193 { 00194 double last =10000; 00195 for(int iterations = 0; iterations < maxiterations; iterations++) 00196 { 00197 00198 Matrix A (nstates, nstates); 00199 Matrix E (nstates,alphabet_size); 00200 Matrix pi (nstates, 1); 00201 00202 Matrix alpha(nstates, sample[s].size()); 00203 Matrix beta(nstates, sample[s].size()); 00204 00205 double P = forward(sample[s], alpha); 00206 backward(sample[s], beta); 00207 00208 double sum = alpha(0, 0) + beta(0, 0); 00209 for(int i = 1; i < nstates; i++) 00210 sum = log_sum(sum, alpha(i,0) + beta(i,0)); 00211 00212 for(int i = 0; i < nstates; i++){ 00213 pi(i, 0) = alpha(i, 0) + beta(i, 0) - sum; 00214 } 00215 00216 00217 for(int i = 0; i < nstates; i++) 00218 { 00219 for(int j = 0; j < nstates; j++) 00220 { 00221 int t = 0; 00222 double sum = -HUGE; 00223 if(t < (int)sample[s].size()-1) { 00224 sum = alpha(i, t) + getState(i)->transitions()->log_probability_of(j) + getState(j)->emission()-> log_probability_of(sample[s][t+1]) + beta(j, t+1); 00225 for( t = 1; t < (int)sample[s].size()-1; t++) 00226 { 00227 sum = log_sum(sum, alpha(i, t) + getState(i)->transitions()->log_probability_of(j) + getState(j)->emission()-> log_probability_of(sample[s][t+1]) + beta(j, t+1)); 00228 } 00229 } 00230 A(i,j) = sum; 00231 } 00232 for(int sigma = 0; sigma < alphabet_size; sigma++) 00233 { 00234 int t = 0; 00235 double sum = -HUGE; 00236 bool first = true; 00237 for(t = 0; t < (int)sample[s].size(); t++) 00238 { 00239 if((sigma == sample[s][t]) && first){ 00240 sum = alpha(i, t) + beta(i,t); 00241 first = false; 00242 }else if(sigma == sample[s][t]) { 00243 sum = log_sum(sum, alpha(i,t) + beta(i,t)); 00244 } 00245 } 00246 E(i, sigma) = sum ; 00247 } 00248 } 00249 00250 00251 00252 Matrix sumA(1, nstates); 00253 Matrix sumE(1, alphabet_size); 00254 for(int k = 0; k < nstates; k++) 00255 { 00256 int l = 0; 00257 if(l < nstates) { 00258 sumA(0, k) = A(k, l); 00259 for(l = 1; l < nstates; l++) 00260 sumA(0, k) = log_sum(sumA(0,k), A(k,l)); 00261 } 00262 int b = 0; 00263 if(b < alphabet_size) { 00264 sumE(0, k) = E(k,b); 00265 for( b = 1; b < alphabet_size; b++) 00266 sumE(0, k) = log_sum(sumE(0, k), E(k,b)); 00267 } 00268 } 00269 std::vector <double> probs; 00270 probs.resize(nstates); 00271 for(int k = 0; k < nstates; k++) 00272 { 00273 _initial_probability->log_probability_of(k,pi(k,0) ); 00274 for(int l = 0; l < nstates; l++) 00275 { 00276 A(k,l) = A(k,l) - sumA(0,k); 00277 getState(k)->transitions()->log_probability_of(l, A(k,l)); 00278 } 00279 for(int b = 0; b <alphabet_size; b++){ 00280 E(k,b) = E(k,b) - sumE(0,k); 00281 getState(k)->emission()->log_probability_of(b, E(k,b)); 00282 } 00283 } 00284 00285 00286 diff = fabs(last - P); 00287 if(diff < diff_threshold) 00288 break; 00289 #if 0 00290 std::cerr << "iteration: " << iterations << std::endl; 00291 fprintf(stderr, "LL: %lf\n" , P ); 00292 std::cerr << "Diff: " << diff << std::endl; 00293 #endif 00294 last = P; 00295 } 00296 } 00297 00298 00299 00300 } 00301 00302 std::string HiddenMarkovModel::str () const 00303 { 00304 int nstates = _states.size(); 00305 std::stringstream out; 00306 out << "model_name = \"" << model_name() << "\"" << std::endl; 00307 out << "state_names = (" ; 00308 out << "\"" << getStateName(0) << "\""; 00309 for(int i = 1; i < (int)getStateNames()->size(); i++) 00310 out << ",\"" << getStateName(i) << "\""; 00311 out << ")" << std::endl; 00312 00313 out << "observation_symbols = (" ; 00314 out << "\"" << alphabet()->getSymbol(0)->name() << "\""; 00315 for(int i = 1; i < (int)alphabet()->size(); i++) 00316 out << ",\"" << alphabet()->getSymbol(i)->name() << "\""; 00317 out << ")" << std::endl; 00318 00319 out << "transitions = (" ; 00320 out << "\"" << getStateName(0) << "\" | \"" << getStateName(0) << "\": " << exp(getState(0)->transitions()->log_probability_of(0)) ; 00321 for(int i = 0; i < nstates; i++) 00322 for(int j = 0; j < nstates; j++) 00323 if((i != 0) || (j != 0)) 00324 out << ";\n \"" << getStateName(j) << "\" | \"" << getStateName(i) << "\": " << exp(getState(i)->transitions()->log_probability_of(j)); 00325 out << ")" << std::endl; 00326 00327 00328 out << "emission_probabilities = (" ; 00329 out << "\"" << alphabet()->getSymbol(0)->name() << "\" | \"" << getStateName(0) << "\": " << exp(getState(0)->emission()->log_probability_of(0)) ; 00330 for(int i = 0; i < nstates; i++) 00331 for(int j = 0; j < (int)alphabet()->size(); j++) 00332 if((i != 0) || (j != 0)) 00333 out << ";\n \"" << alphabet()->getSymbol(j)->name() << "\" | \"" << getStateName(i) << "\": " << exp(getState(i)->emission()->log_probability_of(j)); 00334 out << ")" << std::endl; 00335 00336 double sum = 0; 00337 std::vector <double> probs; 00338 probs.resize(nstates); 00339 for(int i = 0; i < nstates; i++){ 00340 probs[i] = exp(_initial_probability->log_probability_of(i)); 00341 sum += probs[i]; 00342 } 00343 00344 00345 out << "initial_probabilities = ("; 00346 out << "\"" << getStateName(0) << "\": " << probs[0]/sum ; 00347 for(int i = 1; i < nstates; i++) 00348 out << ";\n \"" << getStateName(i) << "\": " << probs[i]/sum; 00349 out << ")" << std::endl; 00350 return out.str(); 00351 } 00352 00353 void HiddenMarkovModel::initialize(const ProbabilisticModelParameters & parameters) { 00354 ProbabilisticModelParameterValuePtr state_names = parameters.getMandatoryParameterValue("state_names"); 00355 ProbabilisticModelParameterValuePtr observation_symbols = parameters.getMandatoryParameterValue("observation_symbols"); 00356 ProbabilisticModelParameterValuePtr initial_probabilities = parameters.getMandatoryParameterValue("initial_probabilities"); 00357 ProbabilisticModelParameterValuePtr transitions = parameters.getMandatoryParameterValue("transitions"); 00358 ProbabilisticModelParameterValuePtr emissions = parameters.getMandatoryParameterValue("emission_probabilities"); 00359 00360 std::vector<HMMStatePtr> state_list; 00361 AlphabetPtr states = AlphabetPtr(new Alphabet()); 00362 AlphabetPtr observations = AlphabetPtr(new Alphabet()); 00363 states->initializeFromVector(state_names->getStringVector()); 00364 observations->initializeFromVector(observation_symbols->getStringVector()); 00365 00366 std::map<std::string,double>::const_iterator it; 00367 std::map<std::string,std::vector<double> >::const_iterator it2; 00368 00369 DiscreteIIDModelPtr pi = DiscreteIIDModelPtr(new DiscreteIIDModel()); 00370 pi->initializeFromMap(initial_probabilities->getDoubleMap(), states); 00371 00372 std::map<std::string,double> emisspar = emissions->getDoubleMap(); 00373 std::map<std::string,double> transpar = transitions->getDoubleMap(); 00374 00375 std::map<std::string,DoubleVector> emiss; 00376 std::map<std::string,DoubleVector> trans; 00377 00378 00379 for(it = emisspar.begin(); it != emisspar.end(); it++) 00380 { 00381 std::vector<std::string> splited; 00382 boost::regex separator("\\|"); 00383 split_regex(it->first, splited, separator); 00384 if(splited.size() == 1) 00385 splited.push_back(""); 00386 std::string symbol(splited[0]); 00387 std::string state(splited[1]); 00388 00389 00390 std::map<std::string,DoubleVector>::iterator eit; 00391 eit = emiss.find(state); 00392 if(eit == emiss.end()) 00393 { 00394 int id = observations->getSymbol(symbol)->id(); 00395 emiss[state].resize(observations->size()); 00396 if((id >= 0) && (id < (int)(emiss[state]).size())) 00397 (emiss[state])[id] = it->second; 00398 } 00399 else 00400 { 00401 int id = observations->getSymbol(symbol)->id(); 00402 if((id >= 0) && (id < (int)(eit->second).size())) 00403 (eit->second)[id] = it->second; 00404 } 00405 } 00406 00407 00408 for(it = transpar.begin(); it != transpar.end(); it++) 00409 { 00410 std::vector<std::string> splited; 00411 boost::regex separator("\\|"); 00412 split_regex(it->first, splited, separator); 00413 if(splited.size() == 1) 00414 splited.push_back(""); 00415 00416 std::string to(splited[0]); 00417 std::string from(splited[1]); 00418 if(trans.find(from) == trans.end()) 00419 { 00420 int id = states->getSymbol(to)->id(); 00421 DoubleVector probs; 00422 probs.resize(states->size()); 00423 trans[from]=probs; 00424 if(id < (int)trans[from].size()) 00425 trans[from][id] = it->second; 00426 } 00427 else 00428 { 00429 int id = states->getSymbol(to)->id(); 00430 if(id < (int)trans[from].size()) 00431 trans[from][id] = it->second; 00432 } 00433 } 00434 for(unsigned int i = 0; i < states->size(); i++) 00435 { 00436 SymbolPtr state_name = states->getSymbol(i); 00437 DiscreteIIDModelPtr e ; 00438 DiscreteIIDModelPtr t ; 00439 it2 = emiss.find(state_name->name()); 00440 if(it2 != emiss.end()) 00441 e = DiscreteIIDModelPtr(new DiscreteIIDModel(it2->second)); 00442 it2 = trans.find(state_name->name()); 00443 if(it2 != trans.end()) 00444 t = DiscreteIIDModelPtr(new DiscreteIIDModel(it2->second)); 00445 else { 00446 std::cerr << "ERROR: Could not configure the state " << state_name->name() << "!" << std::endl; 00447 exit(-1); 00448 } 00449 HMMStatePtr statePtr = HMMStatePtr(new HMMState(state_list.size(), state_name, e, t)); 00450 state_list.push_back(statePtr); 00451 } 00452 00453 setStates(state_list, states); 00454 setInitialProbability(pi); 00455 setObservationSymbols(observations); 00456 } 00457 00458 ProbabilisticModelParameters HiddenMarkovModel::parameters() const { 00459 ProbabilisticModelParameters answer; 00460 int nstates = _states.size(); 00461 answer.add("model_name", StringParameterValuePtr(new StringParameterValue(model_name().c_str()))); 00462 answer.add("state_names", _state_names->getParameterValue()); 00463 answer.add("observation_symbols", alphabet()->getParameterValue()); 00464 std::map <std::string, double> trans; 00465 std::stringstream out; 00466 out << getStateName(0) << "|" << getStateName(0) ; 00467 trans[out.str()] = exp(getState(0)->transitions()->log_probability_of(0)) ; 00468 for(int i = 0; i < nstates; i++) 00469 for(int j = 0; j < nstates; j++) 00470 if((i != 0) || (j != 0)) { 00471 std::stringstream out2; 00472 out2 << getStateName(j) << "|" << getStateName(i) ; 00473 trans[out2.str()] = exp(getState(i)->transitions()->log_probability_of(j)); 00474 } 00475 answer.add("transitions", DoubleMapParameterValuePtr(new DoubleMapParameterValue(trans))); 00476 00477 std::map <std::string, double> emission; 00478 std::stringstream out3; 00479 out3 << alphabet()->getSymbol(0)->name() << "|" << getStateName(0) ; 00480 emission[out3.str()] = exp(getState(0)->emission()->log_probability_of(0)); 00481 for(int i = 0; i < nstates; i++) 00482 for(int j = 0; j < (int)alphabet()->size(); j++) 00483 if((i != 0) || (j != 0)){ 00484 std::stringstream out4; 00485 out4 <<alphabet()->getSymbol(j)->name() << "|" << getStateName(i) ; 00486 emission[out4.str()] = exp(getState(i)->emission()->log_probability_of(j)); 00487 } 00488 answer.add("emission_probabilities", DoubleMapParameterValuePtr(new DoubleMapParameterValue(emission))); 00489 double sum = 0; 00490 std::vector <double> probs; 00491 probs.resize(nstates); 00492 for(int i = 0; i < nstates; i++){ 00493 probs[i] = exp(_initial_probability->log_probability_of(i)); 00494 sum += probs[i]; 00495 } 00496 std::map <std::string, double> initial; 00497 std::stringstream out5; 00498 out5 << getStateName(0); 00499 initial[out5.str()] = probs[0]/sum; 00500 for(int i = 0; i < nstates; i++) 00501 for(int j = 0; j < (int)alphabet()->size(); j++) 00502 if((i != 0) || (j != 0)){ 00503 std::stringstream out6; 00504 out6 << getStateName(i); 00505 initial[out6.str()] = probs[i]/sum; 00506 } 00507 answer.add("initial_probabilities", DoubleMapParameterValuePtr(new DoubleMapParameterValue(initial))); 00508 return answer; 00509 00510 } 00511 00512 00513 void HiddenMarkovModel::setInitialProbability(DiscreteIIDModelPtr initial) { 00514 _initial_probability = initial; 00515 } 00516 void HiddenMarkovModel::setObservationSymbols(AlphabetPtr obs) { 00517 tops::ProbabilisticModel::setAlphabet(obs); 00518 } 00519 void HiddenMarkovModel::setStates(std::vector<HMMStatePtr> states, AlphabetPtr state_names) { 00520 _states = states; 00521 _state_names = state_names; 00522 } 00523 00524 }