ToPS
|
00001 /* 00002 * PairHiddenMarkovModel.cpp 00003 * 00004 * Copyright 2011 Vitor Onuchic <vitoronuchic@gmail.com> 00005 * Andre Yoshiaki Kashiwabara <akashiwabara@usp.br> 00006 * Ígor Bonádio <ibonadio@ime.usp.br> 00007 * Alan Mitchell Durham <aland@usp.br> 00008 * 00009 * This program is free software; you can redistribute it and/or modify 00010 * it under the terms of the GNU General Public License as published by 00011 * the Free Software Foundation; either version 3 of the License, or 00012 * (at your option) any later version. 00013 * 00014 * This program is distributed in the hope that it will be useful, 00015 * but WITHOUT ANY WARRANTY; without even the implied warranty of 00016 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00017 * GNU General Public License for more details. 00018 * 00019 * You should have received a copy of the GNU General Public License 00020 * along with this program; if not, write to the Free Software 00021 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, 00022 * MA 02110-1301, USA. 00023 */ 00024 00025 #include "PairHiddenMarkovModel.hpp" 00026 00027 namespace tops { 00028 00032 00033 float PairHiddenMarkovModel::posteriorProbabilities (const Sequence &seq1, const Sequence &seq2, SparseMatrixPtr &sparsePPMatch,SparseMatrixPtr &sparsePPGap1, SparseMatrixPtr &sparsePPGap2) 00034 { 00035 int nstates = _states.size(); 00036 int length1 = seq1.size(); 00037 int length2 = seq2.size(); 00038 fMatrix ppMatch(length1+1,length2+1); 00039 fMatrix ppGap1(length1+1,length2+1); 00040 fMatrix ppGap2(length1+1,length2+1); 00041 vector<Matrix> alpha; // forward 00042 vector<Matrix> beta; // backward 00043 00044 double full = forward(seq1, seq2, alpha); 00045 backward(seq1, seq2, beta); 00046 for(int i = 0; i <= length1; i++){ 00047 for(int j = 0; j <= length2; j++){ 00048 ppMatch(i,j) = 0; 00049 ppGap1(i,j) = 0; 00050 ppGap2(i,j) = 0; 00051 for(int k = 0; k < nstates; k++) 00052 _states[k]->postProbSum(ppMatch,ppGap1,ppGap2,alpha[k],beta[k],full,i,j); 00053 } 00054 } 00055 float ea = expectedAccuracy(length1+1,length2+1,ppMatch); 00056 sparsePPMatch = SparseMatrixPtr(new SparseMatrix(length1+1,length2+1,ppMatch)); 00057 sparsePPGap1 = SparseMatrixPtr(new SparseMatrix(length1+1,length2+1,ppGap1)); 00058 sparsePPGap2 = SparseMatrixPtr(new SparseMatrix(length1+1,length2+1,ppGap2)); 00059 00060 return ea; 00061 } 00062 00063 void MatchState::postProbSum(fMatrix &ppMatch, fMatrix &ppGap1, fMatrix &ppGap2, Matrix &alpha, Matrix &beta, double full, int i, int j){ 00064 ppMatch(i,j) += exp(alpha(i,j) + beta(i,j) - full); 00065 } 00066 void Gap1State::postProbSum(fMatrix &ppMatch, fMatrix &ppGap1, fMatrix &ppGap2, Matrix &alpha, Matrix &beta, double full, int i, int j){ 00067 ppGap1(i,j) += exp(alpha(i,j) + beta(i,j) - full); 00068 } 00069 void Gap2State::postProbSum(fMatrix &ppMatch, fMatrix &ppGap1, fMatrix &ppGap2, Matrix &alpha, Matrix &beta, double full, int i, int j){ 00070 ppGap2(i,j) += exp(alpha(i,j) + beta(i,j) - full); 00071 } 00072 00073 /*float PairHiddenMarkovModel::expectedAccuracyWithGaps(SparseMatrixPtr postProbs, SparseMatrixPtr gap1Probs, SparseMatrixPtr gap2Probs){ 00074 int size1 = postProbs->nrows(); 00075 int size2 = postProbs->ncols(); 00076 SparseMatrixPtr ea = SparseMatrixPtr(new SparseMatrix(size1,size2)); 00077 SparseMatrixPtr ptrs = SparseMatrixPtr(new SparseMatrix(size1,size2)); 00078 for(int i = 0; i < size1; i++){ 00079 for(int j = 0; j < size2; j++){ 00080 if(i == 0 && j == 0){ 00081 continue; 00082 } 00083 if(i == 0){ 00084 ea->add(i,j,ea->get(i,j-1)+gap1Probs->get(i,j)); 00085 ptrs->add(i,j,2); 00086 continue; 00087 } 00088 if(j == 0){ 00089 ea->add(i,j,ea->get(i-1,j)+gap2Probs->get(i,j)); 00090 ptrs->add(i,j,1); 00091 continue; 00092 } 00093 ea->add(i,j,ea->get(i-1,j)+gap2Probs->get(i,j)); 00094 ptrs->add(i,j,1); 00095 if(ea->get(i,j) <= ea->get(i,j-1)+gap1Probs->get(i,j)){ 00096 ea->add(i,j,ea->get(i,j-1)+gap1Probs->get(i,j)); 00097 ptrs->add(i,j,2); 00098 } 00099 if(ea->get(i,j) <= ea->get(i-1,j-1)+postProbs->get(i,j)){ 00100 ea->add(i,j,ea->get(i-1,j-1)+postProbs->get(i,j)); 00101 ptrs->add(i,j,3); 00102 } 00103 } 00104 } 00105 int i = size1-1; 00106 int j = size2-1; 00107 int allen = 0; 00108 while(i > 0 || j > 0){ 00109 allen++; 00110 float ptr = ptrs->get(i,j); 00111 if(ptr == 1) 00112 i--; 00113 if(ptr == 2) 00114 j--; 00115 if(ptr == 3){ 00116 i--; 00117 j--; 00118 } 00119 } 00120 float mea = ea->get(size1-1,size2-1)/allen; 00121 return mea; 00122 }*/ 00123 00124 float PairHiddenMarkovModel::expectedAccuracy(int size1, int size2, fMatrix &postProbs){ 00125 int allen = size1; 00126 if(size2 < allen) 00127 allen = size2; 00128 fMatrix ea(size1,size2); 00129 00130 for(int i = 0; i < size1; i++) 00131 ea(i,0) = 0.0; 00132 for(int j = 1; j < size2; j++) 00133 ea(0,j) = 0.0; 00134 00135 for(int i = 1; i < size1; i++){ 00136 for(int j = 1; j < size2; j++){ 00137 ea(i,j) = ea(i-1,j); 00138 if(ea(i,j) <= ea(i,j-1)){ 00139 ea(i,j) = ea(i,j-1); 00140 } 00141 if(ea(i,j) <= ea(i-1,j-1)+postProbs(i,j)){ 00142 ea(i,j) = ea(i-1,j-1)+postProbs(i,j); 00143 } 00144 } 00145 } 00146 return ea(size1-1,size2-1)/allen;; 00147 } 00148 00149 /* double PairHiddenMarkovModel::viterbi(const Sequence & seq1, const Sequence & seq2, Sequence & statePath, Sequence & alignment1, Sequence & alignment2, vector<Matrix> &a) 00150 { 00151 int nstates = _states.size(); 00152 int nsilentstates = _silent_states.size(); 00153 int length1 = seq1.size(); 00154 int length2 = seq2.size(); 00155 vector<Matrix> alpha; 00156 Sequence al1; 00157 Sequence al2; 00158 Sequence path; 00159 vector<IntMatrix> pathMatrix; 00160 alpha.resize(nstates); 00161 pathMatrix.resize(nstates); 00162 for(int i = 0; i < nstates; i++){ 00163 (alpha[i]).resize(length1+1,length2+1); 00164 (pathMatrix[i]).resize(length1+1,length2+1); 00165 } 00166 00167 //Initialization 00168 for(int k = 0; k < nstates; k++){ 00169 if(k == (int)_begin_id){ 00170 alpha[k](0,0) = 0.0; 00171 continue; 00172 } 00173 else 00174 alpha[k](0,0) = -HUGE; 00175 } 00176 00177 //Recursion 00178 for (int i = 0; i <= length1; i++){ 00179 for(int j = 0; j <= length2; j++){ 00180 for(int k = 0; k < nstates; k++){ 00181 if(i == 0 && j == 0) 00182 continue; 00183 if(k == (int)_begin_id){ 00184 alpha[k](i,j) = -HUGE; 00185 if(i > 0 && j > 0) 00186 pathMatrix[k](i,j) = -1; 00187 continue; 00188 } 00189 int ne1 = _states[k]->eSeq1(); 00190 int ne2 = _states[k]->eSeq2(); 00191 if(ne1 == 0 && ne2 == 0) 00192 continue; 00193 00194 alpha[k](i,j) = -HUGE; 00195 if(i - ne1 < 0 || j - ne2 < 0){ 00196 pathMatrix[k](i,j) = -1; 00197 continue; 00198 } 00199 00200 double aux = -HUGE; 00201 for(int l = 0; l < (int)(_states[k]->iTransitions()).size(); l++){ 00202 int id = _states[k]->getITransId(l); 00203 aux = alpha[id](i-ne1,j-ne2) + _states[id]->transitions()->log_probability_of(k); 00204 if(aux > alpha[k](i,j)){ 00205 alpha[k](i,j) = aux; 00206 pathMatrix[k](i,j) = id; 00207 } 00208 } 00209 if(ne1 > 0 && ne2 > 0) 00210 alpha[k](i,j) += _states[k]->emission()->log_probability_of_pair(seq1[i-1],seq2[j-1]); 00211 else if(ne1 > 0 && ne2 == 0) 00212 alpha[k](i,j) += _states[k]->emission()->log_probability_of_pair(seq1[i-1],_gap_id); 00213 else 00214 alpha[k](i,j) += _states[k]->emission()->log_probability_of_pair(_gap_id,seq2[j-1]); 00215 } 00216 for(int k = 0; k < nsilentstates; k++){ 00217 if(i == 0 && j == 0) 00218 continue; 00219 int id1 = getSilentId(k); 00220 alpha[id1](i,j) = -HUGE; 00221 if(id1 == (int)_begin_id) 00222 continue; 00223 for(int l = 0; l < (int)(_states[id1]->iTransitions()).size(); l++){ 00224 int id2 = _states[id1]->getITransId(l); 00225 double aux = alpha[id2](i,j) + _states[id2]->transitions()->log_probability_of(id1); 00226 if(aux > alpha[id1](i,j)){ 00227 alpha[id1](i,j) = aux; 00228 pathMatrix[id1](i,j) = id2; 00229 } 00230 } 00231 } 00232 } 00233 } 00234 00235 //Termination 00236 double score = -HUGE; 00237 int p; 00238 for(int i = 0; i < nstates; i++){ 00239 alpha[i](length1,length2) += _states[i]->transitions()->log_probability_of(_end_id); 00240 if(score < alpha[i](length1,length2)){ 00241 score = alpha[i](length1,length2); 00242 p = i; 00243 } 00244 } 00245 a = alpha; 00246 00247 //Trace back 00248 path.push_back(_end_id); 00249 al1.push_back(_gap_id); 00250 al2.push_back(_gap_id); 00251 while(length1 >= 0 || length2 >= 0){ 00252 path.push_back(p); 00253 if(_states[p]->eSeq1() == 0 && _states[p]->eSeq2() == 0){ 00254 al1.push_back(_gap_id); 00255 al2.push_back(_gap_id); 00256 if(length1 == 0 && length2 == 0) 00257 break; 00258 p = pathMatrix[p](length1,length2); 00259 } 00260 else if(_states[p]->eSeq1() > 0 && _states[p]->eSeq2() == 0){ 00261 al1.push_back(seq1[length1-1]); 00262 al2.push_back(_gap_id); 00263 p = pathMatrix[p](length1,length2); 00264 length1--; 00265 } 00266 else if(_states[p]->eSeq1() == 0 && _states[p]->eSeq2() > 0){ 00267 al1.push_back(_gap_id); 00268 al2.push_back(seq2[length2-1]); 00269 p = pathMatrix[p](length1,length2); 00270 length2--; 00271 } 00272 else{ 00273 al1.push_back(seq1[length1-1]); 00274 al2.push_back(seq2[length2-1]); 00275 p = pathMatrix[p](length1,length2); 00276 length1--; 00277 length2--; 00278 } 00279 } 00280 //path.push_back(p); 00281 alignment1.resize(al1.size()); 00282 alignment2.resize(al2.size()); 00283 statePath.resize(path.size()); 00284 for(unsigned int i = 0; i < al1.size(); i++){ 00285 alignment1[i] = al1[al1.size()-1-i]; 00286 alignment2[i] = al2[al2.size()-1-i]; 00287 } 00288 for(unsigned int i = 0; i < path.size(); i++) 00289 statePath[i] = path[path.size() - i - 1]; 00290 00291 return score; 00292 }*/ 00293 00294 void MatchState::forwardSum(vector<PHMMStatePtr> &states, const Sequence &seq1, const Sequence &seq2, vector<Matrix> &a, int i, int j, int gap_id, int begin_id){ 00295 if(i == 1 && j == 1){ 00296 a[_id](i,j) = states[begin_id]->transitions()->log_probability_of(_id) + _emission->log_probability_of_pair(seq1[0],seq2[0]); 00297 return; 00298 } 00299 00300 if(i == 0 || j == 0){ 00301 a[_id](i,j) = -HUGE; 00302 return; 00303 } 00304 00305 vector<int>::iterator predStateNumIt = _incomingTransitions.begin(); 00306 a[_id](i,j) = a[*predStateNumIt](i-1,j-1) + states[*predStateNumIt]->transitions()->log_probability_of(_id); 00307 for(predStateNumIt = _incomingTransitions.begin()+1; predStateNumIt != _incomingTransitions.end(); predStateNumIt++) 00308 a[_id](i,j) = log_sum(a[_id](i,j), a[*predStateNumIt](i-1,j-1) + states[*predStateNumIt]->transitions()->log_probability_of(_id)); 00309 00310 a[_id](i,j) += _emission->log_probability_of_pair(seq1[i-1],seq2[j-1]); 00311 } 00312 00313 void Gap1State::forwardSum(vector<PHMMStatePtr> &states, const Sequence &seq1, const Sequence &seq2, vector<Matrix> &a, int i, int j, int gap_id, int begin_id){ 00314 if(i == 0 && j == 1){ 00315 a[_id](i,j) = states[begin_id]->transitions()->log_probability_of(_id) + _emission->log_probability_of_pair(gap_id,seq2[0]); 00316 return; 00317 } 00318 00319 if(j == 0){ 00320 a[_id](i,j) = -HUGE; 00321 return; 00322 } 00323 00324 vector<int>::iterator predStateNumIt = _incomingTransitions.begin(); 00325 a[_id](i,j) = a[*predStateNumIt](i,j-1) + states[*predStateNumIt]->transitions()->log_probability_of(_id); 00326 for(predStateNumIt = _incomingTransitions.begin()+1; predStateNumIt != _incomingTransitions.end(); predStateNumIt++) 00327 a[_id](i,j) = log_sum(a[_id](i,j), a[*predStateNumIt](i,j-1) + states[*predStateNumIt]->transitions()->log_probability_of(_id)); 00328 00329 a[_id](i,j) += _emission->log_probability_of_pair(gap_id,seq2[j-1]); 00330 } 00331 00332 void Gap2State::forwardSum(vector<PHMMStatePtr> &states, const Sequence &seq1, const Sequence &seq2, vector<Matrix> &a, int i, int j, int gap_id, int begin_id){ 00333 if(i == 1 && j == 0){ 00334 a[_id](i,j) = states[begin_id]->transitions()->log_probability_of(_id) + _emission->log_probability_of_pair(seq1[0],gap_id); 00335 return; 00336 } 00337 00338 if(i == 0){ 00339 a[_id](i,j) = -HUGE; 00340 return; 00341 } 00342 00343 vector<int>::iterator predStateNumIt = _incomingTransitions.begin(); 00344 a[_id](i,j) = a[*predStateNumIt](i-1,j) + states[*predStateNumIt]->transitions()->log_probability_of(_id); 00345 for(predStateNumIt = _incomingTransitions.begin()+1; predStateNumIt != _incomingTransitions.end(); predStateNumIt++) 00346 a[_id](i,j) = log_sum(a[_id](i,j), a[*predStateNumIt](i-1,j) + states[*predStateNumIt]->transitions()->log_probability_of(_id)); 00347 00348 a[_id](i,j) += _emission->log_probability_of_pair(seq1[i-1],gap_id); 00349 } 00350 00351 void SilentState::forwardSum(vector<PHMMStatePtr> &states, const Sequence &seq1, const Sequence &seq2, vector<Matrix> &a, int i, int j, int gap_id, int begin_id){ 00352 return; 00353 } 00354 00355 double PairHiddenMarkovModel::forward(const Sequence & seq1, const Sequence & seq2, vector<Matrix> &a) 00356 { 00357 int nstates = _states.size(); 00358 int length1 = seq1.size(); 00359 int length2 = seq2.size(); 00360 00361 a.resize(nstates); 00362 for(int i = 0; i < nstates; i++){ 00363 if(i != _begin_id && i != _end_id) 00364 (a[i]).resize(length1+1,length2+1); 00365 else 00366 (a[i]).resize(0,0); 00367 } 00368 00369 //Recursion 00370 vector<PHMMStatePtr>::iterator currState; 00371 for (int i = 0; i <= length1; i++){ 00372 for(int j = 0; j <= length2; j++){ 00373 for(currState = _states.begin(); currState != _states.end(); currState++){ 00374 (*currState)->forwardSum(_states,seq1,seq2,a,i,j,_gap_id,_begin_id); 00375 } 00376 } 00377 } 00378 00379 //Termination 00380 double sum = -HUGE; 00381 for(currState = _states.begin(); currState != _states.end(); currState++){ 00382 if(!(*currState)->isSilent()) 00383 sum = log_sum(sum, a[(*currState)->getId()](length1,length2) + (*currState)->transitions()->log_probability_of(_end_id)); 00384 } 00385 return sum; 00386 } 00387 00388 void MatchState::backwardSum(vector<PHMMStatePtr> &states, const Sequence &seq1, const Sequence &seq2, vector<Matrix> &a, int i, int j, int gap_id, int currStateId, double *accumulator){ 00389 if(i == (int)seq1.size() || j == (int)seq2.size()) 00390 return; 00391 (*accumulator) = log_sum((*accumulator), 00392 a[_id](i+1,j+1) + 00393 states[currStateId]->transitions()->log_probability_of(_id) + 00394 _emission->log_probability_of_pair(seq1[i],seq2[j])); 00395 } 00396 00397 void Gap1State::backwardSum(vector<PHMMStatePtr> &states, const Sequence &seq1, const Sequence &seq2, vector<Matrix> &a, int i, int j, int gap_id, int currStateId, double *accumulator){ 00398 if(j == (int)seq2.size()) 00399 return; 00400 (*accumulator) = log_sum((*accumulator), 00401 a[_id](i,j+1) + 00402 states[currStateId]->transitions()->log_probability_of(_id) + 00403 _emission->log_probability_of_pair(gap_id,seq2[j])); 00404 } 00405 00406 void Gap2State::backwardSum(vector<PHMMStatePtr> &states, const Sequence &seq1, const Sequence &seq2, vector<Matrix> &a, int i, int j, int gap_id, int currStateId, double *accumulator){ 00407 if(i == (int)seq1.size()) 00408 return; 00409 (*accumulator) = log_sum((*accumulator), 00410 a[_id](i+1,j) + 00411 states[currStateId]->transitions()->log_probability_of(_id) + 00412 _emission->log_probability_of_pair(seq1[i],gap_id)); 00413 } 00414 00415 double PairHiddenMarkovModel::backward(const Sequence & seq1, const Sequence & seq2, vector<Matrix> &a) 00416 { 00417 int nstates = _states.size(); 00418 int length1 = seq1.size(); 00419 int length2 = seq2.size(); 00420 a.resize(nstates); 00421 for(int i = 0; i < nstates; i++){ 00422 if(i == _end_id || i == _begin_id){ 00423 (a[i]).resize(0,0); 00424 continue; 00425 } 00426 (a[i]).resize(length1+1,length2+1); 00427 } 00428 00429 //Initialization 00430 for(int k = 0; k < nstates; k++){ 00431 if(k != _end_id && k != _begin_id) 00432 a[k](length1,length2) = _states[k]->transitions()->log_probability_of(_end_id); 00433 } 00434 00435 //Recursion 00436 IntVector::iterator nextStateNumIt; 00437 double accumulator; 00438 for (int i = length1; i >= 0; i--){ 00439 for(int j = length2; j >= 0; j--){ 00440 if(i == length1 && j == length2) 00441 continue; 00442 for(int currStateId = 0; currStateId < nstates; currStateId++){ 00443 if(_states[currStateId]->isSilent()) 00444 continue; 00445 accumulator = -HUGE; 00446 for(nextStateNumIt = _states[currStateId]->outTransitions().begin(); nextStateNumIt != _states[currStateId]->outTransitions().end(); nextStateNumIt++) 00447 _states[*nextStateNumIt]->backwardSum(_states,seq1,seq2,a,i,j,_gap_id,currStateId,&accumulator); 00448 a[currStateId](i,j) = accumulator; 00449 } 00450 } 00451 } 00452 00453 //Termination 00454 accumulator = -HUGE; 00455 for(nextStateNumIt = _states[_begin_id]->outTransitions().begin(); nextStateNumIt != _states[_begin_id]->outTransitions().end(); nextStateNumIt++) 00456 _states[*nextStateNumIt]->backwardSum(_states,seq1,seq2,a,0,0,_gap_id,_begin_id,&accumulator); 00457 return accumulator; 00458 } 00459 00463 00464 /* void PairHiddenMarkovModel::trainBaumWelch(SequenceList & sample, int maxiterations, double diff_threshold) 00465 { 00466 int nstates = _states.size(); 00467 int alphabet_size = alphabet()->size(); 00468 00469 double diff = HUGE; 00470 double last = -HUGE; 00471 if(maxiterations < 0) 00472 maxiterations = 500; 00473 00474 int iterations; 00475 for(iterations = 0; iterations < maxiterations; iterations++){ 00476 DoubleVector sumA (nstates); 00477 DoubleVector sumE (nstates); 00478 double sumP = -HUGE; 00479 Matrix A (nstates, nstates); 00480 for(int a = 0; a < nstates; a++){ 00481 for(int b = 0; b < nstates; b++){ 00482 if(!close(exp(_states[a]->transitions()->log_probability_of(b)), 0.0, 1e-10)) 00483 A(a,b) = _states[a]->transitions()->log_probability_of(b) + log(3); 00484 else 00485 A(a,b) = -HUGE; 00486 } 00487 } 00488 vector<Matrix> E; 00489 E.resize(nstates); 00490 for(int i = 0; i < nstates; i++){ 00491 E[i].resize(alphabet_size,alphabet_size); 00492 sumA[i] = -HUGE; 00493 sumE[i] = -HUGE; 00494 for(int a = 0; a < alphabet_size; a++){ 00495 for(int b = 0; b < alphabet_size; b++){ 00496 if(!close(exp(_states[i]->emission()->log_probability_of_pair(a,b)), 0.0, 1e-10)){ 00497 if(_states[i]->eSeq1() > 0 && _states[i]->eSeq2() > 0) 00498 E[i](a,b) = _states[i]->emission()->log_probability_of_pair(a,b) + log(210); 00499 else 00500 E[i](a,b) = _states[i]->emission()->log_probability_of_pair(a,b) + log(19); 00501 } 00502 else 00503 E[i](a,b) = -HUGE; 00504 } 00505 } 00506 } 00507 for(int s1 = 0; s1 < (int)sample.size(); s1++){ 00508 for(int s2 = s1+1; s2 < (int)sample.size(); s2++){ 00509 vector<Matrix> alpha; 00510 vector<Matrix> beta; 00511 int length1 = sample[s1].size(); 00512 int length2 = sample[s2].size(); 00513 00514 double P = forward(sample[s1], sample[s2], alpha); 00515 sumP = log_sum(sumP,P); 00516 backward(sample[s1], sample[s2], beta); 00517 00518 00519 for(int k = 0; k < nstates; k++){ 00520 for(int l = 0; l < nstates; l++){ 00521 if(l == _begin_id || k == _end_id){ 00522 continue; 00523 } 00524 if(k == _begin_id && l != _end_id){ 00525 A(k,l) = log_sum(A(k,l), _states[k]->transitions()->log_probability_of(l) + beta[l](0,0) - P); 00526 continue; 00527 } 00528 if(l == _end_id && k != _begin_id){ 00529 A(k,l) = log_sum(A(k,l), alpha[k](length1,length2) + _states[k]->transitions()->log_probability_of(l)-P); 00530 continue; 00531 } 00532 if(k == _begin_id && l == _end_id){ 00533 continue; 00534 } 00535 for(int i = 0; i <= length1; i++){ 00536 for(int j = 0; j <= length2; j++){ 00537 int ne1 = getPHMMState(l)->eSeq1(); 00538 int ne2 = getPHMMState(l)->eSeq2(); 00539 if((i+ne1 > length1) || (j+ne2 > length2)) 00540 continue; 00541 if(k == 0) { 00542 if(ne1 == 1 && ne2 == 1) 00543 E[l](sample[s1][i],sample[s2][j]) = log_sum(E[l](sample[s1][i],sample[s2][j]), alpha[l](i+ne1,j+ne2) + beta[l](i+ne1,j+ne2) - P); 00544 if(ne1 == 0 && ne2 == 1) 00545 E[l](_gap_id,sample[s2][j]) = log_sum(E[l](_gap_id,sample[s2][j]), alpha[l](i+ne1,j+ne2) + beta[l](i+ne1,j+ne2) - P); 00546 if(ne1 == 1 && ne2 == 0) 00547 E[l](sample[s1][i],_gap_id) = log_sum(E[l](sample[s1][i],_gap_id), alpha[l](i+ne1,j+ne2) + beta[l](i+ne1,j+ne2) - P); 00548 } 00549 00550 if(ne1 == 1 && ne2 == 1) 00551 A(k,l) = log_sum(A(k,l), alpha[k](i,j) + 00552 _states[k]->transitions()->log_probability_of(l) + 00553 getPHMMState(l)->emission()->log_probability_of_pair(sample[s1][i],sample[s2][j]) + 00554 beta[l](i+ne1,j+ne2)-P); 00555 if(ne1 == 0 && ne2 == 1) 00556 A(k,l) = log_sum(A(k,l), alpha[k](i,j) + 00557 _states[k]->transitions()->log_probability_of(l) + 00558 getPHMMState(l)->emission()->log_probability_of_pair(_gap_id,sample[s2][j]) + 00559 beta[l](i+ne1,j+ne2)-P); 00560 if(ne1 == 1 && ne2 == 0) 00561 A(k,l) = log_sum(A(k,l), alpha[k](i,j) + 00562 _states[k]->transitions()->log_probability_of(l) + 00563 getPHMMState(l)->emission()->log_probability_of_pair(sample[s1][i],_gap_id) + 00564 beta[l](i+ne1,j+ne2)-P); 00565 if(ne1 == 0 && ne2 == 0) 00566 A(k,l) = log_sum(A(k,l), alpha[k](i,j) + 00567 _states[k]->transitions()->log_probability_of(l) + 00568 beta[l](i+ne1,j+ne2)-P); 00569 } 00570 } 00571 } 00572 } 00573 } 00574 } 00575 for(int k = 0; k < nstates; k++){ 00576 for(int l = 0; l < nstates; l++){ 00577 sumA[k] = log_sum(sumA[k], A(k,l)); 00578 } 00579 } 00580 for(int k = 0; k < nstates; k++){ 00581 for(int a = 0; a < alphabet_size; a++){ 00582 for(int b = 0; b < alphabet_size; b++){ 00583 double s = log_sum(E[k](a,b), E[k](b,a)); 00584 E[k](a,b) = E[k](b,a) = s-log(2); 00585 sumE[k] = log_sum(sumE[k], E[k](a,b)); 00586 } 00587 } 00588 } 00589 for(int k = 0; k < nstates; k++){ 00590 if(_states[k]->eSeq1() == 0 && _states[k]->eSeq2() == 0) 00591 continue; 00592 for(int a = 0; a < alphabet_size; a++){ 00593 for(int b = 0; b < alphabet_size; b++){ 00594 E[k](a,b) -= sumE[k]; 00595 _states[k]->emission()->log_probability_of_pair(a,b,E[k](a,b)); 00596 } 00597 } 00598 } 00599 for(int k = 0; k < nstates; k++){ 00600 if(_states[k]->oTransitions().size() == 0) 00601 continue; 00602 for(int l = 0; l < nstates; l++){ 00603 A(k,l) -= sumA[k]; 00604 _states[k]->transitions()->log_probability_of(l, A(k,l)); 00605 } 00606 } 00607 //cerr << "sumP = " << sumP << endl; 00608 diff = fabs(last - sumP); 00609 if(diff < diff_threshold) 00610 break; 00611 last = sumP; 00612 } 00613 //cerr << "Stoped at iteration: " << iterations << endl; 00614 //cerr << "Diff: " << diff << endl; 00615 }*/ 00616 00620 00621 void PairHiddenMarkovModel::initialize(const ProbabilisticModelParameters & parameters) { 00622 ProbabilisticModelParameterValuePtr state_names = parameters.getMandatoryParameterValue("state_names"); 00623 ProbabilisticModelParameterValuePtr observation_symbols = parameters.getMandatoryParameterValue("observation_symbols"); 00624 ProbabilisticModelParameterValuePtr number_of_emissions = parameters.getMandatoryParameterValue("number_of_emissions"); 00625 ProbabilisticModelParameterValuePtr transitions = parameters.getMandatoryParameterValue("transitions"); 00626 ProbabilisticModelParameterValuePtr emissions = parameters.getMandatoryParameterValue("emission_probabilities"); 00627 00628 int end = 0; 00629 int begin = 0; 00630 std::vector<PHMMStatePtr> state_list; 00631 std::vector<PHMMStatePtr> silent_state_list; 00632 00633 AlphabetPtr states = AlphabetPtr(new Alphabet()); 00634 states->initializeFromVector(state_names->getStringVector()); 00635 00636 AlphabetPtr observations = AlphabetPtr(new Alphabet()); 00637 observations->initializeFromVector(observation_symbols->getStringVector()); 00638 if(observations->has("-")) 00639 _gap_id = observations->getSymbol("-")->id(); 00640 else{ 00641 observations->createSymbol("-"); 00642 _gap_id = observations->getSymbol("-")->id(); 00643 } 00644 00645 //Initializing the emission probabilities 00646 std::map<std::string,double> emisspar = emissions->getDoubleMap(); 00647 std::map<std::string,double>::const_iterator it; 00648 std::map<std::string,Matrix> emiss; 00649 std::map<std::string,Matrix>::iterator eit; 00650 for(it = emisspar.begin(); it != emisspar.end(); it++){ 00651 std::vector<std::string> splited; 00652 boost::regex separator("\\|"); 00653 split_regex(it->first, splited, separator); 00654 if(splited.size() == 1) 00655 splited.push_back(""); 00656 std::string symbol(splited[0]); 00657 std::string state(splited[1]); 00658 if(symbol.length() < 2){ 00659 cerr << "Emission " << symbol << " is not a pair of observation symbols." << endl; 00660 exit(-1); 00661 } 00662 eit = emiss.find(state); 00663 if(eit == emiss.end()){ 00664 int id1 = observations->getSymbol(symbol.substr(0,1))->id(); 00665 int id2 = observations->getSymbol(symbol.substr(1,1))->id(); 00666 emiss[state] = Matrix (observations->size(),observations->size()); 00667 for(int i = 0; i < (int)observations->size(); i++){ 00668 for(int j = 0; j < (int)observations->size(); j++){ 00669 emiss[state](i,j) = 0.0; 00670 } 00671 } 00672 if((id1 >= 0) && (id1 < (int)(emiss[state]).size1()) && (id2 >= 0) && (id2 < (int)(emiss[state]).size2())) 00673 (emiss[state])(id1,id2) = it->second; 00674 } 00675 else{ 00676 int id1 = observations->getSymbol(symbol.substr(0,1))->id(); 00677 int id2 = observations->getSymbol(symbol.substr(1,1))->id(); 00678 if((id1 >= 0) && (id1 < (int)(eit->second).size1()) && (id2 >= 0) && (id2 < (int)(eit->second).size2())) 00679 emiss[state](id1,id2) = it->second; 00680 } 00681 } 00682 00683 //Initializing the transition probabilities 00684 std::map<std::string,double> transpar = transitions->getDoubleMap(); 00685 std::map<std::string,DoubleVector> trans; 00686 std::map<std::string,DoubleVector>::const_iterator it2; 00687 std::map<std::string,IntVector> inTrans; 00688 std::map<std::string,IntVector> outTrans; 00689 for(it = transpar.begin(); it != transpar.end(); it++){ 00690 std::vector<std::string> splited; 00691 boost::regex separator("\\|"); 00692 split_regex(it->first, splited, separator); 00693 if(splited.size() == 1) 00694 splited.push_back(""); 00695 00696 std::string to(splited[0]); 00697 std::string from(splited[1]); 00698 if(trans.find(from) == trans.end()) 00699 { 00700 int id = states->getSymbol(to)->id(); 00701 DoubleVector probs; 00702 IntVector outgoing; 00703 outgoing.push_back(id); 00704 outTrans[from] = outgoing; 00705 probs.resize(states->size()); 00706 trans[from]=probs; 00707 if(id < (int)trans[from].size()) 00708 trans[from][id] = it->second; 00709 } 00710 else 00711 { 00712 int id = states->getSymbol(to)->id(); 00713 outTrans[from].push_back(id); 00714 if(id < (int)trans[from].size()) 00715 trans[from][id] = it->second; 00716 } 00717 if(inTrans.find(to) == inTrans.end()){ 00718 int id = states->getSymbol(from)->id(); 00719 IntVector incoming; 00720 incoming.push_back(id); 00721 inTrans[to] = incoming; 00722 } 00723 else{ 00724 int id = states->getSymbol(from)->id(); 00725 inTrans[to].push_back(id); 00726 } 00727 } 00728 00729 //Configuring the PHMM states 00730 std::map<std::string,std::string> numEmiss = number_of_emissions->getStringMap(); 00731 std::map<std::string,std::string>::const_iterator numEmissIt; 00732 std::map<std::string,Matrix>::const_iterator it3; 00733 for(unsigned int i = 0; i < states->size(); i++){ 00734 SymbolPtr state_name = states->getSymbol(i); 00735 DiscreteIIDModelPtr e ; 00736 DiscreteIIDModelPtr t ; 00737 IntVector itr; 00738 IntVector otr; 00739 int numEmiss1, numEmiss2; 00740 std::map<std::string,IntVector>::const_iterator inTransIt; 00741 00742 numEmissIt = numEmiss.find(state_name->name()); 00743 if(numEmissIt != numEmiss.end()){ 00744 std::vector<std::string> splited; 00745 boost::regex separator(","); 00746 split_regex(numEmissIt->second, splited, separator); 00747 numEmiss1 = atoi(splited[0].c_str()); 00748 numEmiss2 = atoi(splited[1].c_str()); 00749 } 00750 00751 it3 = emiss.find(state_name->name()); 00752 if(it3 != emiss.end()) 00753 e = DiscreteIIDModelPtr(new DiscreteIIDModel(it3->second)); 00754 00755 if(it3 == emiss.end() && numEmiss1 == 0 && numEmiss2 == 0) 00756 e = DiscreteIIDModelPtr(new DiscreteIIDModel()); 00757 00758 it2 = trans.find(state_name->name()); 00759 if(it2 != trans.end()) 00760 t = DiscreteIIDModelPtr(new DiscreteIIDModel(it2->second)); 00761 00762 else{ 00763 t = DiscreteIIDModelPtr(new DiscreteIIDModel()); 00764 } 00765 00766 inTransIt = inTrans.find(state_name->name()); 00767 if(inTransIt != inTrans.end()) 00768 itr = inTransIt->second; 00769 else{ 00770 begin++; 00771 _begin_id = state_list.size(); 00772 IntVector incoming; 00773 itr = incoming; 00774 } 00775 00776 inTransIt = outTrans.find(state_name->name()); 00777 if(inTransIt != outTrans.end()) 00778 otr = inTransIt->second; 00779 else{ 00780 end++; 00781 _end_id = state_list.size(); 00782 IntVector outgoing; 00783 otr = outgoing; 00784 } 00785 00786 PHMMStatePtr statePtr; 00787 if(numEmiss1 == 0 && numEmiss2 == 0){ 00788 statePtr = PHMMStatePtr(new SilentState(state_list.size(), state_name, e, t, itr, otr)); 00789 silent_state_list.push_back(statePtr); 00790 } 00791 if(numEmiss1 == 1 && numEmiss2 == 1){ 00792 statePtr = PHMMStatePtr(new MatchState(state_list.size(), state_name, e, t, itr, otr)); 00793 } 00794 if(numEmiss1 == 0 && numEmiss2 == 1){ 00795 statePtr = PHMMStatePtr(new Gap1State(state_list.size(), state_name, e, t, itr, otr)); 00796 } 00797 if(numEmiss1 == 1 && numEmiss2 == 0){ 00798 statePtr = PHMMStatePtr(new Gap2State(state_list.size(), state_name, e, t, itr, otr)); 00799 } 00800 state_list.push_back(statePtr); 00801 } 00802 if(end != 1){ 00803 cerr << "An end state must be defined." << endl; 00804 exit(-1); 00805 } 00806 if(begin != 1){ 00807 cerr << "A begin state must be defined." << endl; 00808 exit(-1); 00809 } 00810 setStates(state_list, states); 00811 //if(silent_state_list.size() > 1) 00812 //silentStatesSort(silent_state_list); 00813 setSilentStates(silent_state_list); 00814 setObservationSymbols(observations); 00815 } 00816 00820 00821 /* void PairHiddenMarkovModel::silentStatesSort(vector<PHMMStatePtr> silStates){ 00822 int swap = 1; 00823 while(swap == 1){ 00824 swap = 0; 00825 for(int i = 0; i < (int)silStates.size() - 1; i++){ 00826 IntVector v = silStates[i]->iTransitions(); 00827 std::vector<int>::iterator it; 00828 it = find(v.begin(), v.end(), silStates[i+1]->getId()); 00829 if(it != v.end()){ 00830 PHMMStatePtr aux = silStates[i]; 00831 silStates[i] = silStates[i+1]; 00832 silStates[i+1] = aux; 00833 swap = 1; 00834 } 00835 } 00836 } 00837 }*/ 00838 00839 00840 /* std::string PairHiddenMarkovModel::str () const 00841 { 00842 int nstates = _states.size(); 00843 std::stringstream out; 00844 out << "model_name = \"PairHiddenMarkovModel\"" << std::endl; 00845 out << "state_names = (" ; 00846 out << "\"" << getStateName(0) << "\""; 00847 for(int i = 1; i < (int)getStateNames()->size(); i++) 00848 out << ",\"" << getStateName(i) << "\""; 00849 out << ")" << std::endl; 00850 00851 out << "observation_symbols = (" ; 00852 out << "\"" << alphabet()->getSymbol(0)->name() << "\""; 00853 for(int i = 1; i < (int)alphabet()->size(); i++) 00854 out << ",\"" << alphabet()->getSymbol(i)->name() << "\""; 00855 out << ")" << std::endl; 00856 00857 out << "transitions = (" ; 00858 int it = 0; 00859 for(int i = 0; i < nstates; i++){ 00860 for(int j = 0; j < nstates; j++){ 00861 if(close(exp(_states[i]->transitions()->log_probability_of(j)), 0.0, 1e-10)) 00862 continue; 00863 if(it == 0){ 00864 out << "\"" << getStateName(i) << "\" | \"" << getStateName(i) << "\": " << exp(_states[i]->transitions()->log_probability_of(j)) ; 00865 it++; 00866 } 00867 else 00868 out << ";\n \"" << getStateName(j) << "\" | \"" << getStateName(i) << "\": " << exp(_states[i]->transitions()->log_probability_of(j)); 00869 } 00870 } 00871 out << ";)" << std::endl; 00872 00873 00874 out << "emission_probabilities = (" ; 00875 it = 0; 00876 for(int i = 0; i < nstates; i++){ 00877 for(int k = 0; k < (int)alphabet()->size(); k++){ 00878 for(int j = 0; j < (int)alphabet()->size(); j++){ 00879 if(close(exp(_states[i]->emission()->log_probability_of_pair(k,j)), 0.0, 1e-10)) 00880 continue; 00881 if(it == 0){ 00882 out << "\"" << alphabet()->getSymbol(k)->name() << alphabet()->getSymbol(j)->name() << "\" | \"" << getStateName(i) << "\": " << exp(_states[i]->emission()->log_probability_of_pair(k,j)) ; 00883 it++; 00884 } 00885 else 00886 out << ";\n \"" << alphabet()->getSymbol(k)->name() << alphabet()->getSymbol(j)->name() << "\" | \"" << getStateName(i) << "\": " << exp(_states[i]->emission()->log_probability_of_pair(k,j)); 00887 } 00888 } 00889 } 00890 out << ";)" << std::endl; 00891 00892 out << "number_of_emissions = ("; 00893 out << "\"" << getStateName(0) << "\" : \"" << _states[0]->eSeq1() << "," << _states[0]->eSeq2() << "\""; 00894 for(int i = 1; i < nstates; i++) 00895 out << ";\n\"" << getStateName(i) << "\" : \"" << _states[i]->eSeq1() << "," << _states[i]->eSeq2() << "\""; 00896 out << ";)" << endl; 00897 return out.str(); 00898 } */ 00899 00900 /* void PairHiddenMarkovModel::generateSequence(Sequence &seq1, Sequence &seq2, Sequence &path){ 00901 Sequence s1, s2, p; 00902 int state_id = _begin_id; 00903 int obs1 = _gap_id; 00904 int obs2 = _gap_id; 00905 p.push_back(state_id); 00906 s1.push_back(obs1); 00907 s2.push_back(obs2); 00908 00909 while(state_id != _end_id){ 00910 state_id = getPHMMState(state_id)->transitions()->choose(); 00911 if(_states[state_id]->eSeq1() == 0 && _states[state_id]->eSeq2() == 0){ 00912 p.push_back(state_id); 00913 s1.push_back(_gap_id); 00914 s2.push_back(_gap_id); 00915 continue; 00916 } 00917 _states[state_id]->emission()->choosePair(&obs1,&obs2); 00918 p.push_back(state_id); 00919 s1.push_back(obs1); 00920 s2.push_back(obs2); 00921 } 00922 seq1 = s1; 00923 seq2 = s2; 00924 path = p; 00925 }*/ 00926 } 00927 00928