ToPS
PairHiddenMarkovModel.cpp
00001 /*
00002  *       PairHiddenMarkovModel.cpp
00003  *
00004  *      Copyright 2011  Vitor Onuchic <vitoronuchic@gmail.com>
00005  *                      Andre Yoshiaki Kashiwabara <akashiwabara@usp.br>
00006  *                      Ígor Bonádio <ibonadio@ime.usp.br>
00007  *                      Alan Mitchell Durham <aland@usp.br>
00008  *
00009  *       This program is free software; you can redistribute it and/or modify
00010  *       it under the terms of the GNU  General Public License as published by
00011  *       the Free Software Foundation; either version 3 of the License, or
00012  *       (at your option) any later version.
00013  *
00014  *       This program is distributed in the hope that it will be useful,
00015  *       but WITHOUT ANY WARRANTY; without even the implied warranty of
00016  *       MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00017  *       GNU General Public License for more details.
00018  *
00019  *       You should have received a copy of the GNU General Public License
00020  *       along with this program; if not, write to the Free Software
00021  *       Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
00022  *       MA 02110-1301, USA.
00023  */
00024 
00025 #include "PairHiddenMarkovModel.hpp"
00026 
00027 namespace tops {
00028 
00032 
00033   float PairHiddenMarkovModel::posteriorProbabilities (const Sequence &seq1, const Sequence &seq2, SparseMatrixPtr &sparsePPMatch,SparseMatrixPtr &sparsePPGap1, SparseMatrixPtr &sparsePPGap2)
00034   {
00035     int nstates = _states.size();
00036     int length1 = seq1.size();
00037     int length2 = seq2.size();
00038     fMatrix ppMatch(length1+1,length2+1);
00039     fMatrix ppGap1(length1+1,length2+1);
00040     fMatrix ppGap2(length1+1,length2+1);
00041     vector<Matrix> alpha; // forward
00042     vector<Matrix> beta;  // backward
00043 
00044     double full = forward(seq1, seq2, alpha);
00045     backward(seq1, seq2, beta);
00046     for(int i = 0; i <= length1; i++){
00047       for(int j = 0; j <= length2; j++){
00048   ppMatch(i,j) = 0;
00049   ppGap1(i,j) = 0;
00050   ppGap2(i,j) = 0;
00051   for(int k = 0; k < nstates; k++)
00052     _states[k]->postProbSum(ppMatch,ppGap1,ppGap2,alpha[k],beta[k],full,i,j);
00053       }
00054     }
00055     float ea = expectedAccuracy(length1+1,length2+1,ppMatch);
00056     sparsePPMatch = SparseMatrixPtr(new SparseMatrix(length1+1,length2+1,ppMatch));
00057     sparsePPGap1 = SparseMatrixPtr(new SparseMatrix(length1+1,length2+1,ppGap1));
00058     sparsePPGap2 = SparseMatrixPtr(new SparseMatrix(length1+1,length2+1,ppGap2));
00059 
00060     return ea;
00061   }
00062 
00063   void MatchState::postProbSum(fMatrix &ppMatch, fMatrix &ppGap1, fMatrix &ppGap2, Matrix &alpha, Matrix &beta, double full, int i, int j){
00064     ppMatch(i,j) += exp(alpha(i,j) + beta(i,j) - full);
00065   }
00066   void Gap1State::postProbSum(fMatrix &ppMatch, fMatrix &ppGap1, fMatrix &ppGap2, Matrix &alpha, Matrix &beta, double full, int i, int j){
00067     ppGap1(i,j) += exp(alpha(i,j) + beta(i,j) - full);
00068   }
00069   void Gap2State::postProbSum(fMatrix &ppMatch, fMatrix &ppGap1, fMatrix &ppGap2, Matrix &alpha, Matrix &beta, double full, int i, int j){
00070     ppGap2(i,j) += exp(alpha(i,j) + beta(i,j) - full);
00071   }
00072 
00073   /*float PairHiddenMarkovModel::expectedAccuracyWithGaps(SparseMatrixPtr postProbs, SparseMatrixPtr gap1Probs, SparseMatrixPtr gap2Probs){
00074     int size1 = postProbs->nrows();
00075     int size2 = postProbs->ncols();
00076     SparseMatrixPtr ea = SparseMatrixPtr(new SparseMatrix(size1,size2));
00077     SparseMatrixPtr ptrs = SparseMatrixPtr(new SparseMatrix(size1,size2));
00078     for(int i = 0; i < size1; i++){
00079       for(int j = 0; j < size2; j++){
00080   if(i == 0 && j == 0){
00081     continue;
00082   }
00083   if(i == 0){
00084     ea->add(i,j,ea->get(i,j-1)+gap1Probs->get(i,j));
00085     ptrs->add(i,j,2);
00086     continue;
00087   }
00088   if(j == 0){
00089     ea->add(i,j,ea->get(i-1,j)+gap2Probs->get(i,j));
00090     ptrs->add(i,j,1);
00091     continue;
00092   }
00093   ea->add(i,j,ea->get(i-1,j)+gap2Probs->get(i,j));
00094   ptrs->add(i,j,1);
00095   if(ea->get(i,j) <= ea->get(i,j-1)+gap1Probs->get(i,j)){
00096     ea->add(i,j,ea->get(i,j-1)+gap1Probs->get(i,j));
00097     ptrs->add(i,j,2);
00098   }
00099   if(ea->get(i,j) <= ea->get(i-1,j-1)+postProbs->get(i,j)){
00100     ea->add(i,j,ea->get(i-1,j-1)+postProbs->get(i,j));
00101     ptrs->add(i,j,3);
00102   }
00103       }
00104     }
00105     int i = size1-1;
00106     int j = size2-1;
00107     int allen = 0;
00108     while(i > 0 || j > 0){
00109       allen++;
00110       float ptr = ptrs->get(i,j);
00111       if(ptr == 1)
00112   i--;
00113       if(ptr == 2)
00114   j--;
00115       if(ptr == 3){
00116   i--;
00117   j--;
00118       }
00119     }
00120     float mea = ea->get(size1-1,size2-1)/allen;
00121     return mea;
00122     }*/
00123 
00124   float PairHiddenMarkovModel::expectedAccuracy(int size1, int size2, fMatrix &postProbs){
00125     int allen = size1;
00126     if(size2 < allen)
00127       allen = size2;
00128     fMatrix ea(size1,size2);
00129 
00130     for(int i = 0; i < size1; i++)
00131       ea(i,0) = 0.0;
00132     for(int j = 1; j < size2; j++)
00133       ea(0,j) = 0.0;
00134 
00135     for(int i = 1; i < size1; i++){
00136       for(int j = 1; j < size2; j++){
00137   ea(i,j) = ea(i-1,j);
00138   if(ea(i,j) <= ea(i,j-1)){
00139     ea(i,j) = ea(i,j-1);
00140   }
00141   if(ea(i,j) <= ea(i-1,j-1)+postProbs(i,j)){
00142     ea(i,j) = ea(i-1,j-1)+postProbs(i,j);
00143   }
00144       }
00145     }
00146     return ea(size1-1,size2-1)/allen;;
00147   }
00148 
00149   /*  double PairHiddenMarkovModel::viterbi(const Sequence & seq1, const Sequence & seq2, Sequence & statePath, Sequence & alignment1, Sequence & alignment2, vector<Matrix> &a)
00150   {
00151     int nstates = _states.size();
00152     int nsilentstates = _silent_states.size();
00153     int length1 = seq1.size();
00154     int length2 = seq2.size();
00155     vector<Matrix> alpha;
00156     Sequence al1;
00157     Sequence al2;
00158     Sequence path;
00159     vector<IntMatrix> pathMatrix;
00160     alpha.resize(nstates);
00161     pathMatrix.resize(nstates);
00162     for(int i = 0; i < nstates; i++){
00163       (alpha[i]).resize(length1+1,length2+1);
00164       (pathMatrix[i]).resize(length1+1,length2+1);
00165     }
00166 
00167     //Initialization
00168     for(int k = 0; k < nstates; k++){
00169       if(k == (int)_begin_id){
00170         alpha[k](0,0) = 0.0;
00171         continue;
00172       }
00173       else
00174         alpha[k](0,0) = -HUGE;
00175     }
00176 
00177     //Recursion
00178     for (int i = 0; i <= length1; i++){
00179       for(int j = 0; j <= length2; j++){
00180         for(int k = 0; k < nstates; k++){
00181           if(i == 0 && j == 0)
00182             continue;
00183           if(k == (int)_begin_id){
00184             alpha[k](i,j) = -HUGE;
00185             if(i > 0 && j > 0)
00186               pathMatrix[k](i,j) = -1;
00187             continue;
00188           }
00189           int ne1 = _states[k]->eSeq1();
00190           int ne2 = _states[k]->eSeq2();
00191           if(ne1 == 0 && ne2 == 0)
00192             continue;
00193 
00194           alpha[k](i,j) = -HUGE;
00195           if(i - ne1 < 0 || j - ne2 < 0){
00196             pathMatrix[k](i,j) = -1;
00197             continue;
00198           }
00199 
00200           double aux = -HUGE;
00201           for(int l = 0; l < (int)(_states[k]->iTransitions()).size(); l++){
00202             int id = _states[k]->getITransId(l);
00203             aux = alpha[id](i-ne1,j-ne2) + _states[id]->transitions()->log_probability_of(k);
00204             if(aux > alpha[k](i,j)){
00205               alpha[k](i,j) = aux;
00206               pathMatrix[k](i,j) = id;
00207             }
00208           }
00209           if(ne1 > 0 && ne2 > 0)
00210             alpha[k](i,j) += _states[k]->emission()->log_probability_of_pair(seq1[i-1],seq2[j-1]);
00211           else if(ne1 > 0 && ne2 == 0)
00212             alpha[k](i,j) += _states[k]->emission()->log_probability_of_pair(seq1[i-1],_gap_id);
00213           else
00214             alpha[k](i,j) += _states[k]->emission()->log_probability_of_pair(_gap_id,seq2[j-1]);
00215         }
00216         for(int k = 0; k < nsilentstates; k++){
00217           if(i == 0 && j == 0)
00218             continue;
00219           int id1 = getSilentId(k);
00220           alpha[id1](i,j) = -HUGE;
00221           if(id1 == (int)_begin_id)
00222             continue;
00223           for(int l = 0; l < (int)(_states[id1]->iTransitions()).size(); l++){
00224             int id2 = _states[id1]->getITransId(l);
00225             double aux = alpha[id2](i,j) + _states[id2]->transitions()->log_probability_of(id1);
00226             if(aux > alpha[id1](i,j)){
00227               alpha[id1](i,j) = aux;
00228               pathMatrix[id1](i,j) = id2;
00229             }
00230           }
00231         }
00232       }
00233     }
00234 
00235     //Termination
00236     double score = -HUGE;
00237     int p;
00238     for(int i = 0; i < nstates; i++){
00239       alpha[i](length1,length2) += _states[i]->transitions()->log_probability_of(_end_id);
00240       if(score < alpha[i](length1,length2)){
00241         score = alpha[i](length1,length2);
00242         p = i;
00243       }
00244     }
00245     a = alpha;
00246 
00247     //Trace back
00248     path.push_back(_end_id);
00249     al1.push_back(_gap_id);
00250     al2.push_back(_gap_id);
00251     while(length1 >= 0 || length2 >= 0){
00252       path.push_back(p);
00253       if(_states[p]->eSeq1() == 0 && _states[p]->eSeq2() == 0){
00254         al1.push_back(_gap_id);
00255         al2.push_back(_gap_id);
00256         if(length1 == 0 && length2 == 0)
00257           break;
00258         p = pathMatrix[p](length1,length2);
00259       }
00260       else if(_states[p]->eSeq1() > 0 && _states[p]->eSeq2() == 0){
00261         al1.push_back(seq1[length1-1]);
00262         al2.push_back(_gap_id);
00263         p = pathMatrix[p](length1,length2);
00264         length1--;
00265       }
00266       else if(_states[p]->eSeq1() == 0 && _states[p]->eSeq2() > 0){
00267         al1.push_back(_gap_id);
00268         al2.push_back(seq2[length2-1]);
00269         p = pathMatrix[p](length1,length2);
00270         length2--;
00271       }
00272       else{
00273         al1.push_back(seq1[length1-1]);
00274         al2.push_back(seq2[length2-1]);
00275         p = pathMatrix[p](length1,length2);
00276         length1--;
00277         length2--;
00278       }
00279     }
00280     //path.push_back(p);
00281     alignment1.resize(al1.size());
00282     alignment2.resize(al2.size());
00283     statePath.resize(path.size());
00284     for(unsigned int i = 0; i < al1.size(); i++){
00285       alignment1[i] = al1[al1.size()-1-i];
00286       alignment2[i] = al2[al2.size()-1-i];
00287     }
00288     for(unsigned int i = 0; i < path.size(); i++)
00289       statePath[i] = path[path.size() - i - 1];
00290 
00291     return score;
00292     }*/
00293 
00294   void MatchState::forwardSum(vector<PHMMStatePtr> &states, const Sequence &seq1, const Sequence &seq2, vector<Matrix> &a, int i, int j, int gap_id, int begin_id){
00295     if(i == 1 && j == 1){
00296       a[_id](i,j) = states[begin_id]->transitions()->log_probability_of(_id) + _emission->log_probability_of_pair(seq1[0],seq2[0]);
00297       return;
00298     }
00299 
00300     if(i == 0 || j == 0){
00301       a[_id](i,j) = -HUGE;
00302       return;
00303     }
00304 
00305     vector<int>::iterator predStateNumIt = _incomingTransitions.begin();
00306     a[_id](i,j) = a[*predStateNumIt](i-1,j-1) + states[*predStateNumIt]->transitions()->log_probability_of(_id);
00307     for(predStateNumIt = _incomingTransitions.begin()+1; predStateNumIt != _incomingTransitions.end(); predStateNumIt++)
00308       a[_id](i,j) = log_sum(a[_id](i,j), a[*predStateNumIt](i-1,j-1) + states[*predStateNumIt]->transitions()->log_probability_of(_id));
00309 
00310     a[_id](i,j) += _emission->log_probability_of_pair(seq1[i-1],seq2[j-1]);
00311   }
00312 
00313   void Gap1State::forwardSum(vector<PHMMStatePtr> &states, const Sequence &seq1, const Sequence &seq2, vector<Matrix> &a, int i, int j, int gap_id, int begin_id){
00314     if(i == 0 && j == 1){
00315       a[_id](i,j) = states[begin_id]->transitions()->log_probability_of(_id) + _emission->log_probability_of_pair(gap_id,seq2[0]);
00316       return;
00317     }
00318 
00319     if(j == 0){
00320       a[_id](i,j) = -HUGE;
00321       return;
00322     }
00323 
00324     vector<int>::iterator predStateNumIt = _incomingTransitions.begin();
00325     a[_id](i,j) = a[*predStateNumIt](i,j-1) + states[*predStateNumIt]->transitions()->log_probability_of(_id);
00326     for(predStateNumIt = _incomingTransitions.begin()+1; predStateNumIt != _incomingTransitions.end(); predStateNumIt++)
00327       a[_id](i,j) = log_sum(a[_id](i,j), a[*predStateNumIt](i,j-1) + states[*predStateNumIt]->transitions()->log_probability_of(_id));
00328 
00329     a[_id](i,j) += _emission->log_probability_of_pair(gap_id,seq2[j-1]);
00330   }
00331 
00332   void Gap2State::forwardSum(vector<PHMMStatePtr> &states, const Sequence &seq1, const Sequence &seq2, vector<Matrix> &a, int i, int j, int gap_id, int begin_id){
00333     if(i == 1 && j == 0){
00334       a[_id](i,j) = states[begin_id]->transitions()->log_probability_of(_id) + _emission->log_probability_of_pair(seq1[0],gap_id);
00335       return;
00336     }
00337 
00338     if(i == 0){
00339       a[_id](i,j) = -HUGE;
00340       return;
00341     }
00342 
00343     vector<int>::iterator predStateNumIt = _incomingTransitions.begin();
00344     a[_id](i,j) = a[*predStateNumIt](i-1,j) + states[*predStateNumIt]->transitions()->log_probability_of(_id);
00345     for(predStateNumIt = _incomingTransitions.begin()+1; predStateNumIt != _incomingTransitions.end(); predStateNumIt++)
00346       a[_id](i,j) = log_sum(a[_id](i,j), a[*predStateNumIt](i-1,j) + states[*predStateNumIt]->transitions()->log_probability_of(_id));
00347 
00348     a[_id](i,j) += _emission->log_probability_of_pair(seq1[i-1],gap_id);
00349   }
00350 
00351   void SilentState::forwardSum(vector<PHMMStatePtr> &states, const Sequence &seq1, const Sequence &seq2, vector<Matrix> &a, int i, int j, int gap_id, int begin_id){
00352     return;
00353   }
00354 
00355   double PairHiddenMarkovModel::forward(const Sequence & seq1, const Sequence & seq2, vector<Matrix> &a)
00356   {
00357     int nstates = _states.size();
00358     int length1 = seq1.size();
00359     int length2 = seq2.size();
00360 
00361     a.resize(nstates);
00362     for(int i = 0; i < nstates; i++){
00363       if(i != _begin_id && i != _end_id)
00364   (a[i]).resize(length1+1,length2+1);
00365       else
00366   (a[i]).resize(0,0);
00367     }
00368 
00369     //Recursion
00370     vector<PHMMStatePtr>::iterator currState;
00371     for (int i = 0; i <= length1; i++){
00372       for(int j = 0; j <= length2; j++){
00373         for(currState = _states.begin(); currState != _states.end(); currState++){
00374     (*currState)->forwardSum(_states,seq1,seq2,a,i,j,_gap_id,_begin_id);
00375   }
00376       }
00377     }
00378 
00379     //Termination
00380     double sum = -HUGE;
00381     for(currState = _states.begin(); currState != _states.end(); currState++){
00382       if(!(*currState)->isSilent())
00383   sum = log_sum(sum, a[(*currState)->getId()](length1,length2) + (*currState)->transitions()->log_probability_of(_end_id));
00384     }
00385     return sum;
00386   }
00387 
00388   void MatchState::backwardSum(vector<PHMMStatePtr> &states, const Sequence &seq1, const Sequence &seq2, vector<Matrix> &a, int i, int j, int gap_id, int currStateId, double *accumulator){
00389     if(i == (int)seq1.size() || j == (int)seq2.size())
00390       return;
00391     (*accumulator) = log_sum((*accumulator),
00392            a[_id](i+1,j+1) +
00393            states[currStateId]->transitions()->log_probability_of(_id) +
00394            _emission->log_probability_of_pair(seq1[i],seq2[j]));
00395   }
00396 
00397   void Gap1State::backwardSum(vector<PHMMStatePtr> &states, const Sequence &seq1, const Sequence &seq2, vector<Matrix> &a, int i, int j, int gap_id, int currStateId, double *accumulator){
00398     if(j == (int)seq2.size())
00399       return;
00400     (*accumulator) = log_sum((*accumulator),
00401            a[_id](i,j+1) +
00402            states[currStateId]->transitions()->log_probability_of(_id) +
00403            _emission->log_probability_of_pair(gap_id,seq2[j]));
00404   }
00405 
00406   void Gap2State::backwardSum(vector<PHMMStatePtr> &states, const Sequence &seq1, const Sequence &seq2, vector<Matrix> &a, int i, int j, int gap_id, int currStateId, double *accumulator){
00407     if(i == (int)seq1.size())
00408       return;
00409     (*accumulator) = log_sum((*accumulator),
00410            a[_id](i+1,j) +
00411            states[currStateId]->transitions()->log_probability_of(_id) +
00412            _emission->log_probability_of_pair(seq1[i],gap_id));
00413   }
00414 
00415   double PairHiddenMarkovModel::backward(const Sequence & seq1, const Sequence & seq2, vector<Matrix> &a)
00416   {
00417     int nstates = _states.size();
00418     int length1 = seq1.size();
00419     int length2 = seq2.size();
00420     a.resize(nstates);
00421     for(int i = 0; i < nstates; i++){
00422       if(i == _end_id || i == _begin_id){
00423   (a[i]).resize(0,0);
00424   continue;
00425       }
00426       (a[i]).resize(length1+1,length2+1);
00427     }
00428 
00429     //Initialization
00430     for(int k = 0; k < nstates; k++){
00431       if(k != _end_id && k != _begin_id)
00432   a[k](length1,length2) = _states[k]->transitions()->log_probability_of(_end_id);
00433     }
00434 
00435     //Recursion
00436     IntVector::iterator nextStateNumIt;
00437     double accumulator;
00438     for (int i = length1; i >= 0; i--){
00439       for(int j = length2; j >= 0; j--){
00440   if(i == length1 && j == length2)
00441     continue;
00442         for(int currStateId = 0; currStateId < nstates; currStateId++){
00443     if(_states[currStateId]->isSilent())
00444       continue;
00445     accumulator = -HUGE;
00446     for(nextStateNumIt = _states[currStateId]->outTransitions().begin(); nextStateNumIt != _states[currStateId]->outTransitions().end(); nextStateNumIt++)
00447       _states[*nextStateNumIt]->backwardSum(_states,seq1,seq2,a,i,j,_gap_id,currStateId,&accumulator);
00448     a[currStateId](i,j) = accumulator;
00449         }
00450       }
00451     }
00452 
00453     //Termination
00454     accumulator = -HUGE;
00455     for(nextStateNumIt = _states[_begin_id]->outTransitions().begin(); nextStateNumIt != _states[_begin_id]->outTransitions().end(); nextStateNumIt++)
00456       _states[*nextStateNumIt]->backwardSum(_states,seq1,seq2,a,0,0,_gap_id,_begin_id,&accumulator);
00457     return accumulator;
00458   }
00459 
00463 
00464   /*  void PairHiddenMarkovModel::trainBaumWelch(SequenceList & sample, int maxiterations, double diff_threshold)
00465   {
00466     int nstates = _states.size();
00467     int alphabet_size = alphabet()->size();
00468 
00469     double diff = HUGE;
00470     double last = -HUGE;
00471     if(maxiterations < 0)
00472       maxiterations = 500;
00473 
00474     int iterations;
00475     for(iterations = 0; iterations < maxiterations; iterations++){
00476       DoubleVector sumA (nstates);
00477       DoubleVector sumE (nstates);
00478       double sumP = -HUGE;
00479       Matrix A (nstates, nstates);
00480       for(int a = 0; a < nstates; a++){
00481         for(int b = 0; b < nstates; b++){
00482     if(!close(exp(_states[a]->transitions()->log_probability_of(b)), 0.0, 1e-10))
00483       A(a,b) = _states[a]->transitions()->log_probability_of(b) + log(3);
00484     else
00485       A(a,b) = -HUGE;
00486         }
00487       }
00488       vector<Matrix> E;
00489       E.resize(nstates);
00490       for(int i = 0; i < nstates; i++){
00491         E[i].resize(alphabet_size,alphabet_size);
00492         sumA[i] = -HUGE;
00493         sumE[i] = -HUGE;
00494         for(int a = 0; a < alphabet_size; a++){
00495           for(int b = 0; b < alphabet_size; b++){
00496       if(!close(exp(_states[i]->emission()->log_probability_of_pair(a,b)), 0.0, 1e-10)){
00497         if(_states[i]->eSeq1() > 0 && _states[i]->eSeq2() > 0)
00498     E[i](a,b) = _states[i]->emission()->log_probability_of_pair(a,b) + log(210);
00499         else
00500     E[i](a,b) = _states[i]->emission()->log_probability_of_pair(a,b) + log(19);
00501       }
00502       else
00503         E[i](a,b) = -HUGE;
00504           }
00505   }
00506       }
00507       for(int s1 = 0; s1 < (int)sample.size(); s1++){
00508   for(int s2 = s1+1; s2 < (int)sample.size(); s2++){
00509     vector<Matrix> alpha;
00510     vector<Matrix> beta;
00511     int length1 = sample[s1].size();
00512     int length2 = sample[s2].size();
00513 
00514     double P = forward(sample[s1], sample[s2], alpha);
00515     sumP = log_sum(sumP,P);
00516     backward(sample[s1], sample[s2], beta);
00517 
00518 
00519     for(int k = 0; k < nstates; k++){
00520       for(int l = 0; l < nstates; l++){
00521         if(l == _begin_id || k == _end_id){
00522     continue;
00523         }
00524         if(k == _begin_id && l != _end_id){
00525     A(k,l) = log_sum(A(k,l), _states[k]->transitions()->log_probability_of(l) + beta[l](0,0) - P);
00526     continue;
00527         }
00528         if(l == _end_id && k != _begin_id){
00529     A(k,l) = log_sum(A(k,l), alpha[k](length1,length2) + _states[k]->transitions()->log_probability_of(l)-P);
00530     continue;
00531         }
00532         if(k == _begin_id && l == _end_id){
00533     continue;
00534         }
00535         for(int i = 0; i <= length1; i++){
00536     for(int j = 0; j <= length2; j++){
00537       int ne1 = getPHMMState(l)->eSeq1();
00538       int ne2 = getPHMMState(l)->eSeq2();
00539       if((i+ne1 > length1) || (j+ne2 > length2))
00540         continue;
00541       if(k == 0) {
00542         if(ne1 == 1 && ne2 == 1)
00543           E[l](sample[s1][i],sample[s2][j]) = log_sum(E[l](sample[s1][i],sample[s2][j]), alpha[l](i+ne1,j+ne2) + beta[l](i+ne1,j+ne2) - P);
00544         if(ne1 == 0 && ne2 == 1)
00545           E[l](_gap_id,sample[s2][j]) = log_sum(E[l](_gap_id,sample[s2][j]), alpha[l](i+ne1,j+ne2) + beta[l](i+ne1,j+ne2) - P);
00546         if(ne1 == 1 && ne2 == 0)
00547           E[l](sample[s1][i],_gap_id) = log_sum(E[l](sample[s1][i],_gap_id), alpha[l](i+ne1,j+ne2) + beta[l](i+ne1,j+ne2) - P);
00548       }
00549 
00550       if(ne1 == 1 && ne2 == 1)
00551         A(k,l) = log_sum(A(k,l), alpha[k](i,j) +
00552              _states[k]->transitions()->log_probability_of(l) +
00553              getPHMMState(l)->emission()->log_probability_of_pair(sample[s1][i],sample[s2][j]) +
00554              beta[l](i+ne1,j+ne2)-P);
00555       if(ne1 == 0 && ne2 == 1)
00556         A(k,l) = log_sum(A(k,l), alpha[k](i,j) +
00557              _states[k]->transitions()->log_probability_of(l) +
00558              getPHMMState(l)->emission()->log_probability_of_pair(_gap_id,sample[s2][j]) +
00559              beta[l](i+ne1,j+ne2)-P);
00560       if(ne1 == 1 && ne2 == 0)
00561         A(k,l) = log_sum(A(k,l), alpha[k](i,j) +
00562              _states[k]->transitions()->log_probability_of(l) +
00563              getPHMMState(l)->emission()->log_probability_of_pair(sample[s1][i],_gap_id) +
00564              beta[l](i+ne1,j+ne2)-P);
00565       if(ne1 == 0 && ne2 == 0)
00566         A(k,l) = log_sum(A(k,l), alpha[k](i,j) +
00567              _states[k]->transitions()->log_probability_of(l) +
00568              beta[l](i+ne1,j+ne2)-P);
00569     }
00570         }
00571       }
00572     }
00573   }
00574       }
00575       for(int k = 0; k < nstates; k++){
00576   for(int l = 0; l < nstates; l++){
00577     sumA[k] = log_sum(sumA[k], A(k,l));
00578   }
00579       }
00580       for(int k = 0; k < nstates; k++){
00581   for(int a = 0; a < alphabet_size; a++){
00582     for(int b = 0; b < alphabet_size; b++){
00583       double s = log_sum(E[k](a,b), E[k](b,a));
00584       E[k](a,b) = E[k](b,a) = s-log(2);
00585       sumE[k] = log_sum(sumE[k], E[k](a,b));
00586     }
00587   }
00588       }
00589       for(int k = 0; k < nstates; k++){
00590   if(_states[k]->eSeq1() == 0 && _states[k]->eSeq2() == 0)
00591     continue;
00592   for(int a = 0; a < alphabet_size; a++){
00593     for(int b = 0; b < alphabet_size; b++){
00594       E[k](a,b) -= sumE[k];
00595       _states[k]->emission()->log_probability_of_pair(a,b,E[k](a,b));
00596     }
00597   }
00598       }
00599       for(int k = 0; k < nstates; k++){
00600   if(_states[k]->oTransitions().size() == 0)
00601     continue;
00602   for(int l = 0; l < nstates; l++){
00603     A(k,l) -= sumA[k];
00604     _states[k]->transitions()->log_probability_of(l, A(k,l));
00605   }
00606       }
00607       //cerr << "sumP = " << sumP << endl;
00608       diff = fabs(last - sumP);
00609       if(diff < diff_threshold)
00610   break;
00611       last = sumP;
00612     }
00613     //cerr << "Stoped at iteration: " << iterations << endl;
00614     //cerr << "Diff: " << diff << endl;
00615     }*/
00616 
00620 
00621   void PairHiddenMarkovModel::initialize(const ProbabilisticModelParameters & parameters) {
00622     ProbabilisticModelParameterValuePtr state_names = parameters.getMandatoryParameterValue("state_names");
00623     ProbabilisticModelParameterValuePtr observation_symbols = parameters.getMandatoryParameterValue("observation_symbols");
00624     ProbabilisticModelParameterValuePtr number_of_emissions = parameters.getMandatoryParameterValue("number_of_emissions");
00625     ProbabilisticModelParameterValuePtr transitions = parameters.getMandatoryParameterValue("transitions");
00626     ProbabilisticModelParameterValuePtr emissions = parameters.getMandatoryParameterValue("emission_probabilities");
00627 
00628     int end = 0;
00629     int begin = 0;
00630     std::vector<PHMMStatePtr> state_list;
00631     std::vector<PHMMStatePtr> silent_state_list;
00632 
00633     AlphabetPtr states = AlphabetPtr(new Alphabet());
00634     states->initializeFromVector(state_names->getStringVector());
00635 
00636     AlphabetPtr observations = AlphabetPtr(new Alphabet());
00637     observations->initializeFromVector(observation_symbols->getStringVector());
00638     if(observations->has("-"))
00639       _gap_id = observations->getSymbol("-")->id();
00640     else{
00641       observations->createSymbol("-");
00642       _gap_id = observations->getSymbol("-")->id();
00643     }
00644 
00645     //Initializing the emission probabilities
00646     std::map<std::string,double> emisspar = emissions->getDoubleMap();
00647     std::map<std::string,double>::const_iterator it;
00648     std::map<std::string,Matrix> emiss;
00649     std::map<std::string,Matrix>::iterator eit;
00650     for(it = emisspar.begin(); it != emisspar.end(); it++){
00651       std::vector<std::string> splited;
00652       boost::regex separator("\\|");
00653       split_regex(it->first, splited, separator);
00654       if(splited.size() == 1)
00655         splited.push_back("");
00656       std::string symbol(splited[0]);
00657       std::string state(splited[1]);
00658       if(symbol.length() < 2){
00659         cerr << "Emission " << symbol << " is not a pair of observation symbols." << endl;
00660         exit(-1);
00661       }
00662       eit = emiss.find(state);
00663        if(eit == emiss.end()){
00664         int id1 = observations->getSymbol(symbol.substr(0,1))->id();
00665         int id2 = observations->getSymbol(symbol.substr(1,1))->id();
00666         emiss[state] = Matrix (observations->size(),observations->size());
00667         for(int i = 0; i < (int)observations->size(); i++){
00668             for(int j = 0; j < (int)observations->size(); j++){
00669             emiss[state](i,j) = 0.0;
00670           }
00671         }
00672         if((id1 >= 0) && (id1 < (int)(emiss[state]).size1()) && (id2 >= 0) && (id2 < (int)(emiss[state]).size2()))
00673           (emiss[state])(id1,id2) = it->second;
00674       }
00675       else{
00676         int id1 = observations->getSymbol(symbol.substr(0,1))->id();
00677         int id2 = observations->getSymbol(symbol.substr(1,1))->id();
00678         if((id1 >= 0) && (id1 < (int)(eit->second).size1()) && (id2 >= 0) && (id2 < (int)(eit->second).size2()))
00679           emiss[state](id1,id2) = it->second;
00680       }
00681     }
00682 
00683     //Initializing the transition probabilities
00684     std::map<std::string,double> transpar = transitions->getDoubleMap();
00685     std::map<std::string,DoubleVector> trans;
00686     std::map<std::string,DoubleVector>::const_iterator it2;
00687     std::map<std::string,IntVector> inTrans;
00688     std::map<std::string,IntVector> outTrans;
00689     for(it = transpar.begin(); it != transpar.end(); it++){
00690       std::vector<std::string> splited;
00691       boost::regex separator("\\|");
00692       split_regex(it->first, splited, separator);
00693       if(splited.size() == 1)
00694         splited.push_back("");
00695 
00696       std::string to(splited[0]);
00697       std::string from(splited[1]);
00698       if(trans.find(from) == trans.end())
00699         {
00700           int id = states->getSymbol(to)->id();
00701           DoubleVector probs;
00702           IntVector outgoing;
00703           outgoing.push_back(id);
00704           outTrans[from] = outgoing;
00705           probs.resize(states->size());
00706           trans[from]=probs;
00707           if(id < (int)trans[from].size())
00708             trans[from][id] = it->second;
00709         }
00710       else
00711         {
00712           int id = states->getSymbol(to)->id();
00713           outTrans[from].push_back(id);
00714           if(id < (int)trans[from].size())
00715             trans[from][id] = it->second;
00716         }
00717       if(inTrans.find(to) == inTrans.end()){
00718         int id = states->getSymbol(from)->id();
00719         IntVector incoming;
00720         incoming.push_back(id);
00721         inTrans[to] = incoming;
00722       }
00723       else{
00724         int id = states->getSymbol(from)->id();
00725         inTrans[to].push_back(id);
00726       }
00727     }
00728 
00729     //Configuring the PHMM states
00730     std::map<std::string,std::string> numEmiss = number_of_emissions->getStringMap();
00731     std::map<std::string,std::string>::const_iterator numEmissIt;
00732     std::map<std::string,Matrix>::const_iterator it3;
00733     for(unsigned int i = 0; i < states->size(); i++){
00734       SymbolPtr state_name = states->getSymbol(i);
00735       DiscreteIIDModelPtr e ;
00736       DiscreteIIDModelPtr t ;
00737       IntVector itr;
00738       IntVector otr;
00739       int numEmiss1, numEmiss2;
00740       std::map<std::string,IntVector>::const_iterator inTransIt;
00741 
00742       numEmissIt = numEmiss.find(state_name->name());
00743       if(numEmissIt != numEmiss.end()){
00744         std::vector<std::string> splited;
00745         boost::regex separator(",");
00746         split_regex(numEmissIt->second, splited, separator);
00747         numEmiss1 = atoi(splited[0].c_str());
00748         numEmiss2 = atoi(splited[1].c_str());
00749       }
00750 
00751       it3 = emiss.find(state_name->name());
00752       if(it3 != emiss.end())
00753         e = DiscreteIIDModelPtr(new DiscreteIIDModel(it3->second));
00754 
00755       if(it3 == emiss.end() && numEmiss1 == 0 && numEmiss2 == 0)
00756         e = DiscreteIIDModelPtr(new DiscreteIIDModel());
00757 
00758       it2 = trans.find(state_name->name());
00759       if(it2 != trans.end())
00760         t = DiscreteIIDModelPtr(new DiscreteIIDModel(it2->second));
00761 
00762       else{
00763         t = DiscreteIIDModelPtr(new DiscreteIIDModel());
00764       }
00765 
00766       inTransIt = inTrans.find(state_name->name());
00767       if(inTransIt != inTrans.end())
00768         itr = inTransIt->second;
00769       else{
00770         begin++;
00771         _begin_id = state_list.size();
00772         IntVector incoming;
00773         itr = incoming;
00774       }
00775 
00776       inTransIt = outTrans.find(state_name->name());
00777       if(inTransIt != outTrans.end())
00778         otr = inTransIt->second;
00779       else{
00780         end++;
00781         _end_id = state_list.size();
00782         IntVector outgoing;
00783         otr = outgoing;
00784       }
00785 
00786       PHMMStatePtr statePtr;
00787       if(numEmiss1 == 0 && numEmiss2 == 0){
00788   statePtr = PHMMStatePtr(new SilentState(state_list.size(), state_name, e, t, itr, otr));
00789         silent_state_list.push_back(statePtr);
00790       }
00791       if(numEmiss1 == 1 && numEmiss2 == 1){
00792   statePtr = PHMMStatePtr(new MatchState(state_list.size(), state_name, e, t, itr, otr));
00793       }
00794       if(numEmiss1 == 0 && numEmiss2 == 1){
00795   statePtr = PHMMStatePtr(new Gap1State(state_list.size(), state_name, e, t, itr, otr));
00796       }
00797       if(numEmiss1 == 1 && numEmiss2 == 0){
00798   statePtr = PHMMStatePtr(new Gap2State(state_list.size(), state_name, e, t, itr, otr));
00799       }
00800       state_list.push_back(statePtr);
00801     }
00802     if(end != 1){
00803       cerr << "An end state must be defined." << endl;
00804       exit(-1);
00805     }
00806     if(begin != 1){
00807       cerr << "A begin state must be defined." << endl;
00808       exit(-1);
00809     }
00810     setStates(state_list, states);
00811     //if(silent_state_list.size() > 1)
00812     //silentStatesSort(silent_state_list);
00813     setSilentStates(silent_state_list);
00814     setObservationSymbols(observations);
00815   }
00816 
00820 
00821   /*  void PairHiddenMarkovModel::silentStatesSort(vector<PHMMStatePtr> silStates){
00822     int swap = 1;
00823     while(swap == 1){
00824       swap = 0;
00825       for(int i = 0; i < (int)silStates.size() - 1; i++){
00826         IntVector v = silStates[i]->iTransitions();
00827         std::vector<int>::iterator it;
00828         it = find(v.begin(), v.end(), silStates[i+1]->getId());
00829         if(it != v.end()){
00830           PHMMStatePtr aux = silStates[i];
00831           silStates[i] = silStates[i+1];
00832           silStates[i+1] = aux;
00833           swap = 1;
00834         }
00835       }
00836     }
00837     }*/
00838 
00839 
00840   /*  std::string PairHiddenMarkovModel::str () const
00841   {
00842     int nstates = _states.size();
00843     std::stringstream out;
00844     out << "model_name = \"PairHiddenMarkovModel\"" << std::endl;
00845     out << "state_names = (" ;
00846     out << "\"" << getStateName(0) << "\"";
00847     for(int i = 1; i < (int)getStateNames()->size(); i++)
00848       out << ",\"" << getStateName(i) << "\"";
00849     out << ")" << std::endl;
00850 
00851     out << "observation_symbols = (" ;
00852     out << "\"" << alphabet()->getSymbol(0)->name() << "\"";
00853     for(int i = 1; i < (int)alphabet()->size(); i++)
00854       out << ",\"" << alphabet()->getSymbol(i)->name() << "\"";
00855     out << ")" << std::endl;
00856 
00857     out << "transitions = (" ;
00858     int it = 0;
00859     for(int i = 0; i < nstates; i++){
00860       for(int j = 0; j < nstates; j++){
00861           if(close(exp(_states[i]->transitions()->log_probability_of(j)), 0.0, 1e-10))
00862             continue;
00863           if(it == 0){
00864             out << "\"" << getStateName(i) << "\" | \"" << getStateName(i) << "\": " << exp(_states[i]->transitions()->log_probability_of(j)) ;
00865             it++;
00866           }
00867           else
00868             out << ";\n \"" << getStateName(j) << "\" | \"" << getStateName(i) << "\": " << exp(_states[i]->transitions()->log_probability_of(j));
00869       }
00870     }
00871     out << ";)" << std::endl;
00872 
00873 
00874     out << "emission_probabilities = (" ;
00875     it = 0;
00876     for(int i = 0; i < nstates; i++){
00877       for(int k = 0; k < (int)alphabet()->size(); k++){
00878         for(int j = 0; j < (int)alphabet()->size(); j++){
00879           if(close(exp(_states[i]->emission()->log_probability_of_pair(k,j)), 0.0, 1e-10))
00880             continue;
00881           if(it == 0){
00882             out << "\"" <<  alphabet()->getSymbol(k)->name() << alphabet()->getSymbol(j)->name() << "\" | \"" << getStateName(i) << "\": " << exp(_states[i]->emission()->log_probability_of_pair(k,j)) ;
00883             it++;
00884           }
00885           else
00886             out << ";\n \"" << alphabet()->getSymbol(k)->name() << alphabet()->getSymbol(j)->name() << "\" | \"" <<  getStateName(i) << "\": " << exp(_states[i]->emission()->log_probability_of_pair(k,j));
00887         }
00888       }
00889     }
00890     out << ";)" << std::endl;
00891 
00892     out << "number_of_emissions = (";
00893     out << "\"" << getStateName(0) << "\" : \"" << _states[0]->eSeq1() << "," << _states[0]->eSeq2() << "\"";
00894     for(int i = 1; i < nstates; i++)
00895       out << ";\n\"" << getStateName(i) << "\" : \"" << _states[i]->eSeq1() << "," << _states[i]->eSeq2() << "\"";
00896     out << ";)" << endl;
00897     return out.str();
00898     }  */
00899 
00900   /*  void PairHiddenMarkovModel::generateSequence(Sequence &seq1, Sequence &seq2, Sequence &path){
00901     Sequence s1, s2, p;
00902     int state_id = _begin_id;
00903     int obs1 = _gap_id;
00904     int obs2 = _gap_id;
00905     p.push_back(state_id);
00906     s1.push_back(obs1);
00907     s2.push_back(obs2);
00908 
00909     while(state_id != _end_id){
00910       state_id = getPHMMState(state_id)->transitions()->choose();
00911       if(_states[state_id]->eSeq1() == 0 && _states[state_id]->eSeq2() == 0){
00912         p.push_back(state_id);
00913         s1.push_back(_gap_id);
00914         s2.push_back(_gap_id);
00915         continue;
00916       }
00917       _states[state_id]->emission()->choosePair(&obs1,&obs2);
00918       p.push_back(state_id);
00919       s1.push_back(obs1);
00920       s2.push_back(obs2);
00921     }
00922     seq1 = s1;
00923     seq2 = s2;
00924     path = p;
00925     }*/
00926 }
00927 
00928