ToPS
TrainPhasedMarkovChainContextAlgorithm.cpp
00001 /*
00002  *       TrainPhasedMarkovChainContextAlgorithm.cpp
00003  *
00004  *       Copyright 2011 Andre Yoshiaki Kashiwabara <akashiwabara@usp.br>
00005  *                      Ígor Bonádio <ibonadio@ime.usp.br>
00006  *                      Vitor Onuchic <vitoronuchic@gmail.com>
00007  *                      Alan Mitchell Durham <aland@usp.br>
00008  *
00009  *       This program is free software; you can redistribute it and/or modify
00010  *       it under the terms of the GNU  General Public License as published by
00011  *       the Free Software Foundation; either version 3 of the License, or
00012  *       (at your option) any later version.
00013  *
00014  *       This program is distributed in the hope that it will be useful,
00015  *       but WITHOUT ANY WARRANTY; without even the implied warranty of
00016  *       MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00017  *       GNU General Public License for more details.
00018  *
00019  *       You should have received a copy of the GNU General Public License
00020  *       along with this program; if not, write to the Free Software
00021  *       Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
00022  *       MA 02110-1301, USA.
00023  */
00024 
00025 #include "ContextTree.hpp"
00026 #include "TrainPhasedMarkovChainContextAlgorithm.hpp"
00027 #include "InhomogeneousMarkovChain.hpp"
00028 #include "ProbabilisticModelParameter.hpp"
00029 namespace tops {
00030 
00031 ProbabilisticModelPtr TrainPhasedMarkovChainContextAlgorithm::create(
00032                 ProbabilisticModelParameters & parameters, double & loglikelihood,
00033                 int & sample_size) const {
00034         ProbabilisticModelParameterValuePtr alphabetpar =
00035                         parameters.getMandatoryParameterValue("alphabet");
00036         ProbabilisticModelParameterValuePtr trainingsetpar =
00037                         parameters.getMandatoryParameterValue("training_set");
00038         ProbabilisticModelParameterValuePtr cutpar =
00039                         parameters.getMandatoryParameterValue("cut");
00040 
00041         ProbabilisticModelParameterValuePtr number_of_phasespar =
00042           parameters.getMandatoryParameterValue("number_of_phases");
00043 
00044         if ((alphabetpar == NULL) || (trainingsetpar == NULL)
00045             || (number_of_phasespar == NULL)
00046             || (cutpar == NULL)) {
00047           std::cerr << help() << std::endl;
00048           exit(-1);
00049         }
00050         AlphabetPtr alphabet = AlphabetPtr(new Alphabet());
00051         alphabet->initializeFromVector(alphabetpar ->getStringVector());
00052         SequenceEntryList sample_set;
00053         readSequencesFromFile(sample_set, alphabet, trainingsetpar->getString());
00054         int length = number_of_phasespar->getInt() ;
00055         std::vector<ContextTreePtr> positional_distribution;
00056         positional_distribution.resize(length);
00057         sample_size = 0;
00058         std::vector <std::string> cut = cutpar->getStringVector();
00059         if((int) cut.size() < length) {
00060           std::cerr << "ERROR: The number of cut values  is smaller than the sequence length " << std::endl;
00061           exit(-1);
00062         }
00063 
00064         for (int i = 0; i < length; i++) {
00065                 SequenceEntryList positionalSample;
00066                 for(int j = 0; j < (int)sample_set.size(); j++)
00067                 {
00068                   int nseq = 0;
00069                   int order = j;
00070                   if(order > 10)
00071                     order = 10;
00072                   while(true)
00073                     {
00074                       int start = (length) * nseq - order + i;
00075                       if(start < 0) {
00076                         nseq++;
00077                         continue;
00078                       }
00079                       int end = (length) * nseq + i;
00080                       if(end >= (int)(sample_set[j]->getSequence()).size())
00081                         break;
00082                       Sequence s;
00083                       for(int k = start; k <= end; k++)
00084                         s.push_back((sample_set[j]->getSequence())[k]);
00085                       SequenceEntryPtr entry = SequenceEntryPtr(new SequenceEntry(alphabet));
00086                       entry->setSequence(s);
00087                       positionalSample.push_back(entry);
00088                       nseq++;
00089                     }
00090                 }
00091                 double delta = (parameters.getOptionalParameterValue(cut[i])->getDouble());
00092                 ContextTreePtr tree = ContextTreePtr(new ContextTree(alphabet));
00093                 tree->initializeContextTreeRissanen(positionalSample);
00094                 tree->pruneTree(delta);
00095                 tree->removeContextNotUsed();
00096                 tree->normalize();
00097                 positional_distribution[i] = tree;
00098         }
00099         InhomogeneousMarkovChainPtr model = InhomogeneousMarkovChainPtr(
00100                         new InhomogeneousMarkovChain());
00101         model->setPositionSpecificDistribution(positional_distribution);
00102         model->phased(1);
00103         model->setAlphabet(alphabet);
00104         loglikelihood = 0.0;
00105 
00106         for (int j = 0; j < (int) sample_set.size(); j++) {
00107           sample_size += (sample_set[j]->getSequence()).size();
00108                 loglikelihood += model->evaluate((sample_set[j]->getSequence()), 0, (sample_set[j]->getSequence()).size()
00109                                 - 1);
00110         }
00111         return model;
00112 
00113 }
00114 ProbabilisticModelPtr TrainPhasedMarkovChainContextAlgorithm::create(
00115                 ProbabilisticModelParameters & parameters) const {
00116         int size;
00117         double loglikelihood;
00118         return create(parameters, loglikelihood, size);
00119 }
00120 }
00121