ToPS
TrainVariableLengthMarkovChain.cpp
00001 /*
00002  *       TrainVariableLengthMarkovChain.cpp
00003  *
00004  *       Copyright 2011 Andre Yoshiaki Kashiwabara <akashiwabara@usp.br>
00005  *                      Ígor Bonádio <ibonadio@ime.usp.br>
00006  *                      Vitor Onuchic <vitoronuchic@gmail.com>
00007  *                      Alan Mitchell Durham <aland@usp.br>
00008  *
00009  *       This program is free software; you can redistribute it and/or modify
00010  *       it under the terms of the GNU  General Public License as published by
00011  *       the Free Software Foundation; either version 3 of the License, or
00012  *       (at your option) any later version.
00013  *
00014  *       This program is distributed in the hope that it will be useful,
00015  *       but WITHOUT ANY WARRANTY; without even the implied warranty of
00016  *       MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00017  *       GNU General Public License for more details.
00018  *
00019  *       You should have received a copy of the GNU General Public License
00020  *       along with this program; if not, write to the Free Software
00021  *       Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
00022  *       MA 02110-1301, USA.
00023  */
00024 
00025 #include "util.hpp"
00026 
00027 #include "ContextTree.hpp"
00028 #include "ProbabilisticModelParameter.hpp"
00029 #include "VariableLengthMarkovChain.hpp"
00030 #include "TrainVariableLengthMarkovChain.hpp"
00031 
00032 #include <iostream>
00033 #include <fstream>
00034 #include <set>
00035 
00036 namespace tops {
00037 
00038 ProbabilisticModelPtr TrainVariableLengthMarkovChain::create(
00039                 ProbabilisticModelParameters & parameters) const {
00040         double loglike;
00041         int samplesize;
00042         return create(parameters, loglike, samplesize);
00043 }
00044 
00045   ProbabilisticModelPtr TrainVariableLengthMarkovChain::create( ProbabilisticModelParameters & parameters, const std::vector<std::string> & training_set, double & loglikelihood, int & sample_size) const {
00046         ProbabilisticModelParameterValuePtr alphabet_parameter =
00047                         parameters.getMandatoryParameterValue("alphabet");
00048         ProbabilisticModelParameterValuePtr delta_parameter =
00049                         parameters.getMandatoryParameterValue("cut");
00050         if ((alphabet_parameter == NULL)|| (delta_parameter == NULL)) {
00051                 std::cerr << help() << std::endl;
00052                 exit(-1);
00053         }
00054         AlphabetPtr alphabet = AlphabetPtr(new Alphabet());
00055         alphabet->initializeFromVector(alphabet_parameter->getStringVector());
00056         SequenceFactory factory(alphabet);
00057         SequenceEntryList sample_set;
00058         for(int i = 0; i < (int)sample_set.size(); i++) {
00059           Sequence s = factory.createSequence(training_set[i]);
00060           SequenceEntryPtr e =  SequenceEntryPtr  (new SequenceEntry(alphabet));
00061           e->setSequence(s);
00062           sample_set.push_back(e);
00063         }
00064 
00065         ContextTreePtr tree = ContextTreePtr(new ContextTree(alphabet));
00066         tree->initializeContextTreeRissanen(sample_set);
00067         tree->pruneTree(delta_parameter->getDouble());
00068         tree->removeContextNotUsed();
00069         tree->normalize();
00070         VariableLengthMarkovChainPtr m = VariableLengthMarkovChainPtr(
00071                         new VariableLengthMarkovChain(tree));
00072         m->setAlphabet(alphabet);
00073         loglikelihood = 0.0;
00074         sample_size = 0;
00075         for (int i = 0; i < (int) sample_set.size(); i++) {
00076                 loglikelihood
00077                   += m->evaluate((sample_set[i]->getSequence()), 0, (sample_set[i]->getSequence()).size() - 1);
00078                 sample_size += (sample_set[i]->getSequence()).size();
00079         }
00080         return m;
00081 
00082   }
00083 
00084 ProbabilisticModelPtr TrainVariableLengthMarkovChain::create(
00085                 ProbabilisticModelParameters & parameters, double & loglikelihood,
00086                 int & sample_size) const {
00087         ProbabilisticModelParameterValuePtr training_set_parameter =
00088                         parameters.getMandatoryParameterValue("training_set");
00089         ProbabilisticModelParameterValuePtr alphabet_parameter =
00090                         parameters.getMandatoryParameterValue("alphabet");
00091         ProbabilisticModelParameterValuePtr delta_parameter =
00092                         parameters.getMandatoryParameterValue("cut");
00093 
00094         if ((training_set_parameter == NULL) || (alphabet_parameter == NULL)
00095                         || (delta_parameter == NULL)) {
00096                 std::cerr << help() << std::endl;
00097                 exit(-1);
00098         }
00099         AlphabetPtr alphabet = AlphabetPtr(new Alphabet());
00100         alphabet->initializeFromVector(alphabet_parameter->getStringVector());
00101 
00102         SequenceEntryList sample_set;
00103         readSequencesFromFile(sample_set, alphabet, training_set_parameter->getString());
00104 
00105         ContextTreePtr tree = ContextTreePtr(new ContextTree(alphabet));
00106         tree->initializeContextTreeRissanen(sample_set);
00107         tree->pruneTree(delta_parameter->getDouble());
00108         tree->removeContextNotUsed();
00109         tree->normalize();
00110         VariableLengthMarkovChainPtr m = VariableLengthMarkovChainPtr(
00111                         new VariableLengthMarkovChain(tree));
00112         m->setAlphabet(alphabet);
00113         loglikelihood = 0.0;
00114         sample_size = 0;
00115         for (int i = 0; i < (int) sample_set.size(); i++) {
00116                 loglikelihood
00117                   += m->evaluate((sample_set[i]->getSequence()), 0, (sample_set[i]->getSequence()).size() - 1);
00118                 sample_size += (sample_set[i]->getSequence()).size();
00119         }
00120         return m;
00121 }
00122 
00124 std::string TrainVariableLengthMarkovChain::help() const {
00125       std::stringstream out;
00126       out << "\nUSAGE: " << std::endl;
00127       out << "Mandatory parameters: " << std::endl;
00128       out << "\ntraining_set" << std::endl;
00129       out << "\talphabet" << std::endl;
00130       out << "\tcut" << std::endl;
00131       out << "Example: " << std::endl;
00132       out << "\ttraining_algorithm=\"ContextAlgorithm\"" << std::endl;
00133       out << "\talphabet=(\"0\", \"1\")" << std::endl;
00134       out << "\ttraining_set= \"input.seq" << std::endl;
00135       out << "\tcut=1" << std::endl;
00136       return out.str();
00137 }
00138 
00139 }
00140 ;
00141