ToPS
TrainDiscreteIIDModel.cpp
00001 /*
00002  *       TrainDiscreteIIDModel.cpp
00003  *
00004  *       Copyright 2011 Andre Yoshiaki Kashiwabara <akashiwabara@usp.br>
00005  *                      Ígor Bonádio <ibonadio@ime.usp.br>
00006  *                      Vitor Onuchic <vitoronuchic@gmail.com>
00007  *                      Alan Mitchell Durham <aland@usp.br>
00008  *
00009  *       This program is free software; you can redistribute it and/or modify
00010  *       it under the terms of the GNU  General Public License as published by
00011  *       the Free Software Foundation; either version 3 of the License, or
00012  *       (at your option) any later version.
00013  *
00014  *       This program is distributed in the hope that it will be useful,
00015  *       but WITHOUT ANY WARRANTY; without even the implied warranty of
00016  *       MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00017  *       GNU General Public License for more details.
00018  *
00019  *       You should have received a copy of the GNU General Public License
00020  *       along with this program; if not, write to the Free Software
00021  *       Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
00022  *       MA 02110-1301, USA.
00023  */
00024 
00025 #include "ProbabilisticModel.hpp"
00026 #include "TrainDiscreteIIDModel.hpp"
00027 #include "ConfigurationReader.hpp"
00028 #include "ContextTree.hpp"
00029 #include "VariableLengthMarkovChain.hpp"
00030 #include "util.hpp"
00031 namespace tops {
00032 
00033   ProbabilisticModelPtr TrainDiscreteIIDModel::create( ProbabilisticModelParameters & parameters, const std::vector<std::string> & sample_set, double & loglikelihood, int & sample_size) const {
00034 
00035         ProbabilisticModelParameterValuePtr alphapar =
00036                         parameters.getMandatoryParameterValue("alphabet");
00037 
00038 
00039         if (alphapar == NULL) {
00040           std::cerr << help() << std::endl;
00041           exit(-1);
00042         }
00043         AlphabetPtr alphabet = AlphabetPtr(new Alphabet());
00044         alphabet ->initializeFromVector(alphapar->getStringVector());
00045         SequenceEntryList samples;
00046         SequenceFactory factory(alphabet);
00047         for(int i = 0; i < (int)sample_set.size();i++){
00048           SequenceEntryPtr  inseq = SequenceEntryPtr(new SequenceEntry(alphabet));
00049           inseq->setSequence(factory.createSequenceRemovedSpaces(sample_set[i]));
00050           samples.push_back(inseq);
00051         }
00052 
00053         ProbabilisticModelPtr m = train (samples, alphabet);
00054         loglikelihood = 0.0;
00055         sample_size = 0;
00056         for (int i = 0; i < (int) samples.size(); i++) {
00057           loglikelihood
00058             += m->evaluate((samples[i]->getSequence()), 0, (samples[i]->getSequence()).size() - 1);
00059           sample_size += (samples[i]->getSequence()).size();
00060         }
00061         return m;
00062   }
00063 
00064 ProbabilisticModelPtr TrainDiscreteIIDModel::train(const SequenceEntryList & sample_set, AlphabetPtr alphabet) const{
00065         ContextTreePtr tree = ContextTreePtr(new ContextTree(alphabet));
00066         std::map <std::string, double> w;
00067         tree->initializeCounter(sample_set, 0, w);
00068         tree->normalize();
00069         DiscreteIIDModelPtr m = tree->getRoot()->getDistribution();
00070         m->setAlphabet(alphabet);
00071         return m;
00072   }
00073 
00074   ProbabilisticModelPtr TrainDiscreteIIDModel::create(
00075                 ProbabilisticModelParameters & parameters, double & loglikelihood,
00076                 int & sample_size) const {
00077         ProbabilisticModelParameterValuePtr trainpar =
00078                         parameters.getMandatoryParameterValue("training_set");
00079         ProbabilisticModelParameterValuePtr alphapar =
00080                         parameters.getMandatoryParameterValue("alphabet");
00081 
00082         if ((trainpar == NULL) || (alphapar == NULL) ) {
00083           std::cerr << help() << std::endl;
00084           exit(-1);
00085         }
00086         AlphabetPtr alphabet = AlphabetPtr(new Alphabet());
00087         alphabet ->initializeFromVector(alphapar->getStringVector());
00088         SequenceEntryList sample_set;
00089         readSequencesFromFile(sample_set, alphabet, trainpar->getString());
00090         ProbabilisticModelPtr m =  train(sample_set, alphabet);
00091         loglikelihood = 0.0;
00092         sample_size = 0;
00093         for (int i = 0; i < (int) sample_set.size(); i++) {
00094           loglikelihood
00095             += m->evaluate((sample_set[i]->getSequence()), 0, (sample_set[i]->getSequence()).size() - 1);
00096           sample_size += (sample_set[i]->getSequence()).size();
00097         }
00098         return m;
00099 
00100 }
00101   ProbabilisticModelPtr TrainDiscreteIIDModel::create(
00102                                                                 ProbabilisticModelParameters & parameters) const {
00103     double loglike;
00104     int samplesize;
00105     return create(parameters, loglike, samplesize);
00106 
00107   }
00108 
00109    std::string TrainDiscreteIIDModel::help() const   {
00110       std::stringstream out;
00111       out << "\nUSAGE: " << std::endl;
00112       out << "Mandatory parameters: " << std::endl;
00113       out << "\ttraining_set" << std::endl;
00114       out << "\talphabet" << std::endl;
00115       out << "Example: " << std::endl;
00116       out << "\ttraining_algorithm=\"DiscreteIIDModel\"" << std::endl;
00117       out << "\talphabet=(\"0\", \"1\")" << std::endl;
00118       out << "\ttraining_set= \"input.seq" << std::endl;
00119       return out.str();
00120   }
00121 }
00122