ToPS
TrainInterpolatedMarkovChain.cpp
00001 /*
00002  *       TrainInterpolatedMarkovChain.cpp
00003  *
00004  *       Copyright 2011 Andre Yoshiaki Kashiwabara <akashiwabara@usp.br>
00005  *                      Ígor Bonádio <ibonadio@ime.usp.br>
00006  *                      Vitor Onuchic <vitoronuchic@gmail.com>
00007  *                      Alan Mitchell Durham <aland@usp.br>
00008  *
00009  *       This program is free software; you can redistribute it and/or modify
00010  *       it under the terms of the GNU  General Public License as published by
00011  *       the Free Software Foundation; either version 3 of the License, or
00012  *       (at your option) any later version.
00013  *
00014  *       This program is distributed in the hope that it will be useful,
00015  *       but WITHOUT ANY WARRANTY; without even the implied warranty of
00016  *       MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00017  *       GNU General Public License for more details.
00018  *
00019  *       You should have received a copy of the GNU General Public License
00020  *       along with this program; if not, write to the Free Software
00021  *       Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
00022  *       MA 02110-1301, USA.
00023  */
00024 
00025 #include "ProbabilisticModel.hpp"
00026 #include "TrainInterpolatedMarkovChain.hpp"
00027 #include "ConfigurationReader.hpp"
00028 #include "ContextTree.hpp"
00029 #include "VariableLengthMarkovChain.hpp"
00030 #include "ProbabilisticModelCreatorClient.hpp"
00031 #include "util.hpp"
00032 namespace tops {
00033 
00034 ProbabilisticModelPtr TrainInterpolatedMarkovChain::create(
00035                 ProbabilisticModelParameters & parameters, double & loglikelihood,
00036                 int & sample_size) const {
00037         ProbabilisticModelParameterValuePtr orderpar =
00038                         parameters.getMandatoryParameterValue("order");
00039         ProbabilisticModelParameterValuePtr trainpar =
00040                         parameters.getMandatoryParameterValue("training_set");
00041         ProbabilisticModelParameterValuePtr alphapar =
00042                         parameters.getMandatoryParameterValue("alphabet");
00043         ProbabilisticModelParameterValuePtr pseudocountspar = parameters.getOptionalParameterValue("pseudo_counts");
00044         ProbabilisticModelParameterValuePtr aprioripar = parameters.getOptionalParameterValue("apriori");
00045 
00046         ProbabilisticModelParameterValuePtr weightspar = parameters.getOptionalParameterValue("weights");
00047         std::map <std::string, double> weights;
00048         if(weightspar != NULL) {
00049           readMapFromFile(weights, weightspar->getString());
00050         }
00051 
00052 
00053 
00054 
00055         double pseudocounts = 0;
00056 
00057         if(pseudocountspar != NULL)
00058           pseudocounts = pseudocountspar->getDouble();
00059         ProbabilisticModelPtr apriori;
00060         if(aprioripar != NULL)
00061             {
00062                 ProbabilisticModelCreatorClient c;
00063                 apriori = c.create(aprioripar->getString());
00064             }
00065 
00066 
00067         if ((trainpar == NULL) || (alphapar == NULL) || (orderpar == NULL)) {
00068           std::cerr << help() << std::endl;
00069           exit(-1);
00070         }
00071         AlphabetPtr alphabet = AlphabetPtr(new Alphabet());
00072         alphabet ->initializeFromVector(alphapar->getStringVector());
00073         SequenceEntryList sample_set;
00074         readSequencesFromFile(sample_set, alphabet, trainpar->getString());
00075 
00076 
00077 
00078         ContextTreePtr tree = ContextTreePtr(new ContextTree(alphabet));
00079 
00080         if(apriori != NULL ){
00081           tree->initializeCounter(sample_set, orderpar->getInt(), 0, weights);
00082             tree->pruneTreeSmallSampleSize(400);
00083             tree->normalize(apriori, pseudocounts, 0);
00084         } else {
00085           tree->initializeCounter(sample_set, orderpar->getInt(), pseudocounts, weights);
00086             tree->pruneTreeSmallSampleSize(400);
00087             tree->normalize();
00088         }
00089 
00090 
00091         VariableLengthMarkovChainPtr m = VariableLengthMarkovChainPtr(
00092                                                                       new VariableLengthMarkovChain(tree));
00093         m->setAlphabet(alphabet);
00094         loglikelihood = 0.0;
00095         sample_size = 0;
00096         for (int i = 0; i < (int) sample_set.size(); i++) {
00097           loglikelihood
00098             += m->evaluate((sample_set[i]->getSequence()), 0, (sample_set[i]->getSequence()).size() - 1);
00099           sample_size += (sample_set[i]->getSequence()).size();
00100         }
00101 
00102         return m;
00103 
00104 }
00105 ProbabilisticModelPtr TrainInterpolatedMarkovChain::create(
00106                 ProbabilisticModelParameters & parameters) const {
00107         double loglike;
00108         int samplesize;
00109         return create(parameters, loglike, samplesize);
00110 
00111 }
00112 
00113 }
00114 ;