ToPS
TrainFixedLengthMarkovChain.cpp
00001 /*
00002  *       TrainFixedLengthMarkovChain.cpp
00003  *
00004  *       Copyright 2011 Andre Yoshiaki Kashiwabara <akashiwabara@usp.br>
00005  *                      Ígor Bonádio <ibonadio@ime.usp.br>
00006  *                      Vitor Onuchic <vitoronuchic@gmail.com>
00007  *                      Alan Mitchell Durham <aland@usp.br>
00008  *
00009  *       This program is free software; you can redistribute it and/or modify
00010  *       it under the terms of the GNU  General Public License as published by
00011  *       the Free Software Foundation; either version 3 of the License, or
00012  *       (at your option) any later version.
00013  *
00014  *       This program is distributed in the hope that it will be useful,
00015  *       but WITHOUT ANY WARRANTY; without even the implied warranty of
00016  *       MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00017  *       GNU General Public License for more details.
00018  *
00019  *       You should have received a copy of the GNU General Public License
00020  *       along with this program; if not, write to the Free Software
00021  *       Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
00022  *       MA 02110-1301, USA.
00023  */
00024 
00025 #include "ProbabilisticModel.hpp"
00026 #include "TrainFixedLengthMarkovChain.hpp"
00027 #include "ConfigurationReader.hpp"
00028 #include "ContextTree.hpp"
00029 #include "VariableLengthMarkovChain.hpp"
00030 #include "ProbabilisticModelCreatorClient.hpp"
00031 #include <map>
00032 #include "util.hpp"
00033 namespace tops {
00034 
00035 ProbabilisticModelPtr TrainFixedLengthMarkovChain::create(
00036                 ProbabilisticModelParameters & parameters, double & loglikelihood,
00037                 int & sample_size) const {
00038         ProbabilisticModelParameterValuePtr orderpar =
00039                         parameters.getMandatoryParameterValue("order");
00040         ProbabilisticModelParameterValuePtr trainpar =
00041                         parameters.getMandatoryParameterValue("training_set");
00042         ProbabilisticModelParameterValuePtr alphapar =
00043                         parameters.getMandatoryParameterValue("alphabet");
00044         ProbabilisticModelParameterValuePtr pseudocountspar = parameters.getOptionalParameterValue("pseudo_counts");
00045         ProbabilisticModelParameterValuePtr aprioripar = parameters.getOptionalParameterValue("apriori");
00046         ProbabilisticModelParameterValuePtr weightspar = parameters.getOptionalParameterValue("weights");
00047         std::map <std::string, double> weights;
00048         if(weightspar != NULL) {
00049           readMapFromFile(weights, weightspar->getString());
00050         }
00051 
00052 
00053 
00054         double pseudocounts = 0;
00055         ProbabilisticModelPtr apriori;
00056         if(aprioripar != NULL)
00057             {
00058                 ProbabilisticModelCreatorClient c;
00059                 apriori = c.create(aprioripar->getString());
00060             }
00061         if(pseudocountspar != NULL)
00062           pseudocounts = pseudocountspar->getDouble();
00063 
00064 
00065         if ((trainpar == NULL) || (alphapar == NULL) || (orderpar == NULL)) {
00066           std::cerr << help() << std::endl;
00067           exit(-1);
00068         }
00069         AlphabetPtr alphabet = AlphabetPtr(new Alphabet());
00070         alphabet ->initializeFromVector(alphapar->getStringVector());
00071         SequenceEntryList sample_set;
00072         readSequencesFromFile(sample_set, alphabet, trainpar->getString());
00073         ContextTreePtr tree = ContextTreePtr(new ContextTree(alphabet));
00074 
00075         if(apriori != NULL ){
00076           tree->initializeCounter(sample_set, orderpar->getInt(), 0, weights);
00077           tree->normalize(apriori, pseudocounts);
00078         } else {
00079           tree->initializeCounter(sample_set, orderpar->getInt(), pseudocounts, weights);
00080           tree->normalize();
00081         }
00082 
00083         VariableLengthMarkovChainPtr m = VariableLengthMarkovChainPtr(
00084                                                                       new VariableLengthMarkovChain(tree));
00085         m->setAlphabet(alphabet);
00086         loglikelihood = 0.0;
00087         sample_size = 0;
00088         for (int i = 0; i < (int) sample_set.size(); i++) {
00089           loglikelihood
00090             += m->evaluate((sample_set[i]->getSequence()), 0, (sample_set[i]->getSequence()).size() - 1);
00091           sample_size += (sample_set[i]->getSequence()).size();
00092         }
00093 
00094         return m;
00095 
00096 }
00097 ProbabilisticModelPtr TrainFixedLengthMarkovChain::create(
00098                 ProbabilisticModelParameters & parameters) const {
00099         double loglike;
00100         int samplesize;
00101         return create(parameters, loglike, samplesize);
00102 
00103 }
00104 
00105 }
00106 ;