ToPS
|
00001 /* 00002 * TrainFixedLengthMarkovChain.cpp 00003 * 00004 * Copyright 2011 Andre Yoshiaki Kashiwabara <akashiwabara@usp.br> 00005 * Ígor Bonádio <ibonadio@ime.usp.br> 00006 * Vitor Onuchic <vitoronuchic@gmail.com> 00007 * Alan Mitchell Durham <aland@usp.br> 00008 * 00009 * This program is free software; you can redistribute it and/or modify 00010 * it under the terms of the GNU General Public License as published by 00011 * the Free Software Foundation; either version 3 of the License, or 00012 * (at your option) any later version. 00013 * 00014 * This program is distributed in the hope that it will be useful, 00015 * but WITHOUT ANY WARRANTY; without even the implied warranty of 00016 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00017 * GNU General Public License for more details. 00018 * 00019 * You should have received a copy of the GNU General Public License 00020 * along with this program; if not, write to the Free Software 00021 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, 00022 * MA 02110-1301, USA. 00023 */ 00024 00025 #include "ProbabilisticModel.hpp" 00026 #include "TrainFixedLengthMarkovChain.hpp" 00027 #include "ConfigurationReader.hpp" 00028 #include "ContextTree.hpp" 00029 #include "VariableLengthMarkovChain.hpp" 00030 #include "ProbabilisticModelCreatorClient.hpp" 00031 #include <map> 00032 #include "util.hpp" 00033 namespace tops { 00034 00035 ProbabilisticModelPtr TrainFixedLengthMarkovChain::create( 00036 ProbabilisticModelParameters & parameters, double & loglikelihood, 00037 int & sample_size) const { 00038 ProbabilisticModelParameterValuePtr orderpar = 00039 parameters.getMandatoryParameterValue("order"); 00040 ProbabilisticModelParameterValuePtr trainpar = 00041 parameters.getMandatoryParameterValue("training_set"); 00042 ProbabilisticModelParameterValuePtr alphapar = 00043 parameters.getMandatoryParameterValue("alphabet"); 00044 ProbabilisticModelParameterValuePtr pseudocountspar = parameters.getOptionalParameterValue("pseudo_counts"); 00045 ProbabilisticModelParameterValuePtr aprioripar = parameters.getOptionalParameterValue("apriori"); 00046 ProbabilisticModelParameterValuePtr weightspar = parameters.getOptionalParameterValue("weights"); 00047 std::map <std::string, double> weights; 00048 if(weightspar != NULL) { 00049 readMapFromFile(weights, weightspar->getString()); 00050 } 00051 00052 00053 00054 double pseudocounts = 0; 00055 ProbabilisticModelPtr apriori; 00056 if(aprioripar != NULL) 00057 { 00058 ProbabilisticModelCreatorClient c; 00059 apriori = c.create(aprioripar->getString()); 00060 } 00061 if(pseudocountspar != NULL) 00062 pseudocounts = pseudocountspar->getDouble(); 00063 00064 00065 if ((trainpar == NULL) || (alphapar == NULL) || (orderpar == NULL)) { 00066 std::cerr << help() << std::endl; 00067 exit(-1); 00068 } 00069 AlphabetPtr alphabet = AlphabetPtr(new Alphabet()); 00070 alphabet ->initializeFromVector(alphapar->getStringVector()); 00071 SequenceEntryList sample_set; 00072 readSequencesFromFile(sample_set, alphabet, trainpar->getString()); 00073 ContextTreePtr tree = ContextTreePtr(new ContextTree(alphabet)); 00074 00075 if(apriori != NULL ){ 00076 tree->initializeCounter(sample_set, orderpar->getInt(), 0, weights); 00077 tree->normalize(apriori, pseudocounts); 00078 } else { 00079 tree->initializeCounter(sample_set, orderpar->getInt(), pseudocounts, weights); 00080 tree->normalize(); 00081 } 00082 00083 VariableLengthMarkovChainPtr m = VariableLengthMarkovChainPtr( 00084 new VariableLengthMarkovChain(tree)); 00085 m->setAlphabet(alphabet); 00086 loglikelihood = 0.0; 00087 sample_size = 0; 00088 for (int i = 0; i < (int) sample_set.size(); i++) { 00089 loglikelihood 00090 += m->evaluate((sample_set[i]->getSequence()), 0, (sample_set[i]->getSequence()).size() - 1); 00091 sample_size += (sample_set[i]->getSequence()).size(); 00092 } 00093 00094 return m; 00095 00096 } 00097 ProbabilisticModelPtr TrainFixedLengthMarkovChain::create( 00098 ProbabilisticModelParameters & parameters) const { 00099 double loglike; 00100 int samplesize; 00101 return create(parameters, loglike, samplesize); 00102 00103 } 00104 00105 } 00106 ;