ToPS
|
00001 /* 00002 * TrainInterpolatedMarkovChain.cpp 00003 * 00004 * Copyright 2011 Andre Yoshiaki Kashiwabara <akashiwabara@usp.br> 00005 * Ígor Bonádio <ibonadio@ime.usp.br> 00006 * Vitor Onuchic <vitoronuchic@gmail.com> 00007 * Alan Mitchell Durham <aland@usp.br> 00008 * 00009 * This program is free software; you can redistribute it and/or modify 00010 * it under the terms of the GNU General Public License as published by 00011 * the Free Software Foundation; either version 3 of the License, or 00012 * (at your option) any later version. 00013 * 00014 * This program is distributed in the hope that it will be useful, 00015 * but WITHOUT ANY WARRANTY; without even the implied warranty of 00016 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00017 * GNU General Public License for more details. 00018 * 00019 * You should have received a copy of the GNU General Public License 00020 * along with this program; if not, write to the Free Software 00021 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, 00022 * MA 02110-1301, USA. 00023 */ 00024 00025 #include "ProbabilisticModel.hpp" 00026 #include "TrainInterpolatedMarkovChain.hpp" 00027 #include "ConfigurationReader.hpp" 00028 #include "ContextTree.hpp" 00029 #include "VariableLengthMarkovChain.hpp" 00030 #include "ProbabilisticModelCreatorClient.hpp" 00031 #include "util.hpp" 00032 namespace tops { 00033 00034 ProbabilisticModelPtr TrainInterpolatedMarkovChain::create( 00035 ProbabilisticModelParameters & parameters, double & loglikelihood, 00036 int & sample_size) const { 00037 ProbabilisticModelParameterValuePtr orderpar = 00038 parameters.getMandatoryParameterValue("order"); 00039 ProbabilisticModelParameterValuePtr trainpar = 00040 parameters.getMandatoryParameterValue("training_set"); 00041 ProbabilisticModelParameterValuePtr alphapar = 00042 parameters.getMandatoryParameterValue("alphabet"); 00043 ProbabilisticModelParameterValuePtr pseudocountspar = parameters.getOptionalParameterValue("pseudo_counts"); 00044 ProbabilisticModelParameterValuePtr aprioripar = parameters.getOptionalParameterValue("apriori"); 00045 00046 ProbabilisticModelParameterValuePtr weightspar = parameters.getOptionalParameterValue("weights"); 00047 std::map <std::string, double> weights; 00048 if(weightspar != NULL) { 00049 readMapFromFile(weights, weightspar->getString()); 00050 } 00051 00052 00053 00054 00055 double pseudocounts = 0; 00056 00057 if(pseudocountspar != NULL) 00058 pseudocounts = pseudocountspar->getDouble(); 00059 ProbabilisticModelPtr apriori; 00060 if(aprioripar != NULL) 00061 { 00062 ProbabilisticModelCreatorClient c; 00063 apriori = c.create(aprioripar->getString()); 00064 } 00065 00066 00067 if ((trainpar == NULL) || (alphapar == NULL) || (orderpar == NULL)) { 00068 std::cerr << help() << std::endl; 00069 exit(-1); 00070 } 00071 AlphabetPtr alphabet = AlphabetPtr(new Alphabet()); 00072 alphabet ->initializeFromVector(alphapar->getStringVector()); 00073 SequenceEntryList sample_set; 00074 readSequencesFromFile(sample_set, alphabet, trainpar->getString()); 00075 00076 00077 00078 ContextTreePtr tree = ContextTreePtr(new ContextTree(alphabet)); 00079 00080 if(apriori != NULL ){ 00081 tree->initializeCounter(sample_set, orderpar->getInt(), 0, weights); 00082 tree->pruneTreeSmallSampleSize(400); 00083 tree->normalize(apriori, pseudocounts, 0); 00084 } else { 00085 tree->initializeCounter(sample_set, orderpar->getInt(), pseudocounts, weights); 00086 tree->pruneTreeSmallSampleSize(400); 00087 tree->normalize(); 00088 } 00089 00090 00091 VariableLengthMarkovChainPtr m = VariableLengthMarkovChainPtr( 00092 new VariableLengthMarkovChain(tree)); 00093 m->setAlphabet(alphabet); 00094 loglikelihood = 0.0; 00095 sample_size = 0; 00096 for (int i = 0; i < (int) sample_set.size(); i++) { 00097 loglikelihood 00098 += m->evaluate((sample_set[i]->getSequence()), 0, (sample_set[i]->getSequence()).size() - 1); 00099 sample_size += (sample_set[i]->getSequence()).size(); 00100 } 00101 00102 return m; 00103 00104 } 00105 ProbabilisticModelPtr TrainInterpolatedMarkovChain::create( 00106 ProbabilisticModelParameters & parameters) const { 00107 double loglike; 00108 int samplesize; 00109 return create(parameters, loglike, samplesize); 00110 00111 } 00112 00113 } 00114 ;