ToPS
|
00001 /* 00002 * TrainVariableLengthMarkovChain.cpp 00003 * 00004 * Copyright 2011 Andre Yoshiaki Kashiwabara <akashiwabara@usp.br> 00005 * Ígor Bonádio <ibonadio@ime.usp.br> 00006 * Vitor Onuchic <vitoronuchic@gmail.com> 00007 * Alan Mitchell Durham <aland@usp.br> 00008 * 00009 * This program is free software; you can redistribute it and/or modify 00010 * it under the terms of the GNU General Public License as published by 00011 * the Free Software Foundation; either version 3 of the License, or 00012 * (at your option) any later version. 00013 * 00014 * This program is distributed in the hope that it will be useful, 00015 * but WITHOUT ANY WARRANTY; without even the implied warranty of 00016 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00017 * GNU General Public License for more details. 00018 * 00019 * You should have received a copy of the GNU General Public License 00020 * along with this program; if not, write to the Free Software 00021 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, 00022 * MA 02110-1301, USA. 00023 */ 00024 00025 #include "util.hpp" 00026 00027 #include "ContextTree.hpp" 00028 #include "ProbabilisticModelParameter.hpp" 00029 #include "VariableLengthMarkovChain.hpp" 00030 #include "TrainVariableLengthMarkovChain.hpp" 00031 00032 #include <iostream> 00033 #include <fstream> 00034 #include <set> 00035 00036 namespace tops { 00037 00038 ProbabilisticModelPtr TrainVariableLengthMarkovChain::create( 00039 ProbabilisticModelParameters & parameters) const { 00040 double loglike; 00041 int samplesize; 00042 return create(parameters, loglike, samplesize); 00043 } 00044 00045 ProbabilisticModelPtr TrainVariableLengthMarkovChain::create( ProbabilisticModelParameters & parameters, const std::vector<std::string> & training_set, double & loglikelihood, int & sample_size) const { 00046 ProbabilisticModelParameterValuePtr alphabet_parameter = 00047 parameters.getMandatoryParameterValue("alphabet"); 00048 ProbabilisticModelParameterValuePtr delta_parameter = 00049 parameters.getMandatoryParameterValue("cut"); 00050 if ((alphabet_parameter == NULL)|| (delta_parameter == NULL)) { 00051 std::cerr << help() << std::endl; 00052 exit(-1); 00053 } 00054 AlphabetPtr alphabet = AlphabetPtr(new Alphabet()); 00055 alphabet->initializeFromVector(alphabet_parameter->getStringVector()); 00056 SequenceFactory factory(alphabet); 00057 SequenceEntryList sample_set; 00058 for(int i = 0; i < (int)sample_set.size(); i++) { 00059 Sequence s = factory.createSequence(training_set[i]); 00060 SequenceEntryPtr e = SequenceEntryPtr (new SequenceEntry(alphabet)); 00061 e->setSequence(s); 00062 sample_set.push_back(e); 00063 } 00064 00065 ContextTreePtr tree = ContextTreePtr(new ContextTree(alphabet)); 00066 tree->initializeContextTreeRissanen(sample_set); 00067 tree->pruneTree(delta_parameter->getDouble()); 00068 tree->removeContextNotUsed(); 00069 tree->normalize(); 00070 VariableLengthMarkovChainPtr m = VariableLengthMarkovChainPtr( 00071 new VariableLengthMarkovChain(tree)); 00072 m->setAlphabet(alphabet); 00073 loglikelihood = 0.0; 00074 sample_size = 0; 00075 for (int i = 0; i < (int) sample_set.size(); i++) { 00076 loglikelihood 00077 += m->evaluate((sample_set[i]->getSequence()), 0, (sample_set[i]->getSequence()).size() - 1); 00078 sample_size += (sample_set[i]->getSequence()).size(); 00079 } 00080 return m; 00081 00082 } 00083 00084 ProbabilisticModelPtr TrainVariableLengthMarkovChain::create( 00085 ProbabilisticModelParameters & parameters, double & loglikelihood, 00086 int & sample_size) const { 00087 ProbabilisticModelParameterValuePtr training_set_parameter = 00088 parameters.getMandatoryParameterValue("training_set"); 00089 ProbabilisticModelParameterValuePtr alphabet_parameter = 00090 parameters.getMandatoryParameterValue("alphabet"); 00091 ProbabilisticModelParameterValuePtr delta_parameter = 00092 parameters.getMandatoryParameterValue("cut"); 00093 00094 if ((training_set_parameter == NULL) || (alphabet_parameter == NULL) 00095 || (delta_parameter == NULL)) { 00096 std::cerr << help() << std::endl; 00097 exit(-1); 00098 } 00099 AlphabetPtr alphabet = AlphabetPtr(new Alphabet()); 00100 alphabet->initializeFromVector(alphabet_parameter->getStringVector()); 00101 00102 SequenceEntryList sample_set; 00103 readSequencesFromFile(sample_set, alphabet, training_set_parameter->getString()); 00104 00105 ContextTreePtr tree = ContextTreePtr(new ContextTree(alphabet)); 00106 tree->initializeContextTreeRissanen(sample_set); 00107 tree->pruneTree(delta_parameter->getDouble()); 00108 tree->removeContextNotUsed(); 00109 tree->normalize(); 00110 VariableLengthMarkovChainPtr m = VariableLengthMarkovChainPtr( 00111 new VariableLengthMarkovChain(tree)); 00112 m->setAlphabet(alphabet); 00113 loglikelihood = 0.0; 00114 sample_size = 0; 00115 for (int i = 0; i < (int) sample_set.size(); i++) { 00116 loglikelihood 00117 += m->evaluate((sample_set[i]->getSequence()), 0, (sample_set[i]->getSequence()).size() - 1); 00118 sample_size += (sample_set[i]->getSequence()).size(); 00119 } 00120 return m; 00121 } 00122 00124 std::string TrainVariableLengthMarkovChain::help() const { 00125 std::stringstream out; 00126 out << "\nUSAGE: " << std::endl; 00127 out << "Mandatory parameters: " << std::endl; 00128 out << "\ntraining_set" << std::endl; 00129 out << "\talphabet" << std::endl; 00130 out << "\tcut" << std::endl; 00131 out << "Example: " << std::endl; 00132 out << "\ttraining_algorithm=\"ContextAlgorithm\"" << std::endl; 00133 out << "\talphabet=(\"0\", \"1\")" << std::endl; 00134 out << "\ttraining_set= \"input.seq" << std::endl; 00135 out << "\tcut=1" << std::endl; 00136 return out.str(); 00137 } 00138 00139 } 00140 ; 00141