ToPS
|
00001 /* 00002 * TrainDiscreteIIDModel.cpp 00003 * 00004 * Copyright 2011 Andre Yoshiaki Kashiwabara <akashiwabara@usp.br> 00005 * Ígor Bonádio <ibonadio@ime.usp.br> 00006 * Vitor Onuchic <vitoronuchic@gmail.com> 00007 * Alan Mitchell Durham <aland@usp.br> 00008 * 00009 * This program is free software; you can redistribute it and/or modify 00010 * it under the terms of the GNU General Public License as published by 00011 * the Free Software Foundation; either version 3 of the License, or 00012 * (at your option) any later version. 00013 * 00014 * This program is distributed in the hope that it will be useful, 00015 * but WITHOUT ANY WARRANTY; without even the implied warranty of 00016 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00017 * GNU General Public License for more details. 00018 * 00019 * You should have received a copy of the GNU General Public License 00020 * along with this program; if not, write to the Free Software 00021 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, 00022 * MA 02110-1301, USA. 00023 */ 00024 00025 #include "ProbabilisticModel.hpp" 00026 #include "TrainDiscreteIIDModel.hpp" 00027 #include "ConfigurationReader.hpp" 00028 #include "ContextTree.hpp" 00029 #include "VariableLengthMarkovChain.hpp" 00030 #include "util.hpp" 00031 namespace tops { 00032 00033 ProbabilisticModelPtr TrainDiscreteIIDModel::create( ProbabilisticModelParameters & parameters, const std::vector<std::string> & sample_set, double & loglikelihood, int & sample_size) const { 00034 00035 ProbabilisticModelParameterValuePtr alphapar = 00036 parameters.getMandatoryParameterValue("alphabet"); 00037 00038 00039 if (alphapar == NULL) { 00040 std::cerr << help() << std::endl; 00041 exit(-1); 00042 } 00043 AlphabetPtr alphabet = AlphabetPtr(new Alphabet()); 00044 alphabet ->initializeFromVector(alphapar->getStringVector()); 00045 SequenceEntryList samples; 00046 SequenceFactory factory(alphabet); 00047 for(int i = 0; i < (int)sample_set.size();i++){ 00048 SequenceEntryPtr inseq = SequenceEntryPtr(new SequenceEntry(alphabet)); 00049 inseq->setSequence(factory.createSequenceRemovedSpaces(sample_set[i])); 00050 samples.push_back(inseq); 00051 } 00052 00053 ProbabilisticModelPtr m = train (samples, alphabet); 00054 loglikelihood = 0.0; 00055 sample_size = 0; 00056 for (int i = 0; i < (int) samples.size(); i++) { 00057 loglikelihood 00058 += m->evaluate((samples[i]->getSequence()), 0, (samples[i]->getSequence()).size() - 1); 00059 sample_size += (samples[i]->getSequence()).size(); 00060 } 00061 return m; 00062 } 00063 00064 ProbabilisticModelPtr TrainDiscreteIIDModel::train(const SequenceEntryList & sample_set, AlphabetPtr alphabet) const{ 00065 ContextTreePtr tree = ContextTreePtr(new ContextTree(alphabet)); 00066 std::map <std::string, double> w; 00067 tree->initializeCounter(sample_set, 0, w); 00068 tree->normalize(); 00069 DiscreteIIDModelPtr m = tree->getRoot()->getDistribution(); 00070 m->setAlphabet(alphabet); 00071 return m; 00072 } 00073 00074 ProbabilisticModelPtr TrainDiscreteIIDModel::create( 00075 ProbabilisticModelParameters & parameters, double & loglikelihood, 00076 int & sample_size) const { 00077 ProbabilisticModelParameterValuePtr trainpar = 00078 parameters.getMandatoryParameterValue("training_set"); 00079 ProbabilisticModelParameterValuePtr alphapar = 00080 parameters.getMandatoryParameterValue("alphabet"); 00081 00082 if ((trainpar == NULL) || (alphapar == NULL) ) { 00083 std::cerr << help() << std::endl; 00084 exit(-1); 00085 } 00086 AlphabetPtr alphabet = AlphabetPtr(new Alphabet()); 00087 alphabet ->initializeFromVector(alphapar->getStringVector()); 00088 SequenceEntryList sample_set; 00089 readSequencesFromFile(sample_set, alphabet, trainpar->getString()); 00090 ProbabilisticModelPtr m = train(sample_set, alphabet); 00091 loglikelihood = 0.0; 00092 sample_size = 0; 00093 for (int i = 0; i < (int) sample_set.size(); i++) { 00094 loglikelihood 00095 += m->evaluate((sample_set[i]->getSequence()), 0, (sample_set[i]->getSequence()).size() - 1); 00096 sample_size += (sample_set[i]->getSequence()).size(); 00097 } 00098 return m; 00099 00100 } 00101 ProbabilisticModelPtr TrainDiscreteIIDModel::create( 00102 ProbabilisticModelParameters & parameters) const { 00103 double loglike; 00104 int samplesize; 00105 return create(parameters, loglike, samplesize); 00106 00107 } 00108 00109 std::string TrainDiscreteIIDModel::help() const { 00110 std::stringstream out; 00111 out << "\nUSAGE: " << std::endl; 00112 out << "Mandatory parameters: " << std::endl; 00113 out << "\ttraining_set" << std::endl; 00114 out << "\talphabet" << std::endl; 00115 out << "Example: " << std::endl; 00116 out << "\ttraining_algorithm=\"DiscreteIIDModel\"" << std::endl; 00117 out << "\talphabet=(\"0\", \"1\")" << std::endl; 00118 out << "\ttraining_set= \"input.seq" << std::endl; 00119 return out.str(); 00120 } 00121 } 00122