ToPS
|
00001 /* 00002 * TrainVariableLengthInhomogeneousMarkovChain.cpp 00003 * 00004 * Copyright 2011 Andre Yoshiaki Kashiwabara <akashiwabara@usp.br> 00005 * Ígor Bonádio <ibonadio@ime.usp.br> 00006 * Vitor Onuchic <vitoronuchic@gmail.com> 00007 * Alan Mitchell Durham <aland@usp.br> 00008 * 00009 * This program is free software; you can redistribute it and/or modify 00010 * it under the terms of the GNU General Public License as published by 00011 * the Free Software Foundation; either version 3 of the License, or 00012 * (at your option) any later version. 00013 * 00014 * This program is distributed in the hope that it will be useful, 00015 * but WITHOUT ANY WARRANTY; without even the implied warranty of 00016 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00017 * GNU General Public License for more details. 00018 * 00019 * You should have received a copy of the GNU General Public License 00020 * along with this program; if not, write to the Free Software 00021 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, 00022 * MA 02110-1301, USA. 00023 */ 00024 00025 #include "ContextTree.hpp" 00026 #include "TrainVariableLengthInhomogeneousMarkovChain.hpp" 00027 #include "InhomogeneousMarkovChain.hpp" 00028 #include "ProbabilisticModelParameter.hpp" 00029 namespace tops { 00030 00031 ProbabilisticModelPtr TrainVariableLengthInhomogeneousMarkovChain::create( 00032 ProbabilisticModelParameters & parameters, double & loglikelihood, 00033 int & sample_size) const { 00034 ProbabilisticModelParameterValuePtr alphabetpar = 00035 parameters.getMandatoryParameterValue("alphabet"); 00036 ProbabilisticModelParameterValuePtr trainingsetpar = 00037 parameters.getMandatoryParameterValue("training_set"); 00038 ProbabilisticModelParameterValuePtr cutpar = 00039 parameters.getMandatoryParameterValue("cut"); 00040 ProbabilisticModelParameterValuePtr lengthpar = 00041 parameters.getMandatoryParameterValue("length"); 00042 ProbabilisticModelParameterValuePtr phasedpar = 00043 parameters.getMandatoryParameterValue("phased"); 00044 00045 if ((alphabetpar == NULL) || (trainingsetpar == NULL) 00046 || (lengthpar == NULL) || (cutpar == NULL) || (phasedpar == NULL)) { 00047 std::cerr << help() << std::endl; 00048 exit(-1); 00049 } 00050 AlphabetPtr alphabet = AlphabetPtr(new Alphabet()); 00051 alphabet->initializeFromVector(alphabetpar ->getStringVector()); 00052 SequenceEntryList sample_set; 00053 readSequencesFromFile(sample_set, alphabet, trainingsetpar->getString()); 00054 std::vector<std::string> cuts = cutpar->getStringVector(); 00055 int length = lengthpar->getInt(); 00056 std::vector<ContextTreePtr> positional_distribution; 00057 int phased = phasedpar->getInt(); 00058 positional_distribution.resize(length); 00059 sample_size = 0; 00060 if ((int) cuts.size() < length) { 00061 std::cerr 00062 << "ERROR: The number of cuts is smaller than the sequence length " 00063 << std::endl; 00064 exit(-1); 00065 } 00066 00067 00068 for (int i = 0; i < length; i++) { 00069 SequenceEntryList positionalSample; 00070 int o = i; 00071 for (int j = 0; j < (int) sample_set.size(); j++) { 00072 if(!phased) { 00073 Sequence s; 00074 s.resize(o + 1); 00075 int l = 0; 00076 int m = i - o; 00077 if (m < 0) 00078 m = 0; 00079 00080 while ((m < (int) (sample_set[j]->getSequence()).size()) && (l <= o)) { 00081 s[l] = (sample_set[j]->getSequence())[m]; 00082 l++; 00083 m++; 00084 } 00085 SequenceEntryPtr entry = SequenceEntryPtr (new SequenceEntry(alphabet)); 00086 entry->setSequence(s); 00087 positionalSample.push_back(entry); 00088 } else { 00089 o = int(log( (double) (sample_set[j]->getSequence()).size())); 00090 int m = i - o; 00091 if(m < 0) { m = 0; o = o -i;} 00092 int last_m = m; 00093 while((m < (int) (sample_set[j]->getSequence()).size())) { 00094 Sequence s; 00095 s.resize(o+1); 00096 int l = 0; 00097 while ((m < (int) (sample_set[j]->getSequence()).size()) && (l <= o)) { 00098 s[l] = (sample_set[j]->getSequence())[m]; 00099 l++; 00100 m++; 00101 } 00102 m = last_m + length; 00103 last_m = m; 00104 SequenceEntryPtr entry = SequenceEntryPtr (new SequenceEntry(alphabet)); 00105 entry->setSequence(s); 00106 positionalSample.push_back(entry); 00107 } 00108 } 00109 } 00110 ProbabilisticModelParameterValuePtr c = parameters.getOptionalParameterValue(cuts[i]); 00111 if(c == NULL){ 00112 std::cerr << "Undefined cut value for " << cuts[i] << std::endl; 00113 exit(-1); 00114 } 00115 double delta = 00116 (parameters.getOptionalParameterValue(cuts[i]))->getDouble(); 00117 ContextTreePtr tree = ContextTreePtr(new ContextTree(alphabet)); 00118 tree->initializeContextTreeRissanen(positionalSample); 00119 tree->pruneTree(delta); 00120 tree->removeContextNotUsed(); 00121 tree->normalize(); 00122 positional_distribution[i] = tree; 00123 } 00124 InhomogeneousMarkovChainPtr model = InhomogeneousMarkovChainPtr(new InhomogeneousMarkovChain()); 00125 model->setPositionSpecificDistribution(positional_distribution); 00126 model->phased(0); 00127 model->setAlphabet(alphabet); 00128 model->phased(phased); 00129 loglikelihood = 0.0; 00130 00131 for (int j = 0; j < (int) sample_set.size(); j++) { 00132 sample_size += (sample_set[j]->getSequence()).size(); 00133 loglikelihood += model->evaluate((sample_set[j]->getSequence()), 0, (sample_set[j]->getSequence()).size() 00134 - 1); 00135 } 00136 return model; 00137 00138 } 00139 ProbabilisticModelPtr TrainVariableLengthInhomogeneousMarkovChain::create( 00140 ProbabilisticModelParameters & parameters) const { 00141 int size; 00142 double loglikelihood; 00143 return create(parameters, loglikelihood, size); 00144 } 00145 }