ToPS
|
00001 /* 00002 * TrainPhasedMarkovChainContextAlgorithm.cpp 00003 * 00004 * Copyright 2011 Andre Yoshiaki Kashiwabara <akashiwabara@usp.br> 00005 * Ígor Bonádio <ibonadio@ime.usp.br> 00006 * Vitor Onuchic <vitoronuchic@gmail.com> 00007 * Alan Mitchell Durham <aland@usp.br> 00008 * 00009 * This program is free software; you can redistribute it and/or modify 00010 * it under the terms of the GNU General Public License as published by 00011 * the Free Software Foundation; either version 3 of the License, or 00012 * (at your option) any later version. 00013 * 00014 * This program is distributed in the hope that it will be useful, 00015 * but WITHOUT ANY WARRANTY; without even the implied warranty of 00016 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00017 * GNU General Public License for more details. 00018 * 00019 * You should have received a copy of the GNU General Public License 00020 * along with this program; if not, write to the Free Software 00021 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, 00022 * MA 02110-1301, USA. 00023 */ 00024 00025 #include "ContextTree.hpp" 00026 #include "TrainPhasedMarkovChainContextAlgorithm.hpp" 00027 #include "InhomogeneousMarkovChain.hpp" 00028 #include "ProbabilisticModelParameter.hpp" 00029 namespace tops { 00030 00031 ProbabilisticModelPtr TrainPhasedMarkovChainContextAlgorithm::create( 00032 ProbabilisticModelParameters & parameters, double & loglikelihood, 00033 int & sample_size) const { 00034 ProbabilisticModelParameterValuePtr alphabetpar = 00035 parameters.getMandatoryParameterValue("alphabet"); 00036 ProbabilisticModelParameterValuePtr trainingsetpar = 00037 parameters.getMandatoryParameterValue("training_set"); 00038 ProbabilisticModelParameterValuePtr cutpar = 00039 parameters.getMandatoryParameterValue("cut"); 00040 00041 ProbabilisticModelParameterValuePtr number_of_phasespar = 00042 parameters.getMandatoryParameterValue("number_of_phases"); 00043 00044 if ((alphabetpar == NULL) || (trainingsetpar == NULL) 00045 || (number_of_phasespar == NULL) 00046 || (cutpar == NULL)) { 00047 std::cerr << help() << std::endl; 00048 exit(-1); 00049 } 00050 AlphabetPtr alphabet = AlphabetPtr(new Alphabet()); 00051 alphabet->initializeFromVector(alphabetpar ->getStringVector()); 00052 SequenceEntryList sample_set; 00053 readSequencesFromFile(sample_set, alphabet, trainingsetpar->getString()); 00054 int length = number_of_phasespar->getInt() ; 00055 std::vector<ContextTreePtr> positional_distribution; 00056 positional_distribution.resize(length); 00057 sample_size = 0; 00058 std::vector <std::string> cut = cutpar->getStringVector(); 00059 if((int) cut.size() < length) { 00060 std::cerr << "ERROR: The number of cut values is smaller than the sequence length " << std::endl; 00061 exit(-1); 00062 } 00063 00064 for (int i = 0; i < length; i++) { 00065 SequenceEntryList positionalSample; 00066 for(int j = 0; j < (int)sample_set.size(); j++) 00067 { 00068 int nseq = 0; 00069 int order = j; 00070 if(order > 10) 00071 order = 10; 00072 while(true) 00073 { 00074 int start = (length) * nseq - order + i; 00075 if(start < 0) { 00076 nseq++; 00077 continue; 00078 } 00079 int end = (length) * nseq + i; 00080 if(end >= (int)(sample_set[j]->getSequence()).size()) 00081 break; 00082 Sequence s; 00083 for(int k = start; k <= end; k++) 00084 s.push_back((sample_set[j]->getSequence())[k]); 00085 SequenceEntryPtr entry = SequenceEntryPtr(new SequenceEntry(alphabet)); 00086 entry->setSequence(s); 00087 positionalSample.push_back(entry); 00088 nseq++; 00089 } 00090 } 00091 double delta = (parameters.getOptionalParameterValue(cut[i])->getDouble()); 00092 ContextTreePtr tree = ContextTreePtr(new ContextTree(alphabet)); 00093 tree->initializeContextTreeRissanen(positionalSample); 00094 tree->pruneTree(delta); 00095 tree->removeContextNotUsed(); 00096 tree->normalize(); 00097 positional_distribution[i] = tree; 00098 } 00099 InhomogeneousMarkovChainPtr model = InhomogeneousMarkovChainPtr( 00100 new InhomogeneousMarkovChain()); 00101 model->setPositionSpecificDistribution(positional_distribution); 00102 model->phased(1); 00103 model->setAlphabet(alphabet); 00104 loglikelihood = 0.0; 00105 00106 for (int j = 0; j < (int) sample_set.size(); j++) { 00107 sample_size += (sample_set[j]->getSequence()).size(); 00108 loglikelihood += model->evaluate((sample_set[j]->getSequence()), 0, (sample_set[j]->getSequence()).size() 00109 - 1); 00110 } 00111 return model; 00112 00113 } 00114 ProbabilisticModelPtr TrainPhasedMarkovChainContextAlgorithm::create( 00115 ProbabilisticModelParameters & parameters) const { 00116 int size; 00117 double loglikelihood; 00118 return create(parameters, loglikelihood, size); 00119 } 00120 } 00121