ToPS
|
00001 /* 00002 * TrainInterpolatedPhasedMarkovChain.cpp 00003 * 00004 * Copyright 2011 Andre Yoshiaki Kashiwabara <akashiwabara@usp.br> 00005 * Ígor Bonádio <ibonadio@ime.usp.br> 00006 * Vitor Onuchic <vitoronuchic@gmail.com> 00007 * Alan Mitchell Durham <aland@usp.br> 00008 * 00009 * This program is free software; you can redistribute it and/or modify 00010 * it under the terms of the GNU General Public License as published by 00011 * the Free Software Foundation; either version 3 of the License, or 00012 * (at your option) any later version. 00013 * 00014 * This program is distributed in the hope that it will be useful, 00015 * but WITHOUT ANY WARRANTY; without even the implied warranty of 00016 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00017 * GNU General Public License for more details. 00018 * 00019 * You should have received a copy of the GNU General Public License 00020 * along with this program; if not, write to the Free Software 00021 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, 00022 * MA 02110-1301, USA. 00023 */ 00024 00025 #include "ContextTree.hpp" 00026 #include "TrainInterpolatedPhasedMarkovChain.hpp" 00027 #include "InhomogeneousMarkovChain.hpp" 00028 #include "ProbabilisticModelParameter.hpp" 00029 #include "ProbabilisticModelCreatorClient.hpp" 00030 namespace tops { 00031 00032 ProbabilisticModelPtr TrainInterpolatedPhasedMarkovChain::create( 00033 ProbabilisticModelParameters & parameters, double & loglikelihood, 00034 int & sample_size) const { 00035 ProbabilisticModelParameterValuePtr alphabetpar = 00036 parameters.getMandatoryParameterValue("alphabet"); 00037 ProbabilisticModelParameterValuePtr trainingsetpar = 00038 parameters.getMandatoryParameterValue("training_set"); 00039 ProbabilisticModelParameterValuePtr trainingsetpar1 = 00040 parameters.getOptionalParameterValue("training_set_1"); 00041 ProbabilisticModelParameterValuePtr trainingsetpar2 = 00042 parameters.getOptionalParameterValue("training_set_2"); 00043 00044 00045 ProbabilisticModelParameterValuePtr orderpar = 00046 parameters.getMandatoryParameterValue("order"); 00047 ProbabilisticModelParameterValuePtr number_of_phasespar = 00048 parameters.getMandatoryParameterValue("number_of_phases"); 00049 ProbabilisticModelParameterValuePtr pseudocountspar = parameters.getOptionalParameterValue("pseudo_counts"); 00050 ProbabilisticModelParameterValuePtr aprioripar = parameters.getOptionalParameterValue("apriori"); 00051 ProbabilisticModelParameterValuePtr weightspar = parameters.getOptionalParameterValue("weights"); 00052 std::map <std::string, double> weights; 00053 if(weightspar != NULL) { 00054 readMapFromFile(weights, weightspar->getString()); 00055 } 00056 00057 double pseudocounts = 0; 00058 if(pseudocountspar != NULL) 00059 pseudocounts = pseudocountspar->getDouble(); 00060 ProbabilisticModelPtr apriori; 00061 if(aprioripar != NULL) 00062 { 00063 ProbabilisticModelCreatorClient c; 00064 apriori = c.create(aprioripar->getString()); 00065 } 00066 00067 00068 if ((alphabetpar == NULL) || (trainingsetpar == NULL) 00069 || (orderpar == NULL) || (number_of_phasespar == NULL)) { 00070 std::cerr << help() << std::endl; 00071 exit(-1); 00072 } 00073 AlphabetPtr alphabet = AlphabetPtr(new Alphabet()); 00074 alphabet->initializeFromVector(alphabetpar ->getStringVector()); 00075 SequenceEntryList sample_set; 00076 readSequencesFromFile(sample_set, alphabet, trainingsetpar->getString()); 00077 SequenceEntryList sample_set_1; 00078 if(trainingsetpar1 != NULL) 00079 { 00080 readSequencesFromFile(sample_set_1, alphabet, trainingsetpar1->getString()); 00081 } 00082 SequenceEntryList sample_set_2; 00083 if(trainingsetpar2 != NULL) 00084 { 00085 readSequencesFromFile(sample_set_2, alphabet, trainingsetpar1->getString()); 00086 } 00087 00088 int order = orderpar->getInt(); 00089 int length = number_of_phasespar->getInt() ; 00090 std::vector<ContextTreePtr> positional_distribution; 00091 positional_distribution.resize(length); 00092 sample_size = 0; 00093 00094 for (int i = 0; i < length; i++) { 00095 SequenceEntryList positionalSample; 00096 for(int j = 0; j < (int)sample_set.size(); j++) 00097 { 00098 int nseq = 0; 00099 std::string name = sample_set[j]->getName(); 00100 while(true) 00101 { 00102 int start = (length) * nseq - order + i; 00103 if(start < 0) { 00104 nseq++; 00105 continue; 00106 } 00107 int end = (length) * nseq + i; 00108 if(end >= (int)(sample_set[j]->getSequence()).size()) 00109 break; 00110 Sequence s; 00111 for(int k = start; k <= end; k++) 00112 s.push_back((sample_set[j]->getSequence())[k]); 00113 SequenceEntryPtr entry = SequenceEntryPtr (new SequenceEntry(alphabet)); 00114 entry->setSequence(s); 00115 entry->setName(name); 00116 positionalSample.push_back(entry); 00117 nseq++; 00118 } 00119 } 00120 00121 00122 00123 00124 00125 for(int j = 0; j < (int)sample_set_1.size(); j++) 00126 { 00127 int nseq = 0; 00128 std::string name = sample_set_1[j]->getName(); 00129 while(true) 00130 { 00131 int start = (length) * nseq - order + i -1; 00132 if(start < 0) { 00133 nseq++; 00134 continue; 00135 } 00136 int end = (length) * nseq + i -1; 00137 if(end >= (int)(sample_set_1[j]->getSequence()).size()) 00138 break; 00139 Sequence s; 00140 for(int k = start; k <= end; k++) 00141 s.push_back((sample_set_1[j]->getSequence())[k]); 00142 SequenceEntryPtr entry = SequenceEntryPtr (new SequenceEntry(alphabet)); 00143 entry->setSequence(s); 00144 entry->setName(name); 00145 positionalSample.push_back(entry); 00146 nseq++; 00147 } 00148 } 00149 00150 for(int j = 0; j < (int)sample_set_2.size(); j++) 00151 { 00152 int nseq = 0; 00153 std::string name = sample_set_2[j]->getName(); 00154 while(true) 00155 { 00156 int start = (length) * nseq - order + i -2; 00157 if(start < 0) { 00158 nseq++; 00159 continue; 00160 } 00161 int end = (length) * nseq + i -2; 00162 if(end >= (int)(sample_set_2[j]->getSequence()).size()) 00163 break; 00164 Sequence s; 00165 for(int k = start; k <= end; k++) 00166 s.push_back((sample_set_2[j]->getSequence())[k]); 00167 SequenceEntryPtr entry = SequenceEntryPtr (new SequenceEntry(alphabet)); 00168 entry->setSequence(s); 00169 entry->setName(name); 00170 positionalSample.push_back(entry); 00171 nseq++; 00172 } 00173 } 00174 00175 ContextTreePtr tree = ContextTreePtr(new ContextTree(alphabet)); 00176 00177 if(apriori != NULL){ 00178 tree->initializeCounter(positionalSample, order, 0, weights); 00179 tree->pruneTreeSmallSampleSize(400); 00180 tree->normalize(apriori, pseudocounts, i); 00181 00182 } else { 00183 tree->initializeCounter(positionalSample, order, pseudocounts, weights); 00184 tree->pruneTreeSmallSampleSize(400); 00185 tree->normalize(); 00186 } 00187 00188 positional_distribution[i] = tree; 00189 } 00190 InhomogeneousMarkovChainPtr model = InhomogeneousMarkovChainPtr( 00191 new InhomogeneousMarkovChain()); 00192 model->setPositionSpecificDistribution(positional_distribution); 00193 model->phased(1); 00194 model->setAlphabet(alphabet); 00195 loglikelihood = 0.0; 00196 00197 for (int j = 0; j < (int) sample_set.size(); j++) { 00198 sample_size += (sample_set[j]->getSequence()).size(); 00199 00200 loglikelihood += model->evaluate((sample_set[j]->getSequence()), 0, (sample_set[j]->getSequence()).size() 00201 - 1); 00202 } 00203 return model; 00204 00205 } 00206 ProbabilisticModelPtr TrainInterpolatedPhasedMarkovChain::create( 00207 ProbabilisticModelParameters & parameters) const { 00208 int size; 00209 double loglikelihood; 00210 return create(parameters, loglikelihood, size); 00211 } 00212 } 00213