ToPS
|
00001 /* 00002 * TrainPhasedMarkovChain.cpp 00003 * 00004 * Copyright 2011 Andre Yoshiaki Kashiwabara <akashiwabara@usp.br> 00005 * Ígor Bonádio <ibonadio@ime.usp.br> 00006 * Vitor Onuchic <vitoronuchic@gmail.com> 00007 * Alan Mitchell Durham <aland@usp.br> 00008 * 00009 * This program is free software; you can redistribute it and/or modify 00010 * it under the terms of the GNU General Public License as published by 00011 * the Free Software Foundation; either version 3 of the License, or 00012 * (at your option) any later version. 00013 * 00014 * This program is distributed in the hope that it will be useful, 00015 * but WITHOUT ANY WARRANTY; without even the implied warranty of 00016 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00017 * GNU General Public License for more details. 00018 * 00019 * You should have received a copy of the GNU General Public License 00020 * along with this program; if not, write to the Free Software 00021 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, 00022 * MA 02110-1301, USA. 00023 */ 00024 00025 /* 00026 * TrainPhasedMarkovChain.cpp 00027 * 00028 * Created on: 4/Mai/2009 00029 * Author: yoshiaki 00030 */ 00031 #include "ContextTree.hpp" 00032 #include "TrainPhasedMarkovChain.hpp" 00033 #include "InhomogeneousMarkovChain.hpp" 00034 #include "ProbabilisticModelParameter.hpp" 00035 #include "ProbabilisticModelCreatorClient.hpp" 00036 namespace tops { 00037 00038 ProbabilisticModelPtr TrainPhasedMarkovChain::create( 00039 ProbabilisticModelParameters & parameters, double & loglikelihood, 00040 int & sample_size) const { 00041 ProbabilisticModelParameterValuePtr alphabetpar = 00042 parameters.getMandatoryParameterValue("alphabet"); 00043 ProbabilisticModelParameterValuePtr trainingsetpar = 00044 parameters.getMandatoryParameterValue("training_set"); 00045 ProbabilisticModelParameterValuePtr orderpar = 00046 parameters.getMandatoryParameterValue("order"); 00047 ProbabilisticModelParameterValuePtr number_of_phasespar = 00048 parameters.getMandatoryParameterValue("number_of_phases"); 00049 ProbabilisticModelParameterValuePtr pseudocountspar = parameters.getOptionalParameterValue("pseudo_counts"); 00050 ProbabilisticModelParameterValuePtr aprioripar = parameters.getOptionalParameterValue("apriori"); 00051 ProbabilisticModelParameterValuePtr weightspar = parameters.getOptionalParameterValue("weights"); 00052 std::map <std::string, double> weights; 00053 if(weightspar != NULL) { 00054 readMapFromFile(weights, weightspar->getString()); 00055 } 00056 00057 double pseudocounts = 0; 00058 if(pseudocountspar != NULL) 00059 pseudocounts = pseudocountspar->getDouble(); 00060 ProbabilisticModelPtr apriori; 00061 if(aprioripar != NULL) 00062 { 00063 ProbabilisticModelCreatorClient c; 00064 apriori = c.create(aprioripar->getString()); 00065 } 00066 00067 00068 00069 00070 if ((alphabetpar == NULL) || (trainingsetpar == NULL) 00071 || (orderpar == NULL) || (number_of_phasespar == NULL)) { 00072 std::cerr << help() << std::endl; 00073 exit(-1); 00074 } 00075 AlphabetPtr alphabet = AlphabetPtr(new Alphabet()); 00076 alphabet->initializeFromVector(alphabetpar ->getStringVector()); 00077 SequenceEntryList sample_set; 00078 readSequencesFromFile(sample_set, alphabet, trainingsetpar->getString()); 00079 int order = orderpar->getInt(); 00080 int length = number_of_phasespar->getInt() ; 00081 std::vector<ContextTreePtr> positional_distribution; 00082 positional_distribution.resize(length); 00083 sample_size = 0; 00084 00085 for (int i = 0; i < length; i++) { 00086 SequenceEntryList positionalSample; 00087 for(int j = 0; j < (int)sample_set.size(); j++) 00088 { 00089 int nseq = 0; 00090 while(true) 00091 { 00092 int start = (length) * nseq - order + i; 00093 if(start < 0) { 00094 nseq++; 00095 continue; 00096 } 00097 int end = (length) * nseq + i; 00098 if(end >= (int)(sample_set[j]->getSequence()).size()) 00099 break; 00100 Sequence s; 00101 for(int k = start; k <= end; k++) 00102 s.push_back((sample_set[j]->getSequence())[k]); 00103 SequenceEntryPtr entry = SequenceEntryPtr (new SequenceEntry(alphabet)); 00104 entry->setSequence(s); 00105 positionalSample.push_back(entry); 00106 nseq++; 00107 } 00108 } 00109 ContextTreePtr tree = ContextTreePtr(new ContextTree(alphabet)); 00110 00111 if(apriori != NULL){ 00112 tree->initializeCounter(sample_set, orderpar->getInt(), 0, weights); 00113 tree->normalize(apriori, pseudocounts, i); 00114 } else { 00115 00116 tree->initializeCounter(sample_set, orderpar->getInt(), pseudocounts, weights); 00117 tree->normalize(); 00118 } 00119 00120 00121 00122 positional_distribution[i] = tree; 00123 00124 } 00125 InhomogeneousMarkovChainPtr model = InhomogeneousMarkovChainPtr( 00126 new InhomogeneousMarkovChain()); 00127 model->setPositionSpecificDistribution(positional_distribution); 00128 model->phased(1); 00129 model->setAlphabet(alphabet); 00130 loglikelihood = 0.0; 00131 00132 for (int j = 0; j < (int) sample_set.size(); j++) { 00133 sample_size += (sample_set[j]->getSequence()).size(); 00134 00135 loglikelihood += model->evaluate((sample_set[j]->getSequence()), 0, (sample_set[j]->getSequence()).size() 00136 - 1); 00137 } 00138 return model; 00139 00140 } 00141 ProbabilisticModelPtr TrainPhasedMarkovChain::create( 00142 ProbabilisticModelParameters & parameters) const { 00143 int size; 00144 double loglikelihood; 00145 return create(parameters, loglikelihood, size); 00146 } 00147 } 00148