ToPS
|
00001 /* 00002 * TrainWeightArrayModel.cpp 00003 * 00004 * Copyright 2011 Andre Yoshiaki Kashiwabara <akashiwabara@usp.br> 00005 * Ígor Bonádio <ibonadio@ime.usp.br> 00006 * Vitor Onuchic <vitoronuchic@gmail.com> 00007 * Alan Mitchell Durham <aland@usp.br> 00008 * 00009 * This program is free software; you can redistribute it and/or modify 00010 * it under the terms of the GNU General Public License as published by 00011 * the Free Software Foundation; either version 3 of the License, or 00012 * (at your option) any later version. 00013 * 00014 * This program is distributed in the hope that it will be useful, 00015 * but WITHOUT ANY WARRANTY; without even the implied warranty of 00016 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00017 * GNU General Public License for more details. 00018 * 00019 * You should have received a copy of the GNU General Public License 00020 * along with this program; if not, write to the Free Software 00021 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, 00022 * MA 02110-1301, USA. 00023 */ 00024 00025 #include "ContextTree.hpp" 00026 #include "TrainWeightArrayModel.hpp" 00027 #include "InhomogeneousMarkovChain.hpp" 00028 #include "ProbabilisticModelParameter.hpp" 00029 #include "ProbabilisticModelCreatorClient.hpp" 00030 #include "SequenceFactory.hpp" 00031 namespace tops{ 00032 00033 ProbabilisticModelPtr TrainWeightArrayModel::create( ProbabilisticModelParameters & parameters, double & loglikelihood, int & sample_size) const 00034 { 00035 ProbabilisticModelParameterValuePtr alphabetpar = parameters.getMandatoryParameterValue("alphabet"); 00036 ProbabilisticModelParameterValuePtr trainingsetpar = parameters.getMandatoryParameterValue("training_set"); 00037 ProbabilisticModelParameterValuePtr orderpar = parameters.getMandatoryParameterValue("order"); 00038 ProbabilisticModelParameterValuePtr lengthpar = parameters.getMandatoryParameterValue("length"); 00039 ProbabilisticModelParameterValuePtr offsetpar = parameters.getOptionalParameterValue("offset"); 00040 ProbabilisticModelParameterValuePtr vicinitypar = parameters.getOptionalParameterValue("vicinity_length"); 00041 ProbabilisticModelParameterValuePtr pseudocountspar = parameters.getOptionalParameterValue("pseudo_counts"); 00042 ProbabilisticModelParameterValuePtr fixseqpar = parameters.getOptionalParameterValue("fixed_sequence"); 00043 ProbabilisticModelParameterValuePtr fixseqpospar = parameters.getOptionalParameterValue("fixed_sequence_position"); 00044 ProbabilisticModelParameterValuePtr aprioripar = parameters.getOptionalParameterValue("apriori"); 00045 00046 ProbabilisticModelParameterValuePtr weightspar = parameters.getOptionalParameterValue("weights"); 00047 std::map <std::string, double> weights; 00048 if(weightspar != NULL) { 00049 readMapFromFile(weights, weightspar->getString()); 00050 } 00051 00052 00053 if((alphabetpar == NULL) || 00054 (trainingsetpar == NULL) || 00055 (lengthpar == NULL) || 00056 (orderpar == NULL) ) 00057 { 00058 std::cerr << help() << std::endl; 00059 exit(-1); 00060 } 00061 int offset = 0; 00062 double pseudocounts = 0; 00063 if(pseudocountspar != NULL) 00064 pseudocounts = pseudocountspar->getDouble(); 00065 ProbabilisticModelPtr apriori; 00066 if(aprioripar != NULL) 00067 { 00068 ProbabilisticModelCreatorClient c; 00069 apriori = c.create(aprioripar->getString()); 00070 } 00071 00072 if(offsetpar != NULL) 00073 offset = offsetpar ->getInt(); 00074 if(pseudocountspar != NULL) 00075 pseudocounts = pseudocountspar->getDouble(); 00076 00077 AlphabetPtr alphabet = AlphabetPtr(new Alphabet()); 00078 alphabet->initializeFromVector(alphabetpar ->getStringVector()); 00079 SequenceEntryList sample_set; 00080 readSequencesFromFile(sample_set, alphabet, trainingsetpar->getString()); 00081 int vicinity = 0; 00082 if( vicinitypar != NULL) 00083 vicinity = vicinitypar->getInt(); 00084 int order = orderpar->getInt(); 00085 int length = lengthpar->getInt(); 00086 std::vector <ContextTreePtr> positional_distribution; 00087 positional_distribution.resize(length); 00088 sample_size = 0; 00089 bool fixseq = false; 00090 Sequence fixed; 00091 int fixed_pos = 0; 00092 if((fixseqpar != NULL) && (fixseqpospar != NULL)) { 00093 std::string seqstr = fixseqpar->getString(); 00094 SequenceFactory factory(alphabet); 00095 fixed = factory.createSequence(seqstr); 00096 fixed_pos = fixseqpospar->getInt(); 00097 fixseq = true; 00098 } 00099 00100 for(int i = 0; i < length; i++) 00101 { 00102 SequenceEntryList positionalSample; 00103 int o = i; 00104 if(o > order) o = order; 00105 for(int j = 0; j < (int)sample_set.size(); j++) 00106 { 00107 for(int k = -vicinity; k <= vicinity; k++) 00108 { 00109 Sequence s; 00110 s.resize(o+1); 00111 int l = 0; 00112 int m = i - o + k+ offset; 00113 00114 if(m < 0) 00115 continue; 00116 if((m + o) >= (int)(sample_set[j]->getSequence()).size()) 00117 continue; 00118 00119 00120 if(fixseq) { 00121 while( (m < (int) (sample_set[j]->getSequence()).size()) && (l <= o)) 00122 { 00123 s[l] = (sample_set[j]->getSequence())[m]; 00124 if(fixseq && (fixed_pos <= (m-k)) && ( (m-k) <= fixed_pos + fixed.size()-1)){ 00125 int p = m-fixed_pos-k; 00126 if((p >= 0) && (p < fixed.size())) 00127 s[l] = fixed[p]; 00128 } 00129 l++; m++; 00130 } 00131 } else { 00132 while( (m < (int) (sample_set[j]->getSequence()).size()) && (l <= o)) 00133 { 00134 s[l] = (sample_set[j]->getSequence())[m]; 00135 l++; m++; 00136 } 00137 } 00138 00139 SequenceEntryPtr entry = SequenceEntryPtr(new SequenceEntry(alphabet)); 00140 entry->setSequence(s); 00141 entry->setName(sample_set[j]->getName()); 00142 positionalSample.push_back(entry); 00143 } 00144 } 00145 if(fixseq && (fixed_pos <= i) && ((int)i <= ((int)fixed_pos + (int)fixed.size() - 1))){ 00146 ContextTreePtr tree = ContextTreePtr(new ContextTree(alphabet)); 00147 tree->initializeCounter(positionalSample, o, 0, weights); 00148 tree->normalize(); 00149 positional_distribution[i] = tree; 00150 } else { 00151 ContextTreePtr tree = ContextTreePtr(new ContextTree(alphabet)); 00152 00153 if(apriori != NULL && apriori->factorable() != NULL){ 00154 tree->initializeCounter(positionalSample, o, pseudocounts, weights); 00155 tree->normalize(apriori, pseudocounts); 00156 } else { 00157 tree->initializeCounter(positionalSample, o, pseudocounts, weights); 00158 tree->normalize(); 00159 00160 } 00161 positional_distribution[i] = tree; 00162 } 00163 00164 } 00165 InhomogeneousMarkovChainPtr model = InhomogeneousMarkovChainPtr(new InhomogeneousMarkovChain()); 00166 model->setPositionSpecificDistribution(positional_distribution); 00167 model->phased(0); 00168 model->setAlphabet(alphabet); 00169 loglikelihood = 0.0; 00170 00171 for(int j = 0; j < (int)sample_set.size(); j++) 00172 { 00173 sample_size += (sample_set[j]->getSequence()).size(); 00174 double v = model->evaluate((sample_set[j]->getSequence()), 0, (sample_set[j]->getSequence()).size()-1); 00175 if (v <= -HUGE) 00176 continue; 00177 00178 loglikelihood += v; 00179 } 00180 return model; 00181 00182 } 00183 ProbabilisticModelPtr TrainWeightArrayModel::create( ProbabilisticModelParameters & parameters) const 00184 { 00185 int size; 00186 double loglikelihood; 00187 return create(parameters, loglikelihood, size); 00188 } 00189 }