ToPS
|
00001 /* 00002 * InhomogeneousMarkovChain.cpp 00003 * 00004 * Copyright 2011 Andre Yoshiaki Kashiwabara <akashiwabara@usp.br> 00005 * Ígor Bonádio <ibonadio@ime.usp.br> 00006 * Vitor Onuchic <vitoronuchic@gmail.com> 00007 * Alan Mitchell Durham <aland@usp.br> 00008 * 00009 * This program is free software; you can redistribute it and/or modify 00010 * it under the terms of the GNU General Public License as published by 00011 * the Free Software Foundation; either version 3 of the License, or 00012 * (at your option) any loater version. 00013 * 00014 * This program is distributed in the hope that it will be useful, 00015 * but WITHOUT ANY WARRANTY; without even the implied warranty of 00016 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00017 * GNU General Public License for more details. 00018 * 00019 * You should have received a copy of the GNU General Public License 00020 * along with this program; if not, write to the Free Software 00021 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, 00022 * MA 02110-1301, USA. 00023 */ 00024 00025 #include "InhomogeneousMarkovChain.hpp" 00026 #include "Alphabet.hpp" 00027 #include "ProbabilisticModelParameter.hpp" 00028 #include "Symbol.hpp" 00029 #include "VariableLengthMarkovChainCreator.hpp" 00030 00031 namespace tops{ 00033 double InhomogeneousMarkovChain::evaluatePosition(const Sequence & s, int i, int t) const{ 00034 00035 assert((t >= 0) && (t < (int)_context_trees.size())); 00036 ContextTreeNodePtr context = _context_trees[t]->getContext(s,i); 00037 if(context == NULL) { 00038 return -HUGE; 00039 } 00040 DiscreteIIDModelPtr distr = context->getDistribution(); 00041 if(distr == NULL) 00042 return -HUGE; 00043 return distr->evaluatePosition(s,i); 00044 } 00045 00047 int InhomogeneousMarkovChain::choosePosition(const Sequence & s, int i, int t) const{ 00048 assert((t >= 0) && (t < (int)_context_trees.size())); 00049 return _context_trees[t]->getContext(s,i)->getDistribution()->choosePosition(s,i); 00050 } 00051 00052 std::string InhomogeneousMarkovChain::str() const { 00053 std::stringstream out; 00054 out << "model_name = \"" << model_name() << "\"" << std::endl; 00055 00056 for(int i = 0; i < (int)_context_trees.size(); i++) 00057 { 00058 out << "p" << i << " = ("; 00059 printDistribution( _context_trees[i], _context_trees[i]->getRoot(), out, alphabet()) ; 00060 out << ")" << std::endl; 00061 } 00062 out << "position_specific_distribution = ("; 00063 if (_context_trees.size() >= 1){ 00064 out << "\"p0\""; 00065 for(int i = 1; i < (int)_context_trees.size(); i++) 00066 { 00067 out <<",\"p" << i << "\""; 00068 } 00069 } 00070 out << ")\n"; 00071 out << "phased =" << _phased << std::endl; 00072 out << alphabet()->str(); 00073 return out.str(); 00074 } 00075 00076 int InhomogeneousMarkovChain::size() const 00077 { 00078 int size= 0; 00079 for(int i = 0; i < (int)_context_trees.size(); i++) 00080 size += _context_trees[i]->getNumberOfNodes(); 00081 return size; 00082 } 00083 00084 void InhomogeneousMarkovChain::printDistribution(ContextTreePtr tree, ContextTreeNodePtr node, std::stringstream & out, AlphabetPtr alphabet) const 00085 { 00086 ContextTreeNodePtr current = node; 00087 std::vector <std::string> aux; 00088 bool root=true; 00089 if(current != tree->getRoot() ) 00090 root = false; 00091 while(current != tree->getRoot()) 00092 { 00093 aux.push_back(alphabet->getSymbol(current->symbol())->name() ); 00094 current = tree->getContext(current->getParent()); 00095 } 00096 for(int k = 0; k < (int)alphabet->size(); k++) 00097 { 00098 // if(close(exp(node->getDistribution()->log_probability_of(k)), 0.0, 1e-100)) 00099 // continue; 00100 out << "\"" << alphabet->getSymbol(k)->name() << "\" | " ; 00101 if(root) 00102 out << "\"\" " ; 00103 else 00104 { 00105 out << "\""; 00106 out << aux[aux.size() -1]; 00107 for(int i = aux.size()-2; i >=0; i--) 00108 out << " " << aux[i] ; 00109 out << "\""; 00110 } 00111 out << ": " ; 00112 out << exp(node->getDistribution()->log_probability_of(k)) << ";"; 00113 if(node->isLeaf()) 00114 out << " # leaf"; 00115 out << std::endl; 00116 00117 } 00118 if(!node->isLeaf()) 00119 for(int l = 0; l < node->alphabet_size(); l++) 00120 if(node->getChild(l) != NULL) 00121 printDistribution(tree, node->getChild(l), out, alphabet); 00122 } 00123 void InhomogeneousMarkovChain::removeSequenceFromModel(const Sequence & s, int phase){ 00124 for(int i = 0; i < (int)s.size()-1; i++) 00125 phase = mod(phase + 1, maximumTimeValue()+1); 00126 ContextTreeNodePtr c = _context_trees[phase]->getRoot(); 00127 ContextTreeNodePtr p; 00128 int j; 00129 for(j = s.size()-2; j >=0; j--){ 00130 if(c->isLeaf()) 00131 break; 00132 p = c; 00133 c = c->getChild(s[j]); 00134 if(c == NULL) 00135 { 00136 c = p; 00137 break; 00138 } 00139 } 00140 for(; j >= 0; j--) { 00141 for(int l = 0; l < (int)alphabet()->size(); l++) { 00142 ContextTreeNodePtr n = _context_trees[phase]->createContext(); 00143 DoubleVector prob; 00144 for(int i = 0; i < n->alphabet_size(); i++) 00145 prob.push_back(exp(c ->getDistribution()->log_probability_of(i))); 00146 DiscreteIIDModelPtr distr = DiscreteIIDModelPtr(new DiscreteIIDModel(prob)); 00147 n->setDistribution(distr); 00148 c->setChild(n, l); 00149 } 00150 c = c->getChild(s[j]); 00151 } 00152 00153 ContextTreeNodePtr current = _context_trees[phase]->getContext(s,s.size()-1); 00154 00155 00156 current->getDistribution()->log_probability_of(s[s.size()-1], -HUGE); 00157 double sum = 0; 00158 for(int symbol = 0; symbol < alphabet()->size(); symbol++) 00159 { 00160 sum += exp(current->getDistribution()->log_probability_of(symbol)); 00161 } 00162 for(int symbol = 0; symbol < alphabet()->size(); symbol++) 00163 { 00164 if(symbol != s[s.size()-1]) { 00165 double x = exp(current->getDistribution()->log_probability_of(symbol)); 00166 current->getDistribution()->log_probability_of(symbol, log(x/sum)); 00167 if(close(x, 0, 1e-10)) 00168 current->getDistribution()->log_probability_of(symbol, -HUGE); 00169 00170 } 00171 } 00172 00173 00174 std::vector <ContextTreeNodePtr> children = current->getChildren(); 00175 if(current->isLeaf()) 00176 return; 00177 std::vector <ContextTreeNodePtr> stack; 00178 for(int i = 0 ; i < (int)children.size(); i++) 00179 stack.push_back(children[i]); 00180 #if 1 00181 while(stack.size() > 0) { 00182 current = stack.back(); 00183 stack.pop_back(); 00184 if(current == NULL) 00185 continue; 00186 assert(current->getDistribution() != NULL); 00187 00188 00189 00190 current->getDistribution()->log_probability_of(s[s.size()-1], -HUGE); 00191 double sum = 0; 00192 for(int symbol = 0; symbol < alphabet()->size(); symbol++) 00193 { 00194 sum += exp(current->getDistribution()->log_probability_of(symbol)); 00195 } 00196 for(int symbol = 0; symbol < alphabet()->size(); symbol++) 00197 { 00198 if(symbol != s[s.size()-1]) { 00199 double x = exp(current->getDistribution()->log_probability_of(symbol)); 00200 current->getDistribution()->log_probability_of(symbol, log(x/sum)); 00201 if(close(x, 0, 1e-10)) 00202 current->getDistribution()->log_probability_of(symbol, -HUGE); 00203 00204 } 00205 } 00206 if(current->isLeaf()) 00207 continue; 00208 children = current->getChildren(); 00209 for(int i = 0 ; i < (int)children.size(); i++) 00210 stack.push_back(children[i]); 00211 } 00212 #endif 00213 00214 00215 00216 00217 } 00218 ProbabilisticModelParameters InhomogeneousMarkovChain::parameters() const { 00219 ProbabilisticModelParameters par; 00220 par.add("model_name", StringParameterValuePtr(new StringParameterValue(model_name()))); 00221 StringVector parnames; 00222 for(int i = 0; i <(int)_context_trees.size();i++) 00223 { 00224 std::stringstream out; 00225 out << "p" << i; 00226 parnames.push_back(out.str()); 00227 } 00228 par.add("position_specific_distribution" , 00229 StringVectorParameterValuePtr(new StringVectorParameterValue(parnames))); 00230 for(int i = 0; i <(int)_context_trees.size();i++) 00231 { 00232 std::stringstream out; 00233 out << "p" << i; 00234 par.add(out.str(), _context_trees[i]->getParameterValue()); 00235 } 00236 par.add("phased", IntParameterValuePtr(new IntParameterValue(_phased))); 00237 par.add("alphabet", alphabet()->getParameterValue()); 00238 return par; 00239 } 00240 void InhomogeneousMarkovChain::initialize(const ProbabilisticModelParameters & parameters) { 00241 ProbabilisticModelParameterValuePtr symbols = parameters.getMandatoryParameterValue("alphabet"); 00242 ProbabilisticModelParameterValuePtr position_specific_model_names = parameters.getMandatoryParameterValue("position_specific_distribution"); 00243 ProbabilisticModelParameterValuePtr p = parameters.getMandatoryParameterValue("phased"); 00244 00245 bool is_phased = (p->getInt() == 1); 00246 00247 00248 AlphabetPtr alphabet = AlphabetPtr(new Alphabet()); 00249 alphabet->initializeFromVector(symbols->getStringVector()); 00250 std::vector <ContextTreePtr> position_specific_trees; 00251 StringVector model_names = position_specific_model_names->getStringVector(); 00252 position_specific_trees.resize(model_names.size()); 00253 for(int i = 0; i < (int)model_names.size(); i++) 00254 { 00255 ProbabilisticModelParameters vlmc; 00256 ProbabilisticModelParameterValuePtr probs = parameters.getMandatoryParameterValue(model_names[i]); 00257 if (probs == NULL) 00258 { 00259 std::cerr << "ERROR: Could not find " << model_names[i] << std::endl; 00260 } 00261 00262 vlmc.add("alphabet", symbols); 00263 vlmc.add("probabilities", probs); 00264 00265 VariableLengthMarkovChainCreator vlmcCreator; 00266 VariableLengthMarkovChainPtr mc = vlmcCreator.createVLMC(vlmc); 00267 position_specific_trees[i] = mc->getTree(); 00268 } 00269 phased(is_phased); 00270 setAlphabet(alphabet); 00271 setPositionSpecificDistribution(position_specific_trees); 00272 } 00273 } 00274