ToPS
|
00001 /* 00002 * VariableLengthMarkovChain.cpp 00003 * 00004 * Copyright 2011 Andre Yoshiaki Kashiwabara <akashiwabara@usp.br> 00005 * Ígor Bonádio <ibonadio@ime.usp.br> 00006 * Vitor Onuchic <vitoronuchic@gmail.com> 00007 * Alan Mitchell Durham <aland@usp.br> 00008 * 00009 * This program is free software; you can redistribute it and/or modify 00010 * it under the terms of the GNU General Public License as published by 00011 * the Free Software Foundation; either version 3 of the License, or 00012 * (at your option) any later version. 00013 * 00014 * This program is distributed in the hope that it will be useful, 00015 * but WITHOUT ANY WARRANTY; without even the implied warranty of 00016 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00017 * GNU General Public License for more details. 00018 * 00019 * You should have received a copy of the GNU General Public License 00020 * along with this program; if not, write to the Free Software 00021 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, 00022 * MA 02110-1301, USA. 00023 */ 00024 00025 #include "VariableLengthMarkovChain.hpp" 00026 #include "VariableLengthMarkovChainCreator.hpp" 00027 #include "Symbol.hpp" 00028 #include <boost/algorithm/string.hpp> 00029 namespace tops { 00030 00031 std::string VariableLengthMarkovChain::print_graph () const { 00032 std::stringstream out; 00033 AlphabetPtr alphabet = ProbabilisticModel::alphabet(); 00034 int nnodes = _tree->getNumberOfNodes(); 00035 for (int i = 0; i < nnodes; i++) { 00036 ContextTreeNodePtr current = _tree->getContext(i); 00037 ContextTreeNodePtr root = _tree->getRoot(); 00038 std::vector <std::string> aux; 00039 while(current != root) { 00040 aux.push_back(alphabet->getSymbol(current->symbol())->name() ); 00041 current = _tree->getContext(current->getParent()); 00042 } 00043 std::stringstream node_id; 00044 for (int j = aux.size() - 1; j >= 0; j--) 00045 node_id << aux[j] ; 00046 out << _tree->getContext(i)->id() << " " << node_id.str() << std::endl; 00047 } 00048 out << "#" << std::endl; 00049 for (int i = 0; i < nnodes; i++) { 00050 out << _tree->getContext(i)->id() << " " << _tree->getContext(_tree->getContext(i)->getParent())->id() << std::endl; 00051 } 00052 return out.str(); 00053 } 00054 00055 00056 void VariableLengthMarkovChain::printDistribution(ContextTreePtr tree, ContextTreeNodePtr node, std::stringstream & out, AlphabetPtr alphabet) const { 00057 ContextTreeNodePtr current = node; 00058 std::vector <std::string> aux; 00059 bool root=true; 00060 if(current != tree->getRoot() ) 00061 root = false; 00062 while(current != tree->getRoot()) 00063 { 00064 aux.push_back(alphabet->getSymbol(current->symbol())->name() ); 00065 current = tree->getContext(current->getParent()); 00066 } 00067 for(int k = 0; k < (int)alphabet->size(); k++) 00068 { 00069 out << "\"" << alphabet->getSymbol(k)->name() << "\" | " ; 00070 if(root) 00071 out << "\"\" " ; 00072 else 00073 { 00074 out << "\""; 00075 out << aux[aux.size() -1]; 00076 for(int i = aux.size()-2; i >=0; i--) 00077 out << " " << aux[i] ; 00078 out << "\""; 00079 } 00080 out << ": " ; 00081 out << exp(node->getDistribution()->log_probability_of(k)) << ";"; 00082 if(node->isLeaf()) 00083 out << " # leaf"; 00084 out << std::endl; 00085 } 00086 if(!node->isLeaf()) 00087 for(int l = 0; l < node->alphabet_size(); l++) 00088 if(node->getChild(l) != NULL) 00089 printDistribution(tree, node->getChild(l), out, alphabet); 00090 } 00091 00092 00093 double VariableLengthMarkovChain::evaluatePosition(const Sequence & s, unsigned int i)const{ 00094 ContextTreeNodePtr c = _tree->getContext(s,i); 00095 if (c == NULL) 00096 return -HUGE; 00097 else 00098 return c->getDistribution()->evaluatePosition(s,i); 00099 } 00100 00101 double VariableLengthMarkovChain::choosePosition(const Sequence & s, int i)const{ 00102 ContextTreeNodePtr c = _tree->getContext(s,i); 00103 if (c == NULL) 00104 return -HUGE; 00105 else 00106 return c->getDistribution()->choosePosition(s,i); 00107 } 00108 00109 ProbabilisticModelCreatorPtr VariableLengthMarkovChain::getFactory() const { 00110 return VariableLengthMarkovChainCreatorPtr (new VariableLengthMarkovChainCreator()); 00111 } 00112 00113 int VariableLengthMarkovChain::size() const { 00114 return (alphabet()->size() -1)* _tree->getNumberOfNodes(); 00115 } 00116 00117 std::string VariableLengthMarkovChain::str() const { 00118 std::stringstream out; 00119 if(_tree == NULL){ 00120 std::cerr << "This Markov chain model was not defined !" << std::endl; 00121 return out.str(); 00122 } 00123 00124 out << "model_name = \"" << model_name() << "\"" << std::endl; 00125 out << "probabilities = ("; 00126 printDistribution( _tree, _tree->getRoot(), out, alphabet()) ; 00127 out << ")" << std::endl; 00128 out << alphabet()->str() ; 00129 return out.str(); 00130 } 00131 00132 ProbabilisticModelParameters VariableLengthMarkovChain::parameters() const 00133 { 00134 ProbabilisticModelParameters p; 00135 p.add("model_name", StringParameterValuePtr(new StringParameterValue("VariableLengthMarkovChain"))); 00136 p.add("probabilities", _tree->getParameterValue()); 00137 p.add("alphabet", alphabet()->getParameterValue()); 00138 return p; 00139 } 00140 00141 void VariableLengthMarkovChain::initialize(const ProbabilisticModelParameters & p) { 00142 ProbabilisticModelParameterValuePtr probs = p.getMandatoryParameterValue("probabilities"); 00143 ProbabilisticModelParameterValuePtr symbols = p.getMandatoryParameterValue("alphabet"); 00144 if((probs == NULL) || (symbols == NULL)) 00145 { 00146 return; 00147 } 00148 AlphabetPtr alphabet = AlphabetPtr(new Alphabet()); 00149 alphabet->initializeFromVector(symbols->getStringVector()); 00150 ContextTreePtr tree = 00151 ContextTreePtr(new ContextTree(alphabet)); 00152 ContextTreeNodePtr root = tree->createContext(); 00153 00154 std::string root_str(""); 00155 setTree(tree); 00156 setAlphabet(alphabet); 00157 00158 map<std::string,DoubleVector> probmap; 00159 std::map <std::string,double>::const_iterator it; 00160 std::map <std::string,std::vector<double> >::const_iterator it2; 00161 for(it = (probs->getDoubleMap()).begin(); 00162 it !=(probs->getDoubleMap()).end(); 00163 it++) 00164 { 00165 std::vector <std::string> splited; 00166 boost::regex separator("\\|"); 00167 tops::split_regex(it->first, splited, separator); 00168 if(splited.size() == 1) { 00169 splited.push_back(""); 00170 } 00171 00172 std::string context (splited[1]); 00173 std::string symbol ( splited[0]); 00174 00175 if(probmap.find(context) == probmap.end()) 00176 { 00177 int id = alphabet->getSymbol(symbol)->id(); 00178 DoubleVector probs; 00179 probs.resize(alphabet->size()); 00180 probmap[context]=probs; 00181 if(id < (int)probmap[context].size()) 00182 (probmap[context])[id] = it->second; 00183 } 00184 else 00185 { 00186 int id = alphabet->getSymbol(symbol)->id(); 00187 if(id < (int)probmap[context].size()) 00188 (probmap[context])[id] = it->second; 00189 } 00190 } 00191 00192 for(it2 = probmap.begin(); it2 != probmap.end(); it2++) { 00193 std::vector <std::string> history; 00194 std::string context = it2->first; 00195 std::vector<double> p = it2->second; 00196 boost::split(history, context, boost::is_any_of(" ")); 00197 if(context.size() <= 0) 00198 { 00199 ProbabilisticModelParameters fddpar ; 00200 fddpar.add("probabilities", DoubleVectorParameterValuePtr(new DoubleVectorParameterValue(p))); 00201 fddpar.add("alphabet", alphabet->getParameterValue()); 00202 DiscreteIIDModelPtr distr = 00203 DiscreteIIDModelPtr(new DiscreteIIDModel()); 00204 distr->initialize(fddpar); 00205 root->setDistribution(distr); 00206 } 00207 else 00208 { 00209 ContextTreeNodePtr current = root; 00210 // history with indices -1, -2, -3,... 00211 for(int i = 0; i < (int)history.size() ;i++) 00212 { 00213 int id = alphabet->getSymbol(history[i])->id(); 00214 if(id < 0) 00215 { 00216 std::cerr << "ERROR: Symbol, '" << history[i] << "', not in the alphabet !" << std::endl; 00217 exit(-1); 00218 } 00219 00220 if(current->getChild(id) == NULL) 00221 { 00222 ContextTreeNodePtr node2 = tree->createContext(); 00223 current->setChild(node2, id); 00224 if(i == ((int)history.size() - 1)) 00225 { 00226 ProbabilisticModelParameters fddpar ; 00227 fddpar.add("probabilities", DoubleVectorParameterValuePtr(new DoubleVectorParameterValue(p))); 00228 fddpar.add("alphabet", alphabet->getParameterValue()); 00229 DiscreteIIDModelPtr distr = 00230 DiscreteIIDModelPtr(new DiscreteIIDModel()); 00231 distr->initialize(fddpar); 00232 node2->setDistribution(distr); 00233 } 00234 else 00235 { 00236 DoubleVector probs2; 00237 for(int i = 0; i < (int)alphabet->size()-1; i++) 00238 probs2.push_back(1.0/(double)alphabet->size()); 00239 ProbabilisticModelParameters fddpar ; 00240 fddpar.add("probabilities", DoubleVectorParameterValuePtr(new DoubleVectorParameterValue(probs2))); 00241 fddpar.add("alphabet", alphabet->getParameterValue()); 00242 DiscreteIIDModelPtr distr = 00243 DiscreteIIDModelPtr(new DiscreteIIDModel()); 00244 distr->initialize(fddpar); 00245 node2->setDistribution(distr); 00246 } 00247 } 00248 current = current->getChild(id); 00249 } 00250 } 00251 } 00252 00253 } 00254 00255 void VariableLengthMarkovChain::removeSequenceFromModel(const Sequence & s, int phase){ 00256 ContextTreeNodePtr c = _tree->getRoot(); 00257 ContextTreeNodePtr p; 00258 int j; 00259 for(j = s.size()-2; j >=0; j--){ 00260 if(c->isLeaf()) 00261 break; 00262 p = c; 00263 c = c->getChild(s[j]); 00264 if(c == NULL) 00265 { 00266 c = p; 00267 break; 00268 } 00269 } 00270 for(; j >= 0; j--) { 00271 for(int l = 0; l < (int)alphabet()->size(); l++) { 00272 ContextTreeNodePtr n = _tree->createContext(); 00273 DoubleVector prob; 00274 for(int i = 0; i < n->alphabet_size(); i++) 00275 prob.push_back(exp(c ->getDistribution()->log_probability_of(i))); 00276 DiscreteIIDModelPtr distr = DiscreteIIDModelPtr(new DiscreteIIDModel(prob)); 00277 n->setDistribution(distr); 00278 c->setChild(n, l); 00279 } 00280 c = c->getChild(s[j]); 00281 } 00282 00283 ContextTreeNodePtr current = _tree->getContext(s,s.size()-1); 00284 00285 00286 current->getDistribution()->log_probability_of(s[s.size()-1], -HUGE); 00287 double sum = 0; 00288 for(int symbol = 0; symbol < alphabet()->size(); symbol++) 00289 { 00290 sum += exp(current->getDistribution()->log_probability_of(symbol)); 00291 } 00292 for(int symbol = 0; symbol < alphabet()->size(); symbol++) 00293 { 00294 if(symbol != s[s.size()-1]) { 00295 double x = exp(current->getDistribution()->log_probability_of(symbol)); 00296 current->getDistribution()->log_probability_of(symbol, log(x/sum)); 00297 } 00298 } 00299 00300 00301 std::vector <ContextTreeNodePtr> children = current->getChildren(); 00302 if(current->isLeaf()) 00303 return; 00304 std::vector <ContextTreeNodePtr> stack; 00305 for(int i = 0 ; i < (int)children.size(); i++) 00306 stack.push_back(children[i]); 00307 #if 1 00308 while(stack.size() > 0) { 00309 current = stack.back(); 00310 stack.pop_back(); 00311 if(current == NULL) 00312 continue; 00313 assert(current->getDistribution() != NULL); 00314 00315 00316 00317 current->getDistribution()->log_probability_of(s[s.size()-1], -HUGE); 00318 double sum = 0; 00319 for(int symbol = 0; symbol < alphabet()->size(); symbol++) 00320 { 00321 sum += exp(current->getDistribution()->log_probability_of(symbol)); 00322 } 00323 for(int symbol = 0; symbol < alphabet()->size(); symbol++) 00324 { 00325 if(symbol != s[s.size()-1]) { 00326 double x = exp(current->getDistribution()->log_probability_of(symbol)); 00327 current->getDistribution()->log_probability_of(symbol, log(x/sum)); 00328 } 00329 } 00330 if(current->isLeaf()) 00331 continue; 00332 children = current->getChildren(); 00333 for(int i = 0 ; i < (int)children.size(); i++) 00334 stack.push_back(children[i]); 00335 } 00336 #endif 00337 00338 00339 00340 00341 } 00342 00343 }