ToPS
BayesianInformationCriteria.cpp
00001 /*
00002  *       BayesianInformationCriteria.cpp
00003  *
00004  *       Copyright 2011 Andre Yoshiaki Kashiwabara <akashiwabara@usp.br>
00005  *                      Ígor Bonádio <ibonadio@ime.usp.br>
00006  *                      Vitor Onuchic <vitoronuchic@gmail.com>
00007  *                      Alan Mitchell Durham <aland@usp.br>
00008  *
00009  *       This program is free software; you can redistribute it and/or modify
00010  *       it under the terms of the GNU  General Public License as published by
00011  *       the Free Software Foundation; either version 3 of the License, or
00012  *       (at your option) any later version.
00013  *
00014  *       This program is distributed in the hope that it will be useful,
00015  *       but WITHOUT ANY WARRANTY; without even the implied warranty of
00016  *       MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00017  *       GNU General Public License for more details.
00018  *
00019  *       You should have received a copy of the GNU General Public License
00020  *       along with this program; if not, write to the Free Software
00021  *       Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
00022  *       MA 02110-1301, USA.
00023  */
00024 
00025 #include "BayesianInformationCriteria.hpp"
00026 #include <map>
00027 #include <vector>
00028 
00029 namespace tops
00030 {
00031   std::string BayesianInformationCriteria::help() const {
00032     std::stringstream out;
00033     out << "\nUSAGE: " << std::endl;
00034     out << "Mandatory parameters: " << std::endl;
00035     out << "\nbegin" << std::endl;
00036     out << "\tend" << std::endl;
00037     out << "\tstep" << std::endl;
00038     out << "Example: " << std::endl;
00039     out << "The configuration file below specify the BIC to select the WAM with the best order" << std::endl;
00040     out << "training_algorithm=\"WeightArrayModel\"" << std::endl;
00041     out << "training_set=\"dataset/sequences.txt\"" << std::endl;
00042     out << "alphabet=(\"A\", \"C\", \"G\", \"T\")" << std::endl;
00043     out << "length=31" << std::endl;
00044     out << "vicinity_length = 0" << std::endl;
00045     out << "pseudo_counts=0" << std::endl;
00046     out << "model_selection_criteria = \"BIC\"" << std::endl;
00047     out << "begin = (\"order\": 0)" << std::endl;
00048     out << "end = (\"order\": 3)" << std::endl;
00049     out << "step = (\"order\": 1)" << std::endl;
00050     return out.str();
00051   }
00052 
00053   ProbabilisticModelPtr BayesianInformationCriteria::create( ProbabilisticModelParameters & parameters) const
00054   {
00055     ProbabilisticModelParameterValuePtr beginpar = parameters.getMandatoryParameterValue("begin");
00056     ProbabilisticModelParameterValuePtr endpar = parameters.getMandatoryParameterValue("end");
00057     ProbabilisticModelParameterValuePtr steppar = parameters.getMandatoryParameterValue("step");
00058     ProbabilisticModelParameterValuePtr trainpar = parameters.getMandatoryParameterValue("training_algorithm");
00059 
00060     if((beginpar == NULL) ||
00061        (endpar == NULL) ||
00062        (steppar == NULL))
00063       {
00064         std::cerr << help() << std::endl;
00065         exit(-1);
00066       }
00067     std::map<std::string, double> beginmap = beginpar->getDoubleMap();
00068     std::map<std::string, double> endmap = endpar->getDoubleMap();
00069     std::map<std::string, double> stepmap = steppar->getDoubleMap();
00070     std::vector <std::string> parnames;
00071     std::map <std::string, double>::const_iterator it;
00072     for(it = beginmap.begin(); it!=beginmap.end(); it++)
00073       parnames.push_back(it->first);
00074 
00075     // generates all possible combinations of parameters
00076     std::vector <std::string>::const_iterator p;
00077     std::vector <std::vector <double> > combinations;
00078     combinations.resize(1);
00079     for(p = parnames.begin(); p != parnames.end(); p++)
00080       {
00081         std::vector <std::vector <double> > new_combinations;
00082         for (double i = beginmap[*p] ; i <= endmap[*p]; i += stepmap[*p])
00083           {
00084             for(int j = 0; j < (int) combinations.size() ; j++)
00085               {
00086                 std::vector <double> comb;
00087                 for(int k = 0; k < (int)combinations[j].size(); k++)
00088                   comb.push_back(combinations[j][k]);
00089                 comb.push_back(i);
00090                 new_combinations.push_back(comb);
00091               }
00092           }
00093         combinations = new_combinations;
00094       }
00095     double bic = -HUGE;
00096     ProbabilisticModelPtr result;
00097     for(int i = 0; i < (int) combinations.size(); i++)
00098       {
00099         for(int j = 0; j < (int) combinations[i].size(); j++)
00100           {
00101             DoubleParameterValuePtr value = DoubleParameterValuePtr(new DoubleParameterValue(combinations[i][j]));
00102 
00103             parameters.set(parnames[j], value);
00104           }
00105         double loglikelihood;
00106         int sample_size;
00107         ProbabilisticModelPtr m = _creator->create(parameters, loglikelihood, sample_size);
00108         if(m == NULL)
00109             {
00110                 std::cerr << "ERROR (BIC): could not estimate the model size using BIC " << std::endl;
00111                 exit(-1);
00112             }
00113         double model_bic =  loglikelihood  - 0.5* m->size() * log((double) sample_size);
00114         if((bic < model_bic) || (result == NULL))
00115           {
00116             bic = model_bic;
00117             result = m;
00118           }
00119 
00120         int k = 0;
00121         if( k < (int) combinations[i].size())
00122           {
00123             std::cerr << combinations[i][k];
00124             for(k = 1; k < (int) combinations[i].size(); k++)
00125               std::cerr << "\t" << combinations[i][k];
00126           }
00127         std::cerr << "\t" << model_bic  << std::endl;
00128 
00129       }
00130     std::cout << "# Model BIC: " << bic  << std::endl;
00131     return result;
00132   }
00133 }