ToPS
SmoothedHistogramKernelDensity.cpp
00001 /*
00002  *       SmoothedHistogramKernelDensity.cpp
00003  *
00004  *       Copyright 2011 Andre Yoshiaki Kashiwabara <akashiwabara@usp.br>
00005  *                      Ígor Bonádio <ibonadio@ime.usp.br>
00006  *                      Vitor Onuchic <vitoronuchic@gmail.com>
00007  *                      Alan Mitchell Durham <aland@usp.br>
00008  *
00009  *       This program is free software; you can redistribute it and/or modify
00010  *       it under the terms of the GNU  General Public License as published by
00011  *       the Free Software Foundation; either version 3 of the License, or
00012  *       (at your option) any later version.
00013  *
00014  *       This program is distributed in the hope that it will be useful,
00015  *       but WITHOUT ANY WARRANTY; without even the implied warranty of
00016  *       MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00017  *       GNU General Public License for more details.
00018  *
00019  *       You should have received a copy of the GNU General Public License
00020  *       along with this program; if not, write to the Free Software
00021  *       Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
00022  *       MA 02110-1301, USA.
00023  */
00024 
00025 #include "SmoothedHistogramKernelDensity.hpp"
00026 #include "DiscreteIIDModel.hpp"
00027 #include "util.hpp"
00028 
00029 namespace tops {
00030 
00031 
00032 
00033   ProbabilisticModelPtr SmoothedHistogramKernelDensity::create( ProbabilisticModelParameters & parameters) const
00034   {
00035     ProbabilisticModelParameterValuePtr training_set_parameter =
00036       parameters.getMandatoryParameterValue("training_set");
00037     ProbabilisticModelParameterValuePtr maxlengthp =
00038         parameters.getOptionalParameterValue("max_length");
00039     long max = 15000;
00040     if(maxlengthp != NULL)
00041         max = maxlengthp->getInt();
00042 
00043 
00044     if(training_set_parameter == NULL) {
00045       std::cerr << help () << std::endl;
00046       ProbabilisticModelPtr nullmodel;
00047       exit(-1);
00048       return nullmodel;
00049     }
00050 
00051     DoubleVector data;
00052     AlphabetPtr alpha = AlphabetPtr(new Alphabet());;
00053     SequenceEntryList sample_set;
00054     readSequencesFromFile(sample_set, alpha, training_set_parameter->getString());
00055     for(int i = 0; i < (int)sample_set.size();i++)
00056       for(int j = 0; j < (int) (sample_set[i]->getSequence()).size(); j++)
00057         data.push_back((sample_set[i]->getSequence())[j]);
00058     std::map<long,double> sum;
00059     double total = 0.0;
00060 
00061     if(data.size() > 0)
00062       {
00063         double bandwidth = sj_bandwidth(data);
00064 
00065         for (int pos = 0; pos <= max; pos++) {
00066           sum[pos] = 0.0;
00067           double integral = 0.0;
00068           double min = kernel_density_estimation(pos-0.5, bandwidth, data);
00069           double max2 = kernel_density_estimation(pos+0.5, bandwidth, data);
00070           if(max2 < min) {
00071             double aux = min;
00072             min = max2;
00073             max2 = aux;
00074           }
00075           integral += min + (max2 - min)/2;
00076           sum[pos] = integral;
00077           total += integral;
00078         }
00079       }
00080 
00081     DoubleVector prob;
00082     prob.resize(max+2);
00083     for (int k = 0; k <= max; k++){
00084       prob[k] =  sum[k]/total;
00085     }
00086 
00087 
00088     ProbabilisticModelParameters pars;
00089     pars.add("probabilities", ProbabilisticModelParameterValuePtr (new DoubleVectorParameterValue(prob)));
00090     pars.add("alphabet", alpha->getParameterValue());
00091     DiscreteIIDModelPtr result =
00092       DiscreteIIDModelPtr(new DiscreteIIDModel());
00093     result->initialize(pars);
00094 
00095     return result;
00096 
00097   }
00098 
00099 }