ToPS
SmoothedHistogramBurge.cpp
00001 /*
00002  *       SmoothedHistogramBurge.cpp
00003  *
00004  *       Copyright 2011 Andre Yoshiaki Kashiwabara <akashiwabara@usp.br>
00005  *                      Ígor Bonádio <ibonadio@ime.usp.br>
00006  *                      Vitor Onuchic <vitoronuchic@gmail.com>
00007  *                      Alan Mitchell Durham <aland@usp.br>
00008  *
00009  *       This program is free software; you can redistribute it and/or modify
00010  *       it under the terms of the GNU  General Public License as published by
00011  *       the Free Software Foundation; either version 3 of the License, or
00012  *       (at your option) any later version.
00013  *
00014  *       This program is distributed in the hope that it will be useful,
00015  *       but WITHOUT ANY WARRANTY; without even the implied warranty of
00016  *       MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00017  *       GNU General Public License for more details.
00018  *
00019  *       You should have received a copy of the GNU General Public License
00020  *       along with this program; if not, write to the Free Software
00021  *       Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
00022  *       MA 02110-1301, USA.
00023  */
00024 
00025 #include "SmoothedHistogramBurge.hpp"
00026 #include "DiscreteIIDModel.hpp"
00027 #include "util.hpp"
00028 namespace tops {
00029 
00030 
00031   ProbabilisticModelPtr SmoothedHistogramBurge::create( ProbabilisticModelParameters & parameters) const
00032   {
00033 
00034 
00035     ProbabilisticModelParameterValuePtr training_set_parameter =
00036       parameters.getMandatoryParameterValue("training_set");
00037     ProbabilisticModelParameterValuePtr cpar =
00038       parameters.getMandatoryParameterValue("C");
00039     ProbabilisticModelParameterValuePtr maxlengthp =
00040       parameters.getOptionalParameterValue("max_length");
00041     long max = 15000;
00042     if(maxlengthp != NULL)
00043         max = maxlengthp->getInt();
00044 
00045     if((training_set_parameter == NULL)||(cpar == NULL)) {
00046       std::cerr << help () << std::endl;
00047       ProbabilisticModelPtr nullmodel;
00048       exit(-1);
00049       return nullmodel;
00050     }
00051     double C = cpar->getDouble();
00052     DoubleVector data;
00053     AlphabetPtr alpha = AlphabetPtr(new Alphabet());;
00054     SequenceList sample_set;
00055     readSequencesFromFile(sample_set, alpha, training_set_parameter->getString());
00056 
00057     for(int i = 0; i < (int)sample_set.size();i++)
00058       for(int j = 0; j < (int) sample_set[i].size(); j++)
00059         data.push_back(sample_set[i][j]);
00060 
00061     typedef std::map<long,double> Lengths;
00062     int N = data.size();
00063     Lengths counter;
00064     Lengths sum;
00065     Lengths::const_iterator iter;
00066     for(int i = 0; i < N; i++){
00067       iter = counter.find((long)data[i]);
00068       if(iter == counter.end())
00069         counter[(long)data[i]] = 1.0;
00070       else
00071         counter[(long)data[i]] += 1.0;
00072     }
00073     double total = 0.0;
00074     for (int k = 1; k <= max; k++){
00075       int start = k - 10;
00076       int end = k + 10;
00077       if(start < 0)  start = 0;
00078       sum[k] = 0.0;
00079       for(int x = start; x < end; x++){
00080         iter = counter.find((long)x);
00081         if(iter != counter.end() && iter->second > 0.0){
00082           double nx = iter->second;
00083           double mean = x+1.0;
00084           double sd = sqrt(2*((double)(x+1.0))*C/nx);
00085           double px2 = 0.5*(1 + _erf((((double)k+1.5) - mean))/ (sd*sqrt(2.0)));
00086           double px1 = 0.5*(1 + _erf((((double)k+0.5) - mean))/ (sd*sqrt(2.0)));
00087           assert(nx > 0.0);
00088           assert(mean > 0.0);
00089           assert(sd > 0.0);
00090           sum[k] += nx*(px2 - px1);
00091         }
00092       }
00093       sum[k] = sum[k]/N;
00094       total = total+ sum[k];
00095     }
00096     double epsilon = 1e-5;
00097     total = total/(1 - max*epsilon);
00098 
00099     DoubleVector prob;
00100     prob.resize(max+1);
00101     for (int k = 1; k <= max; k++){
00102       prob[k] = epsilon + sum[k]/total;
00103     }
00104 
00105     ProbabilisticModelParameters pars;
00106     pars.add("probabilities", ProbabilisticModelParameterValuePtr (new DoubleVectorParameterValue(prob)));
00107     pars.add("alphabet", alpha->getParameterValue());
00108     DiscreteIIDModelPtr result =
00109       DiscreteIIDModelPtr(new DiscreteIIDModel());
00110     result->initialize(pars);
00111     return result;
00112   }
00113 
00114 }