ToPS
SmoothedHistogramStanke.cpp
00001 /*
00002  *       SmoothedHistogramStanke.cpp
00003  *
00004  *       Copyright 2011 Andre Yoshiaki Kashiwabara <akashiwabara@usp.br>
00005  *                      Ígor Bonádio <ibonadio@ime.usp.br>
00006  *                      Vitor Onuchic <vitoronuchic@gmail.com>
00007  *                      Alan Mitchell Durham <aland@usp.br>
00008  *
00009  *       This program is free software; you can redistribute it and/or modify
00010  *       it under the terms of the GNU  General Public License as published by
00011  *       the Free Software Foundation; either version 3 of the License, or
00012  *       (at your option) any later version.
00013  *
00014  *       This program is distributed in the hope that it will be useful,
00015  *       but WITHOUT ANY WARRANTY; without even the implied warranty of
00016  *       MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00017  *       GNU General Public License for more details.
00018  *
00019  *       You should have received a copy of the GNU General Public License
00020  *       along with this program; if not, write to the Free Software
00021  *       Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
00022  *       MA 02110-1301, USA.
00023  */
00024 
00025 #include "SmoothedHistogramStanke.hpp"
00026 #include "DiscreteIIDModel.hpp"
00027 #include "util.hpp"
00028 namespace tops {
00029 
00030 
00031   ProbabilisticModelPtr SmoothedHistogramStanke::create( ProbabilisticModelParameters & parameters) const
00032   {
00033     ProbabilisticModelParameterValuePtr training_set_parameter =
00034       parameters.getMandatoryParameterValue("training_set");
00035     ProbabilisticModelParameterValuePtr maxlengthp =
00036       parameters.getOptionalParameterValue("max_length");
00037     ProbabilisticModelParameterValuePtr mp =
00038       parameters.getOptionalParameterValue("m");
00039     ProbabilisticModelParameterValuePtr slopep =
00040       parameters.getOptionalParameterValue("slope");
00041 
00042     ProbabilisticModelParameterValuePtr weightspar = parameters.getOptionalParameterValue("weights");
00043     std::map <std::string, double> weights;
00044     if(weightspar != NULL) {
00045       readMapFromFile(weights, weightspar->getString());
00046     }
00047 
00048     double a = 0.5;
00049     int m = 8;
00050 
00051     if(mp != NULL)
00052         m = mp->getInt();
00053 
00054     if(slopep != NULL)
00055         a = slopep->getDouble();
00056 
00057     long max = 15000;
00058     if(maxlengthp != NULL)
00059         max = maxlengthp->getInt();
00060     int L = max;
00061     max = max + 4 * a * max;
00062 
00063     if(training_set_parameter == NULL) {
00064       ProbabilisticModelPtr nullmodel;
00065       exit(-1);
00066       return nullmodel;
00067     }
00068 
00069     DoubleVector data;
00070     AlphabetPtr alpha = AlphabetPtr(new Alphabet());;
00071     SequenceEntryList sample_set;
00072     readSequencesFromFile(sample_set, alpha, training_set_parameter->getString());
00073     for(int i = 0; i < (int)sample_set.size();i++) {
00074       int rep = 1;
00075       if(weights.find(sample_set[i]->getName()) != weights.end())
00076         rep = (weights.find(sample_set[i]->getName()))->second;
00077 
00078       for(int j = 0; j < (int) (sample_set[i]->getSequence()).size(); j++) {
00079         for(int k = 0;k< rep; k++)
00080             data.push_back((sample_set[i]->getSequence())[j]);
00081       }
00082     }
00083     std::map<long,double> sum;
00084     double total = 0.0;
00085     std::map<long,double> d;
00086     std::map<long,int> counter;
00087     DoubleVector prob;
00088 
00089 
00090     vector<double> pi;
00091     pi.resize(L);
00092 
00093     if(data.size() > 0)
00094       {
00095           for(int i = 0; i < (int)data.size(); i++){
00096               if(counter.find((long)data[i]) == counter.end())
00097                   counter[(long)data[i]] = 1.0;
00098               else
00099                   counter[(long)data[i]] += 1.0;
00100           }
00101 
00102 
00103 
00104         double count_left = 0;
00105         double count_right = 0;
00106 
00107         for(int pos = 0; (pos < L) && (pos < max) ; pos +=1)
00108             {
00109               int bwd = (int) (.01+ (a / pow(L, 1.0/5.0) ) * (double)pos);
00110               if(bwd <= 0)
00111                 bwd = 1;
00112               for(int j = pos - bwd + 1;  (j <= pos + bwd -1)  ; j++)
00113                 {
00114                   if (! (j >= 0 && j < L))
00115                     continue;
00116                   if(j <= pos)
00117                     count_left += (counter[j]) ? 1: 0;
00118                   if(j >= pos)
00119                     count_right += (counter[j])? 1: 0;
00120                 }
00121 
00122               while (count_left < m && count_right < m && bwd < L)
00123                 {
00124                   bwd ++;
00125                   if(pos + bwd -1 < L)
00126                     count_left += counter[pos + bwd - 1] ? 1:0;
00127                   if(pos - bwd + 1 >= 0)
00128                     count_right += counter[pos + bwd - 1] ? 1:0;
00129                 }
00130               if(pos < L)
00131                 pi[pos] += kernel_normal((double)0, (double)bwd) * counter[pos];
00132               bool negligible = false;
00133               int j=1;
00134               while (!negligible && (pos-j>=0 || pos+j<L)){
00135                 double  wj = kernel_normal(j, bwd) * (counter[pos] );
00136                 if (pos-j>=0 && pos-j<(int)pi.size() ) {
00137                   pi[pos-j] += wj;
00138                 }
00139                 if (pos+j<(int)pi.size() && pos+j>=0) {
00140                   pi[pos+j] += wj;
00141                 }
00142                 negligible = (wj < 1e-20);
00143                 j++;
00144               }
00145             }
00146 #if 1
00147         double total = 0;
00148         for (long k = 0; k < (int)pi.size(); k++){
00149             total += pi[k];
00150         }
00151         prob.resize(L);
00152         for (long k = 0; k < (int)pi.size(); k++){
00153             prob[k] =  pi[k]/(total) ;
00154         }
00155 #endif
00156       }
00157     ProbabilisticModelParameters pars;
00158     pars.add("probabilities", ProbabilisticModelParameterValuePtr (new DoubleVectorParameterValue(prob)));
00159     pars.add("alphabet", alpha->getParameterValue());
00160     DiscreteIIDModelPtr result =
00161       DiscreteIIDModelPtr(new DiscreteIIDModel());
00162     result->initialize(pars);
00163     return result;
00164   }
00165 
00166 }