ToPS: util.cpp Source File

00001 /*
00002  *       util.cpp
00003  *
00004  *       Copyright 2011 Andre Yoshiaki Kashiwabara <akashiwabara@usp.br>
00005  *                      Ígor Bonádio <ibonadio@ime.usp.br>
00006  *                      Vitor Onuchic <vitoronuchic@gmail.com>
00007  *                      Alan Mitchell Durham <aland@usp.br>
00008  *
00009  *       This program is free software; you can redistribute it and/or modify
00010  *       it under the terms of the GNU  General Public License as published by
00011  *       the Free Software Foundation; either version 3 of the License, or
00012  *       (at your option) any later version.
00013  *
00014  *       This program is distributed in the hope that it will be useful,
00015  *       but WITHOUT ANY WARRANTY; without even the implied warranty of
00016  *       MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00017  *       GNU General Public License for more details.
00018  *
00019  *       You should have received a copy of the GNU General Public License
00020  *       along with this program; if not, write to the Free Software
00021  *       Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
00022  *       MA 02110-1301, USA.
00023  */
00024 
00025 #include "util.hpp"
00026 namespace tops {
00027 
00028 
00029   /* Epanechnikov kernel */
00030   double epanechnikov(double x, double h){
00031     double a = h * sqrt(5.0);
00032     double absx = fabs(x);
00033     if(absx < a ) {
00034       return (3.0/4.0) * (1.0 - (absx/a)*(absx/a))/a;
00035     } else {
00036       return 0.0;
00037     }
00038   }
00039 
00040   /* normal kernel */
00041   double kernel_normal(double x, double h){
00042       double y = (x/h) * (x/h) ;
00043       double f = 1.0/(sqrt(2*M_PI));
00044       double v =  (f/h) * exp (- y/2);
00045       return v;
00046   }
00047 
00048 
00049 // code from R-1.7.0/src/appl/bandwidths.c
00050 #define abs9(a) (a > 0 ? a:-a)
00051   void band_den_bin(int n, int nb, double *d, const DoubleVector &x,  DoubleVector &cnt)
00052   {
00053     int   i, j,  nn = n;
00054     int ii, jj, iij;
00055     double xmin, xmax, rang, dd;
00056     for (i = 0; i < nb; i++)
00057       cnt.push_back(0);
00058     xmin = xmax = x[0];
00059     for (i = 1; i < nn; i++) {
00060       xmin = _min(xmin, x[i]);
00061       xmax = _min(xmax, x[i]);
00062     }
00063     rang = (xmax - xmin) * 1.01;
00064     *d = dd = rang / (nb);
00065     for (i = 1; i < nn; i++) {
00066       ii = (int)(x[i] / dd);
00067       for (j = 0; j < i; j++) {
00068         jj = (int)(x[j] / dd);
00069         iij = abs9((ii - jj));
00070         cnt[iij]++;
00071       }
00072     }
00073   }
00074   void band_phi6_bin(int n, int nb, double d, DoubleVector &x, double h, double *u)
00075   {
00076     int   i, nn = n, nbin = nb;
00077     double delta, sum, term;
00078     sum = 0.0;
00079     for (i = 0; i < nbin; i++) {
00080       delta = i * (d) / (h);
00081       delta *= delta;
00082       if (delta >= 1000) break;
00083       term = exp(-delta / 2) *
00084         (delta * delta * delta - 15 * delta * delta + 45 * delta - 15);
00085       sum += term * x[i];
00086     }
00087     sum = 2 * sum - 15 * nn;    /* add in diagonal */
00088     *u = sum / (nn * (nn - 1) * pow(h, 7.0) * sqrt(2 * M_PI));
00089   }
00090   void
00091   band_phi4_bin(int n, int nb, double d, DoubleVector x, double h, double *u)
00092   {
00093     int   i, nn = n, nbin = nb;
00094     double delta, sum, term;
00095 
00096     sum = 0.0;
00097     for (i = 0; i < nbin; i++) {
00098       delta = i * (d) / (h);
00099       delta *= delta;
00100       if (delta >= 1000) break;
00101       term = exp(-delta / 2) * (delta * delta - 6 * delta + 3);
00102       sum += term * x[i];
00103     }
00104     sum = 2 * sum + nn * 3;     /* add in diagonal */
00105     *u = sum / (nn * (nn - 1) * pow(h, 5.0) * sqrt(2 * M_PI));
00106   }
00107 
00108 
00109   double mean(const DoubleVector &data){
00110     double sum = 0.0;
00111     for(unsigned int i = 0; i < data.size(); i++){
00112       sum += data[i];
00113     }
00114     return sum/(double)data.size();
00115   }
00116 
00117   double var(const DoubleVector &data){
00118     double data_mean = mean(data);
00119     double sum = 0.0;
00120     for(unsigned int i = 0; i < data.size(); i++){
00121       sum += (data[i] - data_mean)*(data[i] - data_mean);
00122     }
00123     return sum/( (double) data.size() -1.0);
00124   }
00125 
00126   /* quantile */
00127   double quantile (DoubleVector data, double q){
00128     int low_index = (int)floor(q * ((double)data.size()-1));
00129     int high_index = (int)ceil(q * ((double)data.size()-1));
00130     double h =  q * ((double)data.size() -1) - (double)low_index;
00131     sort(data.begin(), data.end());
00132     return (1-h)*data[low_index] + h * data[high_index];
00133   }
00134 
00135   /* interquantile */
00136   double iqr (const DoubleVector &data){
00137     double q1=  quantile(data, 0.25);
00138     double q2 = quantile(data, 0.75);
00139     return q2 - q1;
00140   }
00141 
00142 
00143 
00144   void readSequencesFromFile(SequenceEntryList & s,
00145                              AlphabetPtr alphabet,
00146                              std::string  file_name)
00147   {
00148     std::ifstream input(file_name.c_str());
00149     if(!input.good())
00150       {
00151         std::cerr << "Can not open file " << file_name << std::endl;
00152         exit(-1);
00153       }
00154     while(!input.eof())
00155       {
00156         SequenceEntryPtr  inseq = SequenceEntryPtr(new SequenceEntry(alphabet));
00157         input >> *inseq;
00158         if((inseq->getSequence()).size() > 0)
00159           s.push_back(inseq);
00160       }
00161     input.close();
00162   }
00163 
00164   void readSequencesFromFile(SequenceList & s,
00165                              AlphabetPtr alphabet,
00166                              std::string  file_name)
00167   {
00168     std::ifstream input(file_name.c_str());
00169     if(!input.good())
00170       {
00171         std::cerr << "Can not open file " << file_name << std::endl;
00172         exit(-1);
00173       }
00174     while(!input.eof())
00175       {
00176         SequenceEntry  inseq(alphabet);
00177         input >> inseq;
00178         if((inseq.getSequence()).size() > 0)
00179           s.push_back(inseq.getSequence());
00180       }
00181     input.close();
00182   }
00183 
00184   void readMapFromFile(std::map<std::string, double> & s,
00185                        std::string  file_name)
00186   {
00187     std::ifstream input(file_name.c_str());
00188     if(!input.good())
00189       {
00190         std::cerr << "Can not open file " << file_name << std::endl;
00191         exit(-1);
00192       }
00193     std::string line;
00194     while(!input.eof())
00195       {
00196         std::getline(input, line, '\n');
00197         std::vector <std::string> x;
00198         boost::regex separator("\t");
00199         split_regex(line, x, separator);
00200         if(x.size() >= 2) {
00201           std::string key = x[0];
00202           int value = atof((x[1]).c_str());
00203           s[key] = value;
00204         }
00205       }
00206     input.close();
00207   }
00208 
00209 
00210 
00211   int mod(int D, int d) {
00212     int r = D%d;
00213     if (r < 0) {
00214       if (d > 0) r = r + d;
00215       else r = r - d;
00216     }
00217     return r;
00218   }
00219 
00220   void trim_spaces (std::string & s) {
00221     int k;
00222     for(k = s.size() - 1; k >= 0; k--)
00223       {
00224         if(!isspace(s[k]))
00225           break;
00226       }
00227     s = s.substr(0, k+1);
00228     for(k = 0; k < (int)(s.size()); k++)
00229       {
00230         if(!isspace(s[k]))
00231           break;
00232       }
00233     s = s.substr(k, (s.size()) - k);
00234   }
00235   void split_regex (const std::string & s, std::vector <std::string> & result, const boost::regex & re)
00236   {
00237     boost::sregex_token_iterator i(s.begin(), s.end(), re, -1);
00238     boost::sregex_token_iterator j;
00239     while(i != j)
00240       result.push_back(*i++);
00241   }
00242 
00243   double lookup (double x){
00244     assert (x >= 0.00f);
00245     assert (x <= 7.5);
00246     //return ((-0.00653779113685f * x + 0.09537236626558f) * x + 0.55317574459331f) * x + 0.68672959851568f;
00247     if (x <= 1.00f) return ((-0.009350833524763f * x + 0.130659527668286f) * x + 0.498799810682272f) * x + 0.693203116424741f;
00248     if (x <= 2.50f) return ((-0.014532321752540f * x + 0.139942324101744f) * x + 0.495635523139337f) * x + 0.692140569840976f;
00249     if (x <= 4.50f) return ((-0.004605031767994f * x + 0.063427417320019f) * x + 0.695956496475118f) * x + 0.514272634594009f;
00250     assert (x <= 7.5);
00251     return ((-0.000458661602210f * x + 0.009695946122598f) * x + 0.930734667215156f) * x + 0.168037164329057f;
00252     
00253     //return (((0.00089738532761f * x - 0.01859488697982f) * x + 0.14415772028626f) * x + 0.49515490689159f) * x + 0.69311928966454f;
00254   }
00255 
00256   double log_sum (double x, double y){
00257     if (x < y) return (x <= -2e20 || y - x >= 7.5) ? y : lookup(y-x) + x;
00258     return (y <= -2e20 || x - y >= 7.5) ? x : lookup(x-y) + y;
00259   }
00260 
00261 
00262   /*  double log_sum( double log_a, double log_b)
00263   {
00264     double min = log_a;
00265     double diff;
00266     if(min > log_b)
00267       {
00268         diff = log_b - log_a;
00269         if(diff != diff)
00270           return log_a;
00271         return log_a + lookup(diff);
00272         //return log_a + log(1 + exp(diff));
00273       }
00274     else
00275       {
00276         diff = log_a - log_b;
00277         if(diff != diff)
00278           return log_b;
00279         return log_b + lookup(diff);
00280           //return log_b + log(1 + exp(diff));
00281       }
00282       }*/
00283 
00284 
00285   double safe_division(double a, double b)
00286   {
00287     if((b < 1) && (a > b * (std::numeric_limits<double>::max)()))
00288       {
00289         return (std::numeric_limits<double>::max)();
00290       }
00291     else if (((b > 1) && (a < b*(std::numeric_limits<double>::min)()) )|| (a == 0))
00292       {
00293         return 0;
00294       }
00295     else
00296       {
00297         return a/b;
00298       }
00299 
00300   }
00301 
00302 
00303   bool close(double a, double b, double tolerance)
00304   {
00305     double diff = fabs(a-b);
00306 
00307     double div1 = safe_division(diff, fabs(a));
00308     double div2 = safe_division(diff, fabs(b));
00309     if( (div1 <= tolerance) && (div2 <= tolerance))
00310       {
00311         return true;
00312       }
00313     return false;
00314   }
00315 
00316 
00317 
00318 
00319   double kernel_density_estimation(double x, double bw, const DoubleVector &data){
00320     double result = 0.0;
00321     for(unsigned int i = 0; i < data.size(); i++) {
00322       double x_xi = (double)(x) - (double)data[i];
00323       result += epanechnikov(x_xi, bw);
00324     }
00325     result /= data.size();
00326     return result;
00327   }
00328 
00329 
00330   double kernel_density_estimation_gaussian(double x, double bw, const DoubleVector &data){
00331     double result = 0.0;
00332     for(unsigned int i = 0; i < data.size(); i++) {
00333       double x_xi = (double)(x) - (double)data[i];
00334       double y = (x_xi / bw);
00335       result += 1/sqrt(2*M_PI) * exp( -y*y/2) ;
00336     }
00337     result /= data.size()*bw;
00338     return result;
00339   }
00340 
00341   double sj_bandwidth(const DoubleVector &data){
00342     double n = (double) data.size();
00343     int nb = 1000;
00344     double d = 1.0;
00345     double variance =  var(data);
00346   DoubleVector count;
00347   band_den_bin ((int)n, nb, &d, data, count);
00348   double scale = _min(std::sqrt(variance), iqr(data)/1.349);
00349   double b = 1.23 * scale * pow(n, (-1.0/9.0));
00350   double c1 = 1.0 / (2.0*sqrt(M_PI) * n);
00351   double td;
00352   band_phi6_bin((int)n, (int)count.size(), d, count, b, &td);
00353   td = -td;
00354   if(td < 0 || td != td){
00355     //    cerr << "sj_bandwidth (WARNING). Dataset very sparse !!!\n" << endl;
00356     return 0.001;
00357   }
00358   double sdh ;
00359   band_phi4_bin((int)n, (int)count.size(), d, count, pow(2.394/(n*td), (1.0/7.0)), &sdh);
00360   return pow((c1/sdh), 1.0/5.0);
00361   }
00362 
00363 
00364   /*========*/
00365 
00366 
00367 }