ToPS
|
00001 /* 00002 * util.cpp 00003 * 00004 * Copyright 2011 Andre Yoshiaki Kashiwabara <akashiwabara@usp.br> 00005 * Ígor Bonádio <ibonadio@ime.usp.br> 00006 * Vitor Onuchic <vitoronuchic@gmail.com> 00007 * Alan Mitchell Durham <aland@usp.br> 00008 * 00009 * This program is free software; you can redistribute it and/or modify 00010 * it under the terms of the GNU General Public License as published by 00011 * the Free Software Foundation; either version 3 of the License, or 00012 * (at your option) any later version. 00013 * 00014 * This program is distributed in the hope that it will be useful, 00015 * but WITHOUT ANY WARRANTY; without even the implied warranty of 00016 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00017 * GNU General Public License for more details. 00018 * 00019 * You should have received a copy of the GNU General Public License 00020 * along with this program; if not, write to the Free Software 00021 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, 00022 * MA 02110-1301, USA. 00023 */ 00024 00025 #include "util.hpp" 00026 namespace tops { 00027 00028 00029 /* Epanechnikov kernel */ 00030 double epanechnikov(double x, double h){ 00031 double a = h * sqrt(5.0); 00032 double absx = fabs(x); 00033 if(absx < a ) { 00034 return (3.0/4.0) * (1.0 - (absx/a)*(absx/a))/a; 00035 } else { 00036 return 0.0; 00037 } 00038 } 00039 00040 /* normal kernel */ 00041 double kernel_normal(double x, double h){ 00042 double y = (x/h) * (x/h) ; 00043 double f = 1.0/(sqrt(2*M_PI)); 00044 double v = (f/h) * exp (- y/2); 00045 return v; 00046 } 00047 00048 00049 // code from R-1.7.0/src/appl/bandwidths.c 00050 #define abs9(a) (a > 0 ? a:-a) 00051 void band_den_bin(int n, int nb, double *d, const DoubleVector &x, DoubleVector &cnt) 00052 { 00053 int i, j, nn = n; 00054 int ii, jj, iij; 00055 double xmin, xmax, rang, dd; 00056 for (i = 0; i < nb; i++) 00057 cnt.push_back(0); 00058 xmin = xmax = x[0]; 00059 for (i = 1; i < nn; i++) { 00060 xmin = _min(xmin, x[i]); 00061 xmax = _min(xmax, x[i]); 00062 } 00063 rang = (xmax - xmin) * 1.01; 00064 *d = dd = rang / (nb); 00065 for (i = 1; i < nn; i++) { 00066 ii = (int)(x[i] / dd); 00067 for (j = 0; j < i; j++) { 00068 jj = (int)(x[j] / dd); 00069 iij = abs9((ii - jj)); 00070 cnt[iij]++; 00071 } 00072 } 00073 } 00074 void band_phi6_bin(int n, int nb, double d, DoubleVector &x, double h, double *u) 00075 { 00076 int i, nn = n, nbin = nb; 00077 double delta, sum, term; 00078 sum = 0.0; 00079 for (i = 0; i < nbin; i++) { 00080 delta = i * (d) / (h); 00081 delta *= delta; 00082 if (delta >= 1000) break; 00083 term = exp(-delta / 2) * 00084 (delta * delta * delta - 15 * delta * delta + 45 * delta - 15); 00085 sum += term * x[i]; 00086 } 00087 sum = 2 * sum - 15 * nn; /* add in diagonal */ 00088 *u = sum / (nn * (nn - 1) * pow(h, 7.0) * sqrt(2 * M_PI)); 00089 } 00090 void 00091 band_phi4_bin(int n, int nb, double d, DoubleVector x, double h, double *u) 00092 { 00093 int i, nn = n, nbin = nb; 00094 double delta, sum, term; 00095 00096 sum = 0.0; 00097 for (i = 0; i < nbin; i++) { 00098 delta = i * (d) / (h); 00099 delta *= delta; 00100 if (delta >= 1000) break; 00101 term = exp(-delta / 2) * (delta * delta - 6 * delta + 3); 00102 sum += term * x[i]; 00103 } 00104 sum = 2 * sum + nn * 3; /* add in diagonal */ 00105 *u = sum / (nn * (nn - 1) * pow(h, 5.0) * sqrt(2 * M_PI)); 00106 } 00107 00108 00109 double mean(const DoubleVector &data){ 00110 double sum = 0.0; 00111 for(unsigned int i = 0; i < data.size(); i++){ 00112 sum += data[i]; 00113 } 00114 return sum/(double)data.size(); 00115 } 00116 00117 double var(const DoubleVector &data){ 00118 double data_mean = mean(data); 00119 double sum = 0.0; 00120 for(unsigned int i = 0; i < data.size(); i++){ 00121 sum += (data[i] - data_mean)*(data[i] - data_mean); 00122 } 00123 return sum/( (double) data.size() -1.0); 00124 } 00125 00126 /* quantile */ 00127 double quantile (DoubleVector data, double q){ 00128 int low_index = (int)floor(q * ((double)data.size()-1)); 00129 int high_index = (int)ceil(q * ((double)data.size()-1)); 00130 double h = q * ((double)data.size() -1) - (double)low_index; 00131 sort(data.begin(), data.end()); 00132 return (1-h)*data[low_index] + h * data[high_index]; 00133 } 00134 00135 /* interquantile */ 00136 double iqr (const DoubleVector &data){ 00137 double q1= quantile(data, 0.25); 00138 double q2 = quantile(data, 0.75); 00139 return q2 - q1; 00140 } 00141 00142 00143 00144 void readSequencesFromFile(SequenceEntryList & s, 00145 AlphabetPtr alphabet, 00146 std::string file_name) 00147 { 00148 std::ifstream input(file_name.c_str()); 00149 if(!input.good()) 00150 { 00151 std::cerr << "Can not open file " << file_name << std::endl; 00152 exit(-1); 00153 } 00154 while(!input.eof()) 00155 { 00156 SequenceEntryPtr inseq = SequenceEntryPtr(new SequenceEntry(alphabet)); 00157 input >> *inseq; 00158 if((inseq->getSequence()).size() > 0) 00159 s.push_back(inseq); 00160 } 00161 input.close(); 00162 } 00163 00164 void readSequencesFromFile(SequenceList & s, 00165 AlphabetPtr alphabet, 00166 std::string file_name) 00167 { 00168 std::ifstream input(file_name.c_str()); 00169 if(!input.good()) 00170 { 00171 std::cerr << "Can not open file " << file_name << std::endl; 00172 exit(-1); 00173 } 00174 while(!input.eof()) 00175 { 00176 SequenceEntry inseq(alphabet); 00177 input >> inseq; 00178 if((inseq.getSequence()).size() > 0) 00179 s.push_back(inseq.getSequence()); 00180 } 00181 input.close(); 00182 } 00183 00184 void readMapFromFile(std::map<std::string, double> & s, 00185 std::string file_name) 00186 { 00187 std::ifstream input(file_name.c_str()); 00188 if(!input.good()) 00189 { 00190 std::cerr << "Can not open file " << file_name << std::endl; 00191 exit(-1); 00192 } 00193 std::string line; 00194 while(!input.eof()) 00195 { 00196 std::getline(input, line, '\n'); 00197 std::vector <std::string> x; 00198 boost::regex separator("\t"); 00199 split_regex(line, x, separator); 00200 if(x.size() >= 2) { 00201 std::string key = x[0]; 00202 int value = atof((x[1]).c_str()); 00203 s[key] = value; 00204 } 00205 } 00206 input.close(); 00207 } 00208 00209 00210 00211 int mod(int D, int d) { 00212 int r = D%d; 00213 if (r < 0) { 00214 if (d > 0) r = r + d; 00215 else r = r - d; 00216 } 00217 return r; 00218 } 00219 00220 void trim_spaces (std::string & s) { 00221 int k; 00222 for(k = s.size() - 1; k >= 0; k--) 00223 { 00224 if(!isspace(s[k])) 00225 break; 00226 } 00227 s = s.substr(0, k+1); 00228 for(k = 0; k < (int)(s.size()); k++) 00229 { 00230 if(!isspace(s[k])) 00231 break; 00232 } 00233 s = s.substr(k, (s.size()) - k); 00234 } 00235 void split_regex (const std::string & s, std::vector <std::string> & result, const boost::regex & re) 00236 { 00237 boost::sregex_token_iterator i(s.begin(), s.end(), re, -1); 00238 boost::sregex_token_iterator j; 00239 while(i != j) 00240 result.push_back(*i++); 00241 } 00242 00243 double lookup (double x){ 00244 assert (x >= 0.00f); 00245 assert (x <= 7.5); 00246 //return ((-0.00653779113685f * x + 0.09537236626558f) * x + 0.55317574459331f) * x + 0.68672959851568f; 00247 if (x <= 1.00f) return ((-0.009350833524763f * x + 0.130659527668286f) * x + 0.498799810682272f) * x + 0.693203116424741f; 00248 if (x <= 2.50f) return ((-0.014532321752540f * x + 0.139942324101744f) * x + 0.495635523139337f) * x + 0.692140569840976f; 00249 if (x <= 4.50f) return ((-0.004605031767994f * x + 0.063427417320019f) * x + 0.695956496475118f) * x + 0.514272634594009f; 00250 assert (x <= 7.5); 00251 return ((-0.000458661602210f * x + 0.009695946122598f) * x + 0.930734667215156f) * x + 0.168037164329057f; 00252 00253 //return (((0.00089738532761f * x - 0.01859488697982f) * x + 0.14415772028626f) * x + 0.49515490689159f) * x + 0.69311928966454f; 00254 } 00255 00256 double log_sum (double x, double y){ 00257 if (x < y) return (x <= -2e20 || y - x >= 7.5) ? y : lookup(y-x) + x; 00258 return (y <= -2e20 || x - y >= 7.5) ? x : lookup(x-y) + y; 00259 } 00260 00261 00262 /* double log_sum( double log_a, double log_b) 00263 { 00264 double min = log_a; 00265 double diff; 00266 if(min > log_b) 00267 { 00268 diff = log_b - log_a; 00269 if(diff != diff) 00270 return log_a; 00271 return log_a + lookup(diff); 00272 //return log_a + log(1 + exp(diff)); 00273 } 00274 else 00275 { 00276 diff = log_a - log_b; 00277 if(diff != diff) 00278 return log_b; 00279 return log_b + lookup(diff); 00280 //return log_b + log(1 + exp(diff)); 00281 } 00282 }*/ 00283 00284 00285 double safe_division(double a, double b) 00286 { 00287 if((b < 1) && (a > b * (std::numeric_limits<double>::max)())) 00288 { 00289 return (std::numeric_limits<double>::max)(); 00290 } 00291 else if (((b > 1) && (a < b*(std::numeric_limits<double>::min)()) )|| (a == 0)) 00292 { 00293 return 0; 00294 } 00295 else 00296 { 00297 return a/b; 00298 } 00299 00300 } 00301 00302 00303 bool close(double a, double b, double tolerance) 00304 { 00305 double diff = fabs(a-b); 00306 00307 double div1 = safe_division(diff, fabs(a)); 00308 double div2 = safe_division(diff, fabs(b)); 00309 if( (div1 <= tolerance) && (div2 <= tolerance)) 00310 { 00311 return true; 00312 } 00313 return false; 00314 } 00315 00316 00317 00318 00319 double kernel_density_estimation(double x, double bw, const DoubleVector &data){ 00320 double result = 0.0; 00321 for(unsigned int i = 0; i < data.size(); i++) { 00322 double x_xi = (double)(x) - (double)data[i]; 00323 result += epanechnikov(x_xi, bw); 00324 } 00325 result /= data.size(); 00326 return result; 00327 } 00328 00329 00330 double kernel_density_estimation_gaussian(double x, double bw, const DoubleVector &data){ 00331 double result = 0.0; 00332 for(unsigned int i = 0; i < data.size(); i++) { 00333 double x_xi = (double)(x) - (double)data[i]; 00334 double y = (x_xi / bw); 00335 result += 1/sqrt(2*M_PI) * exp( -y*y/2) ; 00336 } 00337 result /= data.size()*bw; 00338 return result; 00339 } 00340 00341 double sj_bandwidth(const DoubleVector &data){ 00342 double n = (double) data.size(); 00343 int nb = 1000; 00344 double d = 1.0; 00345 double variance = var(data); 00346 DoubleVector count; 00347 band_den_bin ((int)n, nb, &d, data, count); 00348 double scale = _min(std::sqrt(variance), iqr(data)/1.349); 00349 double b = 1.23 * scale * pow(n, (-1.0/9.0)); 00350 double c1 = 1.0 / (2.0*sqrt(M_PI) * n); 00351 double td; 00352 band_phi6_bin((int)n, (int)count.size(), d, count, b, &td); 00353 td = -td; 00354 if(td < 0 || td != td){ 00355 // cerr << "sj_bandwidth (WARNING). Dataset very sparse !!!\n" << endl; 00356 return 0.001; 00357 } 00358 double sdh ; 00359 band_phi4_bin((int)n, (int)count.size(), d, count, pow(2.394/(n*td), (1.0/7.0)), &sdh); 00360 return pow((c1/sdh), 1.0/5.0); 00361 } 00362 00363 00364 /*========*/ 00365 00366 00367 }