ToPS
|
00001 /* 00002 * SmoothedHistogramBurge.cpp 00003 * 00004 * Copyright 2011 Andre Yoshiaki Kashiwabara <akashiwabara@usp.br> 00005 * Ígor Bonádio <ibonadio@ime.usp.br> 00006 * Vitor Onuchic <vitoronuchic@gmail.com> 00007 * Alan Mitchell Durham <aland@usp.br> 00008 * 00009 * This program is free software; you can redistribute it and/or modify 00010 * it under the terms of the GNU General Public License as published by 00011 * the Free Software Foundation; either version 3 of the License, or 00012 * (at your option) any later version. 00013 * 00014 * This program is distributed in the hope that it will be useful, 00015 * but WITHOUT ANY WARRANTY; without even the implied warranty of 00016 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00017 * GNU General Public License for more details. 00018 * 00019 * You should have received a copy of the GNU General Public License 00020 * along with this program; if not, write to the Free Software 00021 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, 00022 * MA 02110-1301, USA. 00023 */ 00024 00025 #include "SmoothedHistogramBurge.hpp" 00026 #include "DiscreteIIDModel.hpp" 00027 #include "util.hpp" 00028 namespace tops { 00029 00030 00031 ProbabilisticModelPtr SmoothedHistogramBurge::create( ProbabilisticModelParameters & parameters) const 00032 { 00033 00034 00035 ProbabilisticModelParameterValuePtr training_set_parameter = 00036 parameters.getMandatoryParameterValue("training_set"); 00037 ProbabilisticModelParameterValuePtr cpar = 00038 parameters.getMandatoryParameterValue("C"); 00039 ProbabilisticModelParameterValuePtr maxlengthp = 00040 parameters.getOptionalParameterValue("max_length"); 00041 long max = 15000; 00042 if(maxlengthp != NULL) 00043 max = maxlengthp->getInt(); 00044 00045 if((training_set_parameter == NULL)||(cpar == NULL)) { 00046 std::cerr << help () << std::endl; 00047 ProbabilisticModelPtr nullmodel; 00048 exit(-1); 00049 return nullmodel; 00050 } 00051 double C = cpar->getDouble(); 00052 DoubleVector data; 00053 AlphabetPtr alpha = AlphabetPtr(new Alphabet());; 00054 SequenceList sample_set; 00055 readSequencesFromFile(sample_set, alpha, training_set_parameter->getString()); 00056 00057 for(int i = 0; i < (int)sample_set.size();i++) 00058 for(int j = 0; j < (int) sample_set[i].size(); j++) 00059 data.push_back(sample_set[i][j]); 00060 00061 typedef std::map<long,double> Lengths; 00062 int N = data.size(); 00063 Lengths counter; 00064 Lengths sum; 00065 Lengths::const_iterator iter; 00066 for(int i = 0; i < N; i++){ 00067 iter = counter.find((long)data[i]); 00068 if(iter == counter.end()) 00069 counter[(long)data[i]] = 1.0; 00070 else 00071 counter[(long)data[i]] += 1.0; 00072 } 00073 double total = 0.0; 00074 for (int k = 1; k <= max; k++){ 00075 int start = k - 10; 00076 int end = k + 10; 00077 if(start < 0) start = 0; 00078 sum[k] = 0.0; 00079 for(int x = start; x < end; x++){ 00080 iter = counter.find((long)x); 00081 if(iter != counter.end() && iter->second > 0.0){ 00082 double nx = iter->second; 00083 double mean = x+1.0; 00084 double sd = sqrt(2*((double)(x+1.0))*C/nx); 00085 double px2 = 0.5*(1 + _erf((((double)k+1.5) - mean))/ (sd*sqrt(2.0))); 00086 double px1 = 0.5*(1 + _erf((((double)k+0.5) - mean))/ (sd*sqrt(2.0))); 00087 assert(nx > 0.0); 00088 assert(mean > 0.0); 00089 assert(sd > 0.0); 00090 sum[k] += nx*(px2 - px1); 00091 } 00092 } 00093 sum[k] = sum[k]/N; 00094 total = total+ sum[k]; 00095 } 00096 double epsilon = 1e-5; 00097 total = total/(1 - max*epsilon); 00098 00099 DoubleVector prob; 00100 prob.resize(max+1); 00101 for (int k = 1; k <= max; k++){ 00102 prob[k] = epsilon + sum[k]/total; 00103 } 00104 00105 ProbabilisticModelParameters pars; 00106 pars.add("probabilities", ProbabilisticModelParameterValuePtr (new DoubleVectorParameterValue(prob))); 00107 pars.add("alphabet", alpha->getParameterValue()); 00108 DiscreteIIDModelPtr result = 00109 DiscreteIIDModelPtr(new DiscreteIIDModel()); 00110 result->initialize(pars); 00111 return result; 00112 } 00113 00114 }