ToPS
|
00001 /* 00002 * SmoothedHistogramStanke.cpp 00003 * 00004 * Copyright 2011 Andre Yoshiaki Kashiwabara <akashiwabara@usp.br> 00005 * Ígor Bonádio <ibonadio@ime.usp.br> 00006 * Vitor Onuchic <vitoronuchic@gmail.com> 00007 * Alan Mitchell Durham <aland@usp.br> 00008 * 00009 * This program is free software; you can redistribute it and/or modify 00010 * it under the terms of the GNU General Public License as published by 00011 * the Free Software Foundation; either version 3 of the License, or 00012 * (at your option) any later version. 00013 * 00014 * This program is distributed in the hope that it will be useful, 00015 * but WITHOUT ANY WARRANTY; without even the implied warranty of 00016 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00017 * GNU General Public License for more details. 00018 * 00019 * You should have received a copy of the GNU General Public License 00020 * along with this program; if not, write to the Free Software 00021 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, 00022 * MA 02110-1301, USA. 00023 */ 00024 00025 #include "SmoothedHistogramStanke.hpp" 00026 #include "DiscreteIIDModel.hpp" 00027 #include "util.hpp" 00028 namespace tops { 00029 00030 00031 ProbabilisticModelPtr SmoothedHistogramStanke::create( ProbabilisticModelParameters & parameters) const 00032 { 00033 ProbabilisticModelParameterValuePtr training_set_parameter = 00034 parameters.getMandatoryParameterValue("training_set"); 00035 ProbabilisticModelParameterValuePtr maxlengthp = 00036 parameters.getOptionalParameterValue("max_length"); 00037 ProbabilisticModelParameterValuePtr mp = 00038 parameters.getOptionalParameterValue("m"); 00039 ProbabilisticModelParameterValuePtr slopep = 00040 parameters.getOptionalParameterValue("slope"); 00041 00042 ProbabilisticModelParameterValuePtr weightspar = parameters.getOptionalParameterValue("weights"); 00043 std::map <std::string, double> weights; 00044 if(weightspar != NULL) { 00045 readMapFromFile(weights, weightspar->getString()); 00046 } 00047 00048 double a = 0.5; 00049 int m = 8; 00050 00051 if(mp != NULL) 00052 m = mp->getInt(); 00053 00054 if(slopep != NULL) 00055 a = slopep->getDouble(); 00056 00057 long max = 15000; 00058 if(maxlengthp != NULL) 00059 max = maxlengthp->getInt(); 00060 int L = max; 00061 max = max + 4 * a * max; 00062 00063 if(training_set_parameter == NULL) { 00064 ProbabilisticModelPtr nullmodel; 00065 exit(-1); 00066 return nullmodel; 00067 } 00068 00069 DoubleVector data; 00070 AlphabetPtr alpha = AlphabetPtr(new Alphabet());; 00071 SequenceEntryList sample_set; 00072 readSequencesFromFile(sample_set, alpha, training_set_parameter->getString()); 00073 for(int i = 0; i < (int)sample_set.size();i++) { 00074 int rep = 1; 00075 if(weights.find(sample_set[i]->getName()) != weights.end()) 00076 rep = (weights.find(sample_set[i]->getName()))->second; 00077 00078 for(int j = 0; j < (int) (sample_set[i]->getSequence()).size(); j++) { 00079 for(int k = 0;k< rep; k++) 00080 data.push_back((sample_set[i]->getSequence())[j]); 00081 } 00082 } 00083 std::map<long,double> sum; 00084 double total = 0.0; 00085 std::map<long,double> d; 00086 std::map<long,int> counter; 00087 DoubleVector prob; 00088 00089 00090 vector<double> pi; 00091 pi.resize(L); 00092 00093 if(data.size() > 0) 00094 { 00095 for(int i = 0; i < (int)data.size(); i++){ 00096 if(counter.find((long)data[i]) == counter.end()) 00097 counter[(long)data[i]] = 1.0; 00098 else 00099 counter[(long)data[i]] += 1.0; 00100 } 00101 00102 00103 00104 double count_left = 0; 00105 double count_right = 0; 00106 00107 for(int pos = 0; (pos < L) && (pos < max) ; pos +=1) 00108 { 00109 int bwd = (int) (.01+ (a / pow(L, 1.0/5.0) ) * (double)pos); 00110 if(bwd <= 0) 00111 bwd = 1; 00112 for(int j = pos - bwd + 1; (j <= pos + bwd -1) ; j++) 00113 { 00114 if (! (j >= 0 && j < L)) 00115 continue; 00116 if(j <= pos) 00117 count_left += (counter[j]) ? 1: 0; 00118 if(j >= pos) 00119 count_right += (counter[j])? 1: 0; 00120 } 00121 00122 while (count_left < m && count_right < m && bwd < L) 00123 { 00124 bwd ++; 00125 if(pos + bwd -1 < L) 00126 count_left += counter[pos + bwd - 1] ? 1:0; 00127 if(pos - bwd + 1 >= 0) 00128 count_right += counter[pos + bwd - 1] ? 1:0; 00129 } 00130 if(pos < L) 00131 pi[pos] += kernel_normal((double)0, (double)bwd) * counter[pos]; 00132 bool negligible = false; 00133 int j=1; 00134 while (!negligible && (pos-j>=0 || pos+j<L)){ 00135 double wj = kernel_normal(j, bwd) * (counter[pos] ); 00136 if (pos-j>=0 && pos-j<(int)pi.size() ) { 00137 pi[pos-j] += wj; 00138 } 00139 if (pos+j<(int)pi.size() && pos+j>=0) { 00140 pi[pos+j] += wj; 00141 } 00142 negligible = (wj < 1e-20); 00143 j++; 00144 } 00145 } 00146 #if 1 00147 double total = 0; 00148 for (long k = 0; k < (int)pi.size(); k++){ 00149 total += pi[k]; 00150 } 00151 prob.resize(L); 00152 for (long k = 0; k < (int)pi.size(); k++){ 00153 prob[k] = pi[k]/(total) ; 00154 } 00155 #endif 00156 } 00157 ProbabilisticModelParameters pars; 00158 pars.add("probabilities", ProbabilisticModelParameterValuePtr (new DoubleVectorParameterValue(prob))); 00159 pars.add("alphabet", alpha->getParameterValue()); 00160 DiscreteIIDModelPtr result = 00161 DiscreteIIDModelPtr(new DiscreteIIDModel()); 00162 result->initialize(pars); 00163 return result; 00164 } 00165 00166 }