ToPS: TrainSimilarityBasedSequenceWeighting.cpp Source File

00001 /*
00002  *       TrainSimilarityBasedSequenceWeighting.cpp
00003  *
00004  *       Copyright 2011 Andre Yoshiaki Kashiwabara <akashiwabara@usp.br>
00005  *                      Ígor Bonádio <ibonadio@ime.usp.br>
00006  *                      Vitor Onuchic <vitoronuchic@gmail.com>
00007  *                      Alan Mitchell Durham <aland@usp.br>
00008  *
00009  *       This program is free software; you can redistribute it and/or modify
00010  *       it under the terms of the GNU  General Public License as published by
00011  *       the Free Software Foundation; either version 3 of the License, or
00012  *       (at your option) any later version.
00013  *
00014  *       This program is distributed in the hope that it will be useful,
00015  *       but WITHOUT ANY WARRANTY; without even the implied warranty of
00016  *       MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00017  *       GNU General Public License for more details.
00018  *
00019  *       You should have received a copy of the GNU General Public License
00020  *       along with this program; if not, write to the Free Software
00021  *       Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
00022  *       MA 02110-1301, USA.
00023  */
00024 
00025 #include "ProbabilisticModel.hpp"
00026 #include "SimilarityBasedSequenceWeighting.hpp"
00027 #include "TrainSimilarityBasedSequenceWeighting.hpp"
00028 #include "ConfigurationReader.hpp"
00029 #include "ContextTree.hpp"
00030 #include "VariableLengthMarkovChain.hpp"
00031 #include "Symbol.hpp"
00032 #include "SequenceFactory.hpp"
00033 #include "util.hpp"
00034 namespace tops {
00035 
00036   double test (ProbabilisticModelPtr m, std::string q, int max)
00037   {
00038     if(q.size() >= max)
00039       {
00040         SequenceFactory f(m->alphabet());
00041         std::stringstream s;
00042         for (int i = 0; i < q.size(); i++)
00043           {
00044             if(i == 0)
00045               s << q[i];
00046             else
00047               s << " " << q[i];
00048           }
00049         Sequence x = f.createSequence(s.str());
00050         double p = exp(m->evaluate(x, 0, x.size()-1));
00051         std::cerr << q << " " << p  << std::endl;
00052         return p;
00053       }
00054     else
00055       {
00056         double sum = 0.0;
00057         for(int i = 0 ; i < m->alphabet()->size(); i++)
00058           {
00059             std::stringstream s;
00060             s <<q<< m->alphabet()->getSymbol(i)->name() ;
00061             sum += test(m, s.str(), max);
00062           }
00063         return sum;
00064       }
00065   }
00066 
00067   double calculate_normalizer (int skip_length,int skip_offset, int max_length, std::map<std::string, double > & counter, int alphabet_size)
00068   {
00069     int npatterns_differ_1 = 0;
00070     npatterns_differ_1 = (alphabet_size - 1) * (max_length - skip_length);
00071     if(skip_length < 0)
00072       npatterns_differ_1 = (alphabet_size - 1) * (max_length );
00073     double sum = 0.0;
00074     std::map<std::string, double>::const_iterator it;
00075     std::map<std::string, double>::const_iterator it2;
00076     for(it = counter.begin(); it != counter.end(); it++)
00077       {
00078         sum += it->second;
00079 
00080         int diff = 0;
00081         int np_differ_1  = 0;
00082 
00083         for(it2 = counter.begin(); it2 != counter.end(); it2++)
00084           {
00085             std::string a = it->first;
00086             std::string b = it2->first;
00087             for(int i = 0; i < max_length; i++)
00088               if((i >= skip_offset) && (i <= skip_offset+skip_length)){
00089                 if(a[i] != b[i])
00090                   diff+=2;
00091               }else if(a[i] != b[i])
00092                 diff++;
00093 
00094             if(diff == 1) {
00095               //sum += it2->second * 0.001;
00096               np_differ_1 ++;
00097             }
00098           }
00099         sum += 0.001*it->second*(npatterns_differ_1 - np_differ_1);
00100       }
00101     return sum;
00102   }
00103 
00104 
00105 
00106 
00107 
00108 
00109 ProbabilisticModelPtr TrainSimilarityBasedSequenceWeighting::create(
00110                 ProbabilisticModelParameters & parameters, double & loglikelihood,
00111                 int & sample_size) const {
00112         ProbabilisticModelParameterValuePtr trainpar =
00113                         parameters.getMandatoryParameterValue("training_set");
00114         ProbabilisticModelParameterValuePtr alphapar =
00115                         parameters.getMandatoryParameterValue("alphabet");
00116 
00117         ProbabilisticModelParameterValuePtr offsetpar = parameters.getOptionalParameterValue("skip_offset");
00118         ProbabilisticModelParameterValuePtr skiplengthpar = parameters.getOptionalParameterValue("skip_length");
00119         ProbabilisticModelParameterValuePtr skipseqpar = parameters.getOptionalParameterValue("skip_sequence");
00120 
00121         if ((trainpar == NULL) || (alphapar == NULL) ) {
00122           std::cerr << help() << std::endl;
00123           exit(-1);
00124         }
00125         int skip_length = -1;
00126         int skip_offset  = -1;
00127         std::string skip_seq;
00128         if((offsetpar != NULL) && (skiplengthpar != NULL))
00129           {
00130             skip_offset = offsetpar->getInt();
00131             skip_length = skiplengthpar->getInt();
00132             skip_seq = skiplengthpar -> getString();
00133 
00134           }
00135 
00136         AlphabetPtr alphabet = AlphabetPtr(new Alphabet());
00137         alphabet ->initializeFromVector(alphapar->getStringVector());
00138         SequenceEntryList sample_set;
00139         readSequencesFromFile(sample_set, alphabet, trainpar->getString());
00140         std::map<std::string, double> counter;
00141         int min_length = 999999999;
00142         for(int i = 0; i < (int) sample_set.size(); i++)
00143           {
00144             std::stringstream o;
00145             for(int j = 0; j < (int)(sample_set[i]->getSequence()).size();j++)
00146               {
00147                 o << alphabet->getSymbol((sample_set[i]->getSequence())[j])->name();
00148               }
00149             if(counter.find(o.str()) == counter.end())
00150               {
00151                 counter[o.str()] = 1;
00152               }
00153             else
00154               {
00155                 counter[o.str()] += 1;
00156               }
00157             if((int)o.str().size() < min_length)
00158               min_length = o.str().size();
00159           }
00160         std::string q;
00161         double normalize = calculate_normalizer(skip_length,skip_offset,  min_length, counter, alphabet->size());
00162 
00163         ProbabilisticModelParameters pars;
00164         pars.add("alphabet", alphapar);
00165         pars.add("counter", DoubleMapParameterValuePtr(new DoubleMapParameterValue(counter)));
00166         pars.add("normalizer", DoubleParameterValuePtr(new DoubleParameterValue(normalize)));
00167         if(skiplengthpar != NULL && offsetpar != NULL) {
00168           pars.add("skip_length", skiplengthpar);
00169           pars.add("skip_offset", offsetpar);
00170           pars.add("skip_sequence", skipseqpar);
00171         }
00172         ProbabilisticModelPtr m = SimilarityBasedSequenceWeightingPtr(new SimilarityBasedSequenceWeighting());
00173         m->initialize(pars);
00174 
00175         //      std::string x;
00176         //      std::cerr << "SUM: " << test(m, x, min_length)<< std::endl;
00177 
00178         loglikelihood = 0.0;
00179         sample_size = 0;
00180         for (int i = 0; i < (int) sample_set.size(); i++) {
00181           loglikelihood
00182             += m->evaluate((sample_set[i]->getSequence()), 0, (sample_set[i]->getSequence()).size() - 1);
00183           sample_size += (sample_set[i]->getSequence()).size();
00184         }
00185         return m;
00186 }
00187   ProbabilisticModelPtr TrainSimilarityBasedSequenceWeighting::create(
00188                                                                       ProbabilisticModelParameters & parameters) const {
00189     double loglike;
00190     int samplesize;
00191     return create(parameters, loglike, samplesize);
00192 
00193   }
00194 
00195 }
00196 ;