ToPS
TrainWeightArrayModel.cpp
00001 /*
00002  *       TrainWeightArrayModel.cpp
00003  *
00004  *       Copyright 2011 Andre Yoshiaki Kashiwabara <akashiwabara@usp.br>
00005  *                      Ígor Bonádio <ibonadio@ime.usp.br>
00006  *                      Vitor Onuchic <vitoronuchic@gmail.com>
00007  *                      Alan Mitchell Durham <aland@usp.br>
00008  *
00009  *       This program is free software; you can redistribute it and/or modify
00010  *       it under the terms of the GNU  General Public License as published by
00011  *       the Free Software Foundation; either version 3 of the License, or
00012  *       (at your option) any later version.
00013  *
00014  *       This program is distributed in the hope that it will be useful,
00015  *       but WITHOUT ANY WARRANTY; without even the implied warranty of
00016  *       MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00017  *       GNU General Public License for more details.
00018  *
00019  *       You should have received a copy of the GNU General Public License
00020  *       along with this program; if not, write to the Free Software
00021  *       Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
00022  *       MA 02110-1301, USA.
00023  */
00024 
00025 #include "ContextTree.hpp"
00026 #include "TrainWeightArrayModel.hpp"
00027 #include "InhomogeneousMarkovChain.hpp"
00028 #include "ProbabilisticModelParameter.hpp"
00029 #include "ProbabilisticModelCreatorClient.hpp"
00030 #include "SequenceFactory.hpp"
00031 namespace tops{
00032 
00033   ProbabilisticModelPtr TrainWeightArrayModel::create( ProbabilisticModelParameters & parameters, double & loglikelihood, int & sample_size) const
00034   {
00035     ProbabilisticModelParameterValuePtr alphabetpar = parameters.getMandatoryParameterValue("alphabet");
00036     ProbabilisticModelParameterValuePtr trainingsetpar = parameters.getMandatoryParameterValue("training_set");
00037     ProbabilisticModelParameterValuePtr orderpar = parameters.getMandatoryParameterValue("order");
00038     ProbabilisticModelParameterValuePtr lengthpar = parameters.getMandatoryParameterValue("length");
00039     ProbabilisticModelParameterValuePtr offsetpar = parameters.getOptionalParameterValue("offset");
00040     ProbabilisticModelParameterValuePtr vicinitypar = parameters.getOptionalParameterValue("vicinity_length");
00041     ProbabilisticModelParameterValuePtr pseudocountspar = parameters.getOptionalParameterValue("pseudo_counts");
00042     ProbabilisticModelParameterValuePtr fixseqpar = parameters.getOptionalParameterValue("fixed_sequence");
00043     ProbabilisticModelParameterValuePtr fixseqpospar = parameters.getOptionalParameterValue("fixed_sequence_position");
00044     ProbabilisticModelParameterValuePtr aprioripar = parameters.getOptionalParameterValue("apriori");
00045 
00046         ProbabilisticModelParameterValuePtr weightspar = parameters.getOptionalParameterValue("weights");
00047         std::map <std::string, double> weights;
00048         if(weightspar != NULL) {
00049           readMapFromFile(weights, weightspar->getString());
00050         }
00051 
00052 
00053     if((alphabetpar == NULL) ||
00054        (trainingsetpar == NULL) ||
00055        (lengthpar == NULL) ||
00056        (orderpar == NULL) )
00057       {
00058         std::cerr << help() << std::endl;
00059         exit(-1);
00060       }
00061     int offset = 0;
00062     double pseudocounts = 0;
00063     if(pseudocountspar != NULL)
00064         pseudocounts = pseudocountspar->getDouble();
00065     ProbabilisticModelPtr apriori;
00066     if(aprioripar != NULL)
00067       {
00068          ProbabilisticModelCreatorClient c;
00069          apriori = c.create(aprioripar->getString());
00070       }
00071 
00072     if(offsetpar != NULL)
00073       offset = offsetpar ->getInt();
00074     if(pseudocountspar != NULL)
00075       pseudocounts = pseudocountspar->getDouble();
00076 
00077     AlphabetPtr alphabet = AlphabetPtr(new Alphabet());
00078     alphabet->initializeFromVector(alphabetpar ->getStringVector());
00079     SequenceEntryList sample_set;
00080     readSequencesFromFile(sample_set, alphabet, trainingsetpar->getString());
00081     int vicinity = 0;
00082     if( vicinitypar != NULL)
00083       vicinity = vicinitypar->getInt();
00084     int order = orderpar->getInt();
00085     int length = lengthpar->getInt();
00086     std::vector <ContextTreePtr> positional_distribution;
00087     positional_distribution.resize(length);
00088     sample_size = 0;
00089     bool fixseq = false;
00090     Sequence fixed;
00091     int fixed_pos = 0;
00092     if((fixseqpar != NULL) && (fixseqpospar != NULL)) {
00093       std::string seqstr = fixseqpar->getString();
00094       SequenceFactory factory(alphabet);
00095       fixed = factory.createSequence(seqstr);
00096       fixed_pos = fixseqpospar->getInt();
00097       fixseq = true;
00098     }
00099 
00100     for(int i = 0; i < length; i++)
00101       {
00102         SequenceEntryList positionalSample;
00103         int o = i;
00104         if(o > order) o = order;
00105         for(int j = 0; j < (int)sample_set.size(); j++)
00106           {
00107               for(int k = -vicinity; k <= vicinity; k++)
00108                   {
00109                       Sequence s;
00110                       s.resize(o+1);
00111                       int l = 0;
00112                       int m = i - o + k+ offset;
00113 
00114                       if(m < 0)
00115                           continue;
00116                       if((m + o) >= (int)(sample_set[j]->getSequence()).size())
00117                           continue;
00118 
00119 
00120                       if(fixseq) {
00121                           while( (m < (int) (sample_set[j]->getSequence()).size()) && (l <= o))
00122                               {
00123                                   s[l] = (sample_set[j]->getSequence())[m];
00124                                   if(fixseq && (fixed_pos <= (m-k)) && ( (m-k) <= fixed_pos + fixed.size()-1)){
00125                                       int p = m-fixed_pos-k;
00126                                       if((p >= 0) && (p < fixed.size()))
00127                                           s[l] = fixed[p];
00128                                   }
00129                                   l++; m++;
00130                               }
00131                       } else {
00132                           while( (m < (int) (sample_set[j]->getSequence()).size()) && (l <= o))
00133                               {
00134                                   s[l] = (sample_set[j]->getSequence())[m];
00135                                   l++; m++;
00136                               }
00137                       }
00138 
00139                       SequenceEntryPtr entry = SequenceEntryPtr(new SequenceEntry(alphabet));
00140                       entry->setSequence(s);
00141                       entry->setName(sample_set[j]->getName());
00142                       positionalSample.push_back(entry);
00143                   }
00144           }
00145         if(fixseq && (fixed_pos <= i) && ((int)i <= ((int)fixed_pos + (int)fixed.size() - 1))){
00146             ContextTreePtr tree = ContextTreePtr(new ContextTree(alphabet));
00147             tree->initializeCounter(positionalSample, o, 0, weights);
00148             tree->normalize();
00149             positional_distribution[i] = tree;
00150         } else {
00151             ContextTreePtr tree = ContextTreePtr(new ContextTree(alphabet));
00152 
00153             if(apriori != NULL  && apriori->factorable() != NULL){
00154               tree->initializeCounter(positionalSample, o, pseudocounts, weights);
00155                 tree->normalize(apriori, pseudocounts);
00156             } else {
00157               tree->initializeCounter(positionalSample, o, pseudocounts, weights);
00158                 tree->normalize();
00159 
00160             }
00161             positional_distribution[i] = tree;
00162         }
00163 
00164       }
00165     InhomogeneousMarkovChainPtr model = InhomogeneousMarkovChainPtr(new    InhomogeneousMarkovChain());
00166     model->setPositionSpecificDistribution(positional_distribution);
00167     model->phased(0);
00168     model->setAlphabet(alphabet);
00169     loglikelihood = 0.0;
00170 
00171     for(int j = 0; j < (int)sample_set.size(); j++)
00172         {
00173         sample_size += (sample_set[j]->getSequence()).size();
00174         double v = model->evaluate((sample_set[j]->getSequence()), 0, (sample_set[j]->getSequence()).size()-1);
00175         if (v <= -HUGE)
00176             continue;
00177 
00178         loglikelihood += v;
00179       }
00180     return model;
00181 
00182   }
00183   ProbabilisticModelPtr TrainWeightArrayModel::create( ProbabilisticModelParameters & parameters) const
00184   {
00185     int size;
00186     double loglikelihood;
00187     return create(parameters, loglikelihood, size);
00188   }
00189 }