ToPS
TrainVariableLengthInhomogeneousMarkovChain.cpp
00001 /*
00002  *       TrainVariableLengthInhomogeneousMarkovChain.cpp
00003  *
00004  *       Copyright 2011 Andre Yoshiaki Kashiwabara <akashiwabara@usp.br>
00005  *                      Ígor Bonádio <ibonadio@ime.usp.br>
00006  *                      Vitor Onuchic <vitoronuchic@gmail.com>
00007  *                      Alan Mitchell Durham <aland@usp.br>
00008  *
00009  *       This program is free software; you can redistribute it and/or modify
00010  *       it under the terms of the GNU  General Public License as published by
00011  *       the Free Software Foundation; either version 3 of the License, or
00012  *       (at your option) any later version.
00013  *
00014  *       This program is distributed in the hope that it will be useful,
00015  *       but WITHOUT ANY WARRANTY; without even the implied warranty of
00016  *       MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00017  *       GNU General Public License for more details.
00018  *
00019  *       You should have received a copy of the GNU General Public License
00020  *       along with this program; if not, write to the Free Software
00021  *       Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
00022  *       MA 02110-1301, USA.
00023  */
00024 
00025 #include "ContextTree.hpp"
00026 #include "TrainVariableLengthInhomogeneousMarkovChain.hpp"
00027 #include "InhomogeneousMarkovChain.hpp"
00028 #include "ProbabilisticModelParameter.hpp"
00029 namespace tops {
00030 
00031 ProbabilisticModelPtr TrainVariableLengthInhomogeneousMarkovChain::create(
00032                 ProbabilisticModelParameters & parameters, double & loglikelihood,
00033                 int & sample_size) const {
00034         ProbabilisticModelParameterValuePtr alphabetpar =
00035                         parameters.getMandatoryParameterValue("alphabet");
00036         ProbabilisticModelParameterValuePtr trainingsetpar =
00037                         parameters.getMandatoryParameterValue("training_set");
00038         ProbabilisticModelParameterValuePtr cutpar =
00039                         parameters.getMandatoryParameterValue("cut");
00040         ProbabilisticModelParameterValuePtr lengthpar =
00041                         parameters.getMandatoryParameterValue("length");
00042         ProbabilisticModelParameterValuePtr phasedpar =
00043                         parameters.getMandatoryParameterValue("phased");
00044 
00045         if ((alphabetpar == NULL) || (trainingsetpar == NULL)
00046             || (lengthpar == NULL) || (cutpar == NULL) || (phasedpar == NULL)) {
00047                 std::cerr << help() << std::endl;
00048                 exit(-1);
00049         }
00050         AlphabetPtr alphabet = AlphabetPtr(new Alphabet());
00051         alphabet->initializeFromVector(alphabetpar ->getStringVector());
00052         SequenceEntryList sample_set;
00053         readSequencesFromFile(sample_set, alphabet, trainingsetpar->getString());
00054         std::vector<std::string> cuts = cutpar->getStringVector();
00055         int length = lengthpar->getInt();
00056         std::vector<ContextTreePtr> positional_distribution;
00057         int phased = phasedpar->getInt();
00058         positional_distribution.resize(length);
00059         sample_size = 0;
00060         if ((int) cuts.size() < length) {
00061                 std::cerr
00062                                 << "ERROR: The number of cuts is smaller than the sequence length "
00063                                 << std::endl;
00064                 exit(-1);
00065         }
00066 
00067 
00068         for (int i = 0; i < length; i++) {
00069                 SequenceEntryList positionalSample;
00070                 int o = i;
00071                 for (int j = 0; j < (int) sample_set.size(); j++) {
00072                         if(!phased) {
00073                           Sequence s;
00074                           s.resize(o + 1);
00075                           int l = 0;
00076                           int m = i - o;
00077                           if (m < 0)
00078                             m = 0;
00079 
00080                           while ((m < (int) (sample_set[j]->getSequence()).size()) && (l <= o)) {
00081                             s[l] = (sample_set[j]->getSequence())[m];
00082                             l++;
00083                             m++;
00084                           }
00085                           SequenceEntryPtr entry = SequenceEntryPtr (new SequenceEntry(alphabet));
00086                           entry->setSequence(s);
00087                           positionalSample.push_back(entry);
00088                         } else {
00089                           o = int(log( (double) (sample_set[j]->getSequence()).size()));
00090                           int m = i - o;
00091                           if(m < 0) { m = 0; o = o -i;}
00092                           int last_m = m;
00093                           while((m < (int) (sample_set[j]->getSequence()).size())) {
00094                             Sequence s;
00095                             s.resize(o+1);
00096                             int l = 0;
00097                             while ((m < (int) (sample_set[j]->getSequence()).size()) && (l <= o)) {
00098                               s[l] = (sample_set[j]->getSequence())[m];
00099                               l++;
00100                               m++;
00101                             }
00102                             m = last_m + length;
00103                             last_m = m;
00104                             SequenceEntryPtr entry = SequenceEntryPtr (new SequenceEntry(alphabet));
00105                             entry->setSequence(s);
00106                             positionalSample.push_back(entry);
00107                           }
00108                         }
00109                 }
00110                 ProbabilisticModelParameterValuePtr c = parameters.getOptionalParameterValue(cuts[i]);
00111                 if(c == NULL){
00112                   std::cerr << "Undefined cut value for " << cuts[i] << std::endl;
00113                   exit(-1);
00114                 }
00115                 double delta =
00116                                 (parameters.getOptionalParameterValue(cuts[i]))->getDouble();
00117                 ContextTreePtr tree = ContextTreePtr(new ContextTree(alphabet));
00118                 tree->initializeContextTreeRissanen(positionalSample);
00119                 tree->pruneTree(delta);
00120                 tree->removeContextNotUsed();
00121                 tree->normalize();
00122                 positional_distribution[i] = tree;
00123         }
00124         InhomogeneousMarkovChainPtr model = InhomogeneousMarkovChainPtr(new InhomogeneousMarkovChain());
00125         model->setPositionSpecificDistribution(positional_distribution);
00126         model->phased(0);
00127         model->setAlphabet(alphabet);
00128         model->phased(phased);
00129         loglikelihood = 0.0;
00130 
00131         for (int j = 0; j < (int) sample_set.size(); j++) {
00132           sample_size += (sample_set[j]->getSequence()).size();
00133           loglikelihood += model->evaluate((sample_set[j]->getSequence()), 0, (sample_set[j]->getSequence()).size()
00134                                            - 1);
00135         }
00136         return model;
00137 
00138 }
00139 ProbabilisticModelPtr TrainVariableLengthInhomogeneousMarkovChain::create(
00140                 ProbabilisticModelParameters & parameters) const {
00141         int size;
00142         double loglikelihood;
00143         return create(parameters, loglikelihood, size);
00144 }
00145 }