ToPS
TrainInterpolatedPhasedMarkovChain.cpp
00001 /*
00002  *       TrainInterpolatedPhasedMarkovChain.cpp
00003  *
00004  *       Copyright 2011 Andre Yoshiaki Kashiwabara <akashiwabara@usp.br>
00005  *                      Ígor Bonádio <ibonadio@ime.usp.br>
00006  *                      Vitor Onuchic <vitoronuchic@gmail.com>
00007  *                      Alan Mitchell Durham <aland@usp.br>
00008  *
00009  *       This program is free software; you can redistribute it and/or modify
00010  *       it under the terms of the GNU  General Public License as published by
00011  *       the Free Software Foundation; either version 3 of the License, or
00012  *       (at your option) any later version.
00013  *
00014  *       This program is distributed in the hope that it will be useful,
00015  *       but WITHOUT ANY WARRANTY; without even the implied warranty of
00016  *       MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00017  *       GNU General Public License for more details.
00018  *
00019  *       You should have received a copy of the GNU General Public License
00020  *       along with this program; if not, write to the Free Software
00021  *       Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
00022  *       MA 02110-1301, USA.
00023  */
00024 
00025 #include "ContextTree.hpp"
00026 #include "TrainInterpolatedPhasedMarkovChain.hpp"
00027 #include "InhomogeneousMarkovChain.hpp"
00028 #include "ProbabilisticModelParameter.hpp"
00029 #include "ProbabilisticModelCreatorClient.hpp"
00030 namespace tops {
00031 
00032     ProbabilisticModelPtr TrainInterpolatedPhasedMarkovChain::create(
00033                                                                      ProbabilisticModelParameters & parameters, double & loglikelihood,
00034                                                                      int & sample_size) const {
00035         ProbabilisticModelParameterValuePtr alphabetpar =
00036             parameters.getMandatoryParameterValue("alphabet");
00037         ProbabilisticModelParameterValuePtr trainingsetpar =
00038             parameters.getMandatoryParameterValue("training_set");
00039         ProbabilisticModelParameterValuePtr trainingsetpar1 =
00040             parameters.getOptionalParameterValue("training_set_1");
00041         ProbabilisticModelParameterValuePtr trainingsetpar2 =
00042             parameters.getOptionalParameterValue("training_set_2");
00043 
00044 
00045         ProbabilisticModelParameterValuePtr orderpar =
00046             parameters.getMandatoryParameterValue("order");
00047         ProbabilisticModelParameterValuePtr number_of_phasespar =
00048             parameters.getMandatoryParameterValue("number_of_phases");
00049         ProbabilisticModelParameterValuePtr pseudocountspar = parameters.getOptionalParameterValue("pseudo_counts");
00050         ProbabilisticModelParameterValuePtr aprioripar = parameters.getOptionalParameterValue("apriori");
00051         ProbabilisticModelParameterValuePtr weightspar = parameters.getOptionalParameterValue("weights");
00052         std::map <std::string, double> weights;
00053         if(weightspar != NULL) {
00054           readMapFromFile(weights, weightspar->getString());
00055         }
00056 
00057         double pseudocounts = 0;
00058         if(pseudocountspar != NULL)
00059             pseudocounts = pseudocountspar->getDouble();
00060         ProbabilisticModelPtr apriori;
00061         if(aprioripar != NULL)
00062             {
00063                 ProbabilisticModelCreatorClient c;
00064                 apriori = c.create(aprioripar->getString());
00065             }
00066 
00067 
00068         if ((alphabetpar == NULL) || (trainingsetpar == NULL)
00069             || (orderpar == NULL) || (number_of_phasespar == NULL)) {
00070             std::cerr << help() << std::endl;
00071             exit(-1);
00072         }
00073         AlphabetPtr alphabet = AlphabetPtr(new Alphabet());
00074         alphabet->initializeFromVector(alphabetpar ->getStringVector());
00075         SequenceEntryList sample_set;
00076         readSequencesFromFile(sample_set, alphabet, trainingsetpar->getString());
00077         SequenceEntryList sample_set_1;
00078         if(trainingsetpar1 != NULL)
00079             {
00080                 readSequencesFromFile(sample_set_1, alphabet, trainingsetpar1->getString());
00081             }
00082         SequenceEntryList sample_set_2;
00083         if(trainingsetpar2 != NULL)
00084             {
00085                 readSequencesFromFile(sample_set_2, alphabet, trainingsetpar1->getString());
00086             }
00087 
00088         int order = orderpar->getInt();
00089         int length = number_of_phasespar->getInt() ;
00090         std::vector<ContextTreePtr> positional_distribution;
00091         positional_distribution.resize(length);
00092         sample_size = 0;
00093 
00094         for (int i = 0; i < length; i++) {
00095             SequenceEntryList positionalSample;
00096             for(int j = 0; j < (int)sample_set.size(); j++)
00097                 {
00098                     int nseq = 0;
00099                     std::string name = sample_set[j]->getName();
00100                     while(true)
00101                         {
00102                             int start = (length) * nseq - order + i;
00103                             if(start < 0) {
00104                                 nseq++;
00105                                 continue;
00106                             }
00107                             int end = (length) * nseq + i;
00108                             if(end >= (int)(sample_set[j]->getSequence()).size())
00109                                 break;
00110                             Sequence s;
00111                             for(int k = start; k <= end; k++)
00112                                 s.push_back((sample_set[j]->getSequence())[k]);
00113                             SequenceEntryPtr entry = SequenceEntryPtr (new SequenceEntry(alphabet));
00114                             entry->setSequence(s);
00115                             entry->setName(name);
00116                             positionalSample.push_back(entry);
00117                             nseq++;
00118                         }
00119                 }
00120 
00121 
00122 
00123 
00124 
00125             for(int j = 0; j < (int)sample_set_1.size(); j++)
00126                 {
00127                     int nseq = 0;
00128                     std::string name = sample_set_1[j]->getName();
00129                     while(true)
00130                         {
00131                             int start = (length) * nseq - order + i -1;
00132                             if(start < 0) {
00133                                 nseq++;
00134                                 continue;
00135                             }
00136                             int end = (length) * nseq + i -1;
00137                             if(end >= (int)(sample_set_1[j]->getSequence()).size())
00138                                 break;
00139                             Sequence s;
00140                             for(int k = start; k <= end; k++)
00141                                 s.push_back((sample_set_1[j]->getSequence())[k]);
00142                             SequenceEntryPtr entry = SequenceEntryPtr (new SequenceEntry(alphabet));
00143                             entry->setSequence(s);
00144                             entry->setName(name);
00145                             positionalSample.push_back(entry);
00146                             nseq++;
00147                         }
00148                 }
00149 
00150             for(int j = 0; j < (int)sample_set_2.size(); j++)
00151                 {
00152                     int nseq = 0;
00153                     std::string name = sample_set_2[j]->getName();
00154                     while(true)
00155                         {
00156                             int start = (length) * nseq - order + i -2;
00157                             if(start < 0) {
00158                                 nseq++;
00159                                 continue;
00160                             }
00161                             int end = (length) * nseq + i -2;
00162                             if(end >= (int)(sample_set_2[j]->getSequence()).size())
00163                                 break;
00164                             Sequence s;
00165                             for(int k = start; k <= end; k++)
00166                                 s.push_back((sample_set_2[j]->getSequence())[k]);
00167                             SequenceEntryPtr entry = SequenceEntryPtr (new SequenceEntry(alphabet));
00168                             entry->setSequence(s);
00169                             entry->setName(name);
00170                             positionalSample.push_back(entry);
00171                             nseq++;
00172                         }
00173                 }
00174 
00175             ContextTreePtr tree = ContextTreePtr(new ContextTree(alphabet));
00176 
00177             if(apriori != NULL){
00178               tree->initializeCounter(positionalSample, order, 0, weights);
00179                 tree->pruneTreeSmallSampleSize(400);
00180                 tree->normalize(apriori, pseudocounts, i);
00181 
00182             } else {
00183               tree->initializeCounter(positionalSample, order, pseudocounts, weights);
00184                 tree->pruneTreeSmallSampleSize(400);
00185                 tree->normalize();
00186             }
00187 
00188             positional_distribution[i] = tree;
00189         }
00190         InhomogeneousMarkovChainPtr model = InhomogeneousMarkovChainPtr(
00191                                                                         new InhomogeneousMarkovChain());
00192         model->setPositionSpecificDistribution(positional_distribution);
00193         model->phased(1);
00194         model->setAlphabet(alphabet);
00195         loglikelihood = 0.0;
00196 
00197         for (int j = 0; j < (int) sample_set.size(); j++) {
00198             sample_size += (sample_set[j]->getSequence()).size();
00199 
00200             loglikelihood += model->evaluate((sample_set[j]->getSequence()), 0, (sample_set[j]->getSequence()).size()
00201                                              - 1);
00202         }
00203         return model;
00204 
00205     }
00206     ProbabilisticModelPtr TrainInterpolatedPhasedMarkovChain::create(
00207                                                                      ProbabilisticModelParameters & parameters) const {
00208         int size;
00209         double loglikelihood;
00210         return create(parameters, loglikelihood, size);
00211     }
00212 }
00213