ToPS: TrainPhasedMarkovChain.cpp Source File

00001 /*
00002  *       TrainPhasedMarkovChain.cpp
00003  *
00004  *       Copyright 2011 Andre Yoshiaki Kashiwabara <akashiwabara@usp.br>
00005  *                      Ígor Bonádio <ibonadio@ime.usp.br>
00006  *                      Vitor Onuchic <vitoronuchic@gmail.com>
00007  *                      Alan Mitchell Durham <aland@usp.br>
00008  *
00009  *       This program is free software; you can redistribute it and/or modify
00010  *       it under the terms of the GNU  General Public License as published by
00011  *       the Free Software Foundation; either version 3 of the License, or
00012  *       (at your option) any later version.
00013  *
00014  *       This program is distributed in the hope that it will be useful,
00015  *       but WITHOUT ANY WARRANTY; without even the implied warranty of
00016  *       MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00017  *       GNU General Public License for more details.
00018  *
00019  *       You should have received a copy of the GNU General Public License
00020  *       along with this program; if not, write to the Free Software
00021  *       Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
00022  *       MA 02110-1301, USA.
00023  */
00024 
00025 /*
00026  * TrainPhasedMarkovChain.cpp
00027  *
00028  *  Created on: 4/Mai/2009
00029  *      Author: yoshiaki
00030  */
00031 #include "ContextTree.hpp"
00032 #include "TrainPhasedMarkovChain.hpp"
00033 #include "InhomogeneousMarkovChain.hpp"
00034 #include "ProbabilisticModelParameter.hpp"
00035 #include "ProbabilisticModelCreatorClient.hpp"
00036 namespace tops {
00037 
00038 ProbabilisticModelPtr TrainPhasedMarkovChain::create(
00039                 ProbabilisticModelParameters & parameters, double & loglikelihood,
00040                 int & sample_size) const {
00041         ProbabilisticModelParameterValuePtr alphabetpar =
00042                         parameters.getMandatoryParameterValue("alphabet");
00043         ProbabilisticModelParameterValuePtr trainingsetpar =
00044                         parameters.getMandatoryParameterValue("training_set");
00045         ProbabilisticModelParameterValuePtr orderpar =
00046                         parameters.getMandatoryParameterValue("order");
00047         ProbabilisticModelParameterValuePtr number_of_phasespar =
00048           parameters.getMandatoryParameterValue("number_of_phases");
00049     ProbabilisticModelParameterValuePtr pseudocountspar = parameters.getOptionalParameterValue("pseudo_counts");
00050         ProbabilisticModelParameterValuePtr aprioripar = parameters.getOptionalParameterValue("apriori");
00051         ProbabilisticModelParameterValuePtr weightspar = parameters.getOptionalParameterValue("weights");
00052         std::map <std::string, double> weights;
00053         if(weightspar != NULL) {
00054           readMapFromFile(weights, weightspar->getString());
00055         }
00056 
00057     double pseudocounts = 0;
00058     if(pseudocountspar != NULL)
00059         pseudocounts = pseudocountspar->getDouble();
00060     ProbabilisticModelPtr apriori;
00061     if(aprioripar != NULL)
00062         {
00063             ProbabilisticModelCreatorClient c;
00064             apriori = c.create(aprioripar->getString());
00065         }
00066 
00067 
00068 
00069 
00070         if ((alphabetpar == NULL) || (trainingsetpar == NULL)
00071             || (orderpar == NULL) || (number_of_phasespar == NULL)) {
00072           std::cerr << help() << std::endl;
00073           exit(-1);
00074         }
00075         AlphabetPtr alphabet = AlphabetPtr(new Alphabet());
00076         alphabet->initializeFromVector(alphabetpar ->getStringVector());
00077         SequenceEntryList sample_set;
00078         readSequencesFromFile(sample_set, alphabet, trainingsetpar->getString());
00079         int order = orderpar->getInt();
00080         int length = number_of_phasespar->getInt() ;
00081         std::vector<ContextTreePtr> positional_distribution;
00082         positional_distribution.resize(length);
00083         sample_size = 0;
00084 
00085         for (int i = 0; i < length; i++) {
00086                 SequenceEntryList positionalSample;
00087                 for(int j = 0; j < (int)sample_set.size(); j++)
00088                 {
00089                   int nseq = 0;
00090                   while(true)
00091                     {
00092                       int start = (length) * nseq - order + i;
00093                       if(start < 0) {
00094                         nseq++;
00095                         continue;
00096                       }
00097                       int end = (length) * nseq + i;
00098                       if(end >= (int)(sample_set[j]->getSequence()).size())
00099                         break;
00100                       Sequence s;
00101                       for(int k = start; k <= end; k++)
00102                         s.push_back((sample_set[j]->getSequence())[k]);
00103                       SequenceEntryPtr entry = SequenceEntryPtr (new SequenceEntry(alphabet));
00104                       entry->setSequence(s);
00105                       positionalSample.push_back(entry);
00106                       nseq++;
00107                     }
00108                 }
00109                 ContextTreePtr tree = ContextTreePtr(new ContextTree(alphabet));
00110 
00111                 if(apriori != NULL){
00112                   tree->initializeCounter(sample_set, orderpar->getInt(), 0, weights);
00113                     tree->normalize(apriori, pseudocounts, i);
00114                 } else {
00115 
00116                   tree->initializeCounter(sample_set, orderpar->getInt(), pseudocounts, weights);
00117                     tree->normalize();
00118                 }
00119 
00120 
00121 
00122                 positional_distribution[i] = tree;
00123 
00124         }
00125         InhomogeneousMarkovChainPtr model = InhomogeneousMarkovChainPtr(
00126                         new InhomogeneousMarkovChain());
00127         model->setPositionSpecificDistribution(positional_distribution);
00128         model->phased(1);
00129         model->setAlphabet(alphabet);
00130         loglikelihood = 0.0;
00131 
00132         for (int j = 0; j < (int) sample_set.size(); j++) {
00133           sample_size += (sample_set[j]->getSequence()).size();
00134 
00135           loglikelihood += model->evaluate((sample_set[j]->getSequence()), 0, (sample_set[j]->getSequence()).size()
00136                                 - 1);
00137         }
00138         return model;
00139 
00140 }
00141 ProbabilisticModelPtr TrainPhasedMarkovChain::create(
00142                 ProbabilisticModelParameters & parameters) const {
00143         int size;
00144         double loglikelihood;
00145         return create(parameters, loglikelihood, size);
00146 }
00147 }
00148