ToPS
SequenceFormat.cpp
00001 /*
00002  *       SequenceFormat.cpp
00003  *
00004  *       Copyright 2011 Andre Yoshiaki Kashiwabara <akashiwabara@usp.br>
00005  *                      Ígor Bonádio <ibonadio@ime.usp.br>
00006  *                      Vitor Onuchic <vitoronuchic@gmail.com>
00007  *                      Alan Mitchell Durham <aland@usp.br>
00008  *
00009  *       This program is free software; you can redistribute it and/or modify
00010  *       it under the terms of the GNU  General Public License as published by
00011  *       the Free Software Foundation; either version 3 of the License, or
00012  *       (at your option) any later version.
00013  *
00014  *       This program is distributed in the hope that it will be useful,
00015  *       but WITHOUT ANY WARRANTY; without even the implied warranty of
00016  *       MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00017  *       GNU General Public License for more details.
00018  *
00019  *       You should have received a copy of the GNU General Public License
00020  *       along with this program; if not, write to the Free Software
00021  *       Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
00022  *       MA 02110-1301, USA.
00023  */
00024 
00025 #include "SequenceFormat.hpp"
00026 #include "SequenceEntry.hpp"
00027 #include "SequenceFactory.hpp"
00028 #include "util.hpp"
00029 #include "Alphabet.hpp"
00030 #include "Symbol.hpp"
00031 #include <stdlib.h>
00032 #include <sys/types.h>
00033 
00034 
00035 
00036 namespace tops {
00037 
00038   std::ostream & SequenceFormat::saveSequence (std::ostream & stream, SequenceEntry & out){
00039     stream << out.getName() << ":\t" ;
00040     if((out.getAlphabet())->size() > 0)
00041       for(int i = 0; i < (int)out.getSequence().size(); i+=1){
00042           stream << (out.getAlphabet())->getSymbol((out.getSequence())[i])->name() << out.getSeparator();
00043       }
00044     else
00045       for(int i = 0; i < (int)out.getSequence().size(); i+=1){
00046         stream << (out.getSequence())[i] << out.getSeparator();
00047       }
00048     stream << std::endl;
00049     return stream;
00050 
00051   }
00052   std::istream & SequenceFormat::readSequence (std::istream & stream, SequenceEntry & in) {
00053     std::string line;
00054     std::string header;
00055     std::string sequence;
00056     SequenceFactory factory(in.getAlphabet());
00057     in.getSequence() = factory.createSequence(sequence);
00058 
00059     if(stream.bad()){
00060       return stream;
00061     }
00062     while(!stream.eof()){
00063       getline(stream,line,'\n');
00064 
00065       // Ignores blank lines
00066       unsigned int i;
00067       for(i = 0; i < line.length();  i++)
00068         if(!isspace(line[i]) && line[i] != '\n'){
00069           break;
00070         }
00071       if(line[i] == '\0'){
00072         continue;
00073       }
00074       trim_spaces(line);
00075 
00076       std::vector<std::string> seq_entry;
00077       boost::regex separator(":");
00078       split_regex(line, seq_entry, separator);
00079       if(seq_entry.size() < 2)
00080         {
00081           std::cerr << "Invalid sequence format !" << std::endl;
00082           exit(-1);
00083         }
00084       std::string seqname = "";
00085       int last = seq_entry.size()-1;
00086       if(seq_entry.size() >= 2)
00087           {
00088               seqname = seq_entry[0];
00089           }
00090 
00091       boost::regex sep(" ");
00092       in.setName( seqname);
00093       std::string description;
00094       trim_spaces(seq_entry[last]);
00095       std::vector<int> invalid;
00096       in.setSequence(factory.createSequence(seq_entry[last], invalid));
00097       in.setInvalidPositions(invalid);
00098       return stream;
00099     }
00100     return stream;
00101   }
00102   std::ostream & FastaSequenceFormat::saveSequence (std::ostream & stream, SequenceEntry & out)
00103   {
00104     stream << ">" << out.getName() << std::endl;
00105     if((out.getAlphabet())->size() > 0)
00106       for(int i = 0; i < (int)out.getSequence().size(); i+=1){
00107         stream << (out.getAlphabet())->getSymbol((out.getSequence())[i])->name() ;
00108       }
00109     else
00110       for(int i = 0; i < (int)out.getSequence().size(); i+=1)
00111         stream << (out.getSequence())[i] << out.getSeparator();
00112     stream << std::endl;
00113     return stream;
00114   }
00115   std::istream & FastaSequenceFormat::readSequence (std::istream & stream, SequenceEntry & in)
00116   {
00117     std::string line;
00118     std::string sequence ;
00119     bool firstSeq = true;
00120     if(_nextFastaHeader.length() > 0) {
00121       _currentFastaHeader = _nextFastaHeader;
00122       firstSeq = false;
00123     }
00124     while (!stream.eof()) {
00125       if(!std::getline(stream, line, '\n'))
00126          continue;
00127       unsigned int i;
00128       for (i = 0; i < line.length(); i++)
00129         if(!isspace(line[i]) && (line[i] != '\n'))
00130           break;
00131       if(line.length() <= 0 || i == line.length())
00132         continue;
00133 
00134       if(line[0] == '>') {
00135         if(firstSeq == true){
00136           _currentFastaHeader = line;
00137           firstSeq =false;
00138           continue;
00139         }
00140         else {
00141           _nextFastaHeader = line;
00142           break;
00143         }
00144       }
00145       sequence += line;
00146     }
00147     int j = 0;
00148     for (int i = 0; i < (int)sequence.length(); i++)
00149       {
00150         if(isspace(sequence[i]) || (sequence[i] == '\n') )
00151            continue;
00152         sequence[j] = sequence[i];
00153         j++;
00154       }
00155     sequence = sequence.substr(0,j);
00156     if(sequence.length () <= 0)
00157         return stream;
00158 
00159     SequenceFactory factory(in.getAlphabet());
00160     trim_spaces(sequence);
00161     std::vector <int> invalid;
00162     in.setSequence(factory.createSequenceRemovedSpaces(sequence, invalid));
00163     in.setInvalidPositions(invalid);
00164     boost::regex sep(" ");
00165     std::vector <std::string> name_and_description;
00166     split_regex(_currentFastaHeader, name_and_description, sep);
00167 
00168     in.setName( name_and_description[0].substr(1, name_and_description[0].length()-1));
00169     std::string description;
00170     for(int i = 1; i < (int)name_and_description.size(); i++)
00171       description += name_and_description[i];
00172     in.setDescription ( description );
00173 
00174     return stream;
00175   }
00176 
00177   SequenceFormatManagerPtr SequenceFormatManager::_inst ;
00178   SequenceFormatManagerPtr SequenceFormatManager::instance() {
00179     if(!_inst) {
00180       _inst = SequenceFormatManagerPtr(new SequenceFormatManager());
00181     }
00182     return _inst;
00183   }
00184 }