ToPS
|
00001 /* 00002 * SequenceFormat.cpp 00003 * 00004 * Copyright 2011 Andre Yoshiaki Kashiwabara <akashiwabara@usp.br> 00005 * Ígor Bonádio <ibonadio@ime.usp.br> 00006 * Vitor Onuchic <vitoronuchic@gmail.com> 00007 * Alan Mitchell Durham <aland@usp.br> 00008 * 00009 * This program is free software; you can redistribute it and/or modify 00010 * it under the terms of the GNU General Public License as published by 00011 * the Free Software Foundation; either version 3 of the License, or 00012 * (at your option) any later version. 00013 * 00014 * This program is distributed in the hope that it will be useful, 00015 * but WITHOUT ANY WARRANTY; without even the implied warranty of 00016 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00017 * GNU General Public License for more details. 00018 * 00019 * You should have received a copy of the GNU General Public License 00020 * along with this program; if not, write to the Free Software 00021 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, 00022 * MA 02110-1301, USA. 00023 */ 00024 00025 #include "SequenceFormat.hpp" 00026 #include "SequenceEntry.hpp" 00027 #include "SequenceFactory.hpp" 00028 #include "util.hpp" 00029 #include "Alphabet.hpp" 00030 #include "Symbol.hpp" 00031 #include <stdlib.h> 00032 #include <sys/types.h> 00033 00034 00035 00036 namespace tops { 00037 00038 std::ostream & SequenceFormat::saveSequence (std::ostream & stream, SequenceEntry & out){ 00039 stream << out.getName() << ":\t" ; 00040 if((out.getAlphabet())->size() > 0) 00041 for(int i = 0; i < (int)out.getSequence().size(); i+=1){ 00042 stream << (out.getAlphabet())->getSymbol((out.getSequence())[i])->name() << out.getSeparator(); 00043 } 00044 else 00045 for(int i = 0; i < (int)out.getSequence().size(); i+=1){ 00046 stream << (out.getSequence())[i] << out.getSeparator(); 00047 } 00048 stream << std::endl; 00049 return stream; 00050 00051 } 00052 std::istream & SequenceFormat::readSequence (std::istream & stream, SequenceEntry & in) { 00053 std::string line; 00054 std::string header; 00055 std::string sequence; 00056 SequenceFactory factory(in.getAlphabet()); 00057 in.getSequence() = factory.createSequence(sequence); 00058 00059 if(stream.bad()){ 00060 return stream; 00061 } 00062 while(!stream.eof()){ 00063 getline(stream,line,'\n'); 00064 00065 // Ignores blank lines 00066 unsigned int i; 00067 for(i = 0; i < line.length(); i++) 00068 if(!isspace(line[i]) && line[i] != '\n'){ 00069 break; 00070 } 00071 if(line[i] == '\0'){ 00072 continue; 00073 } 00074 trim_spaces(line); 00075 00076 std::vector<std::string> seq_entry; 00077 boost::regex separator(":"); 00078 split_regex(line, seq_entry, separator); 00079 if(seq_entry.size() < 2) 00080 { 00081 std::cerr << "Invalid sequence format !" << std::endl; 00082 exit(-1); 00083 } 00084 std::string seqname = ""; 00085 int last = seq_entry.size()-1; 00086 if(seq_entry.size() >= 2) 00087 { 00088 seqname = seq_entry[0]; 00089 } 00090 00091 boost::regex sep(" "); 00092 in.setName( seqname); 00093 std::string description; 00094 trim_spaces(seq_entry[last]); 00095 std::vector<int> invalid; 00096 in.setSequence(factory.createSequence(seq_entry[last], invalid)); 00097 in.setInvalidPositions(invalid); 00098 return stream; 00099 } 00100 return stream; 00101 } 00102 std::ostream & FastaSequenceFormat::saveSequence (std::ostream & stream, SequenceEntry & out) 00103 { 00104 stream << ">" << out.getName() << std::endl; 00105 if((out.getAlphabet())->size() > 0) 00106 for(int i = 0; i < (int)out.getSequence().size(); i+=1){ 00107 stream << (out.getAlphabet())->getSymbol((out.getSequence())[i])->name() ; 00108 } 00109 else 00110 for(int i = 0; i < (int)out.getSequence().size(); i+=1) 00111 stream << (out.getSequence())[i] << out.getSeparator(); 00112 stream << std::endl; 00113 return stream; 00114 } 00115 std::istream & FastaSequenceFormat::readSequence (std::istream & stream, SequenceEntry & in) 00116 { 00117 std::string line; 00118 std::string sequence ; 00119 bool firstSeq = true; 00120 if(_nextFastaHeader.length() > 0) { 00121 _currentFastaHeader = _nextFastaHeader; 00122 firstSeq = false; 00123 } 00124 while (!stream.eof()) { 00125 if(!std::getline(stream, line, '\n')) 00126 continue; 00127 unsigned int i; 00128 for (i = 0; i < line.length(); i++) 00129 if(!isspace(line[i]) && (line[i] != '\n')) 00130 break; 00131 if(line.length() <= 0 || i == line.length()) 00132 continue; 00133 00134 if(line[0] == '>') { 00135 if(firstSeq == true){ 00136 _currentFastaHeader = line; 00137 firstSeq =false; 00138 continue; 00139 } 00140 else { 00141 _nextFastaHeader = line; 00142 break; 00143 } 00144 } 00145 sequence += line; 00146 } 00147 int j = 0; 00148 for (int i = 0; i < (int)sequence.length(); i++) 00149 { 00150 if(isspace(sequence[i]) || (sequence[i] == '\n') ) 00151 continue; 00152 sequence[j] = sequence[i]; 00153 j++; 00154 } 00155 sequence = sequence.substr(0,j); 00156 if(sequence.length () <= 0) 00157 return stream; 00158 00159 SequenceFactory factory(in.getAlphabet()); 00160 trim_spaces(sequence); 00161 std::vector <int> invalid; 00162 in.setSequence(factory.createSequenceRemovedSpaces(sequence, invalid)); 00163 in.setInvalidPositions(invalid); 00164 boost::regex sep(" "); 00165 std::vector <std::string> name_and_description; 00166 split_regex(_currentFastaHeader, name_and_description, sep); 00167 00168 in.setName( name_and_description[0].substr(1, name_and_description[0].length()-1)); 00169 std::string description; 00170 for(int i = 1; i < (int)name_and_description.size(); i++) 00171 description += name_and_description[i]; 00172 in.setDescription ( description ); 00173 00174 return stream; 00175 } 00176 00177 SequenceFormatManagerPtr SequenceFormatManager::_inst ; 00178 SequenceFormatManagerPtr SequenceFormatManager::instance() { 00179 if(!_inst) { 00180 _inst = SequenceFormatManagerPtr(new SequenceFormatManager()); 00181 } 00182 return _inst; 00183 } 00184 }