[med-svn] [Git][med-team/libfastahack][upstream] New upstream version 1.0.0+dfsg
Michael R. Crusoe
gitlab at salsa.debian.org
Wed Sep 4 08:44:55 BST 2019
Michael R. Crusoe pushed to branch upstream at Debian Med / libfastahack
Commits:
45c8227d by Michael R. Crusoe at 2019-09-04T06:37:29Z
New upstream version 1.0.0+dfsg
- - - - -
2 changed files:
- Fasta.cpp
- + tests/crlf.fasta
Changes:
=====================================
Fasta.cpp
=====================================
@@ -3,7 +3,7 @@
// Marth Lab, Department of Biology, Boston College
// All rights reserved.
// ---------------------------------------------------------------------------
-// Last modified: 9 February 2010 (EG)
+// Last modified: 26 February 2019 (AMN)
// ---------------------------------------------------------------------------
#include "Fasta.h"
@@ -87,6 +87,28 @@ ostream& operator<<(ostream& output, FastaIndex& fastaIndex) {
for( vector<FastaIndexEntry>::iterator fit = sortedIndex.begin(); fit != sortedIndex.end(); ++fit) {
output << *fit << endl;
}
+ return output;
+}
+
+// Read a line from in into line. Line endings ('\r', '\n', etc.) are not
+// written to line but is counted in bytes, which will hold the total bytes
+// consumed. Supports both '\n' and '\r\n' line endings. Returns true if data
+// was read, and false on EOF before anything could be read.
+bool getlineCounting(istream& in, string& line, int& bytes) {
+ bytes = 0;
+ line.clear();
+ for(int got = in.get(); got != EOF; got = in.get()) {
+ bytes++;
+ if (got == '\n') {
+ // Line is over, but we read something (the '\n')
+ return true;
+ } else if (got != '\r') {
+ // Anything other than a '\r' is real data.
+ line.push_back((char)got);
+ }
+ // '\r' is skipped, but still counted in bytes
+ }
+ return !line.empty();
}
void FastaIndex::indexReference(string refname) {
@@ -100,6 +122,7 @@ void FastaIndex::indexReference(string refname) {
FastaIndexEntry entry; // an entry buffer used in processing
entry.clear();
int line_length = 0;
+ int line_bytes = 0;
long long offset = 0; // byte offset from start of file
long long line_number = 0; // current line number
bool mismatchedLineLengths = false; // flag to indicate if our line length changes mid-file
@@ -112,18 +135,20 @@ void FastaIndex::indexReference(string refname) {
ifstream refFile;
refFile.open(refname.c_str());
if (refFile.is_open()) {
- while (getline(refFile, line)) {
+ while (getlineCounting(refFile, line, line_bytes)) {
++line_number;
line_length = line.length();
if (line[0] == ';') {
// fasta comment, skip
} else if (line[0] == '+') {
// fastq quality header
- getline(refFile, line);
- line_length = line.length();
- offset += line_length + 1;
- // get and don't handle the quality line
- getline(refFile, line);
+
+ // account for header offset
+ offset += line_bytes;
+
+ // read in quality line so its offset will be accounted for too
+ // TODO: we don't support the quality offset field of the FAI format
+ getlineCounting(refFile, line, line_bytes);
line_length = line.length();
} else if (line[0] == '>' || line[0] == '@') { // fasta /fastq header
// if we aren't on the first entry, push the last sequence into the index
@@ -139,7 +164,6 @@ void FastaIndex::indexReference(string refname) {
entry.offset = offset;
entry.length += line_length;
if (entry.line_len) {
- //entry.line_len = entry.line_len ? entry.line_len : line_length + 1;
if (mismatchedLineLengths || emptyLine) {
if (line_length == 0) {
emptyLine = true; // flag empty lines, raise error only if this is embedded in the sequence
@@ -157,18 +181,18 @@ void FastaIndex::indexReference(string refname) {
// this flag is set here and checked on the next line
// because we may have reached the end of the sequence, in
// which case a mismatched line length is OK
- if (entry.line_len != line_length + 1) {
+ if (entry.line_len != line_bytes) {
mismatchedLineLengths = true;
if (line_length == 0) {
emptyLine = true; // flag empty lines, raise error only if this is embedded in the sequence
}
}
} else {
- entry.line_len = line_length + 1; // first line
+ entry.line_len = line_bytes; // first line
+ entry.line_blen = line_length;
}
- entry.line_blen = entry.line_len - 1;
}
- offset += line_length + 1;
+ offset += line_bytes;
}
// we've hit the end of the fasta file!
// flush the last entry
@@ -249,8 +273,9 @@ FastaReference::~FastaReference(void) {
string FastaReference::getSequence(string seqname) {
FastaIndexEntry entry = index->entry(seqname);
- int newlines_in_sequence = entry.length / entry.line_blen;
- int seqlen = newlines_in_sequence + entry.length;
+ int bytes_per_newline = entry.line_len - entry.line_blen;
+ int newline_bytes_in_sequence = entry.length / entry.line_blen * bytes_per_newline;
+ int seqlen = newline_bytes_in_sequence + entry.length;
char* seq = (char*) calloc (seqlen + 1, sizeof(char));
fseek64(file, entry.offset, SEEK_SET);
string s;
@@ -258,6 +283,7 @@ string FastaReference::getSequence(string seqname) {
seq[seqlen] = '\0';
char* pbegin = seq;
char* pend = seq + (seqlen/sizeof(char));
+ pend = remove(pbegin, pend, '\r');
pend = remove(pbegin, pend, '\n');
pend = remove(pbegin, pend, '\0');
s = seq;
@@ -299,7 +325,8 @@ string FastaReference::getSubSequence(string seqname, int start, int length) {
int newlines_before = start > 0 ? (start - 1) / entry.line_blen : 0;
int newlines_by_end = (start + length - 1) / entry.line_blen;
int newlines_inside = newlines_by_end - newlines_before;
- int seqlen = length + newlines_inside;
+ int bytes_per_newline = entry.line_len - entry.line_blen;
+ int seqlen = length + newlines_inside * bytes_per_newline;
char* seq = (char*) calloc (seqlen + 1, sizeof(char));
fseek64(file, (off_t) (entry.offset + newlines_before + start), SEEK_SET);
string s;
@@ -307,6 +334,7 @@ string FastaReference::getSubSequence(string seqname, int start, int length) {
seq[seqlen] = '\0';
char* pbegin = seq;
char* pend = seq + (seqlen/sizeof(char));
+ pend = remove(pbegin, pend, '\r');
pend = remove(pbegin, pend, '\n');
pend = remove(pbegin, pend, '\0');
s = seq;
=====================================
tests/crlf.fasta
=====================================
@@ -0,0 +1,7 @@
+>chr
+C
+A
+A
+G
+T
+C
View it on GitLab: https://salsa.debian.org/med-team/libfastahack/commit/45c8227d6e4a8656c363aed15a0a3fd7c179d00c
--
View it on GitLab: https://salsa.debian.org/med-team/libfastahack/commit/45c8227d6e4a8656c363aed15a0a3fd7c179d00c
You're receiving this email because of your account on salsa.debian.org.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20190904/83c44d64/attachment-0001.html>
More information about the debian-med-commit
mailing list