[med-svn] [Git][med-team/python-dnaio][upstream] New upstream version 1.2.2
Étienne Mollier (@emollier)
gitlab at salsa.debian.org
Wed Oct 9 20:16:10 BST 2024
Étienne Mollier pushed to branch upstream at Debian Med / python-dnaio
Commits:
40879644 by Étienne Mollier at 2024-10-09T20:49:49+02:00
New upstream version 1.2.2
- - - - -
5 changed files:
- CHANGES.rst
- src/dnaio/_core.pyx
- src/dnaio/bam.h
- + tests/data/missing_quals.bam
- tests/test_internal.py
Changes:
=====================================
CHANGES.rst
=====================================
@@ -2,6 +2,11 @@
Changelog
=========
+v1.2.2 (2024-10-04)
+-------------------
+* :pr:`139`: Fix an error that occurred when decoding BAM records with missing
+ quality values. These are now converted to ``None``.
+
v1.2.1 (2024-06-17)
-------------------
=====================================
src/dnaio/_core.pyx
=====================================
@@ -824,10 +824,14 @@ cdef class BamIter:
bam_qual_start = bam_seq_start + encoded_seq_length
name = PyUnicode_New(name_length, 127)
sequence = PyUnicode_New(seq_length, 127)
- qualities = PyUnicode_New(seq_length, 127)
memcpy(<uint8_t *>PyUnicode_DATA(name), bam_name_start, name_length)
decode_bam_sequence(<uint8_t *>PyUnicode_DATA(sequence), bam_seq_start, seq_length)
- decode_bam_qualities(<uint8_t *>PyUnicode_DATA(qualities), bam_qual_start, seq_length)
+ if seq_length and bam_qual_start[0] == 0xff:
+ # 0xff means qualities are missing.
+ qualities = None
+ else:
+ qualities = PyUnicode_New(seq_length, 127)
+ decode_bam_qualities(<uint8_t *>PyUnicode_DATA(qualities), bam_qual_start, seq_length)
seq_record = SequenceRecord.__new__(SequenceRecord)
seq_record._name = name
seq_record._sequence = sequence
=====================================
src/dnaio/bam.h
=====================================
@@ -13,6 +13,7 @@
#endif
#define COMPILER_HAS_TARGET (GCC_AT_LEAST(4, 8) || CLANG_COMPILER_HAS(__target__))
+#define COMPILER_HAS_CONSTRUCTOR (__GNUC__ || CLANG_COMPILER_HAS(constructor))
#define COMPILER_HAS_OPTIMIZE (GCC_AT_LEAST(4,4) || CLANG_COMPILER_HAS(optimize))
#if defined(__x86_64__) || defined(_M_X64)
@@ -56,7 +57,11 @@ decode_bam_sequence_default(uint8_t *dest, const uint8_t *encoded_sequence, size
}
}
-#if COMPILER_HAS_TARGET && BUILD_IS_X86_64
+static void (*decode_bam_sequence)(
+ uint8_t *dest, const uint8_t *encoded_sequence, size_t length
+) = decode_bam_sequence_default;
+
+#if COMPILER_HAS_TARGET && COMPILER_HAS_CONSTRUCTOR && BUILD_IS_X86_64
__attribute__((__target__("ssse3")))
static void
decode_bam_sequence_ssse3(uint8_t *dest, const uint8_t *encoded_sequence, size_t length)
@@ -66,79 +71,41 @@ decode_bam_sequence_ssse3(uint8_t *dest, const uint8_t *encoded_sequence, size_t
const uint8_t *dest_end_ptr = dest + length;
uint8_t *dest_cursor = dest;
const uint8_t *encoded_cursor = encoded_sequence;
- const uint8_t *dest_vec_end_ptr = dest_end_ptr - (2 * sizeof(__m128i));
- __m128i first_upper_shuffle = _mm_setr_epi8(
- 0, 0xff, 1, 0xff, 2, 0xff, 3, 0xff, 4, 0xff, 5, 0xff, 6, 0xff, 7, 0xff);
- __m128i first_lower_shuffle = _mm_setr_epi8(
- 0xff, 0, 0xff, 1, 0xff, 2, 0xff, 3, 0xff, 4, 0xff, 5, 0xff, 6, 0xff, 7);
- __m128i second_upper_shuffle = _mm_setr_epi8(
- 8, 0xff, 9, 0xff, 10, 0xff, 11, 0xff, 12, 0xff, 13, 0xff, 14, 0xff, 15, 0xff);
- __m128i second_lower_shuffle = _mm_setr_epi8(
- 0xff, 8, 0xff, 9, 0xff, 10, 0xff, 11, 0xff, 12, 0xff, 13, 0xff, 14, 0xff, 15);
+ const uint8_t *dest_vec_end_ptr = dest_end_ptr - (2 * sizeof(__m128i) - 1);
__m128i nuc_lookup_vec = _mm_lddqu_si128((__m128i *)nuc_lookup);
- /* Work on 16 encoded characters at the time resulting in 32 decoded characters
- Examples are given for 8 encoded characters A until H to keep it readable.
- Encoded stored as |AB|CD|EF|GH|
- Shuffle into |AB|00|CD|00|EF|00|GH|00| and
- |00|AB|00|CD|00|EF|00|GH|
- Shift upper to the right resulting into
- |0A|B0|0C|D0|0E|F0|0G|H0| and
- |00|AB|00|CD|00|EF|00|GH|
- Merge with or resulting into (X stands for garbage)
- |0A|XB|0C|XD|0E|XF|0G|XH|
- Bitwise and with 0b1111 leads to:
- |0A|0B|0C|0D|0E|0F|0G|0H|
- We can use the resulting 4-bit integers as indexes for the shuffle of
- the nucleotide lookup. */
+ /* Nucleotides are encoded 4-bits per nucleotide and stored in 8-bit bytes
+ as follows: |AB|CD|EF|GH|. The 4-bit codes (going from 0-15) can be used
+ together with the pshufb instruction as a lookup table. The most efficient
+ the upper codes (|A|C|E|G|) and one with the lower codes (|B|D|F|H|).
+ The lookup can then be performed and the resulting vectors can be
+ interleaved again using the unpack instructions. */
while (dest_cursor < dest_vec_end_ptr) {
__m128i encoded = _mm_lddqu_si128((__m128i *)encoded_cursor);
-
- __m128i first_upper = _mm_shuffle_epi8(encoded, first_upper_shuffle);
- __m128i first_lower = _mm_shuffle_epi8(encoded, first_lower_shuffle);
- __m128i shifted_first_upper = _mm_srli_epi64(first_upper, 4);
- __m128i first_merged = _mm_or_si128(shifted_first_upper, first_lower);
- __m128i first_indexes = _mm_and_si128(first_merged, _mm_set1_epi8(0b1111));
- __m128i first_nucleotides = _mm_shuffle_epi8(nuc_lookup_vec, first_indexes);
+ __m128i encoded_upper = _mm_srli_epi64(encoded, 4);
+ encoded_upper = _mm_and_si128(encoded_upper, _mm_set1_epi8(15));
+ __m128i encoded_lower = _mm_and_si128(encoded, _mm_set1_epi8(15));
+ __m128i nucs_upper = _mm_shuffle_epi8(nuc_lookup_vec, encoded_upper);
+ __m128i nucs_lower = _mm_shuffle_epi8(nuc_lookup_vec, encoded_lower);
+ __m128i first_nucleotides = _mm_unpacklo_epi8(nucs_upper, nucs_lower);
+ __m128i second_nucleotides = _mm_unpackhi_epi8(nucs_upper, nucs_lower);
_mm_storeu_si128((__m128i *)dest_cursor, first_nucleotides);
-
- __m128i second_upper = _mm_shuffle_epi8(encoded, second_upper_shuffle);
- __m128i second_lower = _mm_shuffle_epi8(encoded, second_lower_shuffle);
- __m128i shifted_second_upper = _mm_srli_epi64(second_upper, 4);
- __m128i second_merged = _mm_or_si128(shifted_second_upper, second_lower);
- __m128i second_indexes = _mm_and_si128(second_merged, _mm_set1_epi8(0b1111));
- __m128i second_nucleotides = _mm_shuffle_epi8(nuc_lookup_vec, second_indexes);
_mm_storeu_si128((__m128i *)(dest_cursor + 16), second_nucleotides);
-
encoded_cursor += sizeof(__m128i);
dest_cursor += 2 * sizeof(__m128i);
}
decode_bam_sequence_default(dest_cursor, encoded_cursor, dest_end_ptr - dest_cursor);
}
-static void (*decode_bam_sequence)(
- uint8_t *dest, const uint8_t *encoded_sequence, size_t length);
-
-/* Simple dispatcher function, updates the function pointer after testing the
- CPU capabilities. After this, the dispatcher function is not needed anymore. */
-static void decode_bam_sequence_dispatch(
- uint8_t *dest, const uint8_t *encoded_sequence, size_t length) {
+/* Constructor functions run at dynamic link time. This checks the CPU capabilities
+ and updates the function pointer accordingly. */
+__attribute__((constructor))
+static void decode_bam_sequence_dispatch(void) {
if (__builtin_cpu_supports("ssse3")) {
decode_bam_sequence = decode_bam_sequence_ssse3;
}
else {
decode_bam_sequence = decode_bam_sequence_default;
}
- decode_bam_sequence(dest, encoded_sequence, length);
-}
-
-static void (*decode_bam_sequence)(
- uint8_t *dest, const uint8_t *encoded_sequence, size_t length
-) = decode_bam_sequence_dispatch;
-
-#else
-static inline void decode_bam_sequence(uint8_t *dest, const uint8_t *encoded_sequence, size_t length)
-{
- decode_bam_sequence_default(dest, encoded_sequence, length);
}
#endif
=====================================
tests/data/missing_quals.bam
=====================================
Binary files /dev/null and b/tests/data/missing_quals.bam differ
=====================================
tests/test_internal.py
=====================================
@@ -803,3 +803,13 @@ class TestBamReader:
with pytest.raises(NotImplementedError) as error:
next(it)
assert error.match("unmapped single reads")
+
+ def test_parse_bam_missing_quals(self):
+ bam = TEST_DATA / "missing_quals.bam"
+ reader = BamReader(str(bam))
+ records = list(reader)
+ assert len(records) == 1
+ record = records[0]
+ assert record.sequence == "GATTACA"
+ # Phred scores should default to 0 + 33 == ord('!') when missing.
+ assert record.qualities is None
View it on GitLab: https://salsa.debian.org/med-team/python-dnaio/-/commit/40879644044aca1c849a2d9722d5d3acaa3d0f87
--
View it on GitLab: https://salsa.debian.org/med-team/python-dnaio/-/commit/40879644044aca1c849a2d9722d5d3acaa3d0f87
You're receiving this email because of your account on salsa.debian.org.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20241009/f89f310f/attachment-0001.htm>
More information about the debian-med-commit
mailing list