[med-svn] [Git][med-team/python-dnaio][upstream] New upstream version 1.2.2

Wed Oct 9 20:16:10 BST 2024


Étienne Mollier pushed to branch upstream at Debian Med / python-dnaio


Commits:
40879644 by Étienne Mollier at 2024-10-09T20:49:49+02:00
New upstream version 1.2.2
- - - - -


5 changed files:

- CHANGES.rst
- src/dnaio/_core.pyx
- src/dnaio/bam.h
- + tests/data/missing_quals.bam
- tests/test_internal.py


Changes:

=====================================
CHANGES.rst
=====================================
@@ -2,6 +2,11 @@
 Changelog
 =========
 
+v1.2.2 (2024-10-04)
+-------------------
+* :pr:`139`: Fix an error that occurred when decoding BAM records with missing
+  quality values. These are now converted to ``None``.
+
 v1.2.1 (2024-06-17)
 -------------------
 


=====================================
src/dnaio/_core.pyx
=====================================
@@ -824,10 +824,14 @@ cdef class BamIter:
             bam_qual_start = bam_seq_start + encoded_seq_length
             name = PyUnicode_New(name_length, 127)
             sequence = PyUnicode_New(seq_length, 127)
-            qualities = PyUnicode_New(seq_length, 127)
             memcpy(<uint8_t *>PyUnicode_DATA(name), bam_name_start, name_length)
             decode_bam_sequence(<uint8_t *>PyUnicode_DATA(sequence), bam_seq_start, seq_length)
-            decode_bam_qualities(<uint8_t *>PyUnicode_DATA(qualities), bam_qual_start, seq_length)
+            if seq_length and bam_qual_start[0] == 0xff:
+                # 0xff means qualities are missing.
+                qualities = None
+            else:
+                qualities = PyUnicode_New(seq_length, 127)
+                decode_bam_qualities(<uint8_t *>PyUnicode_DATA(qualities), bam_qual_start, seq_length)
             seq_record = SequenceRecord.__new__(SequenceRecord)
             seq_record._name = name
             seq_record._sequence = sequence


=====================================
src/dnaio/bam.h
=====================================
@@ -13,6 +13,7 @@
 #endif
 
 #define COMPILER_HAS_TARGET (GCC_AT_LEAST(4, 8) || CLANG_COMPILER_HAS(__target__))
+#define COMPILER_HAS_CONSTRUCTOR (__GNUC__ || CLANG_COMPILER_HAS(constructor))
 #define COMPILER_HAS_OPTIMIZE (GCC_AT_LEAST(4,4) || CLANG_COMPILER_HAS(optimize))
 
 #if defined(__x86_64__) || defined(_M_X64)
@@ -56,7 +57,11 @@ decode_bam_sequence_default(uint8_t *dest, const uint8_t *encoded_sequence, size
     }
 }
 
-#if COMPILER_HAS_TARGET && BUILD_IS_X86_64
+static void (*decode_bam_sequence)(
+    uint8_t *dest, const uint8_t *encoded_sequence, size_t length
+) = decode_bam_sequence_default;
+
+#if COMPILER_HAS_TARGET && COMPILER_HAS_CONSTRUCTOR && BUILD_IS_X86_64
 __attribute__((__target__("ssse3")))
 static void 
 decode_bam_sequence_ssse3(uint8_t *dest, const uint8_t *encoded_sequence, size_t length) 
@@ -66,79 +71,41 @@ decode_bam_sequence_ssse3(uint8_t *dest, const uint8_t *encoded_sequence, size_t
     const uint8_t *dest_end_ptr = dest + length;
     uint8_t *dest_cursor = dest;
     const uint8_t *encoded_cursor = encoded_sequence;
-    const uint8_t *dest_vec_end_ptr = dest_end_ptr - (2 * sizeof(__m128i));
-    __m128i first_upper_shuffle = _mm_setr_epi8(
-        0, 0xff, 1, 0xff, 2, 0xff, 3, 0xff, 4, 0xff, 5, 0xff, 6, 0xff, 7, 0xff);
-    __m128i first_lower_shuffle = _mm_setr_epi8(
-        0xff, 0, 0xff, 1, 0xff, 2, 0xff, 3, 0xff, 4, 0xff, 5, 0xff, 6, 0xff, 7);
-    __m128i second_upper_shuffle = _mm_setr_epi8(
-        8, 0xff, 9, 0xff, 10, 0xff, 11, 0xff, 12, 0xff, 13, 0xff, 14, 0xff, 15, 0xff);
-    __m128i second_lower_shuffle = _mm_setr_epi8(
-        0xff, 8, 0xff, 9, 0xff, 10, 0xff, 11, 0xff, 12, 0xff, 13, 0xff, 14, 0xff, 15);
+    const uint8_t *dest_vec_end_ptr = dest_end_ptr - (2 * sizeof(__m128i) - 1);
     __m128i nuc_lookup_vec = _mm_lddqu_si128((__m128i *)nuc_lookup);
-    /* Work on 16 encoded characters at the time resulting in 32 decoded characters
-       Examples are given for 8 encoded characters A until H to keep it readable.
-        Encoded stored as |AB|CD|EF|GH|
-        Shuffle into |AB|00|CD|00|EF|00|GH|00| and
-                     |00|AB|00|CD|00|EF|00|GH|
-        Shift upper to the right resulting into
-                     |0A|B0|0C|D0|0E|F0|0G|H0| and
-                     |00|AB|00|CD|00|EF|00|GH|
-        Merge with or resulting into (X stands for garbage)
-                     |0A|XB|0C|XD|0E|XF|0G|XH|
-        Bitwise and with 0b1111 leads to:
-                     |0A|0B|0C|0D|0E|0F|0G|0H|
-        We can use the resulting 4-bit integers as indexes for the shuffle of
-        the nucleotide lookup. */
+    /* Nucleotides are encoded 4-bits per nucleotide and stored in 8-bit bytes
+       as follows: |AB|CD|EF|GH|. The 4-bit codes (going from 0-15) can be used
+       together with the pshufb instruction as a lookup table. The most efficient
+       the upper codes (|A|C|E|G|) and one with the lower codes (|B|D|F|H|).
+       The lookup can then be performed and the resulting vectors can be
+       interleaved again using the unpack instructions. */
     while (dest_cursor < dest_vec_end_ptr) {
         __m128i encoded = _mm_lddqu_si128((__m128i *)encoded_cursor);
-
-        __m128i first_upper = _mm_shuffle_epi8(encoded, first_upper_shuffle);
-        __m128i first_lower = _mm_shuffle_epi8(encoded, first_lower_shuffle);
-        __m128i shifted_first_upper = _mm_srli_epi64(first_upper, 4);
-        __m128i first_merged = _mm_or_si128(shifted_first_upper, first_lower);
-        __m128i first_indexes = _mm_and_si128(first_merged, _mm_set1_epi8(0b1111));
-        __m128i first_nucleotides = _mm_shuffle_epi8(nuc_lookup_vec, first_indexes);
+        __m128i encoded_upper = _mm_srli_epi64(encoded, 4);
+        encoded_upper = _mm_and_si128(encoded_upper, _mm_set1_epi8(15));
+        __m128i encoded_lower = _mm_and_si128(encoded, _mm_set1_epi8(15));
+        __m128i nucs_upper = _mm_shuffle_epi8(nuc_lookup_vec, encoded_upper);
+        __m128i nucs_lower = _mm_shuffle_epi8(nuc_lookup_vec, encoded_lower);
+        __m128i first_nucleotides = _mm_unpacklo_epi8(nucs_upper, nucs_lower);
+        __m128i second_nucleotides = _mm_unpackhi_epi8(nucs_upper, nucs_lower);
         _mm_storeu_si128((__m128i *)dest_cursor, first_nucleotides);
-
-        __m128i second_upper = _mm_shuffle_epi8(encoded, second_upper_shuffle);
-        __m128i second_lower = _mm_shuffle_epi8(encoded, second_lower_shuffle);
-        __m128i shifted_second_upper = _mm_srli_epi64(second_upper, 4);
-        __m128i second_merged = _mm_or_si128(shifted_second_upper, second_lower);
-        __m128i second_indexes = _mm_and_si128(second_merged, _mm_set1_epi8(0b1111));
-        __m128i second_nucleotides = _mm_shuffle_epi8(nuc_lookup_vec, second_indexes);
         _mm_storeu_si128((__m128i *)(dest_cursor + 16), second_nucleotides);
-
         encoded_cursor += sizeof(__m128i);
         dest_cursor += 2 * sizeof(__m128i);
     }
     decode_bam_sequence_default(dest_cursor, encoded_cursor, dest_end_ptr - dest_cursor);
 }
 
-static void (*decode_bam_sequence)(
-    uint8_t *dest, const uint8_t *encoded_sequence, size_t length);
-
-/* Simple dispatcher function, updates the function pointer after testing the
-   CPU capabilities. After this, the dispatcher function is not needed anymore. */
-static void decode_bam_sequence_dispatch(
-        uint8_t *dest, const uint8_t *encoded_sequence, size_t length) {
+/* Constructor functions run at dynamic link time. This checks the CPU capabilities 
+   and updates the function pointer accordingly. */
+__attribute__((constructor))
+static void decode_bam_sequence_dispatch(void) {
     if (__builtin_cpu_supports("ssse3")) {
         decode_bam_sequence = decode_bam_sequence_ssse3;
     }
     else {
         decode_bam_sequence = decode_bam_sequence_default;
     }
-    decode_bam_sequence(dest, encoded_sequence, length);
-}
-
-static void (*decode_bam_sequence)(
-    uint8_t *dest, const uint8_t *encoded_sequence, size_t length
-) = decode_bam_sequence_dispatch;
-
-#else
-static inline void decode_bam_sequence(uint8_t *dest, const uint8_t *encoded_sequence, size_t length) 
-{
-    decode_bam_sequence_default(dest, encoded_sequence, length);
 }
 #endif 
 


=====================================
tests/data/missing_quals.bam
=====================================
Binary files /dev/null and b/tests/data/missing_quals.bam differ


=====================================
tests/test_internal.py
=====================================
@@ -803,3 +803,13 @@ class TestBamReader:
         with pytest.raises(NotImplementedError) as error:
             next(it)
         assert error.match("unmapped single reads")
+
+    def test_parse_bam_missing_quals(self):
+        bam = TEST_DATA / "missing_quals.bam"
+        reader = BamReader(str(bam))
+        records = list(reader)
+        assert len(records) == 1
+        record = records[0]
+        assert record.sequence == "GATTACA"
+        # Phred scores should default to 0 + 33 == ord('!') when missing.
+        assert record.qualities is None



View it on GitLab: https://salsa.debian.org/med-team/python-dnaio/-/commit/40879644044aca1c849a2d9722d5d3acaa3d0f87

-- 
View it on GitLab: https://salsa.debian.org/med-team/python-dnaio/-/commit/40879644044aca1c849a2d9722d5d3acaa3d0f87
You're receiving this email because of your account on salsa.debian.org.


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20241009/f89f310f/attachment-0001.htm>