[med-svn] [Git][med-team/seqmagick][upstream] New upstream version 0.8.2

Sat Nov 14 19:04:06 GMT 2020


Étienne Mollier pushed to branch upstream at Debian Med / seqmagick


Commits:
3e36ac3b by Étienne Mollier at 2020-11-14T19:37:22+01:00
New upstream version 0.8.2
- - - - -


19 changed files:

- .travis.yml
- README.rst
- docs/changelog.rst
- docs/index.rst
- examples/apply-function/myfunctions.py
- requirements-rtd.txt
- requirements.txt
- seqmagick/subcommands/convert.py
- seqmagick/subcommands/primer_trim.py
- seqmagick/subcommands/quality_filter.py
- + seqmagick/test/integration/data/input6.fasta
- seqmagick/test/integration/data/output2.nex
- + seqmagick/test/integration/data/output3.nex
- + seqmagick/test/integration/data/output4.nex
- seqmagick/test/integration/test_convert.py
- seqmagick/test/test_primer_trim.py
- seqmagick/test/test_subcommands_quality_filter.py
- seqmagick/test/test_transform.py
- seqmagick/transform.py


Changes:

=====================================
.travis.yml
=====================================
@@ -3,7 +3,7 @@ python:
   # - "2.7"
   # - "pypy"
   # - "3.4"
-  - "3.5"
+  # - "3.5"
   - "3.6"
   - "3.7"
   - "3.8"
@@ -12,7 +12,7 @@ python:
 # BioPython doesn't always play well with pip install.
 install:
   - "if [[ $TRAVIS_PYTHON_VERSION != 'pypy' ]]; then pip install -q numpy; fi"
-  - "pip install -q biopython nose"
+  - "pip install -q biopython nose pygtrie"
   - "pip install ."
 
 script:


=====================================
README.rst
=====================================
@@ -19,7 +19,7 @@ there is one that takes arguments::
 Requirements
 ============
 
-* Python >= 3.4
+* Python >= 3.5
 * biopython >= 1.70
 
 Installation
@@ -29,7 +29,7 @@ Use pip::
 
    pip install seqmagick
 
-Note that as of version 0.7.0, this package requires Python 3.4+. If
+Note that as of version 0.8.0, this package requires Python 3.5+. If
 you want to use the most recent version compatible with Python 2.7::
 
   pip install seqmagick==0.6.2


=====================================
docs/changelog.rst
=====================================
@@ -1,6 +1,15 @@
 Changes for seqmagick
 =====================
 
+0.8.0
+-----
+
+* Supports Python 3.5+
+* Drops support for Python 3.4
+* Fix issue: "seqmagick with no params gives KeyError:None" [GH-77]
+* Fix for Biopython 1.71 dual coding support [GH-76]; also fixes issue: "Translation error with new BioPython" [GH-79]
+* Send logging to stderr, not stdout [GH-75]
+
 0.7.0
 -----
 


=====================================
docs/index.rst
=====================================
@@ -59,10 +59,10 @@ To install the bleeding edge version::
 
     pip install git+https://github.com/fhcrc/seqmagick.git@master#egg-info=seqmagick
 
-Note that as of version 0.7.0, this package requires Python 3.4+. If
+Note that as of version 0.8.0, this package requires Python 3.5+. If
 you want to use the most recent version compatible with Python 2.7::
 
-  pip install seqmagick==1.6.2
+  pip install seqmagick==0.6.2
 
 Use
 ===


=====================================
examples/apply-function/myfunctions.py
=====================================
@@ -14,6 +14,6 @@ def hash_starts_numeric(records):
     their sha-1 hash.
     """
     for record in records:
-        seq_hash = hashlib.sha1(str(record.seq)).hexdigest()
+        seq_hash = hashlib.sha1(str(record.seq).encode('utf-8')).hexdigest()
         if seq_hash[0].isdigit():
             yield record


=====================================
requirements-rtd.txt
=====================================
@@ -1,2 +1,3 @@
 biopython
 sphinx
+pygtrie


=====================================
requirements.txt
=====================================
@@ -1,4 +1,5 @@
 biopython>=1.70
+pygtrie
 
 # for development
 wheel


=====================================
seqmagick/subcommands/convert.py
=====================================
@@ -6,8 +6,7 @@ import functools
 import logging
 import random
 
-from Bio import Alphabet, SeqIO
-from Bio.Alphabet import IUPAC
+from Bio import SeqIO
 from Bio.SeqIO import FastaIO
 from seqmagick import transform
 from seqmagick.fileformat import from_handle
@@ -15,11 +14,11 @@ from seqmagick.fileformat import from_handle
 from . import common
 
 ALPHABETS = {
-        'dna': Alphabet.generic_dna,
-        'dna-ambiguous': IUPAC.ambiguous_dna,
-        'protein': Alphabet.generic_protein,
-        'rna': Alphabet.generic_rna,
-        'rna-ambiguous': IUPAC.ambiguous_rna,
+    "dna": "DNA",
+    "dna-ambiguous": "DNA",
+    "rna": "RNA",
+    "rna-ambiguous": "RNA",
+    "protein": "protein",
 }
 
 def add_options(parser):
@@ -230,6 +229,13 @@ def build_parser(parser):
 
     return parser
 
+
+def append_annotation_iterator(records_iterator, alphabet):
+    for record in records_iterator:
+        record.annotations["molecule_type"] = ALPHABETS[alphabet]
+        yield record
+
+
 def transform_file(source_file, destination_file, arguments):
     # Get just the file name, useful for naming the temporary file.
     source_file_type = (arguments.input_format or from_handle(source_file))
@@ -249,8 +255,7 @@ def transform_file(source_file, destination_file, arguments):
                 direction=directions[direction])
     else:
         # Unsorted iterator.
-        records = SeqIO.parse(source_file, source_file_type,
-                alphabet=ALPHABETS.get(arguments.alphabet))
+        records = SeqIO.parse(source_file, source_file_type)
 
 
     #########################################
@@ -315,6 +320,9 @@ def transform_file(source_file, destination_file, arguments):
         # loading the entire sequence file up into memory.
         logging.info("Applying transformations, writing to %s",
                 destination_file)
+        # Append datatype annotation, mandatory for Nexus files conversion.
+        if arguments.alphabet != None:
+            records = append_annotation_iterator(records, arguments.alphabet)
         SeqIO.write(records, destination_file, destination_file_type)
 
 


=====================================
seqmagick/subcommands/primer_trim.py
=====================================
@@ -7,8 +7,7 @@ import logging
 import operator
 import sys
 
-from Bio import Alphabet, SeqIO, pairwise2
-from Bio.Alphabet import IUPAC
+from Bio import SeqIO, pairwise2
 from Bio.Seq import Seq
 
 from seqmagick import transform, fileformat
@@ -298,8 +297,7 @@ def action(arguments):
     with arguments.source_file:
         sequences = SeqIO.parse(
             arguments.source_file,
-            source_format,
-            alphabet=Alphabet.Gapped(Alphabet.single_letter_alphabet))
+            source_format)
 
         # Locate primers
         (forward_start, forward_end), (reverse_start, reverse_end) = locate_primers(
@@ -319,8 +317,7 @@ def action(arguments):
         arguments.source_file.seek(0)
         sequences = SeqIO.parse(
             arguments.source_file,
-            source_format,
-            alphabet=Alphabet.Gapped(Alphabet.single_letter_alphabet))
+            source_format)
 
         # Apply the transformation
         prune_action = _ACTIONS[arguments.prune_action]


=====================================
seqmagick/subcommands/quality_filter.py
=====================================
@@ -11,16 +11,29 @@ import sys
 import time
 
 from Bio import SeqIO
-try:
-    from Bio import trie, triefind
-except ImportError:
-    trie = None
-    triefind = None
+import pygtrie as trie
 from Bio.SeqIO import QualityIO
 
 from seqmagick import fileformat, __version__
 from .common import typed_range, FileType
 
+
+def trie_match(string, trie):
+    def has_prefix(teststring, trie):
+        for key in trie.keys():
+            if key.startswith(teststring):
+                return True
+        return False
+    longest = None
+    for i in range(len(string)):
+        substr = string[:i + 1]
+        if not has_prefix(substr, trie):
+            break
+        if trie.has_key(substr):
+            longest = substr
+    return longest
+
+
 # Default minimummean quality score
 DEFAULT_MEAN_SCORE = 25.0
 
@@ -626,7 +639,7 @@ class PrimerBarcodeFilter(BaseFilter):
         self.trie = trie
 
     def filter_record(self, record):
-        m = triefind.match(str(record.seq), self.trie)
+        m = trie_match(str(record.seq), self.trie)
         if m:
             if self.listener:
                 self.listener(
@@ -646,7 +659,7 @@ def parse_barcode_file(fp, primer=None, header=False):
 
     Any additional columns are ignored
     """
-    tr = trie.trie()
+    tr = trie.StringTrie()
     reader = csv.reader(fp)
 
     if header:
@@ -680,10 +693,6 @@ def action(arguments):
         raise ValueError("--quality-window-mean-qual specified without "
                          "--quality-window")
 
-    if trie is None or triefind is None:
-        raise ValueError(
-            'Missing Bio.trie and/or Bio.triefind modules. Cannot continue')
-
     filters = []
     input_type = fileformat.from_handle(arguments.sequence_file)
     output_type = fileformat.from_handle(arguments.output_file)


=====================================
seqmagick/test/integration/data/input6.fasta
=====================================
@@ -0,0 +1,6 @@
+>test1 test sequence 1 RNA
+AC-GU
+>test2 test sequence 2
+A-AAA
+>test3 sequence 3
+A---A


=====================================
seqmagick/test/integration/data/output2.nex
=====================================
@@ -1,7 +1,7 @@
 #NEXUS
 begin data;
-	dimensions ntax=3 nchar=5;
-	format datatype=dna missing=? gap=-;
+dimensions ntax=3 nchar=5;
+format datatype=dna missing=? gap=-;
 matrix
 test1 AC-GT
 test2 A-AAA


=====================================
seqmagick/test/integration/data/output3.nex
=====================================
@@ -0,0 +1,10 @@
+#NEXUS
+begin data;
+dimensions ntax=3 nchar=5;
+format datatype=rna missing=? gap=-;
+matrix
+test1 AC-GU
+test2 A-AAA
+test3 A---A
+;
+end;


=====================================
seqmagick/test/integration/data/output4.nex
=====================================
@@ -0,0 +1,10 @@
+#NEXUS
+begin data;
+dimensions ntax=3 nchar=5;
+format datatype=protein missing=? gap=-;
+matrix
+test1 AC-GT
+test2 A-AAA
+test3 A---A
+;
+end;


=====================================
seqmagick/test/integration/test_convert.py
=====================================
@@ -97,6 +97,20 @@ class ConvertToNexusTestCase(CommandLineTestMixIn, unittest.TestCase):
     command = 'convert {input} {output} --output-format nexus --alphabet dna-ambiguous'
 
 
+class ConvertToNexusRNATestCase(CommandLineTestMixIn, unittest.TestCase):
+    in_suffix = '.fasta'
+    input_path = p('input6.fasta')
+    expected_path = p('output3.nex')
+    command = 'convert {input} {output} --output-format nexus --alphabet rna'
+
+
+class ConvertToNexusProteinTestCase(CommandLineTestMixIn, unittest.TestCase):
+    in_suffix = '.fasta'
+    input_path = p('input2.fasta')
+    expected_path = p('output4.nex')
+    command = 'convert {input} {output} --output-format nexus --alphabet protein'
+
+
 class ConvertUngapCutTestCase(CommandLineTestMixIn, unittest.TestCase):
     in_suffix = '.fasta'
     out_suffix = '.fasta'


=====================================
seqmagick/test/test_primer_trim.py
=====================================
@@ -3,7 +3,6 @@ Tests for primer trim
 """
 import unittest
 
-from Bio import Alphabet
 from Bio.Seq import Seq
 from Bio.SeqRecord import SeqRecord
 
@@ -70,8 +69,7 @@ class HammingDistanceTestCase(unittest.TestCase):
 
 
 def _alignment_record(sequence):
-    return SeqRecord(
-        Seq(sequence, alphabet=Alphabet.Gapped(Alphabet.generic_dna)))
+    return SeqRecord(Seq(sequence))
 
 
 class LocatePrimersTestCase(unittest.TestCase):


=====================================
seqmagick/test/test_subcommands_quality_filter.py
=====================================
@@ -7,8 +7,6 @@ from Bio.SeqRecord import SeqRecord
 
 from seqmagick.subcommands import quality_filter
 
-from Bio import triefind
-
 IS_PYPY = hasattr(sys, 'pypy_version_info')
 
 
@@ -194,7 +192,6 @@ class MaxLengthFilterTestCase(unittest.TestCase):
                          [i.id for i in actual])
 
 
- at unittest.skipIf(IS_PYPY, "Bio.trie not available on pypy.")
 class PrimerBarcodeFilterTestCase(unittest.TestCase):
     def setUp(self):
         self.sequences = [
@@ -240,7 +237,6 @@ class RecordEventListenerTestCase(unittest.TestCase):
         self.assertEqual(events, [1, 5])
 
 
- at unittest.skipIf(IS_PYPY, "Bio.trie not available on pypy.")
 class BarcodePrimerTrieTestCase(unittest.TestCase):
     def setUp(self):
         self.barcode_str = """p1d1bc205,TACTAGCG,CATTGCCTATG
@@ -258,9 +254,9 @@ p1d1bc213,TACGAGAC,CAYGGCTA"""
         res = quality_filter.parse_barcode_file(self.fp, primer='CATTGCCTATG')
         self.assertEqual(9, len(list(res.keys())))
         self.assertEqual('p1d1bc210', res['TACAGTCGCATTGCCTATG'])
-        self.assertEqual(None, triefind.match('TACAGTCGCATTGCCTAT', res))
+        self.assertEqual(None, quality_filter.trie_match('TACAGTCGCATTGCCTAT', res))
         self.assertEqual('TACAGTCGCATTGCCTATG',
-                         triefind.match('TACAGTCGCATTGCCTATGCTACCTA', res))
+                         quality_filter.trie_match('TACAGTCGCATTGCCTATGCTACCTA', res))
 
     def test_primer_in_file(self):
         res = quality_filter.parse_barcode_file(self.fp, primer=None)


=====================================
seqmagick/test/test_transform.py
=====================================
@@ -7,7 +7,7 @@ import functools
 import logging
 import unittest
 
-from Bio import Alphabet, SeqIO
+from Bio import SeqIO
 from Bio.SeqRecord import SeqRecord
 from Bio.Seq import Seq
 
@@ -16,16 +16,13 @@ from seqmagick import transform
 logging.basicConfig(level=logging.FATAL)
 
 def _alignment_record(sequence):
-    return SeqRecord(Seq(sequence,
-        alphabet=Alphabet.Gapped(Alphabet.generic_dna)))
+    return SeqRecord(Seq(sequence))
 
-def seqrecord(sequence_id, sequence_text, alphabet=Alphabet.generic_dna,
-              description=None):
+def seqrecord(sequence_id, sequence_text, description=None):
     """
     Quick shortcut to make a SeqRecord
     """
-    record = SeqRecord(Seq(sequence_text, alphabet),
-                       id=sequence_id)
+    record = SeqRecord(Seq(sequence_text), id=sequence_id)
     if description:
         record.description = description
     return record


=====================================
seqmagick/transform.py
=====================================
@@ -13,7 +13,6 @@ import tempfile
 import random
 
 from Bio import SeqIO
-from Bio.Alphabet import IUPAC
 from Bio.Data import CodonTable
 from Bio.Seq import Seq
 from Bio.SeqRecord import SeqRecord
@@ -63,8 +62,7 @@ def dashes_cleanup(records, prune_chars='.:?~'):
         "Applying _dashes_cleanup: converting any of '{}' to '-'.".format(prune_chars))
     translation_table = {ord(c): '-' for c in prune_chars}
     for record in records:
-        record.seq = Seq(str(record.seq).translate(translation_table),
-                         record.seq.alphabet)
+        record.seq = Seq(str(record.seq).translate(translation_table))
         yield record
 
 
@@ -169,8 +167,7 @@ def isolate_region(sequences, start, end, gap_char='-'):
         seq = sequence.seq
         start_gap = gap_char * start
         end_gap = gap_char * (len(seq) - end)
-        seq = Seq(start_gap + str(seq[start:end]) + end_gap,
-                alphabet=seq.alphabet)
+        seq = Seq(start_gap + str(seq[start:end]) + end_gap)
         sequence.seq = seq
         yield sequence
 
@@ -191,7 +188,7 @@ def drop_columns(records, slices):
         drop = set(i for slice in slices
                    for i in range(*slice.indices(len(record))))
         keep = [i not in drop for i in range(len(record))]
-        record.seq = Seq(''.join(itertools.compress(record.seq, keep)), record.seq.alphabet)
+        record.seq = Seq(''.join(itertools.compress(record.seq, keep)))
         yield record
 
 def multi_cut_sequences(records, slices):
@@ -690,7 +687,6 @@ def translate(records, translate):
     to_stop = translate.endswith('stop')
 
     source_type = translate[:3]
-    alphabet = {'dna': IUPAC.ambiguous_dna, 'rna': IUPAC.ambiguous_rna}[source_type]
 
     # Get a translation table
     table = {'dna': CodonTable.ambiguous_dna_by_name["Standard"],
@@ -704,7 +700,7 @@ def translate(records, translate):
 
     for record in records:
         sequence = str(record.seq)
-        seq = Seq(sequence, alphabet)
+        seq = Seq(sequence)
         protein = seq.translate(table, to_stop=to_stop)
         yield SeqRecord(protein, id=record.id, description=record.description)
 



View it on GitLab: https://salsa.debian.org/med-team/seqmagick/-/commit/3e36ac3bf6d53643ba93cafda0481629d1b4b6c3

-- 
View it on GitLab: https://salsa.debian.org/med-team/seqmagick/-/commit/3e36ac3bf6d53643ba93cafda0481629d1b4b6c3
You're receiving this email because of your account on salsa.debian.org.


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20201114/00aa2561/attachment-0001.html>