[med-svn] [Git][med-team/seqmagick][upstream] New upstream version 0.8.2
Étienne Mollier
gitlab at salsa.debian.org
Sat Nov 14 19:04:06 GMT 2020
Étienne Mollier pushed to branch upstream at Debian Med / seqmagick
Commits:
3e36ac3b by Étienne Mollier at 2020-11-14T19:37:22+01:00
New upstream version 0.8.2
- - - - -
19 changed files:
- .travis.yml
- README.rst
- docs/changelog.rst
- docs/index.rst
- examples/apply-function/myfunctions.py
- requirements-rtd.txt
- requirements.txt
- seqmagick/subcommands/convert.py
- seqmagick/subcommands/primer_trim.py
- seqmagick/subcommands/quality_filter.py
- + seqmagick/test/integration/data/input6.fasta
- seqmagick/test/integration/data/output2.nex
- + seqmagick/test/integration/data/output3.nex
- + seqmagick/test/integration/data/output4.nex
- seqmagick/test/integration/test_convert.py
- seqmagick/test/test_primer_trim.py
- seqmagick/test/test_subcommands_quality_filter.py
- seqmagick/test/test_transform.py
- seqmagick/transform.py
Changes:
=====================================
.travis.yml
=====================================
@@ -3,7 +3,7 @@ python:
# - "2.7"
# - "pypy"
# - "3.4"
- - "3.5"
+ # - "3.5"
- "3.6"
- "3.7"
- "3.8"
@@ -12,7 +12,7 @@ python:
# BioPython doesn't always play well with pip install.
install:
- "if [[ $TRAVIS_PYTHON_VERSION != 'pypy' ]]; then pip install -q numpy; fi"
- - "pip install -q biopython nose"
+ - "pip install -q biopython nose pygtrie"
- "pip install ."
script:
=====================================
README.rst
=====================================
@@ -19,7 +19,7 @@ there is one that takes arguments::
Requirements
============
-* Python >= 3.4
+* Python >= 3.5
* biopython >= 1.70
Installation
@@ -29,7 +29,7 @@ Use pip::
pip install seqmagick
-Note that as of version 0.7.0, this package requires Python 3.4+. If
+Note that as of version 0.8.0, this package requires Python 3.5+. If
you want to use the most recent version compatible with Python 2.7::
pip install seqmagick==0.6.2
=====================================
docs/changelog.rst
=====================================
@@ -1,6 +1,15 @@
Changes for seqmagick
=====================
+0.8.0
+-----
+
+* Supports Python 3.5+
+* Drops support for Python 3.4
+* Fix issue: "seqmagick with no params gives KeyError:None" [GH-77]
+* Fix for Biopython 1.71 dual coding support [GH-76]; also fixes issue: "Translation error with new BioPython" [GH-79]
+* Send logging to stderr, not stdout [GH-75]
+
0.7.0
-----
=====================================
docs/index.rst
=====================================
@@ -59,10 +59,10 @@ To install the bleeding edge version::
pip install git+https://github.com/fhcrc/seqmagick.git@master#egg-info=seqmagick
-Note that as of version 0.7.0, this package requires Python 3.4+. If
+Note that as of version 0.8.0, this package requires Python 3.5+. If
you want to use the most recent version compatible with Python 2.7::
- pip install seqmagick==1.6.2
+ pip install seqmagick==0.6.2
Use
===
=====================================
examples/apply-function/myfunctions.py
=====================================
@@ -14,6 +14,6 @@ def hash_starts_numeric(records):
their sha-1 hash.
"""
for record in records:
- seq_hash = hashlib.sha1(str(record.seq)).hexdigest()
+ seq_hash = hashlib.sha1(str(record.seq).encode('utf-8')).hexdigest()
if seq_hash[0].isdigit():
yield record
=====================================
requirements-rtd.txt
=====================================
@@ -1,2 +1,3 @@
biopython
sphinx
+pygtrie
=====================================
requirements.txt
=====================================
@@ -1,4 +1,5 @@
biopython>=1.70
+pygtrie
# for development
wheel
=====================================
seqmagick/subcommands/convert.py
=====================================
@@ -6,8 +6,7 @@ import functools
import logging
import random
-from Bio import Alphabet, SeqIO
-from Bio.Alphabet import IUPAC
+from Bio import SeqIO
from Bio.SeqIO import FastaIO
from seqmagick import transform
from seqmagick.fileformat import from_handle
@@ -15,11 +14,11 @@ from seqmagick.fileformat import from_handle
from . import common
ALPHABETS = {
- 'dna': Alphabet.generic_dna,
- 'dna-ambiguous': IUPAC.ambiguous_dna,
- 'protein': Alphabet.generic_protein,
- 'rna': Alphabet.generic_rna,
- 'rna-ambiguous': IUPAC.ambiguous_rna,
+ "dna": "DNA",
+ "dna-ambiguous": "DNA",
+ "rna": "RNA",
+ "rna-ambiguous": "RNA",
+ "protein": "protein",
}
def add_options(parser):
@@ -230,6 +229,13 @@ def build_parser(parser):
return parser
+
+def append_annotation_iterator(records_iterator, alphabet):
+ for record in records_iterator:
+ record.annotations["molecule_type"] = ALPHABETS[alphabet]
+ yield record
+
+
def transform_file(source_file, destination_file, arguments):
# Get just the file name, useful for naming the temporary file.
source_file_type = (arguments.input_format or from_handle(source_file))
@@ -249,8 +255,7 @@ def transform_file(source_file, destination_file, arguments):
direction=directions[direction])
else:
# Unsorted iterator.
- records = SeqIO.parse(source_file, source_file_type,
- alphabet=ALPHABETS.get(arguments.alphabet))
+ records = SeqIO.parse(source_file, source_file_type)
#########################################
@@ -315,6 +320,9 @@ def transform_file(source_file, destination_file, arguments):
# loading the entire sequence file up into memory.
logging.info("Applying transformations, writing to %s",
destination_file)
+ # Append datatype annotation, mandatory for Nexus files conversion.
+ if arguments.alphabet != None:
+ records = append_annotation_iterator(records, arguments.alphabet)
SeqIO.write(records, destination_file, destination_file_type)
=====================================
seqmagick/subcommands/primer_trim.py
=====================================
@@ -7,8 +7,7 @@ import logging
import operator
import sys
-from Bio import Alphabet, SeqIO, pairwise2
-from Bio.Alphabet import IUPAC
+from Bio import SeqIO, pairwise2
from Bio.Seq import Seq
from seqmagick import transform, fileformat
@@ -298,8 +297,7 @@ def action(arguments):
with arguments.source_file:
sequences = SeqIO.parse(
arguments.source_file,
- source_format,
- alphabet=Alphabet.Gapped(Alphabet.single_letter_alphabet))
+ source_format)
# Locate primers
(forward_start, forward_end), (reverse_start, reverse_end) = locate_primers(
@@ -319,8 +317,7 @@ def action(arguments):
arguments.source_file.seek(0)
sequences = SeqIO.parse(
arguments.source_file,
- source_format,
- alphabet=Alphabet.Gapped(Alphabet.single_letter_alphabet))
+ source_format)
# Apply the transformation
prune_action = _ACTIONS[arguments.prune_action]
=====================================
seqmagick/subcommands/quality_filter.py
=====================================
@@ -11,16 +11,29 @@ import sys
import time
from Bio import SeqIO
-try:
- from Bio import trie, triefind
-except ImportError:
- trie = None
- triefind = None
+import pygtrie as trie
from Bio.SeqIO import QualityIO
from seqmagick import fileformat, __version__
from .common import typed_range, FileType
+
+def trie_match(string, trie):
+ def has_prefix(teststring, trie):
+ for key in trie.keys():
+ if key.startswith(teststring):
+ return True
+ return False
+ longest = None
+ for i in range(len(string)):
+ substr = string[:i + 1]
+ if not has_prefix(substr, trie):
+ break
+ if trie.has_key(substr):
+ longest = substr
+ return longest
+
+
# Default minimummean quality score
DEFAULT_MEAN_SCORE = 25.0
@@ -626,7 +639,7 @@ class PrimerBarcodeFilter(BaseFilter):
self.trie = trie
def filter_record(self, record):
- m = triefind.match(str(record.seq), self.trie)
+ m = trie_match(str(record.seq), self.trie)
if m:
if self.listener:
self.listener(
@@ -646,7 +659,7 @@ def parse_barcode_file(fp, primer=None, header=False):
Any additional columns are ignored
"""
- tr = trie.trie()
+ tr = trie.StringTrie()
reader = csv.reader(fp)
if header:
@@ -680,10 +693,6 @@ def action(arguments):
raise ValueError("--quality-window-mean-qual specified without "
"--quality-window")
- if trie is None or triefind is None:
- raise ValueError(
- 'Missing Bio.trie and/or Bio.triefind modules. Cannot continue')
-
filters = []
input_type = fileformat.from_handle(arguments.sequence_file)
output_type = fileformat.from_handle(arguments.output_file)
=====================================
seqmagick/test/integration/data/input6.fasta
=====================================
@@ -0,0 +1,6 @@
+>test1 test sequence 1 RNA
+AC-GU
+>test2 test sequence 2
+A-AAA
+>test3 sequence 3
+A---A
=====================================
seqmagick/test/integration/data/output2.nex
=====================================
@@ -1,7 +1,7 @@
#NEXUS
begin data;
- dimensions ntax=3 nchar=5;
- format datatype=dna missing=? gap=-;
+dimensions ntax=3 nchar=5;
+format datatype=dna missing=? gap=-;
matrix
test1 AC-GT
test2 A-AAA
=====================================
seqmagick/test/integration/data/output3.nex
=====================================
@@ -0,0 +1,10 @@
+#NEXUS
+begin data;
+dimensions ntax=3 nchar=5;
+format datatype=rna missing=? gap=-;
+matrix
+test1 AC-GU
+test2 A-AAA
+test3 A---A
+;
+end;
=====================================
seqmagick/test/integration/data/output4.nex
=====================================
@@ -0,0 +1,10 @@
+#NEXUS
+begin data;
+dimensions ntax=3 nchar=5;
+format datatype=protein missing=? gap=-;
+matrix
+test1 AC-GT
+test2 A-AAA
+test3 A---A
+;
+end;
=====================================
seqmagick/test/integration/test_convert.py
=====================================
@@ -97,6 +97,20 @@ class ConvertToNexusTestCase(CommandLineTestMixIn, unittest.TestCase):
command = 'convert {input} {output} --output-format nexus --alphabet dna-ambiguous'
+class ConvertToNexusRNATestCase(CommandLineTestMixIn, unittest.TestCase):
+ in_suffix = '.fasta'
+ input_path = p('input6.fasta')
+ expected_path = p('output3.nex')
+ command = 'convert {input} {output} --output-format nexus --alphabet rna'
+
+
+class ConvertToNexusProteinTestCase(CommandLineTestMixIn, unittest.TestCase):
+ in_suffix = '.fasta'
+ input_path = p('input2.fasta')
+ expected_path = p('output4.nex')
+ command = 'convert {input} {output} --output-format nexus --alphabet protein'
+
+
class ConvertUngapCutTestCase(CommandLineTestMixIn, unittest.TestCase):
in_suffix = '.fasta'
out_suffix = '.fasta'
=====================================
seqmagick/test/test_primer_trim.py
=====================================
@@ -3,7 +3,6 @@ Tests for primer trim
"""
import unittest
-from Bio import Alphabet
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
@@ -70,8 +69,7 @@ class HammingDistanceTestCase(unittest.TestCase):
def _alignment_record(sequence):
- return SeqRecord(
- Seq(sequence, alphabet=Alphabet.Gapped(Alphabet.generic_dna)))
+ return SeqRecord(Seq(sequence))
class LocatePrimersTestCase(unittest.TestCase):
=====================================
seqmagick/test/test_subcommands_quality_filter.py
=====================================
@@ -7,8 +7,6 @@ from Bio.SeqRecord import SeqRecord
from seqmagick.subcommands import quality_filter
-from Bio import triefind
-
IS_PYPY = hasattr(sys, 'pypy_version_info')
@@ -194,7 +192,6 @@ class MaxLengthFilterTestCase(unittest.TestCase):
[i.id for i in actual])
- at unittest.skipIf(IS_PYPY, "Bio.trie not available on pypy.")
class PrimerBarcodeFilterTestCase(unittest.TestCase):
def setUp(self):
self.sequences = [
@@ -240,7 +237,6 @@ class RecordEventListenerTestCase(unittest.TestCase):
self.assertEqual(events, [1, 5])
- at unittest.skipIf(IS_PYPY, "Bio.trie not available on pypy.")
class BarcodePrimerTrieTestCase(unittest.TestCase):
def setUp(self):
self.barcode_str = """p1d1bc205,TACTAGCG,CATTGCCTATG
@@ -258,9 +254,9 @@ p1d1bc213,TACGAGAC,CAYGGCTA"""
res = quality_filter.parse_barcode_file(self.fp, primer='CATTGCCTATG')
self.assertEqual(9, len(list(res.keys())))
self.assertEqual('p1d1bc210', res['TACAGTCGCATTGCCTATG'])
- self.assertEqual(None, triefind.match('TACAGTCGCATTGCCTAT', res))
+ self.assertEqual(None, quality_filter.trie_match('TACAGTCGCATTGCCTAT', res))
self.assertEqual('TACAGTCGCATTGCCTATG',
- triefind.match('TACAGTCGCATTGCCTATGCTACCTA', res))
+ quality_filter.trie_match('TACAGTCGCATTGCCTATGCTACCTA', res))
def test_primer_in_file(self):
res = quality_filter.parse_barcode_file(self.fp, primer=None)
=====================================
seqmagick/test/test_transform.py
=====================================
@@ -7,7 +7,7 @@ import functools
import logging
import unittest
-from Bio import Alphabet, SeqIO
+from Bio import SeqIO
from Bio.SeqRecord import SeqRecord
from Bio.Seq import Seq
@@ -16,16 +16,13 @@ from seqmagick import transform
logging.basicConfig(level=logging.FATAL)
def _alignment_record(sequence):
- return SeqRecord(Seq(sequence,
- alphabet=Alphabet.Gapped(Alphabet.generic_dna)))
+ return SeqRecord(Seq(sequence))
-def seqrecord(sequence_id, sequence_text, alphabet=Alphabet.generic_dna,
- description=None):
+def seqrecord(sequence_id, sequence_text, description=None):
"""
Quick shortcut to make a SeqRecord
"""
- record = SeqRecord(Seq(sequence_text, alphabet),
- id=sequence_id)
+ record = SeqRecord(Seq(sequence_text), id=sequence_id)
if description:
record.description = description
return record
=====================================
seqmagick/transform.py
=====================================
@@ -13,7 +13,6 @@ import tempfile
import random
from Bio import SeqIO
-from Bio.Alphabet import IUPAC
from Bio.Data import CodonTable
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
@@ -63,8 +62,7 @@ def dashes_cleanup(records, prune_chars='.:?~'):
"Applying _dashes_cleanup: converting any of '{}' to '-'.".format(prune_chars))
translation_table = {ord(c): '-' for c in prune_chars}
for record in records:
- record.seq = Seq(str(record.seq).translate(translation_table),
- record.seq.alphabet)
+ record.seq = Seq(str(record.seq).translate(translation_table))
yield record
@@ -169,8 +167,7 @@ def isolate_region(sequences, start, end, gap_char='-'):
seq = sequence.seq
start_gap = gap_char * start
end_gap = gap_char * (len(seq) - end)
- seq = Seq(start_gap + str(seq[start:end]) + end_gap,
- alphabet=seq.alphabet)
+ seq = Seq(start_gap + str(seq[start:end]) + end_gap)
sequence.seq = seq
yield sequence
@@ -191,7 +188,7 @@ def drop_columns(records, slices):
drop = set(i for slice in slices
for i in range(*slice.indices(len(record))))
keep = [i not in drop for i in range(len(record))]
- record.seq = Seq(''.join(itertools.compress(record.seq, keep)), record.seq.alphabet)
+ record.seq = Seq(''.join(itertools.compress(record.seq, keep)))
yield record
def multi_cut_sequences(records, slices):
@@ -690,7 +687,6 @@ def translate(records, translate):
to_stop = translate.endswith('stop')
source_type = translate[:3]
- alphabet = {'dna': IUPAC.ambiguous_dna, 'rna': IUPAC.ambiguous_rna}[source_type]
# Get a translation table
table = {'dna': CodonTable.ambiguous_dna_by_name["Standard"],
@@ -704,7 +700,7 @@ def translate(records, translate):
for record in records:
sequence = str(record.seq)
- seq = Seq(sequence, alphabet)
+ seq = Seq(sequence)
protein = seq.translate(table, to_stop=to_stop)
yield SeqRecord(protein, id=record.id, description=record.description)
View it on GitLab: https://salsa.debian.org/med-team/seqmagick/-/commit/3e36ac3bf6d53643ba93cafda0481629d1b4b6c3
--
View it on GitLab: https://salsa.debian.org/med-team/seqmagick/-/commit/3e36ac3bf6d53643ba93cafda0481629d1b4b6c3
You're receiving this email because of your account on salsa.debian.org.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20201114/00aa2561/attachment-0001.html>
More information about the debian-med-commit
mailing list