[med-svn] [Git][med-team/fastaq][upstream] New upstream version 3.17.0
Sascha Steinbiss
gitlab at salsa.debian.org
Thu Feb 22 10:06:08 UTC 2018
Sascha Steinbiss pushed to branch upstream at Debian Med / fastaq
Commits:
e4dd7b7f by Sascha Steinbiss at 2018-02-22T10:22:16+01:00
New upstream version 3.17.0
- - - - -
3 changed files:
- pyfastaq/sequences.py
- pyfastaq/tests/sequences_test.py
- setup.py
Changes:
=====================================
pyfastaq/sequences.py
=====================================
--- a/pyfastaq/sequences.py
+++ b/pyfastaq/sequences.py
@@ -1,9 +1,8 @@
import copy
import re
-import string
import random
import itertools
-
+from collections import Counter
from pyfastaq import utils, intervals, genetic_codes
class Error (Exception): pass
@@ -465,6 +464,47 @@ class Fasta:
'''Returns a Fasta sequence, translated into amino acids. Starts translating from 'frame', where frame expected to be 0,1 or 2'''
return Fasta(self.id, ''.join([genetic_codes.codes[genetic_code].get(self.seq[x:x+3].upper(), 'X') for x in range(frame, len(self)-1-frame, 3)]))
+ def gc_content(self, as_decimal=True):
+ """Returns the GC content for the sequence.
+ Notes:
+ This method ignores N when calculating the length of the sequence.
+ It does not, however ignore other ambiguous bases. It also only
+ includes the ambiguous base S (G or C). In this sense the method is
+ conservative with its calculation.
+
+ Args:
+ as_decimal (bool): Return the result as a decimal. Setting to False
+ will return as a percentage. i.e for the sequence GCAT it will
+ return 0.5 by default and 50.00 if set to False.
+
+ Returns:
+ float: GC content calculated as the number of G, C, and S divided
+ by the number of (non-N) bases (length).
+
+ """
+ gc_total = 0.0
+ num_bases = 0.0
+ n_tuple = tuple('nN')
+ accepted_bases = tuple('cCgGsS')
+
+ # counter sums all unique characters in sequence. Case insensitive.
+ for base, count in Counter(self.seq).items():
+
+ # dont count N in the number of bases
+ if base not in n_tuple:
+ num_bases += count
+
+ if base in accepted_bases: # S is a G or C
+ gc_total += count
+
+ gc_content = gc_total / num_bases
+
+ if not as_decimal: # return as percentage
+ gc_content *= 100
+
+ return gc_content
+
+
class Embl(Fasta):
'''Exactly the same as Fasta, but reading seqs from a file works differently'''
=====================================
pyfastaq/tests/sequences_test.py
=====================================
--- a/pyfastaq/tests/sequences_test.py
+++ b/pyfastaq/tests/sequences_test.py
@@ -520,6 +520,19 @@ class TestFasta(unittest.TestCase):
fa = sequences.Fasta('name', 'A')
fa.split_capillary_id()
+ def test_gc_content(self):
+ """Test GC content calculation works as expected"""
+ tests = [
+ (sequences.Fasta('ID', 'cgCG'), 1.0),
+ (sequences.Fasta('ID', 'tTaA'), 0.0),
+ (sequences.Fasta('ID', 'GCAT'), 0.5),
+ (sequences.Fasta('ID', 'GCATNN'), 0.5),
+ (sequences.Fasta('ID', 'GCATNNS'), 0.6),
+ (sequences.Fasta('ID', 'GCATNNSK'), 0.5)
+ ]
+ for test, answer in tests:
+ self.assertAlmostEqual(test.gc_content(), answer)
+ self.assertAlmostEqual(test.gc_content(as_decimal=False), answer * 100)
class TestEmbl(unittest.TestCase):
def test_get_id_from_header_line(self):
=====================================
setup.py
=====================================
--- a/setup.py
+++ b/setup.py
@@ -4,7 +4,7 @@ from setuptools import setup, find_packages
setup(
name='pyfastaq',
- version='3.16.0',
+ version='3.17.0',
description='Script to manipulate FASTA and FASTQ files, plus API for developers',
packages = find_packages(),
author='Martin Hunt',
View it on GitLab: https://salsa.debian.org/med-team/fastaq/commit/e4dd7b7f6d4f01fc0b193e609338be560d7e3bb8
---
View it on GitLab: https://salsa.debian.org/med-team/fastaq/commit/e4dd7b7f6d4f01fc0b193e609338be560d7e3bb8
You're receiving this email because of your account on salsa.debian.org.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.alioth.debian.org/pipermail/debian-med-commit/attachments/20180222/e17ff1f1/attachment-0001.html>
More information about the debian-med-commit
mailing list