[med-svn] [Git][med-team/python-nanomath][upstream] 2 commits: New upstream version 1.0.0

Steffen Möller gitlab at salsa.debian.org
Tue Sep 1 18:32:13 BST 2020



Steffen Möller pushed to branch upstream at Debian Med / python-nanomath


Commits:
fe1a2b9e by Steffen Moeller at 2020-08-21T21:59:14+02:00
New upstream version 1.0.0
- - - - -
a6312d87 by Steffen Moeller at 2020-09-01T19:27:53+02:00
New upstream version 1.0.1+ds
- - - - -


8 changed files:

- PKG-INFO
- README.rst
- nanomath.egg-info/PKG-INFO
- − nanomath.egg-info/SOURCES.txt
- − nanomath.egg-info/requires.txt
- nanomath/nanomath.py
- nanomath/version.py
- setup.py


Changes:

=====================================
PKG-INFO
=====================================
@@ -1,11 +1,11 @@
 Metadata-Version: 1.2
 Name: nanomath
-Version: 0.23.2
+Version: 1.0.1
 Summary: A few simple math function for other Oxford Nanopore processing scripts
 Home-page: https://github.com/wdecoster/nanomath
 Author: Wouter De Coster
 Author-email: decosterwouter at gmail.com
-License: MIT
+License: GPLv3
 Description: # nanomath
         This module provides a few simple math and statistics functions for other scripts processing Oxford Nanopore sequencing data
         


=====================================
README.rst
=====================================
@@ -24,14 +24,14 @@ INSTALLATION
 
 .. code:: bash
 
-    pip install nanomath
+   pip install nanomath
 
 | or
 | |install with conda|
 
 ::
 
-    conda install -c bioconda nanomath
+   conda install -c bioconda nanomath
 
 STATUS
 ------


=====================================
nanomath.egg-info/PKG-INFO
=====================================
@@ -1,11 +1,11 @@
 Metadata-Version: 1.2
 Name: nanomath
-Version: 0.23.2
+Version: 1.0.1
 Summary: A few simple math function for other Oxford Nanopore processing scripts
 Home-page: https://github.com/wdecoster/nanomath
 Author: Wouter De Coster
 Author-email: decosterwouter at gmail.com
-License: MIT
+License: GPLv3
 Description: # nanomath
         This module provides a few simple math and statistics functions for other scripts processing Oxford Nanopore sequencing data
         


=====================================
nanomath.egg-info/SOURCES.txt deleted
=====================================
@@ -1,15 +0,0 @@
-LICENSE
-MANIFEST.in
-README.md
-README.rst
-setup.cfg
-setup.py
-nanomath/__init__.py
-nanomath/nanomath.py
-nanomath/test_nanomath.py
-nanomath/version.py
-nanomath.egg-info/PKG-INFO
-nanomath.egg-info/SOURCES.txt
-nanomath.egg-info/dependency_links.txt
-nanomath.egg-info/requires.txt
-nanomath.egg-info/top_level.txt
\ No newline at end of file


=====================================
nanomath.egg-info/requires.txt deleted
=====================================
@@ -1,2 +0,0 @@
-pandas
-numpy>1.8


=====================================
nanomath/nanomath.py
=====================================
@@ -20,14 +20,16 @@ calc_read_stats(dataframe)
 """
 
 import numpy as np
-from math import log
 import sys
+from deprecated import deprecated
+from math import log
 
 
 class Stats(object):
     def __init__(self, df):
         self.number_of_reads = len(df)
         self.number_of_bases = np.sum(df["lengths"])
+        self._with_readIDs = "readIDs" in df
         if "aligned_lengths" in df:
             self.number_of_bases_aligned = np.sum(df["aligned_lengths"])
         self.median_read_length = np.median(df["lengths"])
@@ -39,16 +41,71 @@ class Stats(object):
         if "channelIDs" in df:
             self.active_channels = np.unique(df["channelIDs"]).size
         if "quals" in df:
-            self.qualgroups = [5, 7, 10, 12, 15]  # needs 5 elements in current implementation
+            self._qualgroups = [5, 7, 10, 12, 15]  # needs 5 elements in current implementation
             self.mean_qual = np.mean(df["quals"])
             self.median_qual = np.median(df["quals"])
-            self.top5_lengths = get_top_5(df=df,
-                                          col="lengths",
-                                          values=["lengths", "quals"])
-            self.top5_quals = get_top_5(df=df,
-                                        col="quals",
-                                        values=["quals", "lengths"])
-            self.reads_above_qual = [reads_above_qual(df, q) for q in self.qualgroups]
+            self._top5_lengths = get_top_5(df=df,
+                                           col="lengths",
+                                           values=["lengths", "quals"])
+            self._top5_quals = get_top_5(df=df,
+                                         col="quals",
+                                         values=["quals", "lengths"])
+            self._reads_above_qual = [reads_above_qual(df, q) for q in self._qualgroups]
+
+    def long_features_as_string(self):
+        """formatting long features to a string to print for legacy stats output"""
+        self.top5_lengths = self.long_feature_as_string_top5(self._top5_lengths)
+        self.top5_quals = self.long_feature_as_string_top5(self._top5_quals)
+        self.reads_above_qual = self.long_feature_as_string_above_qual(self._reads_above_qual)
+
+    def long_feature_as_string_top5(self, field):
+        """for legacy stats output"""
+        if self._with_readIDs:
+            return [str(round(i, ndigits=1)) + " (" +
+                    str(round(j, ndigits=1)) + "; " + k + ")" for i, j, k in field]
+        else:
+            return [str(round(i, ndigits=1)) + " (" +
+                    str(round(j, ndigits=1)) + ")" for i, j in field]
+
+    def long_feature_as_string_above_qual(self, field):
+        """for legacy stats output"""
+        return [self.format_above_qual_line(entry) for entry in field]
+
+    def format_above_qual_line(self, entry):
+        """for legacy stats output"""
+        numberAboveQ, megAboveQ = entry
+        return "{} ({}%) {}Mb".format(numberAboveQ,
+                                      round(100 * (numberAboveQ / self.number_of_reads),
+                                            ndigits=1),
+                                      round(megAboveQ, ndigits=1))
+
+    def to_dict(self):
+        """for tsv stats output"""
+        statdict = self.__dict__
+        for key, value in statdict.items():
+            if not key.startswith('_'):
+                if not isinstance(value, int):
+                    statdict[key] = '{:.1f}'.format(value)
+        self.unwind_long_features_top5(feature='_top5_lengths', name='longest_read_(with_Q)')
+        self.unwind_long_features_top5(feature='_top5_quals', name='highest_Q_read_(with_length)')
+        self.unwind_long_features_above_qual(feature='_reads_above_qual', name='Reads')
+        return {k: v for k, v in statdict.items() if not k.startswith('_')}
+
+    def unwind_long_features_top5(self, feature, name):
+        """for tsv stats output"""
+        for entry, label in zip(self.__dict__[feature], range(1, 6)):
+            self.__dict__[name + ':' + str(label)] = '{} ({})'.format(round(entry[0], ndigits=1),
+                                                                      round(entry[1], ndigits=1))
+
+    def unwind_long_features_above_qual(self, feature, name):
+        """for tsv stats output"""
+        for entry, label in zip(self.__dict__[feature],
+                                ['>Q{}:'.format(q) for q in self._qualgroups]):
+            numberAboveQ, megAboveQ = entry
+            percentage = 100 * (numberAboveQ / float(self.number_of_reads))
+            self.__dict__[name + ' ' + label] = "{} ({}%) {}Mb".format(numberAboveQ,
+                                                                       round(percentage, ndigits=1),
+                                                                       round(megAboveQ, ndigits=1))
 
 
 def get_N50(readlengths):
@@ -59,20 +116,19 @@ def get_N50(readlengths):
     return readlengths[np.where(np.cumsum(readlengths) >= 0.5 * np.sum(readlengths))[0][0]]
 
 
+ at deprecated
 def remove_length_outliers(df, columnname):
     """Remove records with length-outliers above 3 standard deviations from the median."""
     return df[df[columnname] < (np.median(df[columnname]) + 3 * np.std(df[columnname]))]
 
 
-def phred_to_percent(phred):
-    return 100 * (1 - 10 ** (phred / -10))
-
-
+ at deprecated
 def errs_tab(n):
     """Generate list of error rates for qualities less than equal than n."""
     return [10**(q / -10) for q in range(n+1)]
 
 
+ at deprecated
 def ave_qual(quals, qround=False, tab=errs_tab(128)):
     """Calculate average basecall quality of a read.
 
@@ -91,44 +147,22 @@ def ave_qual(quals, qround=False, tab=errs_tab(128)):
         return None
 
 
-def median_qual(quals):
-    """Receive the integer quality scores of a read and return the median quality for that read."""
-    return np.median(quals)
-
-
 def get_top_5(df, col, values):
     if "readIDs" in df:
         values.append("readIDs")
-    res = df.sort_values(col, ascending=False) \
+    return df.sort_values(col, ascending=False) \
         .head(5)[values] \
         .reset_index(drop=True) \
         .itertuples(index=False, name=None)
-    if "readIDs" in df:
-        return [str(round(i, ndigits=1)) + " (" +
-                str(round(j, ndigits=1)) + "; " + k + ")" for i, j, k in res]
-    else:
-        return [str(round(i, ndigits=1)) + " (" +
-                str(round(j, ndigits=1)) + ")" for i, j in res]
 
 
 def reads_above_qual(df, qual):
     numberAboveQ = np.sum(df["quals"] > qual)
     megAboveQ = np.sum(df.loc[df["quals"] > qual, "lengths"]) / 1e6
-    return "{} ({}%) {}Mb".format(numberAboveQ,
-                                  round(100 * (numberAboveQ / len(df.index)), ndigits=1),
-                                  round(megAboveQ, ndigits=1))
-
-
-def feature_list(stats, feature, index=None, padding=15):
-    if index is None:
-        return ' '.join(['{:>{},.1f}'.format(s.__dict__[feature], padding) for s in stats])
-    else:
-        return '\t'.join([str(s.__dict__[feature][index]) if len(s.__dict__[feature]) > index
-                          else "NA"
-                          for s in stats])
+    return numberAboveQ, megAboveQ
 
 
-def write_stats(datadfs, outputfile, names=[]):
+def write_stats(datadfs, outputfile, names=[], as_tsv=False):
     """Call calculation functions and write stats file.
 
     This function takes a list of DataFrames,
@@ -140,6 +174,26 @@ def write_stats(datadfs, outputfile, names=[]):
         output = open(outputfile, 'wt')
 
     stats = [Stats(df) for df in datadfs]
+
+    if as_tsv:
+        import pandas as pd
+        df = pd.DataFrame([s.to_dict() for s in stats]).transpose()
+        df.index.name = 'Metrics'
+        if names:
+            df.columns = names
+        else:
+            df.columns = ['dataset']
+        output.write(df.to_csv(sep='\t'))
+        return df
+    else:
+        write_stats_legacy(stats, names, output, datadfs)
+
+
+def write_stats_legacy(stats, names, output, datadfs):
+    """
+    Legacy method to write out stats.
+    Will add padding to pretty print the table, and contain section headers
+    """
     features = {
         "Number of reads": "number_of_reads",
         "Total bases": "number_of_bases",
@@ -170,16 +224,29 @@ def write_stats(datadfs, outputfile, names=[]):
         except KeyError:
             pass
     if all(["quals" in df for df in datadfs]):
+        for s in stats:
+            s.long_features_as_string()
         long_features = {
             "Top 5 longest reads and their mean basecall quality score":
             ["top5_lengths", range(1, 6)],
             "Top 5 highest mean basecall quality scores and their read lengths":
             ["top5_quals", range(1, 6)],
             "Number, percentage and megabases of reads above quality cutoffs":
-            ["reads_above_qual", [">Q" + str(q) for q in stats[0].qualgroups]],
+            ["reads_above_qual", [">Q" + str(q) for q in stats[0]._qualgroups]],
         }
         for lf in sorted(long_features.keys()):
             output.write(lf + "\n")
-            for i in range(5):
+            for index in range(5):
                 output.write("{}:\t{}\n".format(
-                    long_features[lf][1][i], feature_list(stats, long_features[lf][0], index=i)))
+                    long_features[lf][1][index], feature_list(stats=stats,
+                                                              feature=long_features[lf][0],
+                                                              index=index)))
+
+
+def feature_list(stats, feature, index=None, padding=15):
+    if index is None:
+        return ' '.join(['{:>{},.1f}'.format(s.__dict__[feature], padding) for s in stats])
+    else:
+        return '\t'.join([str(s.__dict__[feature][index]) if len(s.__dict__[feature]) > index
+                          else "NA"
+                          for s in stats])


=====================================
nanomath/version.py
=====================================
@@ -1 +1 @@
-__version__ = "0.23.2"
+__version__ = "1.0.1"


=====================================
setup.py
=====================================
@@ -15,7 +15,7 @@ setup(
     url='https://github.com/wdecoster/nanomath',
     author='Wouter De Coster',
     author_email='decosterwouter at gmail.com',
-    license='MIT',
+    license='GPLv3',
     classifiers=[
         'Development Status :: 4 - Beta',
         'Intended Audience :: Science/Research',
@@ -29,6 +29,6 @@ setup(
     keywords='nanopore sequencing plotting quality control',
     packages=find_packages(),
     python_requires='>=3',
-    install_requires=['pandas', 'numpy>1.8', ],
+    install_requires=['pandas', 'numpy>1.8', 'Python-Deprecated'],
     package_dir={'nanomath': 'nanomath'},
     data_files=[("", ["LICENSE"])])



View it on GitLab: https://salsa.debian.org/med-team/python-nanomath/-/compare/da27f0263a64e9de105a1696cdb1f8544fbe6214...a6312d8761c00a17a19e5bf4e550d0db413de77a

-- 
View it on GitLab: https://salsa.debian.org/med-team/python-nanomath/-/compare/da27f0263a64e9de105a1696cdb1f8544fbe6214...a6312d8761c00a17a19e5bf4e550d0db413de77a
You're receiving this email because of your account on salsa.debian.org.


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20200901/c58ecee3/attachment-0001.html>


More information about the debian-med-commit mailing list