[med-svn] [Git][med-team/python-nanomath][upstream] 2 commits: New upstream version 1.0.0
Steffen Möller
gitlab at salsa.debian.org
Tue Sep 1 18:32:13 BST 2020
Steffen Möller pushed to branch upstream at Debian Med / python-nanomath
Commits:
fe1a2b9e by Steffen Moeller at 2020-08-21T21:59:14+02:00
New upstream version 1.0.0
- - - - -
a6312d87 by Steffen Moeller at 2020-09-01T19:27:53+02:00
New upstream version 1.0.1+ds
- - - - -
8 changed files:
- PKG-INFO
- README.rst
- nanomath.egg-info/PKG-INFO
- − nanomath.egg-info/SOURCES.txt
- − nanomath.egg-info/requires.txt
- nanomath/nanomath.py
- nanomath/version.py
- setup.py
Changes:
=====================================
PKG-INFO
=====================================
@@ -1,11 +1,11 @@
Metadata-Version: 1.2
Name: nanomath
-Version: 0.23.2
+Version: 1.0.1
Summary: A few simple math function for other Oxford Nanopore processing scripts
Home-page: https://github.com/wdecoster/nanomath
Author: Wouter De Coster
Author-email: decosterwouter at gmail.com
-License: MIT
+License: GPLv3
Description: # nanomath
This module provides a few simple math and statistics functions for other scripts processing Oxford Nanopore sequencing data
=====================================
README.rst
=====================================
@@ -24,14 +24,14 @@ INSTALLATION
.. code:: bash
- pip install nanomath
+ pip install nanomath
| or
| |install with conda|
::
- conda install -c bioconda nanomath
+ conda install -c bioconda nanomath
STATUS
------
=====================================
nanomath.egg-info/PKG-INFO
=====================================
@@ -1,11 +1,11 @@
Metadata-Version: 1.2
Name: nanomath
-Version: 0.23.2
+Version: 1.0.1
Summary: A few simple math function for other Oxford Nanopore processing scripts
Home-page: https://github.com/wdecoster/nanomath
Author: Wouter De Coster
Author-email: decosterwouter at gmail.com
-License: MIT
+License: GPLv3
Description: # nanomath
This module provides a few simple math and statistics functions for other scripts processing Oxford Nanopore sequencing data
=====================================
nanomath.egg-info/SOURCES.txt deleted
=====================================
@@ -1,15 +0,0 @@
-LICENSE
-MANIFEST.in
-README.md
-README.rst
-setup.cfg
-setup.py
-nanomath/__init__.py
-nanomath/nanomath.py
-nanomath/test_nanomath.py
-nanomath/version.py
-nanomath.egg-info/PKG-INFO
-nanomath.egg-info/SOURCES.txt
-nanomath.egg-info/dependency_links.txt
-nanomath.egg-info/requires.txt
-nanomath.egg-info/top_level.txt
\ No newline at end of file
=====================================
nanomath.egg-info/requires.txt deleted
=====================================
@@ -1,2 +0,0 @@
-pandas
-numpy>1.8
=====================================
nanomath/nanomath.py
=====================================
@@ -20,14 +20,16 @@ calc_read_stats(dataframe)
"""
import numpy as np
-from math import log
import sys
+from deprecated import deprecated
+from math import log
class Stats(object):
def __init__(self, df):
self.number_of_reads = len(df)
self.number_of_bases = np.sum(df["lengths"])
+ self._with_readIDs = "readIDs" in df
if "aligned_lengths" in df:
self.number_of_bases_aligned = np.sum(df["aligned_lengths"])
self.median_read_length = np.median(df["lengths"])
@@ -39,16 +41,71 @@ class Stats(object):
if "channelIDs" in df:
self.active_channels = np.unique(df["channelIDs"]).size
if "quals" in df:
- self.qualgroups = [5, 7, 10, 12, 15] # needs 5 elements in current implementation
+ self._qualgroups = [5, 7, 10, 12, 15] # needs 5 elements in current implementation
self.mean_qual = np.mean(df["quals"])
self.median_qual = np.median(df["quals"])
- self.top5_lengths = get_top_5(df=df,
- col="lengths",
- values=["lengths", "quals"])
- self.top5_quals = get_top_5(df=df,
- col="quals",
- values=["quals", "lengths"])
- self.reads_above_qual = [reads_above_qual(df, q) for q in self.qualgroups]
+ self._top5_lengths = get_top_5(df=df,
+ col="lengths",
+ values=["lengths", "quals"])
+ self._top5_quals = get_top_5(df=df,
+ col="quals",
+ values=["quals", "lengths"])
+ self._reads_above_qual = [reads_above_qual(df, q) for q in self._qualgroups]
+
+ def long_features_as_string(self):
+ """formatting long features to a string to print for legacy stats output"""
+ self.top5_lengths = self.long_feature_as_string_top5(self._top5_lengths)
+ self.top5_quals = self.long_feature_as_string_top5(self._top5_quals)
+ self.reads_above_qual = self.long_feature_as_string_above_qual(self._reads_above_qual)
+
+ def long_feature_as_string_top5(self, field):
+ """for legacy stats output"""
+ if self._with_readIDs:
+ return [str(round(i, ndigits=1)) + " (" +
+ str(round(j, ndigits=1)) + "; " + k + ")" for i, j, k in field]
+ else:
+ return [str(round(i, ndigits=1)) + " (" +
+ str(round(j, ndigits=1)) + ")" for i, j in field]
+
+ def long_feature_as_string_above_qual(self, field):
+ """for legacy stats output"""
+ return [self.format_above_qual_line(entry) for entry in field]
+
+ def format_above_qual_line(self, entry):
+ """for legacy stats output"""
+ numberAboveQ, megAboveQ = entry
+ return "{} ({}%) {}Mb".format(numberAboveQ,
+ round(100 * (numberAboveQ / self.number_of_reads),
+ ndigits=1),
+ round(megAboveQ, ndigits=1))
+
+ def to_dict(self):
+ """for tsv stats output"""
+ statdict = self.__dict__
+ for key, value in statdict.items():
+ if not key.startswith('_'):
+ if not isinstance(value, int):
+ statdict[key] = '{:.1f}'.format(value)
+ self.unwind_long_features_top5(feature='_top5_lengths', name='longest_read_(with_Q)')
+ self.unwind_long_features_top5(feature='_top5_quals', name='highest_Q_read_(with_length)')
+ self.unwind_long_features_above_qual(feature='_reads_above_qual', name='Reads')
+ return {k: v for k, v in statdict.items() if not k.startswith('_')}
+
+ def unwind_long_features_top5(self, feature, name):
+ """for tsv stats output"""
+ for entry, label in zip(self.__dict__[feature], range(1, 6)):
+ self.__dict__[name + ':' + str(label)] = '{} ({})'.format(round(entry[0], ndigits=1),
+ round(entry[1], ndigits=1))
+
+ def unwind_long_features_above_qual(self, feature, name):
+ """for tsv stats output"""
+ for entry, label in zip(self.__dict__[feature],
+ ['>Q{}:'.format(q) for q in self._qualgroups]):
+ numberAboveQ, megAboveQ = entry
+ percentage = 100 * (numberAboveQ / float(self.number_of_reads))
+ self.__dict__[name + ' ' + label] = "{} ({}%) {}Mb".format(numberAboveQ,
+ round(percentage, ndigits=1),
+ round(megAboveQ, ndigits=1))
def get_N50(readlengths):
@@ -59,20 +116,19 @@ def get_N50(readlengths):
return readlengths[np.where(np.cumsum(readlengths) >= 0.5 * np.sum(readlengths))[0][0]]
+ at deprecated
def remove_length_outliers(df, columnname):
"""Remove records with length-outliers above 3 standard deviations from the median."""
return df[df[columnname] < (np.median(df[columnname]) + 3 * np.std(df[columnname]))]
-def phred_to_percent(phred):
- return 100 * (1 - 10 ** (phred / -10))
-
-
+ at deprecated
def errs_tab(n):
"""Generate list of error rates for qualities less than equal than n."""
return [10**(q / -10) for q in range(n+1)]
+ at deprecated
def ave_qual(quals, qround=False, tab=errs_tab(128)):
"""Calculate average basecall quality of a read.
@@ -91,44 +147,22 @@ def ave_qual(quals, qround=False, tab=errs_tab(128)):
return None
-def median_qual(quals):
- """Receive the integer quality scores of a read and return the median quality for that read."""
- return np.median(quals)
-
-
def get_top_5(df, col, values):
if "readIDs" in df:
values.append("readIDs")
- res = df.sort_values(col, ascending=False) \
+ return df.sort_values(col, ascending=False) \
.head(5)[values] \
.reset_index(drop=True) \
.itertuples(index=False, name=None)
- if "readIDs" in df:
- return [str(round(i, ndigits=1)) + " (" +
- str(round(j, ndigits=1)) + "; " + k + ")" for i, j, k in res]
- else:
- return [str(round(i, ndigits=1)) + " (" +
- str(round(j, ndigits=1)) + ")" for i, j in res]
def reads_above_qual(df, qual):
numberAboveQ = np.sum(df["quals"] > qual)
megAboveQ = np.sum(df.loc[df["quals"] > qual, "lengths"]) / 1e6
- return "{} ({}%) {}Mb".format(numberAboveQ,
- round(100 * (numberAboveQ / len(df.index)), ndigits=1),
- round(megAboveQ, ndigits=1))
-
-
-def feature_list(stats, feature, index=None, padding=15):
- if index is None:
- return ' '.join(['{:>{},.1f}'.format(s.__dict__[feature], padding) for s in stats])
- else:
- return '\t'.join([str(s.__dict__[feature][index]) if len(s.__dict__[feature]) > index
- else "NA"
- for s in stats])
+ return numberAboveQ, megAboveQ
-def write_stats(datadfs, outputfile, names=[]):
+def write_stats(datadfs, outputfile, names=[], as_tsv=False):
"""Call calculation functions and write stats file.
This function takes a list of DataFrames,
@@ -140,6 +174,26 @@ def write_stats(datadfs, outputfile, names=[]):
output = open(outputfile, 'wt')
stats = [Stats(df) for df in datadfs]
+
+ if as_tsv:
+ import pandas as pd
+ df = pd.DataFrame([s.to_dict() for s in stats]).transpose()
+ df.index.name = 'Metrics'
+ if names:
+ df.columns = names
+ else:
+ df.columns = ['dataset']
+ output.write(df.to_csv(sep='\t'))
+ return df
+ else:
+ write_stats_legacy(stats, names, output, datadfs)
+
+
+def write_stats_legacy(stats, names, output, datadfs):
+ """
+ Legacy method to write out stats.
+ Will add padding to pretty print the table, and contain section headers
+ """
features = {
"Number of reads": "number_of_reads",
"Total bases": "number_of_bases",
@@ -170,16 +224,29 @@ def write_stats(datadfs, outputfile, names=[]):
except KeyError:
pass
if all(["quals" in df for df in datadfs]):
+ for s in stats:
+ s.long_features_as_string()
long_features = {
"Top 5 longest reads and their mean basecall quality score":
["top5_lengths", range(1, 6)],
"Top 5 highest mean basecall quality scores and their read lengths":
["top5_quals", range(1, 6)],
"Number, percentage and megabases of reads above quality cutoffs":
- ["reads_above_qual", [">Q" + str(q) for q in stats[0].qualgroups]],
+ ["reads_above_qual", [">Q" + str(q) for q in stats[0]._qualgroups]],
}
for lf in sorted(long_features.keys()):
output.write(lf + "\n")
- for i in range(5):
+ for index in range(5):
output.write("{}:\t{}\n".format(
- long_features[lf][1][i], feature_list(stats, long_features[lf][0], index=i)))
+ long_features[lf][1][index], feature_list(stats=stats,
+ feature=long_features[lf][0],
+ index=index)))
+
+
+def feature_list(stats, feature, index=None, padding=15):
+ if index is None:
+ return ' '.join(['{:>{},.1f}'.format(s.__dict__[feature], padding) for s in stats])
+ else:
+ return '\t'.join([str(s.__dict__[feature][index]) if len(s.__dict__[feature]) > index
+ else "NA"
+ for s in stats])
=====================================
nanomath/version.py
=====================================
@@ -1 +1 @@
-__version__ = "0.23.2"
+__version__ = "1.0.1"
=====================================
setup.py
=====================================
@@ -15,7 +15,7 @@ setup(
url='https://github.com/wdecoster/nanomath',
author='Wouter De Coster',
author_email='decosterwouter at gmail.com',
- license='MIT',
+ license='GPLv3',
classifiers=[
'Development Status :: 4 - Beta',
'Intended Audience :: Science/Research',
@@ -29,6 +29,6 @@ setup(
keywords='nanopore sequencing plotting quality control',
packages=find_packages(),
python_requires='>=3',
- install_requires=['pandas', 'numpy>1.8', ],
+ install_requires=['pandas', 'numpy>1.8', 'Python-Deprecated'],
package_dir={'nanomath': 'nanomath'},
data_files=[("", ["LICENSE"])])
View it on GitLab: https://salsa.debian.org/med-team/python-nanomath/-/compare/da27f0263a64e9de105a1696cdb1f8544fbe6214...a6312d8761c00a17a19e5bf4e550d0db413de77a
--
View it on GitLab: https://salsa.debian.org/med-team/python-nanomath/-/compare/da27f0263a64e9de105a1696cdb1f8544fbe6214...a6312d8761c00a17a19e5bf4e550d0db413de77a
You're receiving this email because of your account on salsa.debian.org.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20200901/c58ecee3/attachment-0001.html>
More information about the debian-med-commit
mailing list