[med-svn] [Git][med-team/python-nanoget][master] 3 commits: New upstream version 1.16.1
Nilesh Patra (@nilesh)
gitlab at salsa.debian.org
Mon Nov 1 21:35:19 GMT 2021
Nilesh Patra pushed to branch master at Debian Med / python-nanoget
Commits:
803c8e5f by Nilesh Patra at 2021-11-02T03:00:52+05:30
New upstream version 1.16.1
- - - - -
ce579a12 by Nilesh Patra at 2021-11-02T03:00:52+05:30
Update upstream source from tag 'upstream/1.16.1'
Update to upstream version '1.16.1'
with Debian dir 873858bf6aa8b921c86ff2b2e2d3c8943baf06c6
- - - - -
bf5011c7 by Nilesh Patra at 2021-11-02T03:01:27+05:30
Upload to unstable
- - - - -
9 changed files:
- − .travis.yml
- debian/changelog
- nanoget/extraction_functions.py
- nanoget/nanoget.py
- nanoget/utils.py
- nanoget/version.py
- + scripts/create_feather.py
- scripts/test.py
- setup.py
Changes:
=====================================
.travis.yml deleted
=====================================
@@ -1,23 +0,0 @@
-language: python
-
-python:
- - "3.6"
- - "3.7"
- - "3.8"
-
-before_install:
- - cp README.md README.rst
- - pip install flake8
-
-install:
- - pip install -e .
-
-script:
- - bash scripts/test.sh
- - flake8 nanoget/nanoget.py
-
-notifications:
- email: false
- webhooks:
- urls:
- - https://webhooks.gitter.im/e/4b1c45cea6826ce475c2
=====================================
debian/changelog
=====================================
@@ -1,3 +1,10 @@
+python-nanoget (1.16.1-1) unstable; urgency=medium
+
+ * Team upload.
+ * New upstream version 1.16.1
+
+ -- Nilesh Patra <nilesh at debian.org> Tue, 02 Nov 2021 03:01:08 +0530
+
python-nanoget (1.12.2-5) unstable; urgency=medium
* Team Upload.
=====================================
nanoget/extraction_functions.py
=====================================
@@ -4,10 +4,10 @@ import nanoget.utils as ut
import pandas as pd
import sys
import pysam
-import nanomath
import re
from Bio import SeqIO
import concurrent.futures as cfutures
+from itertools import repeat
def process_summary(summaryfile, **kwargs):
@@ -128,7 +128,7 @@ def process_ubam(bam, **kwargs):
samfile = pysam.AlignmentFile(bam, "rb", check_sq=False)
logging.info("Nanoget: No index for bam file could be found, created index.")
datadf = pd.DataFrame(
- data=[(read.query_name, nanomath.ave_qual(read.query_qualities), read.query_length)
+ data=[(read.query_name, ut.ave_qual(read.query_qualities), read.query_length)
for read in samfile.fetch(until_eof=True)],
columns=["readIDs", "quals", "lengths"]) \
.dropna(axis='columns', how='all') \
@@ -154,20 +154,29 @@ def process_bam(bam, **kwargs):
logging.info("Nanoget: Starting to collect statistics from bam file {}.".format(bam))
samfile = check_bam(bam)
chromosomes = samfile.references
- if len(chromosomes) > 100:
- params = [(bam, None)]
- logging.info("Nanoget: lots of contigs (>100), not running in separate processes")
- else:
- params = zip([bam] * len(chromosomes), chromosomes)
- with cfutures.ProcessPoolExecutor(max_workers=kwargs["threads"]) as executor:
+ if len(chromosomes) > 100 or kwargs["huge"]:
+ logging.info("Nanoget: lots of contigs (>100) or --huge, not running in separate processes")
datadf = pd.DataFrame(
- data=[res for sublist in executor.map(extract_from_bam, params) for res in sublist],
+ data=extract_from_bam(bam, None, kwargs["keep_supp"]),
columns=["readIDs", "quals", "aligned_quals", "lengths",
"aligned_lengths", "mapQ", "percentIdentity"]) \
.dropna(axis='columns', how='all') \
.dropna(axis='index', how='any')
- logging.info("Nanoget: bam {} contains {} primary alignments.".format(
- bam, datadf["lengths"].size))
+
+ else:
+ unit = chromosomes
+ with cfutures.ProcessPoolExecutor(max_workers=kwargs["threads"]) as executor:
+ datadf = pd.DataFrame(
+ data=[res for sublist in executor.map(extract_from_bam,
+ repeat(bam),
+ unit,
+ repeat(kwargs["keep_supp"]))
+ for res in sublist],
+ columns=["readIDs", "quals", "aligned_quals", "lengths",
+ "aligned_lengths", "mapQ", "percentIdentity"]) \
+ .dropna(axis='columns', how='all') \
+ .dropna(axis='index', how='any')
+ logging.info(f"Nanoget: bam {bam} contains {datadf['lengths'].size} primary alignments.")
return ut.reduce_memory_usage(datadf)
@@ -187,20 +196,25 @@ def process_cram(cram, **kwargs):
logging.info("Nanoget: Starting to collect statistics from cram file {}.".format(cram))
samfile = check_bam(cram, samtype="cram")
chromosomes = samfile.references
- params = zip([cram] * len(chromosomes), chromosomes)
+ if len(chromosomes) > 100:
+ unit = [None]
+ logging.info("Nanoget: lots of contigs (>100), not running in separate processes")
+ else:
+ unit = chromosomes
with cfutures.ProcessPoolExecutor(max_workers=kwargs["threads"]) as executor:
datadf = pd.DataFrame(
- data=[res for sublist in executor.map(extract_from_bam, params) for res in sublist],
+ data=[res for sublist in executor.map(extract_from_bam,
+ repeat(cram), unit, repeat(kwargs["keep_supp"]))
+ for res in sublist],
columns=["readIDs", "quals", "aligned_quals", "lengths",
"aligned_lengths", "mapQ", "percentIdentity"]) \
.dropna(axis='columns', how='all') \
.dropna(axis='index', how='any')
- logging.info("Nanoget: cram {} contains {} primary alignments.".format(
- cram, datadf["lengths"].size))
+ logging.info(f"Nanoget: cram {cram} contains {datadf['lengths'].size} primary alignments.")
return ut.reduce_memory_usage(datadf)
-def extract_from_bam(params):
+def extract_from_bam(bam, chromosome, keep_supplementary=True):
"""Extracts metrics from bam.
Worker function per chromosome
@@ -212,18 +226,29 @@ def extract_from_bam(params):
-mapping qualities
-edit distances to the reference genome scaled by read length
"""
- bam, chromosome = params
samfile = pysam.AlignmentFile(bam, "rb")
- return [
- (read.query_name,
- nanomath.ave_qual(read.query_qualities),
- nanomath.ave_qual(read.query_alignment_qualities),
- read.query_length,
- read.query_alignment_length,
- read.mapping_quality,
- get_pID(read))
- for read in samfile.fetch(reference=chromosome, multiple_iterators=True)
- if not read.is_secondary and not read.is_unmapped]
+ if keep_supplementary:
+ return [
+ (read.query_name,
+ ut.ave_qual(read.query_qualities),
+ ut.ave_qual(read.query_alignment_qualities),
+ read.query_length,
+ read.query_alignment_length,
+ read.mapping_quality,
+ get_pID(read))
+ for read in samfile.fetch(reference=chromosome, multiple_iterators=True)
+ if not read.is_secondary and not read.is_unmapped]
+ else:
+ return [
+ (read.query_name,
+ ut.ave_qual(read.query_qualities),
+ ut.ave_qual(read.query_alignment_qualities),
+ read.query_length,
+ read.query_alignment_length,
+ read.mapping_quality,
+ get_pID(read))
+ for read in samfile.fetch(reference=chromosome, multiple_iterators=True)
+ if not read.is_secondary and not read.is_unmapped and not read.is_supplementary]
def get_pID(read):
@@ -310,7 +335,7 @@ def extract_from_fastq(fq):
Return average quality and read length
"""
for rec in SeqIO.parse(fq, "fastq"):
- yield nanomath.ave_qual(rec.letter_annotations["phred_quality"]), len(rec)
+ yield ut.ave_qual(rec.letter_annotations["phred_quality"]), len(rec)
def stream_fastq_full(fastq, threads):
@@ -336,8 +361,8 @@ def extract_all_from_fastq(rec):
"""
return (rec.id,
len(rec),
- nanomath.ave_qual(rec.letter_annotations["phred_quality"]),
- nanomath.median_qual(rec.letter_annotations["phred_quality"]))
+ ut.ave_qual(rec.letter_annotations["phred_quality"]),
+ None)
def info_to_dict(info):
@@ -364,7 +389,7 @@ def process_fastq_rich(fastq, **kwargs):
try:
read_info = info_to_dict(record.description)
res.append(
- (nanomath.ave_qual(record.letter_annotations["phred_quality"]),
+ (ut.ave_qual(record.letter_annotations["phred_quality"]),
len(record),
read_info["ch"],
read_info["start_time"],
=====================================
nanoget/nanoget.py
=====================================
@@ -27,7 +27,7 @@ import nanoget.extraction_functions as ex
def get_input(source, files, threads=4, readtype="1D",
- combine="simple", names=None, barcoded=False, huge=False):
+ combine="simple", names=None, barcoded=False, huge=False, keep_supp=True):
"""Get input and process accordingly.
Data can be:
@@ -79,13 +79,17 @@ def get_input(source, files, threads=4, readtype="1D",
datadf = proc_functions[source](files[0],
threads=threadsleft,
readtype=readtype,
- barcoded=barcoded)
+ barcoded=barcoded,
+ keep_supp=keep_supp,
+ huge=True)
else:
with cfutures.ProcessPoolExecutor(max_workers=filethreads) as executor:
extraction_function = partial(proc_functions[source],
threads=threadsleft,
readtype=readtype,
- barcoded=barcoded)
+ barcoded=barcoded,
+ keep_supp=keep_supp,
+ huge=False)
datadf = combine_dfs(
dfs=[out for out in executor.map(extraction_function, files)],
names=names or files,
@@ -95,13 +99,13 @@ def get_input(source, files, threads=4, readtype="1D",
datadf = calculate_start_time(datadf)
logging.info("Nanoget: Gathered all metrics of {} reads".format(len(datadf)))
if len(datadf) == 0:
- logging.critical("Nanoget: no reads retrieved.".format(len(datadf)))
+ logging.critical("Nanoget: no reads retrieved.")
sys.exit("Fatal: No reads found in input.")
else:
return datadf
-def combine_dfs(dfs, names, method):
+def combine_dfs(dfs, names=None, method='simple'):
"""Combine dataframes.
Combination is either done simple by just concatenating the DataFrames
=====================================
nanoget/utils.py
=====================================
@@ -2,6 +2,7 @@ import sys
import logging
import pandas as pd
from os import path as opath
+from math import log
def reduce_memory_usage(df):
@@ -32,3 +33,26 @@ def check_existance(f):
if not opath.isfile(f):
logging.error("Nanoget: File provided doesn't exist or the path is incorrect: {}".format(f))
sys.exit("File provided doesn't exist or the path is incorrect: {}".format(f))
+
+
+def errs_tab(n):
+ """Generate list of error rates for qualities less than equal than n."""
+ return [10**(q / -10) for q in range(n+1)]
+
+
+def ave_qual(quals, qround=False, tab=errs_tab(128)):
+ """Calculate average basecall quality of a read.
+
+ Receive the integer quality scores of a read and return the average quality for that read
+ First convert Phred scores to probabilities,
+ calculate average error probability
+ convert average back to Phred scale
+ """
+ if quals:
+ mq = -10 * log(sum([tab[q] for q in quals]) / len(quals), 10)
+ if qround:
+ return round(mq)
+ else:
+ return mq
+ else:
+ return None
=====================================
nanoget/version.py
=====================================
@@ -1 +1 @@
-__version__ = "1.12.2"
+__version__ = "1.16.1"
=====================================
scripts/create_feather.py
=====================================
@@ -0,0 +1,123 @@
+#! /usr/bin/env python
+# wdecoster
+
+from argparse import ArgumentParser
+from nanoget import get_input
+import os
+
+
+def main():
+ '''
+ Organization function
+ -setups logging
+ -gets inputdata
+ -calls plotting function
+ '''
+ args = get_args()
+
+ sources = {
+ "fastq": args.fastq,
+ "bam": args.bam,
+ "cram": args.cram,
+ "fastq_rich": args.fastq_rich,
+ "fastq_minimal": args.fastq_minimal,
+ "summary": args.summary,
+ "fasta": args.fasta,
+ "ubam": args.ubam,
+ }
+ if os.path.isfile(args.output) and not args.force:
+ print("Output file {} already exists.".format(args.output))
+ else:
+ get_input(
+ source=[n for n, s in sources.items() if s][0],
+ files=[f for f in sources.values() if f][0],
+ threads=args.threads,
+ readtype=args.readtype,
+ combine="simple",
+ barcoded=args.barcoded,
+ huge=args.huge,
+ keep_supp=not(args.no_supplementary)) \
+ .to_feather(args.output)
+
+
+def get_args():
+ parser = ArgumentParser(
+ description="Creates various plots for long read sequencing data.".upper(),
+ add_help=False)
+ general = parser.add_argument_group(
+ title='General options')
+ general.add_argument("-h", "--help",
+ action="help",
+ help="show the help and exit")
+ general.add_argument("-t", "--threads",
+ help="Set the allowed number of threads to be used by the script",
+ default=4,
+ type=int)
+ general.add_argument("--huge",
+ help="Input data is one very large file.",
+ action="store_true")
+ general.add_argument("-o", "--output",
+ help="Specify name of feather file.",
+ default="NanoPlot-data.feather")
+ general.add_argument("--readtype",
+ help="Which read type to extract information about from summary. \
+ Options are 1D, 2D, 1D2",
+ default="1D",
+ choices=['1D', '2D', '1D2'])
+ general.add_argument("--barcoded",
+ help="Use if you want to split the summary file by barcode",
+ action="store_true")
+ general.add_argument("--no_supplementary",
+ help="Use if you want to remove supplementary alignments",
+ action="store_true",
+ default=False)
+ general.add_argument("--force",
+ help="Overwrite existing feather files",
+ action="store_true",
+ default=False)
+ target = parser.add_argument_group(
+ title="Input data sources, one of these is required.")
+ mtarget = target.add_mutually_exclusive_group(
+ required=True)
+ mtarget.add_argument("--fastq",
+ help="Data is in one or more default fastq file(s).",
+ nargs='+',
+ metavar="file")
+ mtarget.add_argument("--fasta",
+ help="Data is in one or more fasta file(s).",
+ nargs='+',
+ metavar="file")
+ mtarget.add_argument("--fastq_rich",
+ help="Data is in one or more fastq file(s) generated by albacore, \
+ MinKNOW or guppy with additional information \
+ concerning channel and time.",
+ nargs='+',
+ metavar="file")
+ mtarget.add_argument("--fastq_minimal",
+ help="Data is in one or more fastq file(s) generated by albacore, \
+ MinKNOW or guppy with additional information concerning channel \
+ and time. Is extracted swiftly without elaborate checks.",
+ nargs='+',
+ metavar="file")
+ mtarget.add_argument("--summary",
+ help="Data is in one or more summary file(s) generated by albacore \
+ or guppy.",
+ nargs='+',
+ metavar="file")
+ mtarget.add_argument("--bam",
+ help="Data is in one or more sorted bam file(s).",
+ nargs='+',
+ metavar="file")
+ mtarget.add_argument("--ubam",
+ help="Data is in one or more unmapped bam file(s).",
+ nargs='+',
+ metavar="file")
+ mtarget.add_argument("--cram",
+ help="Data is in one or more sorted cram file(s).",
+ nargs='+',
+ metavar="file")
+ return parser.parse_args()
+
+
+if __name__ == '__main__':
+ main()
=====================================
scripts/test.py
=====================================
@@ -4,6 +4,7 @@ import nanoget
def run_tests():
"""Test functions using testdata from the nanotest repo."""
nanoget.get_input("bam", ["nanotest/alignment.bam"])
+ nanoget.get_input("bam", ["nanotest/alignment.bam"], keep_supp=False)
nanoget.get_input("fastq_rich", ["nanotest/reads.fastq.gz"])
nanoget.get_input("summary", ["nanotest/sequencing_summary.txt"], combine="track")
nanoget.get_input("fastq_minimal", ["nanotest/reads.fastq.gz"])
=====================================
setup.py
=====================================
@@ -19,12 +19,12 @@ setup(
url='https://github.com/wdecoster/nanoget',
author='Wouter De Coster',
author_email='decosterwouter at gmail.com',
- license='MIT',
+ license='GPLv3',
classifiers=[
'Development Status :: 4 - Beta',
'Intended Audience :: Science/Research',
'Topic :: Scientific/Engineering :: Bio-Informatics',
- 'License :: OSI Approved :: MIT License',
+ 'License :: OSI Approved :: GNU General Public License v3 (GPLv3)',
'Programming Language :: Python :: 3',
'Programming Language :: Python :: 3.3',
'Programming Language :: Python :: 3.4',
@@ -36,7 +36,6 @@ setup(
install_requires=['pandas>=0.22.0',
'numpy',
'biopython',
- 'pysam>0.10.0.0',
- 'nanomath'],
+ 'pysam>0.10.0.0'],
package_dir={'nanoget': 'nanoget'},
data_files=[("", ["LICENSE"])])
View it on GitLab: https://salsa.debian.org/med-team/python-nanoget/-/compare/0afac56a7b789f4d7c9e90474f573be83d86a383...bf5011c7eda06738c1975c8ffcec53cfaac47bd9
--
View it on GitLab: https://salsa.debian.org/med-team/python-nanoget/-/compare/0afac56a7b789f4d7c9e90474f573be83d86a383...bf5011c7eda06738c1975c8ffcec53cfaac47bd9
You're receiving this email because of your account on salsa.debian.org.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20211101/4508acd3/attachment-0001.htm>
More information about the debian-med-commit
mailing list