[med-svn] [Git][med-team/python-nanoget][master] 3 commits: New upstream version 1.16.1

Nilesh Patra (@nilesh) gitlab at salsa.debian.org
Mon Nov 1 21:35:19 GMT 2021



Nilesh Patra pushed to branch master at Debian Med / python-nanoget


Commits:
803c8e5f by Nilesh Patra at 2021-11-02T03:00:52+05:30
New upstream version 1.16.1
- - - - -
ce579a12 by Nilesh Patra at 2021-11-02T03:00:52+05:30
Update upstream source from tag 'upstream/1.16.1'

Update to upstream version '1.16.1'
with Debian dir 873858bf6aa8b921c86ff2b2e2d3c8943baf06c6
- - - - -
bf5011c7 by Nilesh Patra at 2021-11-02T03:01:27+05:30
Upload to unstable

- - - - -


9 changed files:

- − .travis.yml
- debian/changelog
- nanoget/extraction_functions.py
- nanoget/nanoget.py
- nanoget/utils.py
- nanoget/version.py
- + scripts/create_feather.py
- scripts/test.py
- setup.py


Changes:

=====================================
.travis.yml deleted
=====================================
@@ -1,23 +0,0 @@
-language: python
-
-python:
-  - "3.6"
-  - "3.7"
-  - "3.8"
-
-before_install:
-  - cp README.md README.rst
-  - pip install flake8
-
-install:
-  - pip install -e .
-
-script:
-  - bash scripts/test.sh
-  - flake8 nanoget/nanoget.py
-
-notifications:
-  email: false
-  webhooks:
-    urls:
-        - https://webhooks.gitter.im/e/4b1c45cea6826ce475c2


=====================================
debian/changelog
=====================================
@@ -1,3 +1,10 @@
+python-nanoget (1.16.1-1) unstable; urgency=medium
+
+  * Team upload.
+  * New upstream version 1.16.1
+
+ -- Nilesh Patra <nilesh at debian.org>  Tue, 02 Nov 2021 03:01:08 +0530
+
 python-nanoget (1.12.2-5) unstable; urgency=medium
 
   * Team Upload.


=====================================
nanoget/extraction_functions.py
=====================================
@@ -4,10 +4,10 @@ import nanoget.utils as ut
 import pandas as pd
 import sys
 import pysam
-import nanomath
 import re
 from Bio import SeqIO
 import concurrent.futures as cfutures
+from itertools import repeat
 
 
 def process_summary(summaryfile, **kwargs):
@@ -128,7 +128,7 @@ def process_ubam(bam, **kwargs):
         samfile = pysam.AlignmentFile(bam, "rb", check_sq=False)
         logging.info("Nanoget: No index for bam file could be found, created index.")
     datadf = pd.DataFrame(
-        data=[(read.query_name, nanomath.ave_qual(read.query_qualities), read.query_length)
+        data=[(read.query_name, ut.ave_qual(read.query_qualities), read.query_length)
               for read in samfile.fetch(until_eof=True)],
         columns=["readIDs", "quals", "lengths"]) \
         .dropna(axis='columns', how='all') \
@@ -154,20 +154,29 @@ def process_bam(bam, **kwargs):
     logging.info("Nanoget: Starting to collect statistics from bam file {}.".format(bam))
     samfile = check_bam(bam)
     chromosomes = samfile.references
-    if len(chromosomes) > 100:
-        params = [(bam, None)]
-        logging.info("Nanoget: lots of contigs (>100), not running in separate processes")
-    else:
-        params = zip([bam] * len(chromosomes), chromosomes)
-    with cfutures.ProcessPoolExecutor(max_workers=kwargs["threads"]) as executor:
+    if len(chromosomes) > 100 or kwargs["huge"]:
+        logging.info("Nanoget: lots of contigs (>100) or --huge, not running in separate processes")
         datadf = pd.DataFrame(
-            data=[res for sublist in executor.map(extract_from_bam, params) for res in sublist],
+            data=extract_from_bam(bam, None, kwargs["keep_supp"]),
             columns=["readIDs", "quals", "aligned_quals", "lengths",
                      "aligned_lengths", "mapQ", "percentIdentity"]) \
             .dropna(axis='columns', how='all') \
             .dropna(axis='index', how='any')
-    logging.info("Nanoget: bam {} contains {} primary alignments.".format(
-        bam, datadf["lengths"].size))
+
+    else:
+        unit = chromosomes
+        with cfutures.ProcessPoolExecutor(max_workers=kwargs["threads"]) as executor:
+            datadf = pd.DataFrame(
+                data=[res for sublist in executor.map(extract_from_bam,
+                                                      repeat(bam),
+                                                      unit,
+                                                      repeat(kwargs["keep_supp"]))
+                      for res in sublist],
+                columns=["readIDs", "quals", "aligned_quals", "lengths",
+                         "aligned_lengths", "mapQ", "percentIdentity"]) \
+                .dropna(axis='columns', how='all') \
+                .dropna(axis='index', how='any')
+    logging.info(f"Nanoget: bam {bam} contains {datadf['lengths'].size} primary alignments.")
     return ut.reduce_memory_usage(datadf)
 
 
@@ -187,20 +196,25 @@ def process_cram(cram, **kwargs):
     logging.info("Nanoget: Starting to collect statistics from cram file {}.".format(cram))
     samfile = check_bam(cram, samtype="cram")
     chromosomes = samfile.references
-    params = zip([cram] * len(chromosomes), chromosomes)
+    if len(chromosomes) > 100:
+        unit = [None]
+        logging.info("Nanoget: lots of contigs (>100), not running in separate processes")
+    else:
+        unit = chromosomes
     with cfutures.ProcessPoolExecutor(max_workers=kwargs["threads"]) as executor:
         datadf = pd.DataFrame(
-            data=[res for sublist in executor.map(extract_from_bam, params) for res in sublist],
+            data=[res for sublist in executor.map(extract_from_bam,
+                                                  repeat(cram), unit, repeat(kwargs["keep_supp"]))
+                  for res in sublist],
             columns=["readIDs", "quals", "aligned_quals", "lengths",
                      "aligned_lengths", "mapQ", "percentIdentity"]) \
             .dropna(axis='columns', how='all') \
             .dropna(axis='index', how='any')
-    logging.info("Nanoget: cram {} contains {} primary alignments.".format(
-        cram, datadf["lengths"].size))
+    logging.info(f"Nanoget: cram {cram} contains {datadf['lengths'].size} primary alignments.")
     return ut.reduce_memory_usage(datadf)
 
 
-def extract_from_bam(params):
+def extract_from_bam(bam, chromosome, keep_supplementary=True):
     """Extracts metrics from bam.
 
     Worker function per chromosome
@@ -212,18 +226,29 @@ def extract_from_bam(params):
     -mapping qualities
     -edit distances to the reference genome scaled by read length
     """
-    bam, chromosome = params
     samfile = pysam.AlignmentFile(bam, "rb")
-    return [
-        (read.query_name,
-         nanomath.ave_qual(read.query_qualities),
-         nanomath.ave_qual(read.query_alignment_qualities),
-         read.query_length,
-         read.query_alignment_length,
-         read.mapping_quality,
-         get_pID(read))
-        for read in samfile.fetch(reference=chromosome, multiple_iterators=True)
-        if not read.is_secondary and not read.is_unmapped]
+    if keep_supplementary:
+        return [
+            (read.query_name,
+             ut.ave_qual(read.query_qualities),
+             ut.ave_qual(read.query_alignment_qualities),
+             read.query_length,
+             read.query_alignment_length,
+             read.mapping_quality,
+             get_pID(read))
+            for read in samfile.fetch(reference=chromosome, multiple_iterators=True)
+            if not read.is_secondary and not read.is_unmapped]
+    else:
+        return [
+            (read.query_name,
+             ut.ave_qual(read.query_qualities),
+             ut.ave_qual(read.query_alignment_qualities),
+             read.query_length,
+             read.query_alignment_length,
+             read.mapping_quality,
+             get_pID(read))
+            for read in samfile.fetch(reference=chromosome, multiple_iterators=True)
+            if not read.is_secondary and not read.is_unmapped and not read.is_supplementary]
 
 
 def get_pID(read):
@@ -310,7 +335,7 @@ def extract_from_fastq(fq):
     Return average quality and read length
     """
     for rec in SeqIO.parse(fq, "fastq"):
-        yield nanomath.ave_qual(rec.letter_annotations["phred_quality"]), len(rec)
+        yield ut.ave_qual(rec.letter_annotations["phred_quality"]), len(rec)
 
 
 def stream_fastq_full(fastq, threads):
@@ -336,8 +361,8 @@ def extract_all_from_fastq(rec):
     """
     return (rec.id,
             len(rec),
-            nanomath.ave_qual(rec.letter_annotations["phred_quality"]),
-            nanomath.median_qual(rec.letter_annotations["phred_quality"]))
+            ut.ave_qual(rec.letter_annotations["phred_quality"]),
+            None)
 
 
 def info_to_dict(info):
@@ -364,7 +389,7 @@ def process_fastq_rich(fastq, **kwargs):
         try:
             read_info = info_to_dict(record.description)
             res.append(
-                (nanomath.ave_qual(record.letter_annotations["phred_quality"]),
+                (ut.ave_qual(record.letter_annotations["phred_quality"]),
                  len(record),
                  read_info["ch"],
                  read_info["start_time"],


=====================================
nanoget/nanoget.py
=====================================
@@ -27,7 +27,7 @@ import nanoget.extraction_functions as ex
 
 
 def get_input(source, files, threads=4, readtype="1D",
-              combine="simple", names=None, barcoded=False, huge=False):
+              combine="simple", names=None, barcoded=False, huge=False, keep_supp=True):
     """Get input and process accordingly.
 
     Data can be:
@@ -79,13 +79,17 @@ def get_input(source, files, threads=4, readtype="1D",
         datadf = proc_functions[source](files[0],
                                         threads=threadsleft,
                                         readtype=readtype,
-                                        barcoded=barcoded)
+                                        barcoded=barcoded,
+                                        keep_supp=keep_supp,
+                                        huge=True)
     else:
         with cfutures.ProcessPoolExecutor(max_workers=filethreads) as executor:
             extraction_function = partial(proc_functions[source],
                                           threads=threadsleft,
                                           readtype=readtype,
-                                          barcoded=barcoded)
+                                          barcoded=barcoded,
+                                          keep_supp=keep_supp,
+                                          huge=False)
             datadf = combine_dfs(
                 dfs=[out for out in executor.map(extraction_function, files)],
                 names=names or files,
@@ -95,13 +99,13 @@ def get_input(source, files, threads=4, readtype="1D",
     datadf = calculate_start_time(datadf)
     logging.info("Nanoget: Gathered all metrics of {} reads".format(len(datadf)))
     if len(datadf) == 0:
-        logging.critical("Nanoget: no reads retrieved.".format(len(datadf)))
+        logging.critical("Nanoget: no reads retrieved.")
         sys.exit("Fatal: No reads found in input.")
     else:
         return datadf
 
 
-def combine_dfs(dfs, names, method):
+def combine_dfs(dfs, names=None, method='simple'):
     """Combine dataframes.
 
     Combination is either done simple by just concatenating the DataFrames


=====================================
nanoget/utils.py
=====================================
@@ -2,6 +2,7 @@ import sys
 import logging
 import pandas as pd
 from os import path as opath
+from math import log
 
 
 def reduce_memory_usage(df):
@@ -32,3 +33,26 @@ def check_existance(f):
     if not opath.isfile(f):
         logging.error("Nanoget: File provided doesn't exist or the path is incorrect: {}".format(f))
         sys.exit("File provided doesn't exist or the path is incorrect: {}".format(f))
+
+
+def errs_tab(n):
+    """Generate list of error rates for qualities less than equal than n."""
+    return [10**(q / -10) for q in range(n+1)]
+
+
+def ave_qual(quals, qround=False, tab=errs_tab(128)):
+    """Calculate average basecall quality of a read.
+
+    Receive the integer quality scores of a read and return the average quality for that read
+    First convert Phred scores to probabilities,
+    calculate average error probability
+    convert average back to Phred scale
+    """
+    if quals:
+        mq = -10 * log(sum([tab[q] for q in quals]) / len(quals), 10)
+        if qround:
+            return round(mq)
+        else:
+            return mq
+    else:
+        return None


=====================================
nanoget/version.py
=====================================
@@ -1 +1 @@
-__version__ = "1.12.2"
+__version__ = "1.16.1"


=====================================
scripts/create_feather.py
=====================================
@@ -0,0 +1,123 @@
+#! /usr/bin/env python
+# wdecoster
+
+from argparse import ArgumentParser
+from nanoget import get_input
+import os
+
+
+def main():
+    '''
+    Organization function
+    -setups logging
+    -gets inputdata
+    -calls plotting function
+    '''
+    args = get_args()
+
+    sources = {
+        "fastq": args.fastq,
+        "bam": args.bam,
+        "cram": args.cram,
+        "fastq_rich": args.fastq_rich,
+        "fastq_minimal": args.fastq_minimal,
+        "summary": args.summary,
+        "fasta": args.fasta,
+        "ubam": args.ubam,
+    }
+    if os.path.isfile(args.output) and not args.force:
+        print("Output file {} already exists.".format(args.output))
+    else:
+        get_input(
+            source=[n for n, s in sources.items() if s][0],
+            files=[f for f in sources.values() if f][0],
+            threads=args.threads,
+            readtype=args.readtype,
+            combine="simple",
+            barcoded=args.barcoded,
+            huge=args.huge,
+            keep_supp=not(args.no_supplementary)) \
+            .to_feather(args.output)
+
+
+def get_args():
+    parser = ArgumentParser(
+        description="Creates various plots for long read sequencing data.".upper(),
+        add_help=False)
+    general = parser.add_argument_group(
+        title='General options')
+    general.add_argument("-h", "--help",
+                         action="help",
+                         help="show the help and exit")
+    general.add_argument("-t", "--threads",
+                         help="Set the allowed number of threads to be used by the script",
+                         default=4,
+                         type=int)
+    general.add_argument("--huge",
+                         help="Input data is one very large file.",
+                         action="store_true")
+    general.add_argument("-o", "--output",
+                         help="Specify name of feather file.",
+                         default="NanoPlot-data.feather")
+    general.add_argument("--readtype",
+                         help="Which read type to extract information about from summary. \
+                                 Options are 1D, 2D, 1D2",
+                         default="1D",
+                         choices=['1D', '2D', '1D2'])
+    general.add_argument("--barcoded",
+                         help="Use if you want to split the summary file by barcode",
+                         action="store_true")
+    general.add_argument("--no_supplementary",
+                         help="Use if you want to remove supplementary alignments",
+                         action="store_true",
+                         default=False)
+    general.add_argument("--force",
+                         help="Overwrite existing feather files",
+                         action="store_true",
+                         default=False)
+    target = parser.add_argument_group(
+        title="Input data sources, one of these is required.")
+    mtarget = target.add_mutually_exclusive_group(
+        required=True)
+    mtarget.add_argument("--fastq",
+                         help="Data is in one or more default fastq file(s).",
+                         nargs='+',
+                         metavar="file")
+    mtarget.add_argument("--fasta",
+                         help="Data is in one or more fasta file(s).",
+                         nargs='+',
+                         metavar="file")
+    mtarget.add_argument("--fastq_rich",
+                         help="Data is in one or more fastq file(s) generated by albacore, \
+                               MinKNOW or guppy with additional information \
+                               concerning channel and time.",
+                         nargs='+',
+                         metavar="file")
+    mtarget.add_argument("--fastq_minimal",
+                         help="Data is in one or more fastq file(s) generated by albacore, \
+                               MinKNOW or guppy with additional information concerning channel \
+                               and time. Is extracted swiftly without elaborate checks.",
+                         nargs='+',
+                         metavar="file")
+    mtarget.add_argument("--summary",
+                         help="Data is in one or more summary file(s) generated by albacore \
+                               or guppy.",
+                         nargs='+',
+                         metavar="file")
+    mtarget.add_argument("--bam",
+                         help="Data is in one or more sorted bam file(s).",
+                         nargs='+',
+                         metavar="file")
+    mtarget.add_argument("--ubam",
+                         help="Data is in one or more unmapped bam file(s).",
+                         nargs='+',
+                         metavar="file")
+    mtarget.add_argument("--cram",
+                         help="Data is in one or more sorted cram file(s).",
+                         nargs='+',
+                         metavar="file")
+    return parser.parse_args()
+
+
+if __name__ == '__main__':
+    main()


=====================================
scripts/test.py
=====================================
@@ -4,6 +4,7 @@ import nanoget
 def run_tests():
     """Test functions using testdata from the nanotest repo."""
     nanoget.get_input("bam", ["nanotest/alignment.bam"])
+    nanoget.get_input("bam", ["nanotest/alignment.bam"], keep_supp=False)
     nanoget.get_input("fastq_rich", ["nanotest/reads.fastq.gz"])
     nanoget.get_input("summary", ["nanotest/sequencing_summary.txt"], combine="track")
     nanoget.get_input("fastq_minimal", ["nanotest/reads.fastq.gz"])


=====================================
setup.py
=====================================
@@ -19,12 +19,12 @@ setup(
     url='https://github.com/wdecoster/nanoget',
     author='Wouter De Coster',
     author_email='decosterwouter at gmail.com',
-    license='MIT',
+    license='GPLv3',
     classifiers=[
         'Development Status :: 4 - Beta',
         'Intended Audience :: Science/Research',
         'Topic :: Scientific/Engineering :: Bio-Informatics',
-        'License :: OSI Approved :: MIT License',
+        'License :: OSI Approved :: GNU General Public License v3 (GPLv3)',
         'Programming Language :: Python :: 3',
         'Programming Language :: Python :: 3.3',
         'Programming Language :: Python :: 3.4',
@@ -36,7 +36,6 @@ setup(
     install_requires=['pandas>=0.22.0',
                       'numpy',
                       'biopython',
-                      'pysam>0.10.0.0',
-                      'nanomath'],
+                      'pysam>0.10.0.0'],
     package_dir={'nanoget': 'nanoget'},
     data_files=[("", ["LICENSE"])])



View it on GitLab: https://salsa.debian.org/med-team/python-nanoget/-/compare/0afac56a7b789f4d7c9e90474f573be83d86a383...bf5011c7eda06738c1975c8ffcec53cfaac47bd9

-- 
View it on GitLab: https://salsa.debian.org/med-team/python-nanoget/-/compare/0afac56a7b789f4d7c9e90474f573be83d86a383...bf5011c7eda06738c1975c8ffcec53cfaac47bd9
You're receiving this email because of your account on salsa.debian.org.


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20211101/4508acd3/attachment-0001.htm>


More information about the debian-med-commit mailing list