[med-svn] [Git][med-team/python-nanoget][upstream] New upstream version 1.19.3

Andreas Tille (@tille) gitlab at salsa.debian.org
Wed Nov 15 10:51:47 GMT 2023



Andreas Tille pushed to branch upstream at Debian Med / python-nanoget


Commits:
a8a632ef by Andreas Tille at 2023-11-15T11:25:46+01:00
New upstream version 1.19.3
- - - - -


9 changed files:

- + .github/workflows/python-package-conda.yml
- + .travis.yml
- README.md
- nanoget/extraction_functions.py
- nanoget/nanoget.py
- nanoget/version.py
- scripts/test.py
- scripts/test.sh
- setup.py


Changes:

=====================================
.github/workflows/python-package-conda.yml
=====================================
@@ -0,0 +1,34 @@
+name: Python Package using Conda
+
+on: [push]
+
+jobs:
+  build-linux:
+    runs-on: ubuntu-20.04
+    strategy:
+      max-parallel: 5
+
+    steps:
+    - uses: actions/checkout at v3
+    - name: Set up Python 3.10
+      uses: actions/setup-python at v3
+      with:
+        python-version: '3.10'
+    - name: Add conda to system path
+      run: |
+        # $CONDA is an environment variable pointing to the root of the miniconda directory
+        echo $CONDA/bin >> $GITHUB_PATH
+    - name: Install dependencies
+      run: |
+        conda install pip
+        pip install .
+    - name: Lint with flake8
+      run: |
+        conda install flake8
+        # stop the build if there are Python syntax errors or undefined names
+        flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
+        # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
+        flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
+    - name: Test
+      run: |
+        bash scripts/test.sh


=====================================
.travis.yml
=====================================
@@ -0,0 +1,23 @@
+language: python
+
+python:
+  - "3.6"
+  - "3.7"
+  - "3.8"
+
+before_install:
+  - cp README.md README.rst
+  - pip install flake8
+
+install:
+  - pip install -e .
+
+script:
+  - bash scripts/test.sh
+  - flake8 nanoget/nanoget.py
+
+notifications:
+  email: false
+  webhooks:
+    urls:
+        - https://webhooks.gitter.im/e/4b1c45cea6826ce475c2


=====================================
README.md
=====================================
@@ -3,7 +3,6 @@ This module provides functions to extract useful metrics from Oxford Nanopore se
 
 [![Twitter URL](https://img.shields.io/twitter/url/https/twitter.com/wouter_decoster.svg?style=social&label=Follow%20%40wouter_decoster)](https://twitter.com/wouter_decoster)
 [![install with conda](https://anaconda.org/bioconda/nanoget/badges/installer/conda.svg)](https://anaconda.org/bioconda/nanoget)
-[![Build Status](https://travis-ci.org/wdecoster/nanoget.svg?branch=master)](https://travis-ci.org/wdecoster/nanoget)
 
 
 ## FUNCTIONS


=====================================
nanoget/extraction_functions.py
=====================================
@@ -11,7 +11,7 @@ from itertools import repeat
 
 
 def process_summary(summaryfile, **kwargs):
-    """Extracting information from an albacore summary file.
+    """Extracting information from an albacore/guppy/dorado summary file.
 
     Only reads which have a >0 length are returned.
 
@@ -59,12 +59,18 @@ def process_summary(summaryfile, **kwargs):
     37    kit
     38    variant
     """
-    logging.info("Nanoget: Collecting metrics from summary file {} for {} sequencing".format(
-        summaryfile, kwargs["readtype"]))
+    logging.info(
+        f"Nanoget: Collecting metrics from summary file {summaryfile} for {kwargs['readtype']} sequencing"
+    )
     ut.check_existance(summaryfile)
     if kwargs["readtype"] == "1D":
-        cols = ["channel", "start_time", "duration",
-                "sequence_length_template", "mean_qscore_template"]
+        cols = [
+            "channel",
+            "start_time",
+            "duration",
+            "sequence_length_template",
+            "mean_qscore_template",
+        ]
     elif kwargs["readtype"] in ["2D", "1D2"]:
         cols = ["channel", "start_time", "duration", "sequence_length_2d", "mean_qscore_2d"]
     if kwargs["barcoded"]:
@@ -77,10 +83,16 @@ def process_summary(summaryfile, **kwargs):
             usecols=cols,
         )
     except ValueError:
-        logging.error("Nanoget: did not find expected columns in summary file {}:\n {}".format(
-            summaryfile, ', '.join(cols)))
-        sys.exit("ERROR: expected columns in summary file {} not found:\n {}".format(
-            summaryfile, ', '.join(cols)))
+        logging.error(
+            "Nanoget: did not find expected columns in summary file {}:\n {}".format(
+                summaryfile, ", ".join(cols)
+            )
+        )
+        sys.exit(
+            "ERROR: expected columns in summary file {} not found:\n {}".format(
+                summaryfile, ", ".join(cols)
+            )
+        )
     if kwargs["barcoded"]:
         datadf.columns = ["channelIDs", "time", "duration", "lengths", "quals", "barcode"]
     else:
@@ -104,12 +116,15 @@ def check_bam(bam, samtype="bam"):
         pysam.index(bam)
         samfile = pysam.AlignmentFile(bam, "rb")  # Need to reload the samfile after creating index
         logging.info("Nanoget: No index for bam file could be found, created index.")
-    if not samfile.header['HD']['SO'] == 'coordinate':
+    if not samfile.header["HD"]["SO"] == "coordinate":
         logging.error("Nanoget: Bam file {} not sorted by coordinate!.".format(bam))
         sys.exit("Please use a bam file sorted by coordinate.")
     if samtype == "bam":
-        logging.info("Nanoget: Bam file {} contains {} mapped and {} unmapped reads.".format(
-            bam, samfile.mapped, samfile.unmapped))
+        logging.info(
+            "Nanoget: Bam file {} contains {} mapped and {} unmapped reads.".format(
+                bam, samfile.mapped, samfile.unmapped
+            )
+        )
         if samfile.mapped == 0:
             logging.error("Nanoget: Bam file {} does not contain aligned reads.".format(bam))
             sys.exit("FATAL: not a single read was mapped in bam file {}".format(bam))
@@ -127,14 +142,18 @@ def process_ubam(bam, **kwargs):
         # Need to reload the samfile after creating index
         samfile = pysam.AlignmentFile(bam, "rb", check_sq=False)
         logging.info("Nanoget: No index for bam file could be found, created index.")
-    datadf = pd.DataFrame(
-        data=[(read.query_name, ut.ave_qual(read.query_qualities), read.query_length)
-              for read in samfile.fetch(until_eof=True)],
-        columns=["readIDs", "quals", "lengths"]) \
-        .dropna(axis='columns', how='all') \
-        .dropna(axis='index', how='any')
-    logging.info("Nanoget: ubam {} contains {} reads.".format(
-        bam, datadf["lengths"].size))
+    datadf = (
+        pd.DataFrame(
+            data=[
+                (read.query_name, ut.ave_qual(read.query_qualities), read.query_length)
+                for read in samfile.fetch(until_eof=True)
+            ],
+            columns=["readIDs", "quals", "lengths"],
+        )
+        .dropna(axis="columns", how="all")
+        .dropna(axis="index", how="any")
+    )
+    logging.info("Nanoget: ubam {} contains {} reads.".format(bam, datadf["lengths"].size))
     return ut.reduce_memory_usage(datadf)
 
 
@@ -154,28 +173,50 @@ def process_bam(bam, **kwargs):
     logging.info("Nanoget: Starting to collect statistics from bam file {}.".format(bam))
     samfile = check_bam(bam)
     chromosomes = samfile.references
-    if len(chromosomes) > 100 or kwargs["huge"]:
-        logging.info("Nanoget: lots of contigs (>100) or --huge, not running in separate processes")
-        datadf = pd.DataFrame(
-            data=extract_from_bam(bam, None, kwargs["keep_supp"]),
-            columns=["readIDs", "quals", "aligned_quals", "lengths",
-                     "aligned_lengths", "mapQ", "percentIdentity"]) \
-            .dropna(axis='columns', how='all') \
-            .dropna(axis='index', how='any')
+    if len(chromosomes) > 200 or kwargs["huge"]:
+        logging.info("Nanoget: lots of contigs (>200) or --huge, not running in separate processes")
+        datadf = (
+            pd.DataFrame(
+                data=extract_from_bam(bam, None, kwargs["keep_supp"]),
+                columns=[
+                    "readIDs",
+                    "quals",
+                    "aligned_quals",
+                    "lengths",
+                    "aligned_lengths",
+                    "mapQ",
+                    "percentIdentity",
+                ],
+            )
+            .dropna(axis="columns", how="all")
+            .dropna(axis="index", how="any")
+        )
 
     else:
         unit = chromosomes
         with cfutures.ProcessPoolExecutor(max_workers=kwargs["threads"]) as executor:
-            datadf = pd.DataFrame(
-                data=[res for sublist in executor.map(extract_from_bam,
-                                                      repeat(bam),
-                                                      unit,
-                                                      repeat(kwargs["keep_supp"]))
-                      for res in sublist],
-                columns=["readIDs", "quals", "aligned_quals", "lengths",
-                         "aligned_lengths", "mapQ", "percentIdentity"]) \
-                .dropna(axis='columns', how='all') \
-                .dropna(axis='index', how='any')
+            datadf = (
+                pd.DataFrame(
+                    data=[
+                        res
+                        for sublist in executor.map(
+                            extract_from_bam, repeat(bam), unit, repeat(kwargs["keep_supp"])
+                        )
+                        for res in sublist
+                    ],
+                    columns=[
+                        "readIDs",
+                        "quals",
+                        "aligned_quals",
+                        "lengths",
+                        "aligned_lengths",
+                        "mapQ",
+                        "percentIdentity",
+                    ],
+                )
+                .dropna(axis="columns", how="all")
+                .dropna(axis="index", how="any")
+            )
     logging.info(f"Nanoget: bam {bam} contains {datadf['lengths'].size} primary alignments.")
     return ut.reduce_memory_usage(datadf)
 
@@ -196,20 +237,34 @@ def process_cram(cram, **kwargs):
     logging.info("Nanoget: Starting to collect statistics from cram file {}.".format(cram))
     samfile = check_bam(cram, samtype="cram")
     chromosomes = samfile.references
-    if len(chromosomes) > 100:
+    if len(chromosomes) > 200:
         unit = [None]
         logging.info("Nanoget: lots of contigs (>100), not running in separate processes")
     else:
         unit = chromosomes
     with cfutures.ProcessPoolExecutor(max_workers=kwargs["threads"]) as executor:
-        datadf = pd.DataFrame(
-            data=[res for sublist in executor.map(extract_from_bam,
-                                                  repeat(cram), unit, repeat(kwargs["keep_supp"]))
-                  for res in sublist],
-            columns=["readIDs", "quals", "aligned_quals", "lengths",
-                     "aligned_lengths", "mapQ", "percentIdentity"]) \
-            .dropna(axis='columns', how='all') \
-            .dropna(axis='index', how='any')
+        datadf = (
+            pd.DataFrame(
+                data=[
+                    res
+                    for sublist in executor.map(
+                        extract_from_bam, repeat(cram), unit, repeat(kwargs["keep_supp"])
+                    )
+                    for res in sublist
+                ],
+                columns=[
+                    "readIDs",
+                    "quals",
+                    "aligned_quals",
+                    "lengths",
+                    "aligned_lengths",
+                    "mapQ",
+                    "percentIdentity",
+                ],
+            )
+            .dropna(axis="columns", how="all")
+            .dropna(axis="index", how="any")
+        )
     logging.info(f"Nanoget: cram {cram} contains {datadf['lengths'].size} primary alignments.")
     return ut.reduce_memory_usage(datadf)
 
@@ -229,26 +284,32 @@ def extract_from_bam(bam, chromosome, keep_supplementary=True):
     samfile = pysam.AlignmentFile(bam, "rb")
     if keep_supplementary:
         return [
-            (read.query_name,
-             ut.ave_qual(read.query_qualities),
-             ut.ave_qual(read.query_alignment_qualities),
-             read.query_length,
-             read.query_alignment_length,
-             read.mapping_quality,
-             get_pID(read))
+            (
+                read.query_name,
+                ut.ave_qual(read.query_qualities),
+                ut.ave_qual(read.query_alignment_qualities),
+                read.query_length,
+                read.query_alignment_length,
+                read.mapping_quality,
+                get_pID(read),
+            )
             for read in samfile.fetch(reference=chromosome, multiple_iterators=True)
-            if not read.is_secondary and not read.is_unmapped]
+            if not read.is_secondary and not read.is_unmapped
+        ]
     else:
         return [
-            (read.query_name,
-             ut.ave_qual(read.query_qualities),
-             ut.ave_qual(read.query_alignment_qualities),
-             read.query_length,
-             read.query_alignment_length,
-             read.mapping_quality,
-             get_pID(read))
+            (
+                read.query_name,
+                ut.ave_qual(read.query_qualities),
+                ut.ave_qual(read.query_alignment_qualities),
+                read.query_length,
+                read.query_alignment_length,
+                read.mapping_quality,
+                get_pID(read),
+            )
             for read in samfile.fetch(reference=chromosome, multiple_iterators=True)
-            if not read.is_secondary and not read.is_unmapped and not read.is_supplementary]
+            if not read.is_secondary and not read.is_unmapped and not read.is_supplementary
+        ]
 
 
 def get_pID(read):
@@ -267,8 +328,10 @@ def get_pID(read):
         return (1 - read.get_tag("NM") / alignment_length) * 100
     except KeyError:
         try:
-            return 100 * (1 - (parse_MD(read.get_tag("MD")) + parse_CIGAR(read.cigartuples)) /
-                          alignment_length)
+            return 100 * (
+                1
+                - (parse_MD(read.get_tag("MD")) + parse_CIGAR(read.cigartuples)) / alignment_length
+            )
         except KeyError:
             return None
     except ZeroDivisionError:
@@ -277,7 +340,7 @@ def get_pID(read):
 
 def parse_MD(MDlist):
     """Parse MD string to get number of mismatches and deletions."""
-    return sum([len(item) for item in re.split('[0-9^]', MDlist)])
+    return sum([len(item) for item in re.split("[0-9^]", MDlist)])
 
 
 def parse_CIGAR(cigartuples):
@@ -293,40 +356,47 @@ def handle_compressed_input(inputfq, file_type="fastq"):
     Relies on file extensions to recognize compression
     """
     ut.check_existance(inputfq)
-    if inputfq.endswith(('.gz', 'bgz')):
+    if inputfq.endswith((".gz", "bgz")):
         import gzip
+
         logging.info("Nanoget: Decompressing gzipped {} {}".format(file_type, inputfq))
-        return gzip.open(inputfq, 'rt')
-    elif inputfq.endswith('.bz2'):
+        return gzip.open(inputfq, "rt")
+    elif inputfq.endswith(".bz2"):
         import bz2
+
         logging.info("Nanoget: Decompressing bz2 compressed {} {}".format(file_type, inputfq))
-        return bz2.open(inputfq, 'rt')
-    elif inputfq.endswith(('.fastq', '.fq', 'fasta', '.fa', '.fas')):
-        return open(inputfq, 'r')
+        return bz2.open(inputfq, "rt")
+    elif inputfq.endswith((".fastq", ".fq", "fasta", ".fa", ".fas")):
+        return open(inputfq, "r")
     else:
         logging.error("INPUT ERROR: Unrecognized file extension {}".format(inputfq))
-        sys.exit('INPUT ERROR:\nUnrecognized file extension in {}\n'
-                 'Supported are gz, bz2, bgz, fastq, fq, fasta, fa and fas'.format(inputfq))
+        sys.exit(
+            "INPUT ERROR:\nUnrecognized file extension in {}\n"
+            "Supported are gz, bz2, bgz, fastq, fq, fasta, fa and fas".format(inputfq)
+        )
 
 
 def process_fasta(fasta, **kwargs):
     """Combine metrics extracted from a fasta file."""
     logging.info("Nanoget: Starting to collect statistics from a fasta file.")
     inputfasta = handle_compressed_input(fasta, file_type="fasta")
-    return ut.reduce_memory_usage(pd.DataFrame(
-        data=[len(rec) for rec in SeqIO.parse(inputfasta, "fasta")],
-        columns=["lengths"]
-    ).dropna())
+    return ut.reduce_memory_usage(
+        pd.DataFrame(
+            data=[len(rec) for rec in SeqIO.parse(inputfasta, "fasta")], columns=["lengths"]
+        ).dropna()
+    )
 
 
 def process_fastq_plain(fastq, **kwargs):
     """Combine metrics extracted from a fastq file."""
     logging.info("Nanoget: Starting to collect statistics from plain fastq file.")
     inputfastq = handle_compressed_input(fastq)
-    return ut.reduce_memory_usage(pd.DataFrame(
-        data=[res for res in extract_from_fastq(inputfastq) if res],
-        columns=["quals", "lengths"]
-    ).dropna())
+    return ut.reduce_memory_usage(
+        pd.DataFrame(
+            data=[res for res in extract_from_fastq(inputfastq) if res],
+            columns=["quals", "lengths"],
+        ).dropna()
+    )
 
 
 def extract_from_fastq(fq):
@@ -359,15 +429,12 @@ def extract_all_from_fastq(rec):
 
     Return identifier, read length, average quality and median quality
     """
-    return (rec.id,
-            len(rec),
-            ut.ave_qual(rec.letter_annotations["phred_quality"]),
-            None)
+    return (rec.id, len(rec), ut.ave_qual(rec.letter_annotations["phred_quality"]), None)
 
 
 def info_to_dict(info):
     """Get the key-value pairs from the albacore/minknow fastq description and return dict"""
-    return {field.split('=')[0]: field.split('=')[1] for field in info.split(' ')[1:]}
+    return {field.split("=")[0]: field.split("=")[1] for field in info.split(" ")[1:]}
 
 
 def process_fastq_rich(fastq, **kwargs):
@@ -389,19 +456,24 @@ def process_fastq_rich(fastq, **kwargs):
         try:
             read_info = info_to_dict(record.description)
             res.append(
-                (ut.ave_qual(record.letter_annotations["phred_quality"]),
-                 len(record),
-                 read_info["ch"],
-                 read_info["start_time"],
-                 read_info["runid"]))
+                (
+                    ut.ave_qual(record.letter_annotations["phred_quality"]),
+                    len(record),
+                    read_info["ch"],
+                    read_info["start_time"],
+                    read_info["runid"],
+                )
+            )
         except KeyError:
-            logging.error("Nanoget: keyerror when processing record {}".format(record.description))
-            sys.exit("Unexpected fastq identifier:\n{}\n\n \
-            missing one or more of expected fields 'ch', 'start_time' or 'runid'".format(
-                record.description))
+            logging.error(f"Nanoget: keyerror when processing record {record.description}")
+            sys.exit(
+                f"Unexpected fastq identifier:\n{record.description}\n\n \
+            missing one or more of expected fields 'ch', 'start_time' or 'runid'"
+            )
     df = pd.DataFrame(
-        data=res,
-        columns=["quals", "lengths", "channelIDs", "timestamp", "runIDs"]).dropna()
+        data=res, columns=["quals", "lengths", "channelIDs", "timestamp", "runIDs"]
+    ).dropna()
+    df["timestamp"] = pd.to_datetime(df["timestamp"], format='mixed', utc=True)
     df["channelIDs"] = df["channelIDs"].astype("int64")
     return ut.reduce_memory_usage(df)
 
@@ -412,29 +484,29 @@ def readfq(fp):
     while True:  # mimic closure; is it a bad idea?
         if not last:  # the first record or a record following a fastq
             for l in fp:  # search for the start of the next record
-                if l[0] in '>@':  # fasta/q header line
+                if l[0] in ">@":  # fasta/q header line
                     last = l[:-1]  # save this line
                     break
         if not last:
             break
         name, seqs, last = last[1:].partition(" ")[0], [], None
         for l in fp:  # read the sequence
-            if l[0] in '@+>':
+            if l[0] in "@+>":
                 last = l[:-1]
                 break
             seqs.append(l[:-1])
-        if not last or last[0] != '+':  # this is a fasta record
-            yield name, ''.join(seqs), None  # yield a fasta record
+        if not last or last[0] != "+":  # this is a fasta record
+            yield name, "".join(seqs), None  # yield a fasta record
             if not last:
                 break
         else:  # this is a fastq record
-            seq, leng, seqs = ''.join(seqs), 0, []
+            seq, leng, seqs = "".join(seqs), 0, []
             for l in fp:  # read the quality
                 seqs.append(l[:-1])
                 leng += len(l) - 1
                 if leng >= len(seq):  # have read enough quality
                     last = None
-                    yield name, seq, ''.join(seqs)  # yield a fastq record
+                    yield name, seq, "".join(seqs)  # yield a fastq record
                     break
             if last:  # reach EOF before reading enough quality
                 yield name, seq, None  # yield a fasta record instead
@@ -464,10 +536,10 @@ def process_fastq_minimal(fastq, **kwargs):
     infastq = handle_compressed_input(fastq)
     try:
         df = pd.DataFrame(
-            data=[rec for rec in fq_minimal(infastq) if rec],
-            columns=["timestamp", "lengths"]
+            data=[rec for rec in fq_minimal(infastq) if rec], columns=["timestamp", "lengths"]
         )
     except IndexError:
         logging.error("Fatal: Incorrect file structure for fastq_minimal")
         sys.exit("Error: file does not match expected structure for fastq_minimal")
+    df["timestamp"] = pd.to_datetime(df["timestamp"])
     return ut.reduce_memory_usage(df)


=====================================
nanoget/nanoget.py
=====================================
@@ -26,8 +26,17 @@ import concurrent.futures as cfutures
 import nanoget.extraction_functions as ex
 
 
-def get_input(source, files, threads=4, readtype="1D",
-              combine="simple", names=None, barcoded=False, huge=False, keep_supp=True):
+def get_input(
+    source,
+    files,
+    threads=4,
+    readtype="1D",
+    combine="simple",
+    names=None,
+    barcoded=False,
+    huge=False,
+    keep_supp=True,
+):
     """Get input and process accordingly.
 
     Data can be:
@@ -55,14 +64,15 @@ def get_input(source, files, threads=4, readtype="1D",
       files, or None
     """
     proc_functions = {
-        'fastq': ex.process_fastq_plain,
-        'fasta': ex.process_fasta,
-        'bam': ex.process_bam,
-        'summary': ex.process_summary,
-        'fastq_rich': ex.process_fastq_rich,
-        'fastq_minimal': ex.process_fastq_minimal,
-        'cram': ex.process_cram,
-        'ubam': ex.process_ubam, }
+        "fastq": ex.process_fastq_plain,
+        "fasta": ex.process_fasta,
+        "bam": ex.process_bam,
+        "summary": ex.process_summary,
+        "fastq_rich": ex.process_fastq_rich,
+        "fastq_minimal": ex.process_fastq_minimal,
+        "cram": ex.process_cram,
+        "ubam": ex.process_ubam,
+    }
 
     if source not in proc_functions.keys():
         logging.error("nanoget: Unsupported data source: {}".format(source))
@@ -73,29 +83,36 @@ def get_input(source, files, threads=4, readtype="1D",
         logging.info("nanoget: Running with a single huge input file.")
         if not len(files) == 1:
             logging.error("nanoget: Using multiple huge input files is currently not supported.")
-            sys.exit("Using multiple huge input files is currently not supported.\n"
-                     "Please let me know on GitHub if that's of interest for your application.\n")
-
-        datadf = proc_functions[source](files[0],
-                                        threads=threadsleft,
-                                        readtype=readtype,
-                                        barcoded=barcoded,
-                                        keep_supp=keep_supp,
-                                        huge=True)
+            sys.exit(
+                "Using multiple huge input files is currently not supported.\n"
+                "Please let me know on GitHub if that's of interest for your application.\n"
+            )
+
+        datadf = proc_functions[source](
+            files[0],
+            threads=threadsleft,
+            readtype=readtype,
+            barcoded=barcoded,
+            keep_supp=keep_supp,
+            huge=True,
+        )
     else:
         with cfutures.ProcessPoolExecutor(max_workers=filethreads) as executor:
-            extraction_function = partial(proc_functions[source],
-                                          threads=threadsleft,
-                                          readtype=readtype,
-                                          barcoded=barcoded,
-                                          keep_supp=keep_supp,
-                                          huge=False)
+            extraction_function = partial(
+                proc_functions[source],
+                threads=threadsleft,
+                readtype=readtype,
+                barcoded=barcoded,
+                keep_supp=keep_supp,
+                huge=False,
+            )
             datadf = combine_dfs(
                 dfs=[out for out in executor.map(extraction_function, files)],
                 names=names or files,
-                method=combine)
+                method=combine,
+            )
     if "readIDs" in datadf.columns and pd.isna(datadf["readIDs"]).any():
-        datadf.drop("readIDs", axis='columns', inplace=True)
+        datadf.drop("readIDs", axis="columns", inplace=True)
     datadf = calculate_start_time(datadf)
     logging.info("Nanoget: Gathered all metrics of {} reads".format(len(datadf)))
     if len(datadf) == 0:
@@ -105,14 +122,13 @@ def get_input(source, files, threads=4, readtype="1D",
         return datadf
 
 
-def combine_dfs(dfs, names=None, method='simple'):
+def combine_dfs(dfs, names=None, method="simple"):
     """Combine dataframes.
 
     Combination is either done simple by just concatenating the DataFrames
     or performs tracking by adding the name of the dataset as a column."""
     if method == "track":
-        return pd.concat([df.assign(dataset=n) for df, n in zip(dfs, names)],
-                         ignore_index=True)
+        return pd.concat([df.assign(dataset=n) for df, n in zip(dfs, names)], ignore_index=True)
     elif method == "simple":
         return pd.concat(dfs, ignore_index=True)
 
@@ -132,16 +148,17 @@ def calculate_start_time(df):
     subtraction is done per dataset
     """
     if "time" in df.columns:
-        df["time_arr"] = pd.Series(df["time"], dtype='datetime64[s]')
+        df["time_arr"] = pd.Series(df["time"], dtype="datetime64[s]")
     elif "timestamp" in df.columns:
-        df["time_arr"] = pd.Series(df["timestamp"], dtype="datetime64[ns]")
+        df["time_arr"] = df["timestamp"]
     else:
         return df
     if "dataset" in df.columns:
         for dset in df["dataset"].unique():
             time_zero = df.loc[df["dataset"] == dset, "time_arr"].min()
-            df.loc[df["dataset"] == dset, "start_time"] = \
+            df.loc[df["dataset"] == dset, "start_time"] = (
                 df.loc[df["dataset"] == dset, "time_arr"] - time_zero
+            )
     else:
         df["start_time"] = df["time_arr"] - df["time_arr"].min()
     return df.drop(["time", "timestamp", "time_arr"], axis=1, errors="ignore")


=====================================
nanoget/version.py
=====================================
@@ -1 +1 @@
-__version__ = "1.16.1"
+__version__ = "1.19.3"


=====================================
scripts/test.py
=====================================
@@ -6,11 +6,12 @@ def run_tests():
     nanoget.get_input("bam", ["nanotest/alignment.bam"])
     nanoget.get_input("bam", ["nanotest/alignment.bam"], keep_supp=False)
     nanoget.get_input("fastq_rich", ["nanotest/reads.fastq.gz"])
+    nanoget.get_input("fastq_rich", ["nanotest/reads-mixed-timestamp.fastq"])
     nanoget.get_input("summary", ["nanotest/sequencing_summary.txt"], combine="track")
     nanoget.get_input("fastq_minimal", ["nanotest/reads.fastq.gz"])
     nanoget.get_input("fastq", ["nanotest/reads.fastq.gz"])
     nanoget.get_input("fasta", ["nanotest/reads.fa.gz"])
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     run_tests()


=====================================
scripts/test.sh
=====================================
@@ -1,5 +1,9 @@
 set -ev
 
-git clone https://github.com/wdecoster/nanotest.git
+if [ -d "nanotest" ]; then
+    echo "nanotest already cloned"
+else
+    git clone https://github.com/wdecoster/nanotest.git
+fi
 
 python scripts/test.py


=====================================
setup.py
=====================================
@@ -12,7 +12,7 @@ exec(open('nanoget/version.py').read())
 
 setup(
     name='nanoget',
-    version=__version__,
+    version=__version__,  # noqa: F821
     description='Functions to extract information from Oxford Nanopore sequencing data and alignments.',
     long_description=open(path.join(here, "README.md")).read(),
     long_description_content_type="text/markdown",
@@ -33,7 +33,7 @@ setup(
     keywords='nanopore sequencing plotting quality control',
     python_requires='>=3',
     packages=find_packages() + ['scripts'],
-    install_requires=['pandas>=0.22.0',
+    install_requires=['pandas>=2.0.0',
                       'numpy',
                       'biopython',
                       'pysam>0.10.0.0'],



View it on GitLab: https://salsa.debian.org/med-team/python-nanoget/-/commit/a8a632efcda09b5cb73ebea148becb28a481751f

-- 
View it on GitLab: https://salsa.debian.org/med-team/python-nanoget/-/commit/a8a632efcda09b5cb73ebea148becb28a481751f
You're receiving this email because of your account on salsa.debian.org.


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20231115/c1a2ca4c/attachment-0001.htm>


More information about the debian-med-commit mailing list