[med-svn] [Git][med-team/python-nanoget][upstream] New upstream version 1.19.3
Andreas Tille (@tille)
gitlab at salsa.debian.org
Wed Nov 15 10:51:47 GMT 2023
Andreas Tille pushed to branch upstream at Debian Med / python-nanoget
Commits:
a8a632ef by Andreas Tille at 2023-11-15T11:25:46+01:00
New upstream version 1.19.3
- - - - -
9 changed files:
- + .github/workflows/python-package-conda.yml
- + .travis.yml
- README.md
- nanoget/extraction_functions.py
- nanoget/nanoget.py
- nanoget/version.py
- scripts/test.py
- scripts/test.sh
- setup.py
Changes:
=====================================
.github/workflows/python-package-conda.yml
=====================================
@@ -0,0 +1,34 @@
+name: Python Package using Conda
+
+on: [push]
+
+jobs:
+ build-linux:
+ runs-on: ubuntu-20.04
+ strategy:
+ max-parallel: 5
+
+ steps:
+ - uses: actions/checkout at v3
+ - name: Set up Python 3.10
+ uses: actions/setup-python at v3
+ with:
+ python-version: '3.10'
+ - name: Add conda to system path
+ run: |
+ # $CONDA is an environment variable pointing to the root of the miniconda directory
+ echo $CONDA/bin >> $GITHUB_PATH
+ - name: Install dependencies
+ run: |
+ conda install pip
+ pip install .
+ - name: Lint with flake8
+ run: |
+ conda install flake8
+ # stop the build if there are Python syntax errors or undefined names
+ flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
+ # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
+ flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
+ - name: Test
+ run: |
+ bash scripts/test.sh
=====================================
.travis.yml
=====================================
@@ -0,0 +1,23 @@
+language: python
+
+python:
+ - "3.6"
+ - "3.7"
+ - "3.8"
+
+before_install:
+ - cp README.md README.rst
+ - pip install flake8
+
+install:
+ - pip install -e .
+
+script:
+ - bash scripts/test.sh
+ - flake8 nanoget/nanoget.py
+
+notifications:
+ email: false
+ webhooks:
+ urls:
+ - https://webhooks.gitter.im/e/4b1c45cea6826ce475c2
=====================================
README.md
=====================================
@@ -3,7 +3,6 @@ This module provides functions to extract useful metrics from Oxford Nanopore se
[![Twitter URL](https://img.shields.io/twitter/url/https/twitter.com/wouter_decoster.svg?style=social&label=Follow%20%40wouter_decoster)](https://twitter.com/wouter_decoster)
[![install with conda](https://anaconda.org/bioconda/nanoget/badges/installer/conda.svg)](https://anaconda.org/bioconda/nanoget)
-[![Build Status](https://travis-ci.org/wdecoster/nanoget.svg?branch=master)](https://travis-ci.org/wdecoster/nanoget)
## FUNCTIONS
=====================================
nanoget/extraction_functions.py
=====================================
@@ -11,7 +11,7 @@ from itertools import repeat
def process_summary(summaryfile, **kwargs):
- """Extracting information from an albacore summary file.
+ """Extracting information from an albacore/guppy/dorado summary file.
Only reads which have a >0 length are returned.
@@ -59,12 +59,18 @@ def process_summary(summaryfile, **kwargs):
37 kit
38 variant
"""
- logging.info("Nanoget: Collecting metrics from summary file {} for {} sequencing".format(
- summaryfile, kwargs["readtype"]))
+ logging.info(
+ f"Nanoget: Collecting metrics from summary file {summaryfile} for {kwargs['readtype']} sequencing"
+ )
ut.check_existance(summaryfile)
if kwargs["readtype"] == "1D":
- cols = ["channel", "start_time", "duration",
- "sequence_length_template", "mean_qscore_template"]
+ cols = [
+ "channel",
+ "start_time",
+ "duration",
+ "sequence_length_template",
+ "mean_qscore_template",
+ ]
elif kwargs["readtype"] in ["2D", "1D2"]:
cols = ["channel", "start_time", "duration", "sequence_length_2d", "mean_qscore_2d"]
if kwargs["barcoded"]:
@@ -77,10 +83,16 @@ def process_summary(summaryfile, **kwargs):
usecols=cols,
)
except ValueError:
- logging.error("Nanoget: did not find expected columns in summary file {}:\n {}".format(
- summaryfile, ', '.join(cols)))
- sys.exit("ERROR: expected columns in summary file {} not found:\n {}".format(
- summaryfile, ', '.join(cols)))
+ logging.error(
+ "Nanoget: did not find expected columns in summary file {}:\n {}".format(
+ summaryfile, ", ".join(cols)
+ )
+ )
+ sys.exit(
+ "ERROR: expected columns in summary file {} not found:\n {}".format(
+ summaryfile, ", ".join(cols)
+ )
+ )
if kwargs["barcoded"]:
datadf.columns = ["channelIDs", "time", "duration", "lengths", "quals", "barcode"]
else:
@@ -104,12 +116,15 @@ def check_bam(bam, samtype="bam"):
pysam.index(bam)
samfile = pysam.AlignmentFile(bam, "rb") # Need to reload the samfile after creating index
logging.info("Nanoget: No index for bam file could be found, created index.")
- if not samfile.header['HD']['SO'] == 'coordinate':
+ if not samfile.header["HD"]["SO"] == "coordinate":
logging.error("Nanoget: Bam file {} not sorted by coordinate!.".format(bam))
sys.exit("Please use a bam file sorted by coordinate.")
if samtype == "bam":
- logging.info("Nanoget: Bam file {} contains {} mapped and {} unmapped reads.".format(
- bam, samfile.mapped, samfile.unmapped))
+ logging.info(
+ "Nanoget: Bam file {} contains {} mapped and {} unmapped reads.".format(
+ bam, samfile.mapped, samfile.unmapped
+ )
+ )
if samfile.mapped == 0:
logging.error("Nanoget: Bam file {} does not contain aligned reads.".format(bam))
sys.exit("FATAL: not a single read was mapped in bam file {}".format(bam))
@@ -127,14 +142,18 @@ def process_ubam(bam, **kwargs):
# Need to reload the samfile after creating index
samfile = pysam.AlignmentFile(bam, "rb", check_sq=False)
logging.info("Nanoget: No index for bam file could be found, created index.")
- datadf = pd.DataFrame(
- data=[(read.query_name, ut.ave_qual(read.query_qualities), read.query_length)
- for read in samfile.fetch(until_eof=True)],
- columns=["readIDs", "quals", "lengths"]) \
- .dropna(axis='columns', how='all') \
- .dropna(axis='index', how='any')
- logging.info("Nanoget: ubam {} contains {} reads.".format(
- bam, datadf["lengths"].size))
+ datadf = (
+ pd.DataFrame(
+ data=[
+ (read.query_name, ut.ave_qual(read.query_qualities), read.query_length)
+ for read in samfile.fetch(until_eof=True)
+ ],
+ columns=["readIDs", "quals", "lengths"],
+ )
+ .dropna(axis="columns", how="all")
+ .dropna(axis="index", how="any")
+ )
+ logging.info("Nanoget: ubam {} contains {} reads.".format(bam, datadf["lengths"].size))
return ut.reduce_memory_usage(datadf)
@@ -154,28 +173,50 @@ def process_bam(bam, **kwargs):
logging.info("Nanoget: Starting to collect statistics from bam file {}.".format(bam))
samfile = check_bam(bam)
chromosomes = samfile.references
- if len(chromosomes) > 100 or kwargs["huge"]:
- logging.info("Nanoget: lots of contigs (>100) or --huge, not running in separate processes")
- datadf = pd.DataFrame(
- data=extract_from_bam(bam, None, kwargs["keep_supp"]),
- columns=["readIDs", "quals", "aligned_quals", "lengths",
- "aligned_lengths", "mapQ", "percentIdentity"]) \
- .dropna(axis='columns', how='all') \
- .dropna(axis='index', how='any')
+ if len(chromosomes) > 200 or kwargs["huge"]:
+ logging.info("Nanoget: lots of contigs (>200) or --huge, not running in separate processes")
+ datadf = (
+ pd.DataFrame(
+ data=extract_from_bam(bam, None, kwargs["keep_supp"]),
+ columns=[
+ "readIDs",
+ "quals",
+ "aligned_quals",
+ "lengths",
+ "aligned_lengths",
+ "mapQ",
+ "percentIdentity",
+ ],
+ )
+ .dropna(axis="columns", how="all")
+ .dropna(axis="index", how="any")
+ )
else:
unit = chromosomes
with cfutures.ProcessPoolExecutor(max_workers=kwargs["threads"]) as executor:
- datadf = pd.DataFrame(
- data=[res for sublist in executor.map(extract_from_bam,
- repeat(bam),
- unit,
- repeat(kwargs["keep_supp"]))
- for res in sublist],
- columns=["readIDs", "quals", "aligned_quals", "lengths",
- "aligned_lengths", "mapQ", "percentIdentity"]) \
- .dropna(axis='columns', how='all') \
- .dropna(axis='index', how='any')
+ datadf = (
+ pd.DataFrame(
+ data=[
+ res
+ for sublist in executor.map(
+ extract_from_bam, repeat(bam), unit, repeat(kwargs["keep_supp"])
+ )
+ for res in sublist
+ ],
+ columns=[
+ "readIDs",
+ "quals",
+ "aligned_quals",
+ "lengths",
+ "aligned_lengths",
+ "mapQ",
+ "percentIdentity",
+ ],
+ )
+ .dropna(axis="columns", how="all")
+ .dropna(axis="index", how="any")
+ )
logging.info(f"Nanoget: bam {bam} contains {datadf['lengths'].size} primary alignments.")
return ut.reduce_memory_usage(datadf)
@@ -196,20 +237,34 @@ def process_cram(cram, **kwargs):
logging.info("Nanoget: Starting to collect statistics from cram file {}.".format(cram))
samfile = check_bam(cram, samtype="cram")
chromosomes = samfile.references
- if len(chromosomes) > 100:
+ if len(chromosomes) > 200:
unit = [None]
logging.info("Nanoget: lots of contigs (>100), not running in separate processes")
else:
unit = chromosomes
with cfutures.ProcessPoolExecutor(max_workers=kwargs["threads"]) as executor:
- datadf = pd.DataFrame(
- data=[res for sublist in executor.map(extract_from_bam,
- repeat(cram), unit, repeat(kwargs["keep_supp"]))
- for res in sublist],
- columns=["readIDs", "quals", "aligned_quals", "lengths",
- "aligned_lengths", "mapQ", "percentIdentity"]) \
- .dropna(axis='columns', how='all') \
- .dropna(axis='index', how='any')
+ datadf = (
+ pd.DataFrame(
+ data=[
+ res
+ for sublist in executor.map(
+ extract_from_bam, repeat(cram), unit, repeat(kwargs["keep_supp"])
+ )
+ for res in sublist
+ ],
+ columns=[
+ "readIDs",
+ "quals",
+ "aligned_quals",
+ "lengths",
+ "aligned_lengths",
+ "mapQ",
+ "percentIdentity",
+ ],
+ )
+ .dropna(axis="columns", how="all")
+ .dropna(axis="index", how="any")
+ )
logging.info(f"Nanoget: cram {cram} contains {datadf['lengths'].size} primary alignments.")
return ut.reduce_memory_usage(datadf)
@@ -229,26 +284,32 @@ def extract_from_bam(bam, chromosome, keep_supplementary=True):
samfile = pysam.AlignmentFile(bam, "rb")
if keep_supplementary:
return [
- (read.query_name,
- ut.ave_qual(read.query_qualities),
- ut.ave_qual(read.query_alignment_qualities),
- read.query_length,
- read.query_alignment_length,
- read.mapping_quality,
- get_pID(read))
+ (
+ read.query_name,
+ ut.ave_qual(read.query_qualities),
+ ut.ave_qual(read.query_alignment_qualities),
+ read.query_length,
+ read.query_alignment_length,
+ read.mapping_quality,
+ get_pID(read),
+ )
for read in samfile.fetch(reference=chromosome, multiple_iterators=True)
- if not read.is_secondary and not read.is_unmapped]
+ if not read.is_secondary and not read.is_unmapped
+ ]
else:
return [
- (read.query_name,
- ut.ave_qual(read.query_qualities),
- ut.ave_qual(read.query_alignment_qualities),
- read.query_length,
- read.query_alignment_length,
- read.mapping_quality,
- get_pID(read))
+ (
+ read.query_name,
+ ut.ave_qual(read.query_qualities),
+ ut.ave_qual(read.query_alignment_qualities),
+ read.query_length,
+ read.query_alignment_length,
+ read.mapping_quality,
+ get_pID(read),
+ )
for read in samfile.fetch(reference=chromosome, multiple_iterators=True)
- if not read.is_secondary and not read.is_unmapped and not read.is_supplementary]
+ if not read.is_secondary and not read.is_unmapped and not read.is_supplementary
+ ]
def get_pID(read):
@@ -267,8 +328,10 @@ def get_pID(read):
return (1 - read.get_tag("NM") / alignment_length) * 100
except KeyError:
try:
- return 100 * (1 - (parse_MD(read.get_tag("MD")) + parse_CIGAR(read.cigartuples)) /
- alignment_length)
+ return 100 * (
+ 1
+ - (parse_MD(read.get_tag("MD")) + parse_CIGAR(read.cigartuples)) / alignment_length
+ )
except KeyError:
return None
except ZeroDivisionError:
@@ -277,7 +340,7 @@ def get_pID(read):
def parse_MD(MDlist):
"""Parse MD string to get number of mismatches and deletions."""
- return sum([len(item) for item in re.split('[0-9^]', MDlist)])
+ return sum([len(item) for item in re.split("[0-9^]", MDlist)])
def parse_CIGAR(cigartuples):
@@ -293,40 +356,47 @@ def handle_compressed_input(inputfq, file_type="fastq"):
Relies on file extensions to recognize compression
"""
ut.check_existance(inputfq)
- if inputfq.endswith(('.gz', 'bgz')):
+ if inputfq.endswith((".gz", "bgz")):
import gzip
+
logging.info("Nanoget: Decompressing gzipped {} {}".format(file_type, inputfq))
- return gzip.open(inputfq, 'rt')
- elif inputfq.endswith('.bz2'):
+ return gzip.open(inputfq, "rt")
+ elif inputfq.endswith(".bz2"):
import bz2
+
logging.info("Nanoget: Decompressing bz2 compressed {} {}".format(file_type, inputfq))
- return bz2.open(inputfq, 'rt')
- elif inputfq.endswith(('.fastq', '.fq', 'fasta', '.fa', '.fas')):
- return open(inputfq, 'r')
+ return bz2.open(inputfq, "rt")
+ elif inputfq.endswith((".fastq", ".fq", "fasta", ".fa", ".fas")):
+ return open(inputfq, "r")
else:
logging.error("INPUT ERROR: Unrecognized file extension {}".format(inputfq))
- sys.exit('INPUT ERROR:\nUnrecognized file extension in {}\n'
- 'Supported are gz, bz2, bgz, fastq, fq, fasta, fa and fas'.format(inputfq))
+ sys.exit(
+ "INPUT ERROR:\nUnrecognized file extension in {}\n"
+ "Supported are gz, bz2, bgz, fastq, fq, fasta, fa and fas".format(inputfq)
+ )
def process_fasta(fasta, **kwargs):
"""Combine metrics extracted from a fasta file."""
logging.info("Nanoget: Starting to collect statistics from a fasta file.")
inputfasta = handle_compressed_input(fasta, file_type="fasta")
- return ut.reduce_memory_usage(pd.DataFrame(
- data=[len(rec) for rec in SeqIO.parse(inputfasta, "fasta")],
- columns=["lengths"]
- ).dropna())
+ return ut.reduce_memory_usage(
+ pd.DataFrame(
+ data=[len(rec) for rec in SeqIO.parse(inputfasta, "fasta")], columns=["lengths"]
+ ).dropna()
+ )
def process_fastq_plain(fastq, **kwargs):
"""Combine metrics extracted from a fastq file."""
logging.info("Nanoget: Starting to collect statistics from plain fastq file.")
inputfastq = handle_compressed_input(fastq)
- return ut.reduce_memory_usage(pd.DataFrame(
- data=[res for res in extract_from_fastq(inputfastq) if res],
- columns=["quals", "lengths"]
- ).dropna())
+ return ut.reduce_memory_usage(
+ pd.DataFrame(
+ data=[res for res in extract_from_fastq(inputfastq) if res],
+ columns=["quals", "lengths"],
+ ).dropna()
+ )
def extract_from_fastq(fq):
@@ -359,15 +429,12 @@ def extract_all_from_fastq(rec):
Return identifier, read length, average quality and median quality
"""
- return (rec.id,
- len(rec),
- ut.ave_qual(rec.letter_annotations["phred_quality"]),
- None)
+ return (rec.id, len(rec), ut.ave_qual(rec.letter_annotations["phred_quality"]), None)
def info_to_dict(info):
"""Get the key-value pairs from the albacore/minknow fastq description and return dict"""
- return {field.split('=')[0]: field.split('=')[1] for field in info.split(' ')[1:]}
+ return {field.split("=")[0]: field.split("=")[1] for field in info.split(" ")[1:]}
def process_fastq_rich(fastq, **kwargs):
@@ -389,19 +456,24 @@ def process_fastq_rich(fastq, **kwargs):
try:
read_info = info_to_dict(record.description)
res.append(
- (ut.ave_qual(record.letter_annotations["phred_quality"]),
- len(record),
- read_info["ch"],
- read_info["start_time"],
- read_info["runid"]))
+ (
+ ut.ave_qual(record.letter_annotations["phred_quality"]),
+ len(record),
+ read_info["ch"],
+ read_info["start_time"],
+ read_info["runid"],
+ )
+ )
except KeyError:
- logging.error("Nanoget: keyerror when processing record {}".format(record.description))
- sys.exit("Unexpected fastq identifier:\n{}\n\n \
- missing one or more of expected fields 'ch', 'start_time' or 'runid'".format(
- record.description))
+ logging.error(f"Nanoget: keyerror when processing record {record.description}")
+ sys.exit(
+ f"Unexpected fastq identifier:\n{record.description}\n\n \
+ missing one or more of expected fields 'ch', 'start_time' or 'runid'"
+ )
df = pd.DataFrame(
- data=res,
- columns=["quals", "lengths", "channelIDs", "timestamp", "runIDs"]).dropna()
+ data=res, columns=["quals", "lengths", "channelIDs", "timestamp", "runIDs"]
+ ).dropna()
+ df["timestamp"] = pd.to_datetime(df["timestamp"], format='mixed', utc=True)
df["channelIDs"] = df["channelIDs"].astype("int64")
return ut.reduce_memory_usage(df)
@@ -412,29 +484,29 @@ def readfq(fp):
while True: # mimic closure; is it a bad idea?
if not last: # the first record or a record following a fastq
for l in fp: # search for the start of the next record
- if l[0] in '>@': # fasta/q header line
+ if l[0] in ">@": # fasta/q header line
last = l[:-1] # save this line
break
if not last:
break
name, seqs, last = last[1:].partition(" ")[0], [], None
for l in fp: # read the sequence
- if l[0] in '@+>':
+ if l[0] in "@+>":
last = l[:-1]
break
seqs.append(l[:-1])
- if not last or last[0] != '+': # this is a fasta record
- yield name, ''.join(seqs), None # yield a fasta record
+ if not last or last[0] != "+": # this is a fasta record
+ yield name, "".join(seqs), None # yield a fasta record
if not last:
break
else: # this is a fastq record
- seq, leng, seqs = ''.join(seqs), 0, []
+ seq, leng, seqs = "".join(seqs), 0, []
for l in fp: # read the quality
seqs.append(l[:-1])
leng += len(l) - 1
if leng >= len(seq): # have read enough quality
last = None
- yield name, seq, ''.join(seqs) # yield a fastq record
+ yield name, seq, "".join(seqs) # yield a fastq record
break
if last: # reach EOF before reading enough quality
yield name, seq, None # yield a fasta record instead
@@ -464,10 +536,10 @@ def process_fastq_minimal(fastq, **kwargs):
infastq = handle_compressed_input(fastq)
try:
df = pd.DataFrame(
- data=[rec for rec in fq_minimal(infastq) if rec],
- columns=["timestamp", "lengths"]
+ data=[rec for rec in fq_minimal(infastq) if rec], columns=["timestamp", "lengths"]
)
except IndexError:
logging.error("Fatal: Incorrect file structure for fastq_minimal")
sys.exit("Error: file does not match expected structure for fastq_minimal")
+ df["timestamp"] = pd.to_datetime(df["timestamp"])
return ut.reduce_memory_usage(df)
=====================================
nanoget/nanoget.py
=====================================
@@ -26,8 +26,17 @@ import concurrent.futures as cfutures
import nanoget.extraction_functions as ex
-def get_input(source, files, threads=4, readtype="1D",
- combine="simple", names=None, barcoded=False, huge=False, keep_supp=True):
+def get_input(
+ source,
+ files,
+ threads=4,
+ readtype="1D",
+ combine="simple",
+ names=None,
+ barcoded=False,
+ huge=False,
+ keep_supp=True,
+):
"""Get input and process accordingly.
Data can be:
@@ -55,14 +64,15 @@ def get_input(source, files, threads=4, readtype="1D",
files, or None
"""
proc_functions = {
- 'fastq': ex.process_fastq_plain,
- 'fasta': ex.process_fasta,
- 'bam': ex.process_bam,
- 'summary': ex.process_summary,
- 'fastq_rich': ex.process_fastq_rich,
- 'fastq_minimal': ex.process_fastq_minimal,
- 'cram': ex.process_cram,
- 'ubam': ex.process_ubam, }
+ "fastq": ex.process_fastq_plain,
+ "fasta": ex.process_fasta,
+ "bam": ex.process_bam,
+ "summary": ex.process_summary,
+ "fastq_rich": ex.process_fastq_rich,
+ "fastq_minimal": ex.process_fastq_minimal,
+ "cram": ex.process_cram,
+ "ubam": ex.process_ubam,
+ }
if source not in proc_functions.keys():
logging.error("nanoget: Unsupported data source: {}".format(source))
@@ -73,29 +83,36 @@ def get_input(source, files, threads=4, readtype="1D",
logging.info("nanoget: Running with a single huge input file.")
if not len(files) == 1:
logging.error("nanoget: Using multiple huge input files is currently not supported.")
- sys.exit("Using multiple huge input files is currently not supported.\n"
- "Please let me know on GitHub if that's of interest for your application.\n")
-
- datadf = proc_functions[source](files[0],
- threads=threadsleft,
- readtype=readtype,
- barcoded=barcoded,
- keep_supp=keep_supp,
- huge=True)
+ sys.exit(
+ "Using multiple huge input files is currently not supported.\n"
+ "Please let me know on GitHub if that's of interest for your application.\n"
+ )
+
+ datadf = proc_functions[source](
+ files[0],
+ threads=threadsleft,
+ readtype=readtype,
+ barcoded=barcoded,
+ keep_supp=keep_supp,
+ huge=True,
+ )
else:
with cfutures.ProcessPoolExecutor(max_workers=filethreads) as executor:
- extraction_function = partial(proc_functions[source],
- threads=threadsleft,
- readtype=readtype,
- barcoded=barcoded,
- keep_supp=keep_supp,
- huge=False)
+ extraction_function = partial(
+ proc_functions[source],
+ threads=threadsleft,
+ readtype=readtype,
+ barcoded=barcoded,
+ keep_supp=keep_supp,
+ huge=False,
+ )
datadf = combine_dfs(
dfs=[out for out in executor.map(extraction_function, files)],
names=names or files,
- method=combine)
+ method=combine,
+ )
if "readIDs" in datadf.columns and pd.isna(datadf["readIDs"]).any():
- datadf.drop("readIDs", axis='columns', inplace=True)
+ datadf.drop("readIDs", axis="columns", inplace=True)
datadf = calculate_start_time(datadf)
logging.info("Nanoget: Gathered all metrics of {} reads".format(len(datadf)))
if len(datadf) == 0:
@@ -105,14 +122,13 @@ def get_input(source, files, threads=4, readtype="1D",
return datadf
-def combine_dfs(dfs, names=None, method='simple'):
+def combine_dfs(dfs, names=None, method="simple"):
"""Combine dataframes.
Combination is either done simple by just concatenating the DataFrames
or performs tracking by adding the name of the dataset as a column."""
if method == "track":
- return pd.concat([df.assign(dataset=n) for df, n in zip(dfs, names)],
- ignore_index=True)
+ return pd.concat([df.assign(dataset=n) for df, n in zip(dfs, names)], ignore_index=True)
elif method == "simple":
return pd.concat(dfs, ignore_index=True)
@@ -132,16 +148,17 @@ def calculate_start_time(df):
subtraction is done per dataset
"""
if "time" in df.columns:
- df["time_arr"] = pd.Series(df["time"], dtype='datetime64[s]')
+ df["time_arr"] = pd.Series(df["time"], dtype="datetime64[s]")
elif "timestamp" in df.columns:
- df["time_arr"] = pd.Series(df["timestamp"], dtype="datetime64[ns]")
+ df["time_arr"] = df["timestamp"]
else:
return df
if "dataset" in df.columns:
for dset in df["dataset"].unique():
time_zero = df.loc[df["dataset"] == dset, "time_arr"].min()
- df.loc[df["dataset"] == dset, "start_time"] = \
+ df.loc[df["dataset"] == dset, "start_time"] = (
df.loc[df["dataset"] == dset, "time_arr"] - time_zero
+ )
else:
df["start_time"] = df["time_arr"] - df["time_arr"].min()
return df.drop(["time", "timestamp", "time_arr"], axis=1, errors="ignore")
=====================================
nanoget/version.py
=====================================
@@ -1 +1 @@
-__version__ = "1.16.1"
+__version__ = "1.19.3"
=====================================
scripts/test.py
=====================================
@@ -6,11 +6,12 @@ def run_tests():
nanoget.get_input("bam", ["nanotest/alignment.bam"])
nanoget.get_input("bam", ["nanotest/alignment.bam"], keep_supp=False)
nanoget.get_input("fastq_rich", ["nanotest/reads.fastq.gz"])
+ nanoget.get_input("fastq_rich", ["nanotest/reads-mixed-timestamp.fastq"])
nanoget.get_input("summary", ["nanotest/sequencing_summary.txt"], combine="track")
nanoget.get_input("fastq_minimal", ["nanotest/reads.fastq.gz"])
nanoget.get_input("fastq", ["nanotest/reads.fastq.gz"])
nanoget.get_input("fasta", ["nanotest/reads.fa.gz"])
-if __name__ == '__main__':
+if __name__ == "__main__":
run_tests()
=====================================
scripts/test.sh
=====================================
@@ -1,5 +1,9 @@
set -ev
-git clone https://github.com/wdecoster/nanotest.git
+if [ -d "nanotest" ]; then
+ echo "nanotest already cloned"
+else
+ git clone https://github.com/wdecoster/nanotest.git
+fi
python scripts/test.py
=====================================
setup.py
=====================================
@@ -12,7 +12,7 @@ exec(open('nanoget/version.py').read())
setup(
name='nanoget',
- version=__version__,
+ version=__version__, # noqa: F821
description='Functions to extract information from Oxford Nanopore sequencing data and alignments.',
long_description=open(path.join(here, "README.md")).read(),
long_description_content_type="text/markdown",
@@ -33,7 +33,7 @@ setup(
keywords='nanopore sequencing plotting quality control',
python_requires='>=3',
packages=find_packages() + ['scripts'],
- install_requires=['pandas>=0.22.0',
+ install_requires=['pandas>=2.0.0',
'numpy',
'biopython',
'pysam>0.10.0.0'],
View it on GitLab: https://salsa.debian.org/med-team/python-nanoget/-/commit/a8a632efcda09b5cb73ebea148becb28a481751f
--
View it on GitLab: https://salsa.debian.org/med-team/python-nanoget/-/commit/a8a632efcda09b5cb73ebea148becb28a481751f
You're receiving this email because of your account on salsa.debian.org.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20231115/c1a2ca4c/attachment-0001.htm>
More information about the debian-med-commit
mailing list