[med-svn] [Git][med-team/pychopper][upstream] New upstream version 2.7.2
Nilesh Patra (@nilesh)
gitlab at salsa.debian.org
Sat Nov 19 15:47:47 GMT 2022
Nilesh Patra pushed to branch upstream at Debian Med / pychopper
Commits:
4f6485e9 by Nilesh Patra at 2022-11-19T17:09:57+05:30
New upstream version 2.7.2
- - - - -
29 changed files:
- − .circleci/config.yml
- .gitignore
- .gitlab-ci.yml
- + CHANGELOG.md
- Makefile
- + MakefileOld
- README.md
- + conda/meta.yaml
- evaluation/Makefile
- + evaluation/data/SIRV_E0_pcs109_25k.fq.gz
- pychopper/__init__.py
- pychopper/chopper.py
- pychopper/common_structures.py
- pychopper/edlib_backend.py
- + pychopper/primer_data/PCS111_primers.fas
- scripts/__init__.py → pychopper/scripts/__init__.py
- scripts/cdna_classifier.py → pychopper/scripts/pychopper.py
- pychopper/seq_utils.py
- + pychopper/tests/data/PCS111_umi_test_reads.fastq.gz
- + pychopper/tests/data/PCS111_umi_test_reads_expected.fastq
- − pychopper/tests/data/ref.fas
- + pychopper/tests/data/ref.fas.gz
- − pychopper/tests/data/ref.fq
- + pychopper/tests/data/ref.fq.gz
- pychopper/tests/test_regression_simple.py
- pychopper/utils.py
- requirements.txt
- − setup.cfg
- setup.py
Changes:
=====================================
.circleci/config.yml deleted
=====================================
@@ -1,57 +0,0 @@
-# Python CircleCI 2.0 configuration file
-#
-# Check https://circleci.com/docs/2.0/language-python/ for more details
-#
-version: 2
-jobs:
- build:
- docker:
- # specify the version you desire here
- # use `-browsers` prefix for selenium tests, e.g. `3.6.1-browsers`
- - image: circleci/python:3.6.1
-
- # Specify service dependencies here if necessary
- # CircleCI maintains a library of pre-built images
- # documented at https://circleci.com/docs/2.0/circleci-images/
- # - image: circleci/postgres:9.4
-
- working_directory: ~/nanoporetech/pychopper
-
- steps:
- - checkout
-
- # Download and cache dependencies
- - restore_cache:
- keys:
- - v1-dependencies-{{ checksum "requirements.txt" }}
- # fallback to using the latest cache if no exact match is found
- - v1-dependencies-
-
- - run:
- name: install dependencies
- command: |
- python3 -m venv venv
- . venv/bin/activate
- pip install --upgrade pip setuptools wheel
- pip install -r requirements.txt
- pip install -e .
-
- - save_cache:
- paths:
- - ./venv
- key: v1-dependencies-{{ checksum "requirements.txt" }}
-
- # run tests!
- # this example uses Django's built-in test-runner
- # other common Python testing frameworks include pytest and nose
- # https://pytest.org
- # https://nose.readthedocs.io
- - run:
- name: run tests
- command: |
- . venv/bin/activate
- py.test
-
- - store_artifacts:
- path: test-reports
- destination: test-reports
=====================================
.gitignore
=====================================
@@ -1,3 +1,4 @@
+test/
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
@@ -63,4 +64,4 @@ target/
.idea/
._.DS_Store
*.DS_Store
-*.pyc
\ No newline at end of file
+*.pyc
=====================================
.gitlab-ci.yml
=====================================
@@ -1,25 +1,74 @@
-image: ubuntu:xenial
+include:
+ - project: "epi2melabs/ci-templates"
+ file: "push-github.yaml"
+ - project: "epi2melabs/ci-templates"
+ file: "push-conda.yaml"
+
+image: ${UBUNTUIMAGE}:20.04
+
+.prep-image: &prep-image |
+ export DEBIAN_FRONTEND=noninteractive
+ apt update -qq
+ apt install -y --no-install-recommends make wget python3-all-dev python3-venv
+
stages:
- test
- - pages
+ - build
+ - prerelease
- release
+ - postrelease
+
+# Insist that the version in __init__.py matches the git tag
+.check-versions: &check-versions |
+ PYVER="v"$(grep "__version__ = " ${CI_PROJECT_NAME}/__init__.py | awk '{gsub("\"","",$3); print $3}')
+ TAGVER=${CI_COMMIT_TAG}
+ if [[ "${PYVER}" != "${TAGVER}" ]]; then
+ echo "Mismatching TAG and PACKAGE versions:"
+ echo " - TAG:'$TAGVER'"
+ echo " - PACKAGE:'$TAGVER'"
+ exit 1
+ else
+ echo "TAG and PACKAGE versions agree: '${PYVER}'"
+ fi
+
+# Insist a CHANGELOG entry has been made for tags
+.check-changelog: &check-changelog |
+ TAGVER=${CI_COMMIT_TAG}
+ MATCHES=$(grep -c "## \[${TAGVER}\]" CHANGELOG.md || exit 0)
+ if [[ "${MATCHES}" != "1" ]]; then
+ echo "Expected one match to '${CI_COMMIT_TAG}' in CHANGELOG, found ${MATCHES}"
+ exit 1
+ else
+ echo "Found CHANGELOG.md entry for tag"
+ fi
+
+test:
+ stage: test
+ script:
+ - *prep-image
+ - make test
+ - make docs
+ - make sdist
+ artifacts:
+ paths:
+ - dist/*.tar.gz
+
+deploy-checks:
+ stage: prerelease
+ script:
+ - *check-versions
+ - *check-changelog
+ rules:
+ - if: '$CI_COMMIT_TAG =~ /^v[[:digit:]]+\.[[:digit:]]+\.[[:digit:]]+$/'
-before_script:
- - apt-get update
- - apt-get remove python
- - apt-get install -y python3 python3-pip make python3-numpy python3-matplotlib hmmer
- - update-alternatives --install /usr/bin/python python /usr/bin/python3 10
- - alias python=python3; pip3 install --upgrade pip
- - hash -r pip3
- - alias python=python3; pip3 install sphinx sphinx-argparse sphinx_rtd_theme pytest pandas
- - alias python=python3; pip3 install -e ./
-
-
-do_testing:
- stage: test
- script:
- - alias python=python3; make test
- except:
- - tags
-
+conda:
+ tags:
+ - large_ram
+ extends:
+ - .deploy-conda
+ before_script:
+ - *prep-image
+ - export CONDA_PKG=${CI_PROJECT_NAME}
+ - export CONDA_PKG_VERSION=${CI_COMMIT_TAG/v/}
+ - cd conda
=====================================
CHANGELOG.md
=====================================
@@ -0,0 +1,17 @@
+# Changelog
+All notable changes to this project will be documented in this file.
+
+The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
+and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
+
+## [v2.7.2]
+### Added
+- umi detection and extraction
+
+## [v2.7.1]
+### Changed
+- Install instructions
+
+## [v2.7.0]
+### Added
+- PCS111 cDNA sequencing kit added
=====================================
Makefile
=====================================
@@ -1,72 +1,51 @@
-MODULE=pychopper
-
-.PHONY: clean clean-test clean-pyc clean-build docs com help
-
-.DEFAULT_GOAL := help
-
-define PRINT_HELP_PYSCRIPT
-import re, sys
-
-for line in sys.stdin:
- match = re.match(r'^([a-zA-Z_-]+):.*?## (.*)$$', line)
- if match:
- target, help = match.groups()
- print("%-20s %s" % (target, help))
-endef
-export PRINT_HELP_PYSCRIPT
-
-help:
- @python -c "$$PRINT_HELP_PYSCRIPT" < $(MAKEFILE_LIST)
-
-clean: clean-build clean-pyc clean-test ## remove all build, test, coverage and Python artifacts
-
-
-clean-build: ## remove build artifacts
- rm -fr build/
- rm -fr dist/
- rm -fr .eggs/
- find . -name '*.egg-info' -exec rm -fr {} +
- find . -name '*.egg' -exec rm -f {} +
-
-clean-pyc: ## remove Python file artifacts
- find . -name '*.pyc' -exec rm -f {} +
- find . -name '*.pyo' -exec rm -f {} +
- find . -name '*~' -exec rm -f {} +
- find . -name '__pycache__' -exec rm -fr {} +
-
-clean-test: ## remove test and coverage artifacts
- rm -f .coverage
- rm -fr htmlcov/
-
-lint: ## check style with flake8
- @(flake8 --max-line-length=120 $(MODULE) | grep -v "E501 line too long") || true
- @(flake8 --max-line-length=120 scripts/*.py | grep -v "E501 line too long") || true
-
-test: ## run tests quickly with the default Python
- py.test
-
-coverage: ## check code coverage quickly with the default Python
- coverage run --source $(MODULE) --omit="*/tests/*,*__init__.py" `which py.test`
- coverage report -m --omit="*/tests/*,*__init__.py"
- coverage html
-
-docs: ## generate Sphinx HTML documentation, including API docs
- @cd docs; make clean html
-
-servedocs: docs ## compile the docs watching for changes
- watchmedo shell-command -p '*.rst' -c '$(MAKE) -C docs html' -R -D .
-
-release: clean ## package and upload a release
- python setup.py sdist upload
- python setup.py bdist_wheel upload
-
-dist: clean ## builds source and wheel package
- python setup.py sdist
- python setup.py bdist_wheel
- ls -l dist
-
-install: clean ## install the package to the active Python's site-packages
- python setup.py install
-
-com: ## commit all changes to git
- git commit -a
+.PHONY: develop docs
+
+PYTHON ?= python3
+
+IN_VENV=. ./venv/bin/activate
+
+venv/bin/activate:
+ test -d venv || $(PYTHON) -m venv venv
+ ${IN_VENV} && pip install pip --upgrade
+ ${IN_VENV} && pip install -r requirements.txt
+
+develop: venv/bin/activate
+ ${IN_VENV} && python setup.py develop
+
+test: venv/bin/activate
+# ${IN_VENV} && pip install flake8 flake8-rst-docstrings flake8-docstrings flake8-import-order flake8-forbid-visual-indent
+# ${IN_VENV} && flake8 pychopper \
+# --import-order-style google --application-import-names pychopper \
+# --statistics
+ # demo should run without error
+ ${IN_VENV} && python setup.py install
+
+IN_BUILD=. ./pypi_build/bin/activate
+pypi_build/bin/activate:
+ test -d pypi_build || $(PYTHON) -m venv pypi_build --prompt "(pypi) "
+ ${IN_BUILD} && pip install pip --upgrade
+ ${IN_BUILD} && pip install --upgrade pip setuptools twine wheel readme_renderer[md] keyrings.alt
+
+.PHONY: sdist
+sdist: pypi_build/bin/activate
+ ${IN_BUILD} && python setup.py sdist
+
+.PHONY: clean
+clean:
+ rm -rf __pycache__ dist build venv pychopper.egg-info tmp docs/_build
+
+# Documentation
+SPHINXOPTS =
+SPHINXBUILD = sphinx-build
+BUILDDIR = _build
+PAPER =
+PAPEROPT_a4 = -D latex_paper_size=a4
+PAPEROPT_letter = -D latex_paper_size=letter
+ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
+DOCSRC = docs
+docs: venv/bin/activate
+ ${IN_VENV} && pip install sphinx sphinx_rtd_theme sphinx-argparse
+ ${IN_VENV} && cd $(DOCSRC) && $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
+ @echo
+ @echo "Build finished. The HTML pages are in $(DOCSRC)/$(BUILDDIR)/html."
+ touch $(DOCSRC)/$(BUILDDIR)/html/.nojekyll
=====================================
MakefileOld
=====================================
@@ -0,0 +1,72 @@
+MODULE=pychopper
+
+.PHONY: clean clean-test clean-pyc clean-build docs com help
+
+.DEFAULT_GOAL := help
+
+define PRINT_HELP_PYSCRIPT
+import re, sys
+
+for line in sys.stdin:
+ match = re.match(r'^([a-zA-Z_-]+):.*?## (.*)$$', line)
+ if match:
+ target, help = match.groups()
+ print("%-20s %s" % (target, help))
+endef
+export PRINT_HELP_PYSCRIPT
+
+help:
+ @python -c "$$PRINT_HELP_PYSCRIPT" < $(MAKEFILE_LIST)
+
+clean: clean-build clean-pyc clean-test ## remove all build, test, coverage and Python artifacts
+
+
+clean-build: ## remove build artifacts
+ rm -fr build/
+ rm -fr dist/
+ rm -fr .eggs/
+ find . -name '*.egg-info' -exec rm -fr {} +
+ find . -name '*.egg' -exec rm -f {} +
+
+clean-pyc: ## remove Python file artifacts
+ find . -name '*.pyc' -exec rm -f {} +
+ find . -name '*.pyo' -exec rm -f {} +
+ find . -name '*~' -exec rm -f {} +
+ find . -name '__pycache__' -exec rm -fr {} +
+
+clean-test: ## remove test and coverage artifacts
+ rm -f .coverage
+ rm -fr htmlcov/
+
+lint: ## check style with flake8
+ @(flake8 --max-line-length=120 $(MODULE) | grep -v "E501 line too long") || true
+ @(flake8 --max-line-length=120 scripts/*.py | grep -v "E501 line too long") || true
+
+test: ## run tests quickly with the default Python
+ py.test
+
+coverage: ## check code coverage quickly with the default Python
+ coverage run --source $(MODULE) --omit="*/tests/*,*__init__.py" `which py.test`
+ coverage report -m --omit="*/tests/*,*__init__.py"
+ coverage html
+
+docs: ## generate Sphinx HTML documentation, including API docs
+ @cd docs; make clean html
+
+servedocs: docs ## compile the docs watching for changes
+ watchmedo shell-command -p '*.rst' -c '$(MAKE) -C docs html' -R -D .
+
+release: clean ## package and upload a release
+ python setup.py sdist upload
+ python setup.py bdist_wheel upload
+
+dist: clean ## builds source and wheel package
+ python setup.py sdist
+ python setup.py bdist_wheel
+ ls -l dist
+
+install: clean ## install the package to the active Python's site-packages
+ python setup.py install
+
+com: ## commit all changes to git
+ git commit -a
=====================================
README.md
=====================================
@@ -4,7 +4,6 @@
Pychopper
=========
-[![CircleCI](https://circleci.com/gh/nanoporetech/pychopper.svg?style=svg)](https://circleci.com/gh/nanoporetech/pychopper) [![install with bioconda](https://img.shields.io/badge/install%20with-bioconda-brightgreen.svg?style=flat)](http://bioconda.github.io/recipes/pychopper/README.html)
Pychopper v2 is a tool to identify, orient and trim full-length Nanopore cDNA reads. The tool is also able to rescue fused reads.
@@ -23,41 +22,18 @@ The general approach of Pychopper v2 is the following:
Getting Started
================
-## Dependencies
-
-The required Python packages are installed by either `pip` or `conda`. The profile HMM alignment backend depends on the latest [hmmer](http://hmmer.org/) package.
-This can be easily installed using conda:
-
-```bash
-conda install -c bioconda "hmmer>=3.0"
-```
-
## Installation
-Install via pip:
+Install using conda :
```bash
-pip install git+https://github.com/nanoporetech/pychopper.git
+conda install -c epi2melabs -c conda-forge -c bioconda "epi2melabs::pychopper"
```
-Or install from bioconda:
-
-```bash
-conda install -c bioconda "pychopper>=2.0"
-```
-
-Run the tests:
-
-```bash
-make test
-```
-
-Issue `make help` to get a list of `make` targets.
-
## Usage
```
-usage: cdna_classifier.py [-h] [-b primers] [-g phmm_file] [-c config_file]
+usage: pychopper [-h] [-b primers] [-g phmm_file] [-c config_file]
[-k kit] [-q cutoff] [-Q min_qual] [-z min_len]
[-r report_pdf] [-u unclass_output]
[-l len_fail_output] [-w rescue_output]
@@ -83,7 +59,7 @@ optional arguments:
-q cutoff Cutoff parameter (autotuned).
-Q min_qual Minimum mean base quality (7.0).
-z min_len Minimum segment length (50).
- -r report_pdf Report PDF (cdna_classifier_report.pdf).
+ -r report_pdf Report PDF (pychopper_report.pdf).
-u unclass_output Write unclassified reads to this file.
-l len_fail_output Write fragments failing the length filter in this file.
-w rescue_output Write rescued reads to this file.
@@ -100,6 +76,9 @@ optional arguments:
-t threads Number of threads to use (8).
-B batch_size Maximum number of reads processed in each batch
(1000000).
+ -y fastq_comments Use with minimap2 -y to pass UMI and additional info into BAM file (false).
+ -U umi Detect umis.
+
```
*WARNING: Do not turn on trimming during basecalling as it will remove the primers needed for classifying the reads!*
@@ -109,18 +88,18 @@ optional arguments:
Example usage with default PCS109/DCS109 primers using the default pHMM backend:
```bash
-cdna_classifier.py -r report.pdf -u unclassified.fq -w rescued.fq input.fq full_length_output.fq
+pychopper -r report.pdf -u unclassified.fq -w rescued.fq input.fq full_length_output.fq
```
Example usage with default PCS109/DCS109 primers using the edlib/parasail backend:
```bash
-cdna_classifier.py -m edlib -r report.pdf -u unclassified.fq -w rescued.fq input.fq full_length_output.fq
+pychopper -m edlib -r report.pdf -u unclassified.fq -w rescued.fq input.fq full_length_output.fq
```
Example usage with default PCS109/DCS109 primers using the default pHMM backend:
```bash
-cdna_classifier.py -r report.pdf -A aln_hits.bed -S statistics.tsv -u unclassified.fq -w rescued.fq input.fq full_length_output.fq
+pychopper -r report.pdf -A aln_hits.bed -S statistics.tsv -u unclassified.fq -w rescued.fq input.fq full_length_output.fq
```
### Advanced usage
@@ -128,51 +107,36 @@ cdna_classifier.py -r report.pdf -A aln_hits.bed -S statistics.tsv -u unclassifi
The fasta files with custom primers used by the `edlib/parasail` backend can be specified through `-b`, while the valid primer configurations are specified through `-c`:
```bash
-cdna_classifier.py -m edlib -b custom_pimers.fas -c primer_config.txt input.fq full_length_output.fq
+pychopper -m edlib -b custom_pimers.fas -c primer_config.txt input.fq full_length_output.fq
```
Where the contents of `primer_config.txt` looks like `+:MySSP,-MyVNP|-:MyVNP,-MySSP`.
The `pHMM` alignment backend takes a "compressed" profile HMM trained from a multiple sequence alignment using the [hmmer](http://hmmer.org/) package. Custom profile HMMs can be trained from a fastq of reads and a fasta file with the primer sequences using the [hammerpede](https://github.com/nanoporetech/hammerpede) package. The path to the custom profile HMM can be specified using `-g`:
```bash
-cdna_classifier.py -m phmm -g MySSP_MyVNP.hmm -c primer_config.txt input.fq full_length_output.fq
+pychopper -m phmm -g MySSP_MyVNP.hmm -c primer_config.txt input.fq full_length_output.fq
```
-Evaluation
-==========
-
-## Performance on SIRV E0 mix spike-in data
-
-Evaluation on 50k reads from a [SIRV](https://www.lexogen.com/sirvs) E0 mix dataset produced by the PCS109 protocol indicated good performance using both backends:
-
-- More than 85% of the reads were classified, while the percent of classified and rescued reads was higher than 90%:
-
-![sirv_stats](/evaluation/img/sirv_stats.png)
-
-- The oriented reads came from the + and - strands in a roughly 1:1 proportion as expected:
-
-![sirv_strand](/evaluation/img/sirv_strand.png)
-
-- When mapping the oriented reads to the transcriptome, virtually all of them map in the forward direction as expected:
-
-![sirv_map_strand](/evaluation/img/sirv_map_strand.png)
-
-- Comparing the percent of reads covered by alignment before and after trimming, we observe that trimming removed the adapters and the primers:
-
-![sirv_read_cov](/evaluation/img/sirv_read_cov.png)
-
-- Comparing the percent of reference transcripts covered by alignment before and after trimming, we can observe that trimming did not change its value in most of the cases, hence it only rarely removed valid sequence portions:
-
-![sirv_ref_cov](/evaluation/img/sirv_ref_cov.png)
-
-The evaluation can be re-run by issuing `make` from the `evaluation` directory.
-
-Contributing
-================
+### UMI detection
+Detect umis in input reads using `-U`
+#### FASTQ output example:
+```bash
+pychopper -U -k PCS111 -m edlib pychopper/tests/data/PCS111_umi_test_reads.fastq -
+```
+will add:
+```
+umi=TTTGCCATTGAAATTAGCGTTCGCCTT
+```
+to the FASTQ header in the output file.
-- Please fork the repository and create a merge request to contribute.
-- Use [bumpversion](https://github.com/peritus/bumpversion) to manage package versioning.
-- The code should be [PEP8](https://www.python.org/dev/peps/pep-0008) compliant, which can be tested by `make lint`.
+#### BAM output example:
+```bash
+pychopper -U -y -k PCS111 -m edlib pychopper/tests/data/PCS111_umi_test_reads.fastq - | minimap2 -y ...
+```
+will create a BAM file with the following tags containing the UMI:
+```
+RX:Z:TTTGCCATTGAAATTAGCGTTCGCCTT
+```
Help
====
=====================================
conda/meta.yaml
=====================================
@@ -0,0 +1,49 @@
+
+# this doesn't work
+# {% set data = load_setup_py_data() %}
+
+package:
+ name: {{ environ.get('CONDA_PKG') }}
+ version: {{ environ.get('CONDA_PKG_VERSION') }}
+
+source:
+ path: ../
+
+build:
+ number: 0
+ script: "{{ PYTHON }} -m pip install . --no-deps --ignore-installed -vv"
+ noarch: python
+
+requirements:
+ host:
+ - pip
+ - python
+ run:
+ - python
+ - python-edlib
+ - parasail-python
+ - hmmer
+ - tqdm==4.26.0
+ - matplotlib-base
+ - seaborn
+ - six
+ - pandas
+ - numpy
+
+test:
+ commands:
+ - pychopper --help
+
+about:
+ home: "https://github.com/epi2me-labs/pychopper"
+ license: Mozilla Public License 2.0
+ license_family: OTHER
+ license_file: LICENSE.md
+ summary: "A tool to identify, orient and rescue full length cDNA reads from nanopore data."
+ doc_url: https://github.com/epi2me-labs/pychopper
+ dev_url: https://github.com/epi2me-labs/pychopper
+
+extra:
+ recipe-maintainers:
+ - cjw85
+
=====================================
evaluation/Makefile
=====================================
@@ -13,10 +13,10 @@ all:
@echo Reads are at: $(REF)
# seqkit sample -n $(NR_READS) $(READS) > $(IN)
./scripts/process_reads.sh $(REF) $(IN) $(CORES) RAW
- cdna_classifier.py -S phmm_report.tsv -A phmm_hits.bed -w $(RES_PHMM) -t $(CORES) -m phmm -Y $(TUN_SAMPLE) -r phmm_pychopper_report.pdf $(IN) $(IN_PHMM)
+ pychopper -S phmm_report.tsv -A phmm_hits.bed -w $(RES_PHMM) -t $(CORES) -m phmm -Y $(TUN_SAMPLE) -r phmm_pychopper_report.pdf $(IN) $(IN_PHMM)
./scripts/process_reads.sh $(REF) $(IN_PHMM) $(CORES) TRIM_PHMM
./scripts/process_reads.sh $(REF) $(RES_PHMM) $(CORES) RES_PHMM
- cdna_classifier.py -S edlib_report.tsv -A edlib_hits.bed -w $(RES_EDLIB) -t $(CORES) -m edlib -Y $(TUN_SAMPLE) -r edlib_pychopper_report.pdf $(IN) $(IN_EDLIB)
+ pychopper -S edlib_report.tsv -A edlib_hits.bed -w $(RES_EDLIB) -t $(CORES) -m edlib -Y $(TUN_SAMPLE) -r edlib_pychopper_report.pdf $(IN) $(IN_EDLIB)
./scripts/process_reads.sh $(REF) $(IN_EDLIB) $(CORES) TRIM_EDLIB
./scripts/process_reads.sh $(REF) $(RES_EDLIB) $(CORES) RES_EDLIB
=====================================
evaluation/data/SIRV_E0_pcs109_25k.fq.gz
=====================================
Binary files /dev/null and b/evaluation/data/SIRV_E0_pcs109_25k.fq.gz differ
=====================================
pychopper/__init__.py
=====================================
@@ -2,4 +2,4 @@
__author__ = 'ONT Applications Group'
__email__ = 'Apps at nanoporetech.com'
-__version__ = '2.5.0'
+__version__ = "2.7.2"
=====================================
pychopper/chopper.py
=====================================
@@ -1,5 +1,6 @@
# -*- coding: utf-8 -*-
+import sys
import numpy as np
from pychopper import seq_utils as seu
from pychopper import hmmer_backend, edlib_backend
@@ -77,7 +78,7 @@ def analyse_hits(hits, config):
return tuple(valid_segments), hits, tlen
-def segments_to_reads(read, segments, keep_primers):
+def segments_to_reads(read, segments, keep_primers, bam_tags, detect_umis):
"Convert segments to output reads with annotation"
for s in segments:
Start = s.Start
@@ -85,12 +86,44 @@ def segments_to_reads(read, segments, keep_primers):
if keep_primers:
Start = s.Left
End = s.Right
+
+ # Format FASTQ name and comment
sr_id = "{}:{}|".format(Start, End)
- sr_name = sr_id + read.Name + " strand=" + s.Strand
+ if bam_tags:
+ try:
+ name, comment = read.Name.split(" ", 1)
+ except ValueError:
+ name = read.Name
+ comment = ""
+ sr_name = sr_id + name + " CO:Z:" + comment + "\tTS:A:{}".format(s.Strand)
+ else:
+ sr_name = sr_id + read.Name + " strand=" + s.Strand
+
if len(segments) > 1:
sr_name += " rescue=1"
+
+ umi = None
+ if detect_umis:
+ max_umi_ed = 3
+ # Detect UMIs
+ # Get adapters for UMI search
+ padding = 40
+ p1_from = max(0, s.Left - padding)
+ p1_to = min(len(read.Seq), s.Start + padding)
+ p_1 = read.Seq[p1_from:p1_to]
+ p2_from = max(0, s.End - padding)
+ p2_to = min(len(read.Seq), s.Right + padding)
+ p_2 = read.Seq[p2_from:p2_to]
+
+ umi, _ = edlib_backend.find_umi_single(
+ [p_1 + 'NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN' + p_2, max_umi_ed])
+ if bam_tags:
+ sr_name += "\tRX:Z:{}".format(umi)
+ else:
+ sr_name += " umi={}".format(umi)
+
sr_seq = read.Seq[Start:End]
- sr = Seq(sr_id + read.Id, sr_name, sr_seq, read.Qual[Start:End] if read.Qual is not None else None)
+ sr = Seq(sr_id + read.Id, sr_name, sr_seq, read.Qual[Start:End] if read.Qual is not None else None, umi)
if s.Strand == '-':
sr = seu.revcomp_seq(sr)
yield sr
=====================================
pychopper/common_structures.py
=====================================
@@ -3,5 +3,5 @@
from collections import namedtuple
Hit = namedtuple('Hit', 'Ref RefStart RefEnd Query QueryStart QueryEnd Score')
-Seq = namedtuple('Seq', 'Id Name Seq Qual')
+Seq = namedtuple('Seq', 'Id Name Seq Qual Umi')
Segment = namedtuple('Segment', 'Left Start End Right Strand Len')
=====================================
pychopper/edlib_backend.py
=====================================
@@ -2,6 +2,7 @@
import edlib
+from pychopper import seq_utils as seu
from pychopper.common_structures import Hit
from pychopper import utils
from pychopper.parasail_backend import refine_locations
@@ -17,6 +18,76 @@ def find_locations(reads, all_primers, max_ed, pool, min_batch):
return
+def find_umi_single(params):
+ "Find UMI in a single reads using the edlib/parasail backend"
+ read = params[0]
+ max_ed = params[1]
+ normalise = False
+
+ pattern_list = [
+ (
+ "TTTVVVVTTVVVVTTVVVVTTVVVVTTT",
+ "V",
+ [("V", "A"), ("V", "G"), ("V", "C")],
+ True
+ ),
+ (
+ "AAABBBBAABBBBAABBBBAABBBBAAA",
+ "B",
+ [("B", "T"), ("B", "G"), ("B", "C")],
+ False
+ )
+ ]
+
+ best_ed = None
+ best_pattern = None
+ best_result = None
+ for pattern, wildcard, equalities, forward in pattern_list:
+ result = edlib.align(pattern,
+ read,
+ task="path",
+ mode="HW",
+ k=max_ed,
+ additionalEqualities=equalities)
+ if result['editDistance'] == -1:
+ continue
+ if not best_ed or result['editDistance'] < best_ed:
+ best_ed = result['editDistance']
+ best_pattern = (pattern, wildcard, equalities, forward)
+ best_result = result
+
+ if best_ed is None:
+ return None, None
+
+ # Extract and normalise UMI
+ umi = ""
+ pattern, wildcard, equalities, forward = best_pattern
+ ed = best_result["editDistance"]
+ if not normalise:
+ locs = best_result["locations"][0]
+ umi = read[locs[0]:locs[1]+1]
+ if not forward:
+ umi = seu.reverse_complement(umi)
+
+ return umi, ed
+
+ align = edlib.getNiceAlignment(best_result, pattern, read)
+ for q, t in zip(align['query_aligned'], align['target_aligned']):
+ if q != wildcard:
+ continue
+ if t == '-':
+ umi += 'N'
+ else:
+ umi += t
+ if not forward:
+ umi = seu.reverse_complement(umi)
+
+ if len(umi) != 16:
+ raise RuntimeError("UMI length incorrect")
+
+ return umi, best_ed
+
+
def _find_locations_single(params):
"Find alignment hits of all primers in a single reads using the edlib/parasail backend"
read = params[0]
=====================================
pychopper/primer_data/PCS111_primers.fas
=====================================
@@ -0,0 +1,4 @@
+>VNP
+ACTTGCCTGTCGCTCTATCTTCAGAGGAGAGTCCGCCGCCCGCAAGTTTT
+>SSP
+TTTCTGTTGGTGCTGATATTGCTTT
=====================================
scripts/__init__.py → pychopper/scripts/__init__.py
=====================================
=====================================
scripts/cdna_classifier.py → pychopper/scripts/pychopper.py
=====================================
@@ -3,12 +3,14 @@
import argparse
import os
+import io
import sys
import numpy as np
import pandas as pd
from collections import OrderedDict, defaultdict
import concurrent.futures
import tqdm
+import gzip
from pychopper import seq_utils as seu
from pychopper import utils
@@ -17,59 +19,6 @@ import pychopper.phmm_data as phmm_data
import pychopper.primer_data as primer_data
-"""
-Parse command line arguments.
-"""
-parser = argparse.ArgumentParser(
- description='Tool to identify, orient and rescue full-length cDNA reads.')
-parser.add_argument(
- '-b', metavar='primers', type=str, default=None, help="Primers fasta.", required=False)
-parser.add_argument(
- '-g', metavar='phmm_file', type=str, default=None, help="File with custom profile HMMs (None).", required=False)
-parser.add_argument(
- '-c', metavar='config_file', type=str, default=None, help="File to specify primer configurations for each direction (None).")
-parser.add_argument(
- '-k', metavar='kit', type=str, default="PCS109", help="Use primer sequences from this kit (PCS109).")
-parser.add_argument(
- '-q', metavar='cutoff', type=float, default=None, help="Cutoff parameter (autotuned).")
-parser.add_argument(
- '-Q', metavar='min_qual', type=float, default=7.0, help="Minimum mean base quality (7.0).")
-parser.add_argument(
- '-z', metavar='min_len', type=int, default=50, help="Minimum segment length (50).")
-parser.add_argument(
- '-r', metavar='report_pdf', type=str, default="cdna_classifier_report.pdf", help="Report PDF (cdna_classifier_report.pdf).")
-parser.add_argument(
- '-u', metavar='unclass_output', type=str, default=None, help="Write unclassified reads to this file.")
-parser.add_argument(
- '-l', metavar='len_fail_output', type=str, default=None, help="Write fragments failing the length filter in this file.")
-parser.add_argument(
- '-w', metavar='rescue_output', type=str, default=None, help="Write rescued reads to this file.")
-parser.add_argument(
- '-S', metavar='stats_output', type=str, default="cdna_classifier_report.tsv", help="Write statistics to this file.")
-parser.add_argument(
- '-K', metavar='qc_fail_output', type=str, default=None, help="Write reads failing mean quality filter to this file.")
-parser.add_argument(
- '-Y', metavar='autotune_nr', type=float, default=10000, help="Approximate number of reads used for tuning the cutoff parameter (10000).")
-parser.add_argument(
- '-L', metavar='autotune_samples', type=int, default=30, help="Number of samples taken when tuning cutoff parameter (30).")
-parser.add_argument(
- '-A', metavar='scores_output', type=str, default=None, help="Write alignment scores to this BED file.")
-parser.add_argument(
- '-m', metavar='method', type=str, default="phmm", help="Detection method: phmm or edlib (phmm).")
-parser.add_argument(
- '-x', metavar='rescue', type=str, default=None, help="Protocol-specific read rescue: DCS109 (None).")
-parser.add_argument(
- '-p', action='store_true', default=False, help="Keep primers, but trim the rest.")
-parser.add_argument(
- '-t', metavar='threads', type=int, default=8, help="Number of threads to use (8).")
-parser.add_argument(
- '-B', metavar='batch_size', type=int, default=1000000, help="Maximum number of reads processed in each batch (1000000).")
-parser.add_argument(
- '-D', metavar='read stats', type=str, default=None, help="Tab separated file with per-read stats (None).")
-parser.add_argument('input_fastx', metavar='input_fastx', type=str, help="Input file.")
-parser.add_argument('output_fastx', metavar='output_fastx', type=str, help="Output file.")
-
-
def _new_stats():
"Initialize a new statistic dictionary"
st = OrderedDict()
@@ -84,6 +33,8 @@ def _new_stats():
st["LenFail"] = 0
st["QcFail"] = 0
st["PassReads"] = 0
+ st["Umi_detected"] = 0
+ st["Umi_detected_final"] = 0
return st
@@ -160,6 +111,13 @@ def _process_stats(st):
res["Category"] += ["Hits"]
res["Name"] += [k]
res["Value"] += [v]
+
+ res["Category"] += ["ReadStats"]
+ res["Name"] += ["Umi_detected"]
+ res["Value"] += [st["Umi_detected"]]
+ res["Category"] += ["ReadStats"]
+ res["Name"] += ["Umi_detected_final"]
+ res["Value"] += [st["Umi_detected_final"]]
res = pd.DataFrame(res)
return res
@@ -218,18 +176,25 @@ def _plot_pd_line(df, title, report, alpha=0.7, xrot=0, vline=None):
report.save_close()
-def _plot_stats(st, pdf):
+def _plot_stats(st, pdf, q, q_bak, detect_umi):
"Generate plots and save to report PDF"
R = report.Report(pdf)
rs = st.loc[st.Category == "Classification", ]
_plot_pd_bars(rs.copy(), "Classification of output reads", R, ann=True)
found, rescue, unusable = float(rs.loc[rs.Name == "Primers_found", ].Value), float(rs.loc[rs.Name == "Rescue", ].Value), float(rs.loc[rs.Name == "Unusable", ].Value)
+ rs_stats = st.loc[st.Category == "ReadStats", ]
+ umi_detected = float(rs_stats.loc[rs_stats.Name == "Umi_detected", ].Value)
+ umi_detected_final = float(rs_stats.loc[rs_stats.Name == "Umi_detected_final", ].Value)
total = found + rescue + unusable
found = found / total * 100
rescue = rescue / total * 100
unusable = unusable / total * 100
+ umi_detected_final = umi_detected_final / total * 100
sys.stderr.write("-----------------------------------\n")
- sys.stderr.write("Reads with two primers:\t{:.2f}%\nRescued reads:\t\t{:.2f}%\nUnusable reads:\t\t{:.2f}%\n".format(found, rescue, unusable))
+ if detect_umi:
+ sys.stderr.write("Reads with two primers:\t{:.2f}% (with UMI {:.2f}%)\nRescued reads:\t\t{:.2f}%\nUnusable reads:\t\t{:.2f}%\n".format(found, umi_detected_final, rescue, unusable))
+ else:
+ sys.stderr.write("Reads with two primers:\t{:.2f}%\nRescued reads:\t\t{:.2f}%\nUnusable reads:\t\t{:.2f}%\n".format(found, rescue, unusable))
sys.stderr.write("-----------------------------------\n")
_plot_pd_bars(st.loc[st.Category == "Strand", ].copy(), "Strand of oriented reads", R, ann=True)
_plot_pd_bars(st.loc[st.Category == "RescueStrand", ].copy(), "Strand of rescued reads", R, ann=True)
@@ -237,14 +202,110 @@ def _plot_stats(st, pdf):
_plot_pd_bars(st.loc[st.Category == "RescueHitNr", ].copy(), "Number of hits in rescued reads", R)
_plot_pd_bars(st.loc[st.Category == "RescueSegmentNr", ].copy(), "Number of usable segments per rescued read", R)
if q_bak is None:
- _plot_pd_line(st.loc[st.Category == "AutotuneSample", ].copy(), "Usable bases as function of cutoff(q). Best q={:.4g}".format(args.q), R, vline=args.q)
+ _plot_pd_line(st.loc[st.Category == "AutotuneSample", ].copy(), "Usable bases as function of cutoff(q). Best q={:.4g}".format(q), R, vline=q)
udf = st.loc[st.Category == "Unusable", ].copy()
udf.Name = np.log10(1.0 + np.array(udf.Name, dtype=float))
_plot_pd_line(udf, "Log10 length distribution of trimmed away sequences.", R)
R.close()
-if __name__ == '__main__':
+def _opener(filename, mode, encoding='utf8'):
+ if filename == '-':
+ sys.stderr.write("Reading from stdin\n")
+ #return open(sys.stdin.buff, mode, encoding=encoding)
+ return io.TextIOWrapper(sys.stdin.buffer, encoding=encoding)
+ elif filename.endswith('.gz'):
+ return gzip.open(filename, mode + 't', encoding=encoding)
+ else:
+ return open(filename, mode, encoding=encoding)
+
+
+def main():
+ """
+ Parse command line arguments.
+ """
+ parser = argparse.ArgumentParser(
+ description='Tool to identify, orient and rescue full-length cDNA reads.')
+ parser.add_argument(
+ '-b', metavar='primers', type=str, default=None, help="Primers fasta.",
+ required=False)
+ parser.add_argument(
+ '-g', metavar='phmm_file', type=str, default=None,
+ help="File with custom profile HMMs (None).", required=False)
+ parser.add_argument(
+ '-c', metavar='config_file', type=str, default=None,
+ help="File to specify primer configurations for each direction (None).")
+ parser.add_argument(
+ '-k', metavar='kit', type=str, default="PCS109",
+ help="Use primer sequences from this kit (PCS109).")
+ parser.add_argument(
+ '-q', metavar='cutoff', type=float, default=None,
+ help="Cutoff parameter (autotuned).")
+ parser.add_argument(
+ '-Q', metavar='min_qual', type=float, default=7.0,
+ help="Minimum mean base quality (7.0).")
+ parser.add_argument(
+ '-z', metavar='min_len', type=int, default=50,
+ help="Minimum segment length (50).")
+ parser.add_argument(
+ '-r', metavar='report_pdf', type=str,
+ default="pychopper.pdf",
+ help="Report PDF (pychopper.pdf).")
+ parser.add_argument(
+ '-u', metavar='unclass_output', type=str, default=None,
+ help="Write unclassified reads to this file.")
+ parser.add_argument(
+ '-l', metavar='len_fail_output', type=str, default=None,
+ help="Write fragments failing the length filter in this file.")
+ parser.add_argument(
+ '-w', metavar='rescue_output', type=str, default=None,
+ help="Write rescued reads to this file.")
+ parser.add_argument(
+ '-S', metavar='stats_output', type=str,
+ default="pychopper.tsv",
+ help="Write statistics to this file.")
+ parser.add_argument(
+ '-K', metavar='qc_fail_output', type=str, default=None,
+ help="Write reads failing mean quality filter to this file.")
+ parser.add_argument(
+ '-Y', metavar='autotune_nr', type=float, default=10000,
+ help="Approximate number of reads used for tuning the cutoff parameter (10000).")
+ parser.add_argument(
+ '-L', metavar='autotune_samples', type=int, default=30,
+ help="Number of samples taken when tuning cutoff parameter (30).")
+ parser.add_argument(
+ '-A', metavar='scores_output', type=str, default=None,
+ help="Write alignment scores to this BED file.")
+ parser.add_argument(
+ '-m', metavar='method', type=str, default="phmm",
+ help="Detection method: phmm or edlib (phmm).")
+ parser.add_argument(
+ '-x', metavar='rescue', type=str, default=None,
+ help="Protocol-specific read rescue: DCS109 (None).")
+ parser.add_argument(
+ '-p', action='store_true', default=False,
+ help="Keep primers, but trim the rest.")
+ parser.add_argument(
+ '-t', metavar='threads', type=int, default=8,
+ help="Number of threads to use (8).")
+ parser.add_argument(
+ '-B', metavar='batch_size', type=int, default=1000000,
+ help="Maximum number of reads processed in each batch (1000000).")
+ parser.add_argument(
+ '-D', metavar='read stats', type=str, default=None,
+ help="Tab separated file with per-read stats (None).")
+ parser.add_argument(
+ '-y', action='store_true', default=False,
+ help="Output FASTQ comment as BAM tags. Use with minimap2 -y to pass UMI and additional info into BAM file.")
+ parser.add_argument(
+ '-U', action='store_true', default=False,
+ help="Detect UMIs")
+
+ parser.add_argument('input_fastx', metavar='input_fastx', type=str,
+ help="Input file.")
+ parser.add_argument('output_fastx', metavar='output_fastx', nargs="?",
+ type=str, default="-", help="Output file.")
+
args = parser.parse_args()
if args.m == "phmm":
@@ -255,14 +316,39 @@ if __name__ == '__main__':
if args.c is not None:
CONFIG = open(args.c, "r").readline().strip()
- kits = {"PCS109": {"HMM": os.path.join(os.path.dirname(phmm_data.__file__), "cDNA_SSP_VNP.hmm"), "FAS": os.path.join(os.path.dirname(primer_data.__file__), "cDNA_SSP_VNP.fas"), }, "PCS110": {
- "HMM": os.path.join(os.path.dirname(phmm_data.__file__), "PCS110_primers.hmm"), "FAS": os.path.join(os.path.dirname(primer_data.__file__), "PCS110_primers.fas")}}
+ kits = {
+ "PCS109": {
+ "HMM": os.path.join(
+ os.path.dirname(phmm_data.__file__), "cDNA_SSP_VNP.hmm"),
+ "FAS": os.path.join(
+ os.path.dirname(primer_data.__file__), "cDNA_SSP_VNP.fas"),
+ },
+ "PCS110": {
+ "HMM": os.path.join(
+ os.path.dirname(phmm_data.__file__), "PCS110_primers.hmm"),
+ "FAS": os.path.join(
+ os.path.dirname(primer_data.__file__), "PCS110_primers.fas")
+ },
+ "PCS111": {
+ "HMM": os.path.join(
+ os.path.dirname(phmm_data.__file__), "PCS110_primers.hmm"),
+ "FAS": os.path.join(
+ os.path.dirname(primer_data.__file__), "PCS111_primers.fas")}
+ }
if args.g is None:
args.g = kits[args.k]["HMM"]
+ elif args.m != 'phmm':
+ sys.exit(
+ 'if using -g option, phmm backend should be used (-m phmm)'
+ )
if args.b is None:
args.b = kits[args.k]["FAS"]
+ elif args.m != 'edlib':
+ sys.exit(
+ 'if using -b option, edlib backend should be used (-m edlib)'
+ )
if args.x is not None and args.x in ('DCS109'):
if args.x == "DCS109":
@@ -274,7 +360,7 @@ if __name__ == '__main__':
in_fh = sys.stdin
if args.input_fastx != '-':
- in_fh = open(args.input_fastx, "r")
+ in_fh = _opener(args.input_fastx, "r")
out_fh = sys.stdout
if args.output_fastx != '-':
@@ -317,23 +403,27 @@ if __name__ == '__main__':
return chopper.chopper_phmm(x, args.g, config, q, args.t, pool, mb)
elif args.m == "edlib":
def backend(x, pool, q=None, mb=None):
- return chopper.chopper_edlib(x, all_primers, config, q * 1.2, q, pool, mb)
+ return chopper.chopper_edlib(x, all_primers, config, q * 1.2, q,
+ pool, mb)
else:
raise Exception("Invalid backend!")
# Pick the -q maximizing the number of classified reads using grid search:
+ #nr_records = None
nr_records = None
tune_df = None
q_bak = args.q
+
if args.q is None:
nr_cutoffs = args.L
cutoffs = np.linspace(0.0, 1.0, num=nr_cutoffs)
cutoffs = cutoffs / cutoffs[-1]
if args.m == "phmm":
- cutoffs = np.linspace(10**-5, 5.0, num=nr_cutoffs)
+ cutoffs = np.linspace(10 ** -5, 5.0, num=nr_cutoffs)
class_reads = []
class_readLens = []
- nr_records = utils.count_fastq_records(args.input_fastx)
+ nr_records = utils.count_fastq_records(args.input_fastx,
+ opener=_opener)
opt_batch = int(nr_records / args.t)
if opt_batch < args.B:
args.B = opt_batch
@@ -342,17 +432,29 @@ if __name__ == '__main__':
target_prop = args.Y / float(nr_records)
if target_prop > 1.0:
target_prop = 1.0
- sys.stderr.write("Counting fastq records in input file: {}\n".format(args.input_fastx))
- sys.stderr.write("Total fastq records in input file: {}\n".format(nr_records))
- read_sample = list(seu.readfq(open(args.input_fastx, "r"), sample=target_prop, min_qual=args.Q))
- sys.stderr.write("Tuning the cutoff parameter (q) on {} sampled reads ({:.1f}%) passing quality filters (Q >= {}).\n".format(len(read_sample), target_prop * 100.0, args.Q))
+ sys.stderr.write("Counting fastq records in input file: {}\n".format(
+ args.input_fastx))
+ sys.stderr.write(
+ "Total fastq records in input file: {}\n".format(nr_records))
+ read_sample = list(
+ seu.readfq(_opener(args.input_fastx, "r"), sample=target_prop,
+ min_qual=args.Q))
+ sys.stderr.write(
+ "Tuning the cutoff parameter (q) on {} sampled reads ({:.1f}%) passing quality filters (Q >= {}).\n".format(
+ len(read_sample), target_prop * 100.0, args.Q))
sys.stderr.write("Optimizing over {} cutoff values.\n".format(args.L))
for qv in tqdm.tqdm(cutoffs):
clsLen = 0
cls = 0
- with concurrent.futures.ProcessPoolExecutor(max_workers=args.t) as executor:
+ with concurrent.futures.ProcessPoolExecutor(
+ max_workers=args.t) as executor:
for batch in utils.batch(read_sample, int((len(read_sample)))):
- for read, (segments, hits, usable_len) in backend(batch, executor, qv, max(1000, int((len(read_sample)) / args.t))):
+ for read, (segments, hits, usable_len) in backend(batch,
+ executor,
+ qv,
+ max(1000,
+ int((
+ len(read_sample)) / args.t))):
flt = list([x.Len for x in segments if x.Len > 0])
if len(flt) == 1:
clsLen += sum(flt)
@@ -371,8 +473,11 @@ if __name__ == '__main__':
tune_df["Name"] += ["Cutoff(q)"]
tune_df["Value"] += [args.q]
if best_qi == (len(class_reads) - 1):
- sys.stderr.write("Best cuttoff value is at the edge of the search interval! Using tuned value is not safe! Please pick a q value manually and QC your data!\n")
- sys.stderr.write("Best cutoff (q) value is {:.4g} with {:.0f}% of the reads classified.\n".format(args.q, class_reads[best_qi] * 100 / len(read_sample)))
+ sys.stderr.write(
+ "Best cuttoff value is at the edge of the search interval! Using tuned value is not safe! Please pick a q value manually and QC your data!\n")
+ sys.stderr.write(
+ "Best cutoff (q) value is {:.4g} with {:.0f}% of the reads classified.\n".format(
+ args.q, class_reads[best_qi] * 100 / len(read_sample)))
if nr_records is not None:
input_size = nr_records
@@ -380,39 +485,54 @@ if __name__ == '__main__':
args.B = nr_records
if args.B == 0:
args.B = 1
- sys.stderr.write("Processing the whole dataset using a batch size of {}:\n".format(args.B))
+ sys.stderr.write(
+ "Processing the whole dataset using a batch size of {}:\n".format(
+ args.B))
pbar = tqdm.tqdm(total=input_size)
min_batch_size = max(int(args.B / args.t), 1)
rfq_sup = {"out_fq": args.K, "pass": 0, "total": 0}
- with concurrent.futures.ProcessPoolExecutor(max_workers=args.t) as executor:
- for batch in utils.batch(seu.readfq(in_fh, min_qual=args.Q, rfq_sup=rfq_sup), args.B):
- for read, (segments, hits, usable_len) in backend(batch, executor, q=args.q, mb=min_batch_size):
+ with concurrent.futures.ProcessPoolExecutor(
+ max_workers=args.t) as executor:
+ for batch in utils.batch(
+ seu.readfq(in_fh, min_qual=args.Q, rfq_sup=rfq_sup), args.B):
+ for read, (segments, hits, usable_len) in backend(batch, executor,
+ q=args.q,
+ mb=min_batch_size):
if args.A is not None:
for h in hits:
a_fh.write(utils.hit2bed(h, read) + "\n")
_update_stats(st, d_fh, segments, hits, usable_len, read)
if args.u is not None and len(segments) == 0:
seu.writefq(read, u_fh)
- for trim_read in chopper.segments_to_reads(read, segments, args.p):
+ for trim_read in chopper.segments_to_reads(read, segments,
+ args.p, args.y, args.U):
+ if trim_read.Umi:
+ st["Umi_detected"] += 1
if args.l is not None and len(trim_read.Seq) < args.z:
st["LenFail"] += 1
seu.writefq(trim_read, l_fh)
continue
if len(segments) == 1:
+ if trim_read.Umi:
+ st["Umi_detected_final"] += 1
seu.writefq(trim_read, out_fh)
if args.w is not None and len(segments) > 1:
seu.writefq(trim_read, w_fh)
- if nr_records is None:
- pbar.update(seu.record_size(read, 'fastq'))
- else:
- pbar.update(1)
+ #if nr_records is None:
+ # pbar.update(seu.record_size(read, 'fastq'))
+ #else:
+ pbar.update(1)
pbar.close()
sys.stderr.write("Finished processing file: {}\n".format(args.input_fastx))
fail_nr = rfq_sup["total"] - rfq_sup["pass"]
fail_pc = (fail_nr * 100 / rfq_sup["total"])
st["QcFail"] = fail_nr
- sys.stderr.write("Input reads failing mean quality filter (Q < {}): {} ({:.2f}%)\n".format(args.Q, fail_nr, fail_pc))
- sys.stderr.write("Output fragments failing length filter (length < {}): {}\n".format(args.z, st["LenFail"]))
+ sys.stderr.write(
+ "Input reads failing mean quality filter (Q < {}): {} ({:.2f}%)\n".format(
+ args.Q, fail_nr, fail_pc))
+ sys.stderr.write(
+ "Output fragments failing length filter (length < {}): {}\n".format(
+ args.z, st["LenFail"]))
# Save stats as TSV:
stdf = None
@@ -433,4 +553,9 @@ if __name__ == '__main__':
fh.close()
if args.r is not None:
- _plot_stats(stdf, args.r)
+ _plot_stats(stdf, args.r, args.q, q_bak, args.U)
+
+
+if __name__ == '__main__':
+ main()
+
=====================================
pychopper/seq_utils.py
=====================================
@@ -77,7 +77,7 @@ def readfq(fp, sample=None, min_qual=None, rfq_sup={}): # this is a generator f
if sample is None or (random() < sample):
if tsup:
rfq_sup["total"] += 1
- yield Seq(name.split(" ", 1)[0], name, ''.join(seqs), None) # yield a fasta record
+ yield Seq(name.split(" ", 1)[0], name, ''.join(seqs), None, None) # yield a fasta record
if not last:
break
else: # this is a fastq record
@@ -89,10 +89,10 @@ def readfq(fp, sample=None, min_qual=None, rfq_sup={}): # this is a generator f
last = None
if sample is None or (random() < sample):
quals = "".join(seqs)
- oseq = Seq(Id=name.split(" ", 1)[0], Name=name, Seq=seq, Qual=quals)
+ oseq = Seq(Id=name.split(" ", 1)[0], Name=name, Seq=seq, Qual=quals, Umi=None)
if tsup:
rfq_sup["total"] += 1
- if not (min_qual is not None and mean_qual(quals) < min_qual):
+ if not (min_qual is not None and min_qual > 0 and mean_qual(quals) < min_qual):
if tsup:
rfq_sup["pass"] += 1
yield oseq
@@ -123,7 +123,7 @@ def revcomp_seq(seq):
qual = seq.Qual
if qual is not None:
qual = qual[::-1]
- return Seq(seq.Id, seq.Name, reverse_complement(seq.Seq), qual)
+ return Seq(seq.Id, seq.Name, reverse_complement(seq.Seq), qual, seq.Umi)
def get_runid(desc):
=====================================
pychopper/tests/data/PCS111_umi_test_reads.fastq.gz
=====================================
Binary files /dev/null and b/pychopper/tests/data/PCS111_umi_test_reads.fastq.gz differ
=====================================
pychopper/tests/data/PCS111_umi_test_reads_expected.fastq
=====================================
@@ -0,0 +1,16 @@
+ at 110:1086|0c0a6a78-5e30-42d5-b8b9-58d20068ae0f runid=6e6c14dbcf8460aa18e3f20fad57285f682bb0bb read=108016 ch=2834 start_time=2022-07-02T19:52:34.859104+01:00 flow_cell_id=PAM17289 protocol_group_id=20220701_IV_AJ_cDNA-R1041 sample_id=1_RT_42C_90min_non-size-selected parent_read_id=0c0a6a78-5e30-42d5-b8b9-58d20068ae0f basecall_model_version_id=dna_r10.4.1_e8.2_hac at v3.5.1 strand=+ umi=TTTCCGCTTACGGTTAGGATTGCACTTT
+CCGCTTACGGTTAGGATTGCACTTTGGGAGCTGCCATCTTGCGTCCCCGCGTGTGTGCGCCTAATCTCAGGTGGTCGCCAAGACCCCTTGAGCACCAACCCTAGTCCCCCGCGCGGCCCCTTATTCGCTCCGACAAGATGAAAGAAACAATCATGAACCAGGAAAAACTGGCCAAACTGCAGGCACAAGTGCGCATTGGTGGGAAAGGAACTGCTCGCAGAAGAAGAAGGTGGTTCATAGAACAGCCACAGCAGATGACAAAAAACTTCAGTTCTCCTTAAAGAAGTTAGGGTAAACAATATCTCTGGTATTGAAGAGGTGAATATGTTTACAAACCAAGGACGGTCATCACTTTAACAACCCTAGAAGTTCAGGCATCTCTGGCAGCGAACACTTTCACCATTACAGGCCATGCTCAAGACAAAGCAGCTGACAGAAATGCTACCCAGCATCTTAAACCAGCTTGGTGCGGATAGTCTGACTAGTTTAAGGAGACTGGGCCAAACTATCCCAAACAATCTGTGGATGGAAAAGCACCACTTGCTACTGGAGAGGATGATGATGATGAAGTTCCAGATCTTGTGGAGAATTTTAATGAGGCTTCCAAGAATGAGGCAAACTGAATTGAGTCAACTTCTGAAGATAAAACCTGAAGAAGTTACTGGGAGCTGCTATTTTATATTATGACTGCTTTTTAAGAAATTTTTGTTTATGGATCTGATAAAATCTAGATCTCTAATATTTTTAAGCCCAAGCCCCTTGGACACTGCAGCTCTTTTCAGTTTTTGCTTATACACAATTCATTCTTTGCAGCTAATTAAGCCGAAGAAGCCTGGGAATCAGATTTTAAACAAAGATTAATAAAGTTCTTTGCCTAGTAAAAAATCAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
++
+<5005:B?9777<BA at A:BBDC?>6/=;<=>>@ABCCCEC?<6566CB=999;7<{{73226:<CAE/--,../-*%&&)*'((,/037553,,,@>=<;==/...,-.27>?{>:;78:86,6667<=<:9:?J945@??DF@<>C=5243KGGJIDCCB?@CAEC58+**+9;?CB>9998;:<ABCCB9867=C87779,+.82,%)%&%%((((((+157849AA@??:874008>@BBEC@>><<:;;;>=<?BA@<HPIA279::AGDFE=>A7>@@LL>;;6768BCEIHEFGGJEC@>@CDDBHA at 9IHCBBADGHJIEGKHHMIHJBA<95..*+))(&%%$%&)3***+((+;92,)*),,***+5ABBCDFCABDE?;::?CAEHFAFCDA at A=51,,.<98.+*('('*+,-,-///456<9<?BCE;1,,*/44/;:::;@6-.F:600/083349999:B>BB@?>?@A?B763,::9<;753+('((-/0((&&%%&',5A?AC;EDCA??A>EB8ABC?>=>><AA@@@==@?=9:@><.++++***,.3:=>=87331=;87::7700.&$$$&*2(((%)()(()(,1/11>:9;::78ACAA at AD>>9667/68:;4691397<;8:?D>6922<CFECA@>GE;<9983336669GEGHEJ{F@@>@AB?BA=<<<969DEECGK{NHDDIHGC>>><?=>A at EEI64>44?=<AAAA>@@BB>;C at CE@@>>A?;;;<<A=?DC===<=BABB<77AA@@BAB{{LF@@AGID>>H=BBCB:@B?=>G@>;4433?AC?<=<:6679>>9--.:904/(,('(-++.%%&/65543--,235768::8886.---879888{KED11--,,,--,+**,,,--++,020//033330,*/360+++/..--.///./012001320012422334643455566568999799=;
+ at 117:1441|e8cdbabd-e5a4-4d03-b709-8520b42fbdcb runid=6e6c14dbcf8460aa18e3f20fad57285f682bb0bb read=183478 ch=1061 start_time=2022-07-02T19:52:35.859104+01:00 flow_cell_id=PAM17289 protocol_group_id=20220701_IV_AJ_cDNA-R1041 sample_id=1_RT_42C_90min_non-size-selected parent_read_id=e8cdbabd-e5a4-4d03-b709-8520b42fbdcb basecall_model_version_id=dna_r10.4.1_e8.2_hac at v3.5.1 strand=+ umi=TTTCGACTTGCGCTTAGGCTTCCAGTTT
+CGACTTGCGCTTAGGCTTCCAGTTTGGGGGAGAAGCCGGGTGGGGCGGGCTGGAAGGAAGCGAACCTACGAAGCAGAAGATGTGAAGACAGCATGCTCACGGCCGAGATTCCCCAGAGTGGCTCTCCATTCCCAGGCTCCGTGCAGGATCCAGGCCTGCATGTGTGGCGGGTGGAGAAGCTGAAGCCGGTGCCTGTGGCGCAAGAGAACCCGGCGTCTCTTCTGGGGGACTCCTTCCTGGTGCCTCGCGATGGCCCAGAAGAGGTTTCCCATCTGCACCTGTCGATGGCCAGCAGTCATCCGGGATGAGCAGGGGGCCTGTGCCGTGCTGGCTGTGCACCTCAACACGCTGCTGGGAGAGCAGGCCTGTGCAGCACCGCGAGGTGCAGGGCAATGAGTCTGACCTCTTCATGAGCTACTCCCACGGGGCCTGGGGTACCCCCAGGAAGGTGGTGTGGAGTCAGCATTTCACAAGACCTCCACAGGAGCCCCAGCTGCCATCAAGAAACTCTACCAGGTGAAGGGGAAGAAGAACGTCCGTGCCACCGAGCGGGCACTGAACTGGGACAGCTTCAACACTCGGGGACTGCTTCATCCCTGGACCTGGCCAGAACATCTTCGCCTGGTGTGGTGGAAAGTCCAACATCCTGGAACGCAACAAGGCGAGGGACCTGGCCCTGGCCATCCGGGACAGTGAGCGACAGGGCAAGGCCCAGGTGGAGATTGTCACTGATGGGGAGGAGCCTGCTGAGATGATCCAGGTCCTGGGCCCCAAGCCTGCTCTGAAGGAGGGCAACCCTGAGGAAGACCTTCACAGCTGACAAGGCAAATGCCCAGGCCGCAGCTCTGTATAAGGTCTCTGATGCCACTGGACAGATGAACCTGACCAAGGTGGCTGACTCCCAGCCCATTTGCCCTTGAACTGCTGATATCTGATGACTGCTTTGTGCTGGACAACGGGCTCTGTGGGCAAGATCTATCTGGAAGGGGCAAAAAGCGAATGAGAAGGAGCAGGCAGGCAGCCCTGCAGGTGGCCGAGGGCTTCATATTCGCATGCAGTTCGCCCCGAACACTCAGGTGGAGATTCTGCCTCAGGGCCGTGAGAGTCCCATCTTCAAGCAATTTTTCAAGGACTGGAAATGAGGGTGGGCGTCTTCCTGCCCCATGCTCCCCTGCCCCCCACCACCTGCCTGCTTGCTTCTCTGGCTGCCTGGTCAGTGCAGGGGTGCCCCCTCCAGTGTTTGATAAAGGAGACATTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTAAAAAAAAAAAA
++
+4677CNHC;::;===>?D?<323/0(/8::;32-+((-.1165652('*(<<4.-74172'''00781+)()*,,**,*-.*,,.7<;9779775,))*.+++/2+*))+00.,1668 at BCAD?{>:3;;3374672=6+&&&&%%&$$$&&&,),4+)'''')B422/...5/()34C>===@<<;987;?>?<:,,,+,--2//,/-2-)()..+,,+1*2&&'10-,,,,2.+-.*))),,((,***-(**,1*;<=BFJCCCB;<3*447667899@@****63221018BD?<;64./0<=222296741@@AB@@FC;:::?AAACE?FHDBA at BB@ABC=<<=@DF75392549/))*:<?=@@AGDA>>;=>?DEFHHDEDF at WEDBA;;:@?===;>>B at CECCD?2134++93B999ED at B1/.,'''(+,35AHCF>?=<=3<58766;3*))'&&&*++,-:99=?==6:>?>??>;>EDDBABA@>>2-+))**)-,,-.8??>?<=<;9557736/+,7325598>:<<=@@?@@A?>>?ABABCCBD>>@<:,**+1+()).83('')16../20($$&%%(04475?+3427221*&*++.1)*.0;CCBDGEB@;:/30//1****+2+*++1CH at 54349:=<<=;<;<995588:8BCCA>A<@@=86678ABBCD>=:878<777?>>>>BEEA at B?@A>7665249;<<ABC<<<EFHB>=43783232/.0>=?720151/54DDCEA<64(()9;;;6222;?@==<@@?BCBBAF?3324654.,++'(7=@GEECACED?=222=;:9>==C?8667;BCCBA@@?@ADGBAADA??><<;;<CA===?@@@??<>==9565655787803/+**++,(<=@=9:985:@>?>9:HEAAACBCBBFBAAADBCCACBEEMDHFBA?=>5444445 at A8>=<32*(6765430&%%&''24:654/)(&'**2@?@BJEEF@>8;97111-13368@?>=>?@>??88;+=----CD@@>67&&&%%&),***+*&%&+25;5459=@@<6//08638(''&%$%))1,)))*;21147A@@8;<6AAAB=?A?<<<@DCFIDEHB=99<:=>AGA1001=:<976-,+*+'.-555798555>;<;<@320=DDDB@@ABABAA??@>AA?877459456;ACD@@5543???@A at A>>>>>>==?BA,)(+%%..)&&%%,.02+++++%$&,47;FJO{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{PMJEA@?><;94(%$0256654358
+ at 117:1312|1241d0c7-a348-4cbc-8e29-f3f4cb4b902e runid=6e6c14dbcf8460aa18e3f20fad57285f682bb0bb read=122321 ch=1898 start_time=2022-07-02T19:52:37.859104+01:00 flow_cell_id=PAM17289 protocol_group_id=20220701_IV_AJ_cDNA-R1041 sample_id=1_RT_42C_90min_non-size-selected parent_read_id=1241d0c7-a348-4cbc-8e29-f3f4cb4b902e basecall_model_version_id=dna_r10.4.1_e8.2_hac at v3.5.1 strand=- umi=TTTCAGGTTAAACTTGGCATTAAGGTTT
+CAGGTTAAACTTGGCATTAAGGTTTGGGGCTTCCGGGTGGGGCCCCGGGCCAGGCGATGGCGCCCTGGGCGCTCCTCAGCCCTGGGTCCTGGTGCGGACCGGGCACACCGTGCTGACCTGGGGAATCACGCTGGTGCTCTTCCTGCACGATACCGAGCTGCGGCAATGGGAGGAGCAGGGGAGCTGCTCCTGCCCCTCACCTTCCTGCTCCTGGTGCTGGGCTCCCTGCTGCTCTACCTCGCTGTGTCACTCATGGACCCTGGCTACGTGAATGTGCAGCCCCAGCCTCAGGAGGAGCTCAAAGAGGAGCAGACAGCCATGGTTCCTCCAGCCATCCCTCTTCGGCGCTGCAGATACTGCCTGGTGCTGCAGCCCCTGAGGGCTCGGCACTGCCGTGAGTGCCGCCGTTGCGTCCGCCGCTACGACCACCACTGCCCCTGGATGGAGAACTCTGTGTGGAGAGCGCAACCACCCGCTCTTTGTGGTCTACCTGGCGCTGCAGCTGGTGGTGCTTCTGTGGGGCCTGTACCTGGCATGGTCAGGCCTCCGGTTCTTCCAGCCCTGGGGTCTGTGGTTGCGGTCCAGCGGGCTCCTGTTCGCCACCTTCCTGCTGCTGTCCCTTCTTGTTGGTGGCCAGCCTGCTCCTCGTCTCGCACCTCTACCTGGTGGCCAGCAACACCACCACCTGGGGATTCATCTCCTCACACCGCATCGCCTATCTCCGCCAGCGCCCCAGCAACCCCTTCGACCGAGGCCTTACCTGCAACCTGGCCCACTTCTTCTGTGGATGGCCCTCAGGGTCCTGGGAGACCCTCTGGGCTGAGGAGGAGGAAGAGGCAGCAGCCCAGCTGTTTAGGGTTGCTGGAGGCCGGGCTACCGTCTGTGCCCTGAAAACCACGGGGCCTGTCCCCAGCTGGGGTGAGCGCTCAGAGGGCTGGGGCCCTCACCTGCCTAACGCCTCCCAGACCCCAGAACGGAGCTTCAAGTCAGACAGATCCCTGCCTTGGTGGGCAGTTCTGCCTTCCAAGGAAGAAGGGGGAGAAAAGGACCTGTGGGTGGCTCAGGCCCAAGCAGACCCCGGGCTCCACCCCAGCCCCGCCCAGGCTGCTGCCAGTGCACACTTTTACAAATTTAATATAAAGCAAGTCCAGTCTTAAAAAGACAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
++
+CDDEIIKGIIEDDDFEGDA@@AP@@?=@AEB??;<978;;89>>BAC?(&&,('&+1/,**,,,-+((&&+;<>?<BA4868=?;945-,2-,-,40*((///4{3311310/1339;>?=334832215559?@>?65=BB;;:4335@<<<?>?A73336EEGF39HDBACBAB at 7>=<=;888;;;@=DFEHDC?=AC@==7779=B>@BB at AFCCDECAAEFCDABCADD?@@:::<GD{@9:9<=?CCCCB595:/1//199ABGDA@>===B?=;<>8;>;:>BA@<?>>DAA2@@?@A>;;:<;=>@>=>BBBA@?>>@>:>;?>@?BB>:963668:;@>>>>??==<=>?DBBB at BB?;:;:B at 41(&'@@<///2@<2222<8788 at A=;6666=<<6544688989:<A>>?AACAAA=<<;<<;=AA?=:632224)),-{{11)-3{45564388;<;:99<>AA6,,-.222235542688<<<BDDHEEIB>??ADBGC====?=?@AC@???DAAA at A??==:898:<;::897=?A@?:=;=BBBB=:33447=7<877<22232-+,,?CENEBEC1111AFDDEFFFFGHJFEEEC?@B<=<+)*3788:FDCABBADC?>=D:6,*)))(''**2/;;99?FDCDBEABCCDDDCAB@@AACB at CAAACCDEDD@@CDDBEFGC9778=A:999=??>??=<<=>;:;<@D@?>?ABA at AAFG65554337;62(&&))''';@GCC>>:<>9@?HG=:8933:83+*).9<;>@?=<=>BD at 9=>==@BBEDH@@A=?FAAA at DGDB:?;@@A..0<>@A@>??FFDEFB662>>10118:8AAB<9::;<;;:5547,+454.-++/5C@@>/)()78::;;:<=<<;:5 at BBEB95456445:7568?A0+,,//.021018655,*)''*2348ADBFD@?@98;;<85438:==>?=>ACEB>>>;;;;@?@?A>ACBDDD@:;;;<FDEEEBADGDDDE???B?>::984.))448=>=>=>>>AA=>=<=>?;:9;899/.278;;;<72346=?????>>445:79789;??B=<<=@><989>9889JB@?A@=////27..--/50545499;A:8769::9763301../09>GEC@>>@<;;<;:::977542/.-------,
+ at 100:753|dab8eb3f-9aec-4e24-972f-0cb217b801fa runid=6e6c14dbcf8460aa18e3f20fad57285f682bb0bb read=167248 ch=2128 start_time=2022-07-02T19:52:37.859104+01:00 flow_cell_id=PAM17289 protocol_group_id=20220701_IV_AJ_cDNA-R1041 sample_id=1_RT_42C_90min_non-size-selected parent_read_id=dab8eb3f-9aec-4e24-972f-0cb217b801fa basecall_model_version_id=dna_r10.4.1_e8.2_hac at v3.5.1 strand=+ umi=TTTGCCATTGAAATTAGCGTTCGCCTT
+GCCATTGAAATTAGCGTTCGCCTTGGGACGCCTAGAAGACAGCGGAACTAAGAAAAGAAGAGGCCTGTGGACAAGAACAATCATGTCTGACTCCTGGTGGTGTGCGAGGTAGACCCAGAGCTAACAGAAAAGCTGAGGAAATTCCGCTTCCGAAAAGAGACAGACAATGCAGCCATCATAATGAAGGTGGACAAAGACCGGCAGATGGTGGTGTTGGAGGAAGAAGTTTCAGAACGTTCCCAGAGGAGCTCAAAATGGAGTTGCCGGAGAGACAGCCCAGGTTCGTGGTTTACAGCTACAAGTACGTGCATGACGATGGCCGAGTGTCCTACCCTTTGTGTGTTTCATCTTCTCCAGCCCTGTGGGCTGCAAGCCGGAACAACAGATGATGTATGCAGGGGATTGAACAGGGCTGGTGCAGACAGCAGAGCTCACGAAGGTCTTCGAGAAATCCGCACCACTGATGACCTCACTGAGGCCTGGCTCCAAGAAAAGTTGTCTTTCTTTCGTTGATCTCTGGGCTGGGGACTGAATTCCTGATGTCTGAGTCCTCAAGGTGACTGGGGACTTGGAACCCCTAGGACCTGAACAACCAAGACTTTAAATAAATTTTAAAATGCAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
++
+GEFEEF<719554441121,+,-1(:696/../2/...3/.10/012310++.765675688:6588:888;8.,-526>@ACBBC>>??<8+7)>?BE at A@{{6556AA;;;:;=8C@>==;??1../4443.**+../--,+/)))*++*+,-9<7;60))())-)////--*)**+68;<GHB?=<=<<=>739:002:?AA==??5444))(*10*')-485//1,+**-/0-.+/3.777ABBBA:...AA796?860277>?C{{>;::=<>B at A@?:88:=BBEJEECCFCDEDFK;98:<<=>IM><<<@EEFCBBCCB?;<7767<CE@?;{{{:800-**+-07789>><=>=8897A?>5...118<>@<9;3/.//./,-)((*)&%&,/*)''((,++-/=EEFEDD><9(((>EGCDC:98)(((-,+((')&&(15<=<7778@>:99:AAC>==;CEDBCA?;99-,,+.+)*/345:;??<<=>?>778<366;>ED<<<=''97:??@=B57*++)))*5434.''(')0<<;;<:<=<<85-(&%%*-&&/%%%&/(),=**+ at A??@BEEFAA:<>?D358??FKHA>>ACCKC{KZCBB<91,-/-,,++-0144578;<<::=?@@?=>@B
=====================================
pychopper/tests/data/ref.fas deleted
=====================================
@@ -1,81 +0,0 @@
->seq_0_rev
-TACCCGACTGTTGTTGAAGCTCCTGACGGCTTGCAATTTATATTTACCTTTGCTGCAGTT
-GACTATAGGTAGTTTAATAAAACTCGGGTTGGTCATTGTCGGCCTAACGACACGTACACT
-AGTGTGCTCCTGAACAAAAGCGCGTTAGCTCGGGGATGGCGGGAGATCCGTCTACTCTCA
-TCGACACTTCTTACAGTAGTATTCATCTTTCCCCGTCGGGTGATCATCGATACCGCGTTA
-GATCGCAACTTTCGCAGTATCCTTACAACCGAACAATTTGTTAATTTGAAAGGTTCAATT
-TGGTCTTCTTCCCCGCCGCGTCGAAGACCGAACTATGTTATTAACCGAAATGTCGCGCTA
-GTGCTGGGCGGTATCACTTCCAGTATTGCTTACGTGGGCTATGCGCATGTCGATTAGTCT
-CTGGAGGTCTTAGGTGGCTCCAATTACTCCGAGAACCAGTTTTATGTCACGCCGAACGGG
-CATTTTCCTGGGTGTTGATGTGTTTACTCTCGAACTCAACATGAAAAATTATCTCGTATC
-TGTACGGCCATCGGTTAGTAACGAAGTTTAACTACAACGAACTGCACGACTCGCAACCGA
-TGGAATTACCAGGGGTCATACAGAGTTTAACCATCGCCGTCTCCCCAGGGAGGAACTAGG
-TTTATCGTCCCGGAGTATGCCCGCGTGTACTCTAACTGAGGTAGCAACACTATGCTTCTT
-CCATCGCGTGCCGCTAGTACCCGCACGGCGGTCACCCCCTCTCTAGGACTCCCTGGTAAT
-CACATTGTTGGGGATTGCAA
->seq_0
-TTGCAATCCCCAACAATGTGATTACCAGGGAGTCCTAGAGAGGGGGTGACCGCCGTGCGG
-GTACTAGCGGCACGCGATGGAAGAAGCATAGTGTTGCTACCTCAGTTAGAGTACACGCGG
-GCATACTCCGGGACGATAAACCTAGTTCCTCCCTGGGGAGACGGCGATGGTTAAACTCTG
-TATGACCCCTGGTAATTCCATCGGTTGCGAGTCGTGCAGTTCGTTGTAGTTAAACTTCGT
-TACTAACCGATGGCCGTACAGATACGAGATAATTTTTCATGTTGAGTTCGAGAGTAAACA
-CATCAACACCCAGGAAAATGCCCGTTCGGCGTGACATAAAACTGGTTCTCGGAGTAATTG
-GAGCCACCTAAGACCTCCAGAGACTAATCGACATGCGCATAGCCCACGTAAGCAATACTG
-GAAGTGATACCGCCCAGCACTAGCGCGACATTTCGGTTAATAACATAGTTCGGTCTTCGA
-CGCGGCGGGGAAGAAGACCAAATTGAACCTTTCAAATTAACAAATTGTTCGGTTGTAAGG
-ATACTGCGAAAGTTGCGATCTAACGCGGTATCGATGATCACCCGACGGGGAAAGATGAAT
-ACTACTGTAAGAAGTGTCGATGAGAGTAGACGGATCTCCCGCCATCCCCGAGCTAACGCG
-CTTTTGTTCAGGAGCACACTAGTGTACGTGTCGTTAGGCCGACAATGACCAACCCGAGTT
-TTATTAAACTACCTATAGTCAACTGCAGCAAAGGTAAATATAAATTGCAAGCCGTCAGGA
-GCTTCAACAACAGTCGGGTA
->seq_1
-CTCCTGAAGTGTGTGATTGCAATGTGTCCCCTCCCTTACCGTCAATCTGAATTTAATGAA
-GCCACGGAGGTTGAATCACGCATAAGAAGGTGACTAAGATCAAGCTTCTGAATGCAGCTA
-CAAGAATGCTCACCGTCCCAGAGTATTCGGGCAATAGAATGAGATCTAGTCACATCATTA
-AAAGAATACCGAATTGAATTTACCGTTTGCTTCTTTCTTCAGGGAAAGACCGCACTATCG
-CGGTTAGCGTGCTCGCAAGACCTATCCATAGATCGTCGTCCGAGCGGCATCATAAATTTC
-CCTGAACTATCGCTTGGAGATGGATCGACATTAAAAAATTACCTGTACTTGAATGGACGT
-GAAGTCCCTCCTAGCTAATGGCCTGTCACAAGCCTTATCCAAGGATCGCTAGATCCCAGT
-CACAACCGGTACACAGTCTACGGTAGCTTGAAAACATCTCGCGGAGTTACAAAGTTCGTG
-GGCACTTATTTAGCCATCGAGGATGGTGAGTGATGGCTGAGGTCTAGGGGGCTGATTAGA
-ACATTTTTTCGCTGTGTCCCTACGCCCCAGCTGCGCTTAATGGTAGCTTTTATTGTCGCC
-CTACTAATTGTGCAGTGGAACTACTATTTAGATAATCACCTTCCAGACTATTGGTCGTTA
-TTCTCTTTGACATTGGGACAATCAGTTGTATGCCTCATTCCTCGCTCACACAGCGACGCA
-GTCTTATTTACGGACAATCTCGTGCGTCGATGGGCAGTCGGACGGCTACCAGCCGAACTA
-ATTAGTGACGACCGGCAATTAGCGAACCAGAACAAGTCCGACAAAAATCCAGGACGTGCT
-ACCGGGTGGCAGGGTGAAGAGGGTACTCCTAAGCTACCGACTGCTATGCTTTGGGTATCA
-TCCATCGGTTGAACTCGCCTTGGGCTCCTGACGCCATTCTAAATAGAGCAAATGGCTGGT
-TCATTCTATTTGAAGGAGGCTCCGATGGTCTAAATCGATGTAGTCTGACCTGTTTTACCG
-TTACGGTGTGCCAGTTAGCTGGAGTTTGGGCTTCTTCCATCATGGTGTGGCCACTGGGAC
-CTTTGTTCTTACAGGATTCGCACCAATAAGCACACGAGTTGAGCCCGCTAAACCACTCGC
-CATCGGTCACATGCCGATAATACGCAACGTCACTAAAGATTCCGCTATGGCCTCAGTCGC
-GTGACGCGACTTATATGTGACAAGACGGTAACCGTGCCTCCACATTACATATCAAATTAG
-GTTGAGCTTAAACACAGGTCCGCCAGCCTTCGCGCATTCCTCTGCAGAGCGGGAAATCAT
-GACTTCTGCACGTTCTAATACAAGCGGCAGCAGGCTGCAACGGACCTTTGTTTCGACTAA
-GACCGTTGACCTGAATTCACGACGTCTTACGCGGGGGGAATATCAGTTTATCGGTACTGG
-TAGACTCACGCGAACAGTGAAAACCGGCGGAGTGCTTAACAGCACATTATTTGGAGAGTA
-GGAGGGAAGGTGAGTAAAGAGGGATATCAGGGCTACACTGTACGGATGCTGTCGGTAAAT
-GTACATTCGAAATAACCATTTTATAACACCTTCTTCGTAAGTGGGCAGCTAGTTATCCAG
-CTCGACTTTGCACGAATCACCCTCCGGATAACTGAAGCCCCCCAGACAGGAAAGCGCCGC
-GGACGGTGAATTGGCCGAGAGGATGGCCATAGGCCTAACGGTCTTTATGACAGTTGGGAG
-TATATTGAACCATCGCGCGGGAGTAAAATGTATTCTGTACCGCTGGCCGTCAAGAATTGT
-GCAACTGGTGTGAGGAGTGACTAACTGGTGATTGGCGTGTCGATCCAGGTCCAAATGAAT
-ATACGACGCAGTGTGGAACTTGGTGATTTATGAAAGTTTTGGGACCACACAACTCATTAG
-GTCCGTATGGCCGTTACTGCAATCGCATCTAGCTCCGGTCGAGGACGGTATAGGCAGACC
-TCTCCAACACATGATGCTAGAACAACTTGGCCTGACTTGTTGCTCCCTAATGAGGGATCT
-ATATCCCTCTGTGGGTCATTAGCTCTTCCTCTCTATTATTATGGTATATCAAAGGGGCAA
-TGCCCGGCCGGCCAAACCCGCCAGCGACCACCGCTAATTCCAGGTCTCGCAACTATCGAC
-AGTCTTAAGCACGCTATAGGAGCATACGGATATCTAGTCGACACTCTGCGACCAATCGGG
-GCTGAAATATACACAAGTAGGTATGCACTAGCTCACCAGTCTGTCCCAGTTTTGTTCTTC
-ACGATCTAATGCTTCGGGGGGTCGGTTGGCCCGACAGGCGTATGGTCTCTGCTATGGCAT
-TTCCCGGCAAAGGGTGGGTACGTAACATCATCGGATCGAAGGTACATGGGATGATTTCAC
-TGTATTAGACCTTCATTGCGTCTCCATTTGGATGTACTAACATTGACTGGCGATGCAGCC
-ACTAACTATATAGACGCCCATTATCCCAATGCAGTAGCTCCCGGTTAAATTTTCCGGCTC
-TGCATCGTGTCGCAGTGGTGGGAGAGAGTATGTTCAGAAATGGAGGTCATTACGGGCAAT
-GCGTTTAATGGTTGTACTCTGAAAGCGCACGGGTCCTACGATTAGTGGGCCCTTGAATCT
-TGAACGATCGGAGTCGGCACTTCACAAAACGCGTAAGCATTTAAAGCCGTCGCCCACGTT
-CTGAACTAAATGGACGTCTGGAAACAACTTCTGCATGCATCTGCTATGCTATGAACATCG
-CTTGAACATAGGAAAAGCGTGGCTCTCGCACTAATGATTGCGTTACCAACCCACACTGGG
-CCAAGAGTGACGGCATAAGACAAGAGTTATTTGACGCTATCCCAATCTTCGTTTCAGCCT
-CGTTCCACTACACATAACTCTTATTTTTGTCTTACGGAACTGACAGGCCGGCTCTATGAT
-GGATGCGAAATCGGCATACGAACTACCATCTCATCTCGAGTTGAGAGACGGGACGAATCC
=====================================
pychopper/tests/data/ref.fas.gz
=====================================
Binary files /dev/null and b/pychopper/tests/data/ref.fas.gz differ
=====================================
pychopper/tests/data/ref.fq deleted
=====================================
@@ -1,12 +0,0 @@
- at seq_0_rev
-TACCCGACTGTTGTTGAAGCTCCTGACGGCTTGCAATTTATATTTACCTTTGCTGCAGTTGACTATAGGTAGTTTAATAAAACTCGGGTTGGTCATTGTCGGCCTAACGACACGTACACTAGTGTGCTCCTGAACAAAAGCGCGTTAGCTCGGGGATGGCGGGAGATCCGTCTACTCTCATCGACACTTCTTACAGTAGTATTCATCTTTCCCCGTCGGGTGATCATCGATACCGCGTTAGATCGCAACTTTCGCAGTATCCTTACAACCGAACAATTTGTTAATTTGAAAGGTTCAATTTGGTCTTCTTCCCCGCCGCGTCGAAGACCGAACTATGTTATTAACCGAAATGTCGCGCTAGTGCTGGGCGGTATCACTTCCAGTATTGCTTACGTGGGCTATGCGCATGTCGATTAGTCTCTGGAGGTCTTAGGTGGCTCCAATTACTCCGAGAACCAGTTTTATGTCACGCCGAACGGGCATTTTCCTGGGTGTTGATGTGTTTACTCTCGAACTCAACATGAAAAATTATCTCGTATCTGTACGGCCATCGGTTAGTAACGAAGTTTAACTACAACGAACTGCACGACTCGCAACCGATGGAATTACCAGGGGTCATACAGAGTTTAACCATCGCCGTCTCCCCAGGGAGGAACTAGGTTTATCGTCCCGGAGTATGCCCGCGTGTACTCTAACTGAGGTAGCAACACTATGCTTCTTCCATCGCGTGCCGCTAGTACCCGCACGGCGGTCACCCCCTCTCTAGGACTCCCTGGTAATCACATTGTTGGGGATTGCAA
-+
-IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
- at seq_0
-TTGCAATCCCCAACAATGTGATTACCAGGGAGTCCTAGAGAGGGGGTGACCGCCGTGCGGGTACTAGCGGCACGCGATGGAAGAAGCATAGTGTTGCTACCTCAGTTAGAGTACACGCGGGCATACTCCGGGACGATAAACCTAGTTCCTCCCTGGGGAGACGGCGATGGTTAAACTCTGTATGACCCCTGGTAATTCCATCGGTTGCGAGTCGTGCAGTTCGTTGTAGTTAAACTTCGTTACTAACCGATGGCCGTACAGATACGAGATAATTTTTCATGTTGAGTTCGAGAGTAAACACATCAACACCCAGGAAAATGCCCGTTCGGCGTGACATAAAACTGGTTCTCGGAGTAATTGGAGCCACCTAAGACCTCCAGAGACTAATCGACATGCGCATAGCCCACGTAAGCAATACTGGAAGTGATACCGCCCAGCACTAGCGCGACATTTCGGTTAATAACATAGTTCGGTCTTCGACGCGGCGGGGAAGAAGACCAAATTGAACCTTTCAAATTAACAAATTGTTCGGTTGTAAGGATACTGCGAAAGTTGCGATCTAACGCGGTATCGATGATCACCCGACGGGGAAAGATGAATACTACTGTAAGAAGTGTCGATGAGAGTAGACGGATCTCCCGCCATCCCCGAGCTAACGCGCTTTTGTTCAGGAGCACACTAGTGTACGTGTCGTTAGGCCGACAATGACCAACCCGAGTTTTATTAAACTACCTATAGTCAACTGCAGCAAAGGTAAATATAAATTGCAAGCCGTCAGGAGCTTCAACAACAGTCGGGTA
-+
-IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
- at seq_1
-CTCCTGAAGTGTGTGATTGCAATGTGTCCCCTCCCTTACCGTCAATCTGAATTTAATGAAGCCACGGAGGTTGAATCACGCATAAGAAGGTGACTAAGATCAAGCTTCTGAATGCAGCTACAAGAATGCTCACCGTCCCAGAGTATTCGGGCAATAGAATGAGATCTAGTCACATCATTAAAAGAATACCGAATTGAATTTACCGTTTGCTTCTTTCTTCAGGGAAAGACCGCACTATCGCGGTTAGCGTGCTCGCAAGACCTATCCATAGATCGTCGTCCGAGCGGCATCATAAATTTCCCTGAACTATCGCTTGGAGATGGATCGACATTAAAAAATTACCTGTACTTGAATGGACGTGAAGTCCCTCCTAGCTAATGGCCTGTCACAAGCCTTATCCAAGGATCGCTAGATCCCAGTCACAACCGGTACACAGTCTACGGTAGCTTGAAAACATCTCGCGGAGTTACAAAGTTCGTGGGCACTTATTTAGCCATCGAGGATGGTGAGTGATGGCTGAGGTCTAGGGGGCTGATTAGAACATTTTTTCGCTGTGTCCCTACGCCCCAGCTGCGCTTAATGGTAGCTTTTATTGTCGCCCTACTAATTGTGCAGTGGAACTACTATTTAGATAATCACCTTCCAGACTATTGGTCGTTATTCTCTTTGACATTGGGACAATCAGTTGTATGCCTCATTCCTCGCTCACACAGCGACGCAGTCTTATTTACGGACAATCTCGTGCGTCGATGGGCAGTCGGACGGCTACCAGCCGAACTAATTAGTGACGACCGGCAATTAGCGAACCAGAACAAGTCCGACAAAAATCCAGGACGTGCTACCGGGTGGCAGGGTGAAGAGGGTACTCCTAAGCTACCGACTGCTATGCTTTGGGTATCATCCATCGGTTGAACTCGCCTTGGGCTCCTGACGCCATTCTAAATAGAGCAAATGGCTGGTTCATTCTATTTGAAGGAGGCTCCGATGGTCTAAATCGATGTAGTCTGACCTGTTTTACCGTTACGGTGTGCCAGTTAGCTGGAGTTTGGGCTTCTTCCATCATGGTGTGGCCACTGGGACCTTTGTTCTTACAGGATTCGCACCAATAAGCACACGAGTTGAGCCCGCTAAACCACTCGCCATCGGTCACATGCCGATAATACGCAACGTCACTAAAGATTCCGCTATGGCCTCAGTCGCGTGACGCGACTTATATGTGACAAGACGGTAACCGTGCCTCCACATTACATATCAAATTAGGTTGAGCTTAAACACAGGTCCGCCAGCCTTCGCGCATTCCTCTGCAGAGCGGGAAATCATGACTTCTGCACGTTCTAATACAAGCGGCAGCAGGCTGCAACGGACCTTTGTTTCGACTAAGACCGTTGACCTGAATTCACGACGTCTTACGCGGGGGGAATATCAGTTTATCGGTACTGGTAGACTCACGCGAACAGTGAAAACCGGCGGAGTGCTTAACAGCACATTATTTGGAGAGTAGGAGGGAAGGTGAGTAAAGAGGGATATCAGGGCTACACTGTACGGATGCTGTCGGTAAATGTACATTCGAAATAACCATTTTATAACACCTTCTTCGTAAGTGGGCAGCTAGTTATCCAGCTCGACTTTGCACGAATCACCCTCCGGATAACTGAAGCCCCCCAGACAGGAAAGCGCCGCGGACGGTGAATTGGCCGAGAGGATGGCCATAGGCCTAACGGTCTTTATGACAGTTGGGAGTATATTGAACCATCGCGCGGGAGTAAAATGTATTCTGTACCGCTGGCCGTCAAGAATTGTGCAACTGGTGTGAGGAGTGACTAACTGGTGATTGGCGTGTCGATCCAGGTCCAAATGAATATACGACGCAGTGTGGAACTTGGTGATTTATGAAAGTTTTGGGACCACACAACTCATTAGGTCCGTATGGCCGTTACTGCAATCGCATCTAGCTCCGGTCGAGGACGGTATAGGCAGACCTCTCCAACACATGATGCTAGAACAACTTGGCCTGACTTGTTGCTCCCTAATGAGGGATCTATATCCCTCTGTGGGTCATTAGCTCTTCCTCTCTATTATTATGGTATATCAAAGGGGCAATGCCCGGCCGGCCAAACCCGCCAGCGACCACCGCTAATTCCAGGTCTCGCAACTATCGACAGTCTTAAGCACGCTATAGGAGCATACGGATATCTAGTCGACACTCTGCGACCAATCGGGGCTGAAATATACACAAGTAGGTATGCACTAGCTCACCAGTCTGTCCCAGTTTTGTTCTTCACGATCTAATGCTTCGGGGGGTCGGTTGGCCCGACAGGCGTATGGTCTCTGCTATGGCATTTCCCGGCAAAGGGTGGGTACGTAACATCATCGGATCGAAGGTACATGGGATGATTTCACTGTATTAGACCTTCATTGCGTCTCCATTTGGATGTACTAACATTGACTGGCGATGCAGCCACTAACTATATAGACGCCCATTATCCCAATGCAGTAGCTCCCGGTTAAATTTTCCGGCTCTGCATCGTGTCGCAGTGGTGGGAGAGAGTATGTTCAGAAATGGAGGTCATTACGGGCAATGCGTTTAATGGTTGTACTCTGAAAGCGCACGGGTCCTACGATTAGTGGGCCCTTGAATCTTGAACGATCGGAGTCGGCACTTCACAAAACGCGTAAGCATTTAAAGCCGTCGCCCACGTTCTGAACTAAATGGACGTCTGGAAACAACTTCTGCATGCATCTGCTATGCTATGAACATCGCTTGAACATAGGAAAAGCGTGGCTCTCGCACTAATGATTGCGTTACCAACCCACACTGGGCCAAGAGTGACGGCATAAGACAAGAGTTATTTGACGCTATCCCAATCTTCGTTTCAGCCTCGTTCCACTACACATAACTCTTATTTTTGTCTTACGGAACTGACAGGCCGGCTCTATGATGGATGCGAAATCGGCATACGAACTACCATCTCATCTCGAGTTGAGAGACGGGACGAATCC
-+
-IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
=====================================
pychopper/tests/data/ref.fq.gz
=====================================
Binary files /dev/null and b/pychopper/tests/data/ref.fq.gz differ
=====================================
pychopper/tests/test_regression_simple.py
=====================================
@@ -9,16 +9,29 @@ class TestIntegration(unittest.TestCase):
def testIntegration(self):
""" Integration test. """
base = path.dirname(__file__)
- pkg_base = path.dirname(path.dirname(base))
test_base = path.join(base, 'data')
- cdna_classifier = path.join(pkg_base, 'scripts', 'cdna_classifier.py')
barcodes = path.join(test_base, 'barcodes.fas')
- input_fasta = path.join(test_base, 'ref.fq')
- output_fasta = path.join(test_base, 'test_output.fq')
+ input_fasta = path.join(test_base, 'ref.fq.gz')
+ output_fasta = path.join(test_base, 'test_output.fq.gz')
expected_output = path.join(test_base, 'expected_output.fas')
- subprocess.call("{} {} {} {} {}".format(cdna_classifier, "-Y 0 -B 3 -q 0.5 -m edlib -b", barcodes, input_fasta, output_fasta), shell=True)
+ subprocess.call("{} {} {} {} {}".format('pychopper', "-Y 0 -B 3 -q 0.5 -m edlib -b", barcodes, input_fasta, output_fasta), shell=True)
retval = subprocess.call(['cmp', output_fasta, expected_output])
self.assertEqual(retval, 0)
os.remove(output_fasta)
+
+ def testIntegration_umi(self):
+ """ Integration test. """
+ base = path.dirname(__file__)
+ test_base = path.join(base, 'data')
+
+ input_fasta = path.join(test_base, 'PCS111_umi_test_reads.fastq.gz')
+ output_fasta = path.join(test_base, 'test_output_umi.fq.gz')
+ expected_output = path.join(test_base, 'PCS111_umi_test_reads_expected.fastq')
+
+ subprocess.call("{} {} {} {}".format('pychopper', "-U -m edlib -k PCS111", input_fasta, output_fasta), shell=True)
+ retval = subprocess.call(['cmp', output_fasta, expected_output])
+ self.assertEqual(retval, 0)
+ os.remove(output_fasta)
+
=====================================
pychopper/utils.py
=====================================
@@ -50,8 +50,8 @@ def hit2bed(hit, read):
return bed_line
-def count_fastq_records(fname, size=128000000):
- fh = open(fname, "r")
+def count_fastq_records(fname, size=128000000, opener=open):
+ fh = opener(fname, "r")
count = 0
while True:
b = fh.read(size)
=====================================
requirements.txt
=====================================
@@ -2,6 +2,7 @@ parasail
biopython
matplotlib
six
-tqdm
+tqdm==4.26.0
pandas
+numpy
pytest
=====================================
setup.cfg deleted
=====================================
@@ -1,22 +0,0 @@
-[bumpversion]
-current_version = 2.5.0
-commit = True
-tag = True
-
-[bumpversion:file:setup.py]
-search = version='{current_version}'
-replace = version='{new_version}'
-
-[bumpversion:file:pychopper/__init__.py]
-search = __version__ = '{current_version}'
-replace = __version__ = '{new_version}'
-
-[bdist_wheel]
-universal = 1
-
-[flake8]
-exclude = docs
-
-[aliases]
-test = pytest
-
=====================================
setup.py
=====================================
@@ -1,55 +1,74 @@
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-
-from setuptools import setup
+import os
from glob import glob
+import sys
+import re
+from setuptools import setup, find_packages
+import pkg_resources
+
+__pkg_name__ = 'pychopper'
+__author__ = 'cwright'
+__description__ = 'Identify, orient, and trim full-length ONT cDNA reads.'
+
+# Use readme as long description and say its github-flavour markdown
+from os import path
+this_directory = path.abspath(path.dirname(__file__))
+kwargs = {'encoding':'utf-8'} if sys.version_info.major == 3 else {}
+with open(path.join(this_directory, 'README.md'), **kwargs) as f:
+ __long_description__ = f.read()
+__long_description_content_type__ = 'text/markdown'
+
+__path__ = os.path.dirname(__file__)
+__pkg_path__ = os.path.join(os.path.join(__path__, __pkg_name__))
+
+# Get the version number from __init__.py
+verstrline = open(os.path.join(__pkg_name__, '__init__.py'), 'r').read()
+vsre = r"^__version__ = ['\"]([^'\"]*)['\"]"
+mo = re.search(vsre, verstrline, re.M)
+if mo:
+ __version__ = mo.group(1)
+else:
+ raise RuntimeError('Unable to find version string in "{}/__init__.py".'.format(__pkg_name__))
+
+dir_path = os.path.dirname(__file__)
+with open(os.path.join(dir_path, 'requirements.txt')) as fh:
+ install_requires = [
+ str(requirement) for requirement in
+ pkg_resources.parse_requirements(fh)]
+
+data_files = []
+extra_requires = {}
+extensions = []
-with open('README.md') as readme_file:
- readme = readme_file.read()
-
-requirements = [
- 'edlib',
- 'parasail',
- 'matplotlib',
- 'seaborn',
- 'tqdm',
- 'six',
- 'pandas',
- 'pytest',
- 'sphinx',
- 'sphinx_rtd_theme',
-]
-
-test_requirements = [
- # TODO: put package test requirements here
-]
setup(
- name='pychopper',
- version='2.5.0',
- description="A tool to identify full length cDNA reads.",
- long_description=readme,
- author="ONT Applications Group",
- author_email='Apps at nanoporetech.com',
- url='',
+ name=__pkg_name__,
+ version=__version__,
+ url='https://github.com/epi2me-labs/{}'.format(__pkg_name__),
+ author=__author__,
+ author_email='{}@nanoporetech.com'.format(__author__),
+ description=__description__,
+ long_description=__long_description__,
+ long_description_content_type=__long_description_content_type__,
+ dependency_links=[],
+ ext_modules=extensions,
+ install_requires=install_requires,
+ tests_require=[].extend(install_requires),
+ extras_require=extra_requires,
packages=[
'pychopper',
+ 'pychopper.scripts',
'pychopper.phmm_data',
'pychopper.primer_data'
],
- package_dir={'pychopper':
- 'pychopper'},
+ package_dir={'pychopper': 'pychopper'},
package_data={'pychopper': ['primer_data/*.fas', 'phmm_data/*.*']},
- include_package_data=True,
- install_requires=requirements,
zip_safe=False,
- keywords='pychopper',
- classifiers=[
- 'Development Status :: 2 - Pre-Alpha',
- 'Intended Audience :: Developers',
- 'Natural Language :: English',
- "Programming Language :: Python :: 3",
- ],
- tests_require=test_requirements,
- scripts=[x for x in glob('scripts/*.py') if x != 'scripts/__init__.py']
+ data_files=data_files,
+ # scripts=[x for x in glob('scripts/*.py') if x != 'scripts/__init__.py']
+ entry_points ={
+ 'console_scripts': [
+ 'pychopper=pychopper.scripts.pychopper:main'
+ ]
+ }
)
+
View it on GitLab: https://salsa.debian.org/med-team/pychopper/-/commit/4f6485e9be34596093f579f8c672872c91580fc2
--
View it on GitLab: https://salsa.debian.org/med-team/pychopper/-/commit/4f6485e9be34596093f579f8c672872c91580fc2
You're receiving this email because of your account on salsa.debian.org.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20221119/361a47f3/attachment-0001.htm>
More information about the debian-med-commit
mailing list