[med-svn] [Git][med-team/catfishq][upstream] New upstream version 1.2.1+ds

Tony Mancill (@tmancill) gitlab at salsa.debian.org
Tue Nov 9 17:12:44 GMT 2021



Tony Mancill pushed to branch upstream at Debian Med / catfishq


Commits:
68f924e2 by tony mancill at 2021-11-09T09:06:23-08:00
New upstream version 1.2.1+ds
- - - - -


6 changed files:

- PKG-INFO
- README.md
- catfishq.egg-info/PKG-INFO
- catfishq/__init__.py
- catfishq/cat_fastq.py
- setup.py


Changes:

=====================================
PKG-INFO
=====================================
@@ -1,10 +1,10 @@
 Metadata-Version: 1.0
 Name: catfishq
-Version: 1.1.5
+Version: 1.2.1
 Summary: Cat FASTQ files
 Home-page: UNKNOWN
 Author: philres
 Author-email: UNKNOWN
-License: UNKNOWN
+License: MIT
 Description: UNKNOWN
 Platform: UNKNOWN


=====================================
README.md
=====================================
@@ -3,14 +3,55 @@ Takes paths to an arbritary number of zipped and unzipped FASTQ files and/or fol
 
 Supported file extensions are: `'*.fastq', '*.fastq.gz', '*.fasta', '*.fasta.gz', '*.fa', '*.fa.gz', '*.fq', '*.fq.gz'`
 
+May also be used to filter FQ reads by read ID, read length, q-score, and min/max sequencing time.
+
+
 # Install
 ``` bash
 pip install catfishq
 ```
 
-# Example
+# Examples
+
+Check full command list:
+
 ```bash
-$ catfishq -r test/
+$ catfishq --help;
 ```
 
+Merge all FQ files within a target directory:
+
+```bash
+$ catfishq test/ > test.fastq;
+```
 
+Merge all FQ files within a target directory and its sub-directories (recurive):
+
+```bash
+$ catfishq -r test/ > test.fastq;
+```
+
+Merge the first 1000 reads:
+
+```bash
+$ catfishq -n 1000 test/ > test_1st_1000.fastq;
+```
+
+Merge reads with a length >=50bp and a q-score >=10:
+```bash
+$ catfishq -l 50 -q 10 -l test/ > test_filt.fastq;
+```
+
+Merge reads collected <60mins from sequencing start:
+
+```bash
+catfishq --min-sequencing-time 0 --max-sequencing-time 60 test/ > test_60_min.fastq;  #merge reads
+```
+
+Note that when looping catfishq over multiple folders from the same run, it is quicker to grab the start time via `--print-start-time` and providing it to catfishq via `--start-time "$timestamp"`.
+
+```bash
+$ t0="$(catfishq --print-start-time test1)";
+catfishq --max-sequencing-time 60 --start-time "$t0" test1/ > test1_60_min.fastq;
+catfishq --max-sequencing-time 60 --start-time "$t0" test2/ > test2_60_min.fastq;
+```


=====================================
catfishq.egg-info/PKG-INFO
=====================================
@@ -1,10 +1,10 @@
 Metadata-Version: 1.0
 Name: catfishq
-Version: 1.1.5
+Version: 1.2.1
 Summary: Cat FASTQ files
 Home-page: UNKNOWN
 Author: philres
 Author-email: UNKNOWN
-License: UNKNOWN
+License: MIT
 Description: UNKNOWN
 Platform: UNKNOWN


=====================================
catfishq/__init__.py
=====================================
@@ -1 +1 @@
-__version__ = "1.1.5"
+__version__ = "1.2.1"


=====================================
catfishq/cat_fastq.py
=====================================
@@ -122,6 +122,14 @@ def parse_args(argv):
         help="Remove duplicated reads.",
     )
 
+    parser.add_argument(
+        "--comments",
+        choices=['forward', 'skip', 'wrap'],
+        default='wrap',
+        help="How to treat FASTQ header comments. "
+             "`forward` them 'as is', `wrap` them into 'CO:Z:xxx' tag, `skip` them in the output.",
+    )
+
     parser.add_argument(
         "FASTQ",
         nargs="+",
@@ -174,6 +182,14 @@ def find_file_in_folder(
     return files
 
 
+def parse_timestamp(time_str):
+    try:
+        time_obj = datetime.strptime(time_str,'%Y-%m-%dT%H:%M:%SZ')
+    except ValueError:
+        time_obj = datetime.strptime(time_str,'%Y-%m-%dT%H:%M:%S.%f%z').replace(tzinfo=None)
+    return time_obj
+
+
 def check_seq_time(comment, max_start_time,min_start_time):
     #This tests if the start time of the respective read is between
     #max_sequencing_time and min_sequencing_time
@@ -183,7 +199,7 @@ def check_seq_time(comment, max_start_time,min_start_time):
     else:
         matchObj = re.search( r'start_time=([^ ]+)', comment, re.M|re.I)
         start_str = matchObj.group(1)
-        start = datetime.strptime(start_str,'%Y-%m-%dT%H:%M:%SZ')
+        start = parse_timestamp(start_str)
 
         bool_min=0
         bool_max=0
@@ -203,7 +219,7 @@ def compare_start_time(comment,min_start_time):
     #The smaller time is returned
     matchObj = re.search( r'start_time=([^ ]+)', comment, re.M|re.I)
     start_time_str = matchObj.group(1)
-    start_time = datetime.strptime(start_time_str,'%Y-%m-%dT%H:%M:%SZ')
+    start_time = parse_timestamp(start_time_str)
 
     if(min_start_time==0):
         return start_time
@@ -213,7 +229,7 @@ def compare_start_time(comment,min_start_time):
         return start_time
 
 
-def parse_fastqs(filename, min_len=0, min_qscore=0, max_start_time=None, min_start_time=None):
+def parse_fastqs(filename, min_len=0, min_qscore=0, max_start_time=None, min_start_time=None, comments='wrap'):
     with pysam.FastxFile(filename) as fh:
         for entry in fh:
             if min_len and len(entry.sequence) < min_len:
@@ -225,9 +241,10 @@ def parse_fastqs(filename, min_len=0, min_qscore=0, max_start_time=None, min_sta
                 continue
             if not check_seq_time(entry.comment, max_start_time, min_start_time):
                 continue
-            if entry.comment:
+            if entry.comment and comments == 'wrap':
                 entry.comment = "CO:Z:{}".format(entry.comment)
-
+            elif comments == 'skip':
+                entry.comment = None
             yield entry
 
 
@@ -265,7 +282,7 @@ def get_start_time(paths,recursive=False):
     return min_start_time
 
 
-def format_fq(paths, out_filename, min_len=0, min_qscore=0, max_n=0, max_bp=0, recursive=False, dedup=False, max_seq_time=0, min_seq_time=0, start_time=0, filter_read_ids_file=None):
+def format_fq(paths, out_filename, min_len=0, min_qscore=0, max_n=0, max_bp=0, recursive=False, dedup=False, max_seq_time=0, min_seq_time=0, start_time=0, filter_read_ids_file=None, comments='wrap'):
     """
     Concatenate FASTQ files
 
@@ -289,7 +306,7 @@ def format_fq(paths, out_filename, min_len=0, min_qscore=0, max_n=0, max_bp=0, r
 
     if start_time:
         if not start_time=="min":
-            start = datetime.strptime(start_time,'%Y-%m-%dT%H:%M:%SZ')
+            start = parse_timestamp(start_time)
 
             if(max_seq_time):
                 max_start_time = start + timedelta(minutes=max_seq_time)
@@ -298,7 +315,7 @@ def format_fq(paths, out_filename, min_len=0, min_qscore=0, max_n=0, max_bp=0, r
         else:
             #This option allows to automatically use the minmal start_time of
             #all the given fastq files as input for --start-time
-            auto_start_time=get_start_time(paths,recursive)
+            start=get_start_time(paths,recursive)
 
             if(max_seq_time):
                 max_start_time = start + timedelta(minutes=max_seq_time)
@@ -314,7 +331,7 @@ def format_fq(paths, out_filename, min_len=0, min_qscore=0, max_n=0, max_bp=0, r
             logging.debug("Found {} files".format(len(filenames)))
             for filename in filenames:
                 for entry in parse_fastqs(
-                    filename, min_len=min_len, min_qscore=min_qscore, max_start_time=max_start_time, min_start_time=min_start_time
+                    filename, min_len=min_len, min_qscore=min_qscore, max_start_time=max_start_time, min_start_time=min_start_time, comments=comments
                 ):
                     if dedup and entry.name in read_ids:
                         continue
@@ -363,7 +380,8 @@ def main(argv=sys.argv[1:]):
             max_seq_time=args.MAX_SEQ_TIME,
             min_seq_time=args.MIN_SEQ_TIME,
             start_time=args.START_TIME,
-            filter_read_ids_file=args.FILTER_ID
+            filter_read_ids_file=args.FILTER_ID,
+            comments=args.comments,
         )
 
 


=====================================
setup.py
=====================================
@@ -14,6 +14,7 @@ setup(
     version=__version__,
     author='philres',
     description='Cat FASTQ files',
+    license='MIT',
     zip_safe=False,
     install_requires=[
         'pysam'



View it on GitLab: https://salsa.debian.org/med-team/catfishq/-/commit/68f924e27d8c5cc4b32343dc88f1167dd99adc23

-- 
View it on GitLab: https://salsa.debian.org/med-team/catfishq/-/commit/68f924e27d8c5cc4b32343dc88f1167dd99adc23
You're receiving this email because of your account on salsa.debian.org.


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20211109/f5220b4b/attachment-0001.htm>


More information about the debian-med-commit mailing list