[med-svn] [Git][med-team/python-cgecore][upstream] New upstream version 1.5.2

Mon Jan 6 16:44:04 GMT 2020


Steffen Möller pushed to branch upstream at Debian Med / python-cgecore


Commits:
422e7b5e by Steffen Moeller at 2020-01-06T17:33:00+01:00
New upstream version 1.5.2
- - - - -


5 changed files:

- PKG-INFO
- cgecore.egg-info/PKG-INFO
- cgecore/__init__.py
- cgecore/blaster/blaster.py
- cgecore/cgefinder.py


Changes:

=====================================
PKG-INFO
=====================================
@@ -1,6 +1,6 @@
 Metadata-Version: 1.0
 Name: cgecore
-Version: 1.5.0
+Version: 1.5.2
 Summary: Center for Genomic Epidemiology Core Module
 Home-page: https://bitbucket.org/genomicepidemiology/cge_core_module
 Author: Center for Genomic Epidemiology


=====================================
cgecore.egg-info/PKG-INFO
=====================================
@@ -1,6 +1,6 @@
 Metadata-Version: 1.0
 Name: cgecore
-Version: 1.5.0
+Version: 1.5.2
 Summary: Center for Genomic Epidemiology Core Module
 Home-page: https://bitbucket.org/genomicepidemiology/cge_core_module
 Author: Center for Genomic Epidemiology


=====================================
cgecore/__init__.py
=====================================
@@ -13,7 +13,7 @@ from .argumentparsing import (check_file_type, get_arguments, get_string,
                               )
 
 #####################
-__version__ = "1.5.0"
+__version__ = "1.5.2"
 __all__ = [
     "argumentparsing",
     "cmdline",


=====================================
cgecore/blaster/blaster.py
=====================================
@@ -98,7 +98,6 @@ class Blaster():
 
             # Parsing over the hits and only keeping the best
             for blast_record in blast_records:
-
                 # OLD CODE TO BE REMOVED
                 #  query = blast_record.query
                 #  blast_record.alignments.sort(key=lambda align: (
@@ -113,7 +112,6 @@ class Blaster():
                     key=lambda align: (max((int(hsp.identities)
                                            for hsp in align.hsps))),
                     reverse=True)
-
                 query = blast_record.query
 
                 for alignment in blast_record.alignments:
@@ -121,13 +119,13 @@ class Blaster():
                     # HSP fragment
                     best_e_value = 1
                     best_bit = 0
+                    start_hsp = 0
+                    end_hsp = 0
 
                     for hsp in alignment.hsps:
-
                         if hsp.expect < best_e_value or hsp.bits > best_bit:
-                            best_e_value = hsp.expect
-                            best_bit = hsp.bits
-
+                            # best_e_value = hsp.expect
+                            # best_bit = hsp.bits
                             tmp = alignment.title.split(" ")
                             sbjct_header = tmp[1]
                             # DEBUG
@@ -158,7 +156,6 @@ class Blaster():
                             hit_id = "%s:%s..%s:%s:%f" % (
                                 contig_name, query_start, query_end,
                                 sbjct_header, cal_score)
-
                             # If the hit is on the other strand
                             if sbjct_start > sbjct_end:
                                 tmp = sbjct_start
@@ -198,29 +195,30 @@ class Blaster():
                                             'perc_coverage': perc_coverage
                                             }
 
-                    # Saving the result if any
-                    if best_hsp:
-                        save = 1
-
-                        # If there are other gene alignments they are compared
-                        if gene_results:
-                            tmp_gene_split = gene_split
-                            tmp_results = gene_results
-
-                            # Compare the hit results
-                            save, gene_split, gene_results = (
-                                self.compare_results(save, best_hsp,
-                                                     tmp_results,
-                                                     tmp_gene_split,
-                                                     allowed_overlap)
-                            )
-
-                        # If the hit is not overlapping with other hit
-                        # seqeunces it is kept
-                        if save == 1:
-                            # DEBUG
-                            print("Saving: {}".format(hit_id))
-                            gene_results[hit_id] = best_hsp
+
+
+                        # Saving the result if any
+                        if best_hsp:
+                            save = 1
+
+                            # If there are other gene alignments they are compared
+                            if gene_results:
+                                tmp_gene_split = gene_split
+                                tmp_results = gene_results
+                                # Compare the hit results
+                                save, gene_split, gene_results = (
+                                    self.compare_results(save, best_hsp,
+                                                         tmp_results,
+                                                         tmp_gene_split,
+                                                         allowed_overlap)
+                                )
+
+                            # If the hit is not overlapping with other hit
+                            # seqeunces it is kept
+                            if save == 1:
+                                # DEBUG
+                                print("Saving: {}".format(hit_id))
+                                gene_results[hit_id] = best_hsp
 
             result_handle.close()
 
@@ -326,15 +324,23 @@ class Blaster():
             old_HSP = hit_data['HSP_length']
 
             remove_old = 0
-
             # If they align to the same gene in the database they are
             # compared
             if new_db_hit == old_db_hit:
 
-                # If the hit provids additional sequence it is kept and
+                #If the hit comes from different contig
+                if old_contig != new_contig:
+
+                    # Save a split if the new hit still creats one
+                    if(new_db_hit in tmp_gene_split
+                       and hit_id not in tmp_gene_split[new_db_hit]):
+
+                        tmp_gene_split[new_db_hit][hit_id] = 1
+
+                # If the hit provides additional sequence it is kept and
                 # the new coverage is saved otherwise the one with the
                 # highest score is kept
-                if(new_start_sbjct < old_start_sbjct
+                elif(new_start_sbjct < old_start_sbjct
                    or new_end_sbjct > old_end_sbjct):
 
                     # Save the hits as split
@@ -342,28 +348,29 @@ class Blaster():
                     if hit not in tmp_gene_split[old_db_hit]:
                         tmp_gene_split[old_db_hit][hit] = 1
 
-                else:
-                    if new_score > old_score:
+
+#                else:
+#                    if new_score > old_score:
                         # Set to remove old hit
-                        remove_old = 1
+#                        remove_old = 1
 
                         # Save a split if the new hit still creats one
-                        if(new_db_hit in tmp_gene_split
-                           and hit_id not in tmp_gene_split[new_db_hit]):
+#                        if(new_db_hit in tmp_gene_split
+#                           and hit_id not in tmp_gene_split[new_db_hit]):
 
-                            tmp_gene_split[new_db_hit][hit_id] = 1
-                    else:
-                        save = 0
+#                            tmp_gene_split[new_db_hit][hit_id] = 1
+#                    else:
+#                        save = 0
 
                         # If the old and new hit is not identical the
                         # possible saved gene split for the new hit is
                         # removed
-                        if hit_id != hit:
-                            if(new_db_hit in tmp_gene_split
-                               and hit_id in tmp_gene_split[new_db_hit]):
+#                        if hit_id != hit:
+#                            if(new_db_hit in tmp_gene_split
+#                               and hit_id in tmp_gene_split[new_db_hit]):
 
-                                del tmp_gene_split[new_db_hit][hit_id]
-                        break
+#                                del tmp_gene_split[new_db_hit][hit_id]
+#                        break
 
             # If the hits comes form the same part of the contig
             # sequnce but match different genes only the best hit is


=====================================
cgecore/cgefinder.py
=====================================
@@ -36,9 +36,28 @@ class CGEFinder():
             kma_apm=None, kma_memmode=False, kma_nanopore=False, debug=False,
             kma_add_args=None, kma_cge=False, kma_1t1=False):
         """
-           I expect that there will only be one hit pr gene, but if there are
+           TODO: Result storage - Too complex. Not effective.
+           Before changing code: Check downstream dependencies of results
+                                 dicts.
+           Currently the code stores results in four different dicts. This can
+           be reduced to just one.
+           The main dict that stores all results currently stores one result pr
+           hit, and distiguishes hits to the same gene by adding an increasing
+           integer (obtained by getting the length of the internal gene dict).
+           The main dict also stores an internal dict for each 'gene'
+           containing an empty dict for each hit.
+           Solution: Create a main<dict> -> gene<list> -> hit<dict> design, and
+           remove all the references main -> hit. This reduces redundancy and
+           the need for manipulating gene names with an incremental integer.
+
+           TODO: Method too many responsibilities
+           Solution: Create KMA class, create additional functions.
+
+           Original comment:
+           "I expect that there will only be one hit pr gene, but if there are
            more, I assume that the sequence of the hits are the same in the res
-           file and the aln file.
+           file and the aln file."
+           Not sure if this holds for the current code.
         """
         threshold = threshold * 100
         min_cov = min_cov * 100
@@ -117,16 +136,24 @@ class CGEFinder():
                          "\n{}\n{}".format(out.decode("utf-8"),
                                            err.decode("utf-8")))
 
+            gene_res_count = {}
+
             for line in res_file:
 
                 if kma_results[db] == 'No hit found':
                     kma_results[db] = dict()
-                    # kma_results[db]["excluded"] = dict()
-                    # continue
 
                 data = [data.strip() for data in line.split("\t")]
                 gene = data[0]
 
+                gene_count = gene_res_count.get(gene, 0)
+                gene_count = gene_count + 1
+                gene_res_count[gene] = gene_count
+                if(gene_count == 1):
+                    hit = gene
+                else:
+                    hit = "{}_{}".format(gene, gene_count)
+
                 sbjct_len = int(data[3])
                 sbjct_ident = float(data[4])
                 coverage = float(data[5])
@@ -134,11 +161,6 @@ class CGEFinder():
                 q_value = float(data[-2])
                 p_value = float(data[-1])
 
-                if gene not in kma_results[db]:
-                    hit = gene
-                else:
-                    hit = gene + "_" + str(len(kma_results[db][gene]) + 1)
-
                 exclude_reasons = []
 
                 if(coverage < min_cov or sbjct_ident < threshold):
@@ -146,7 +168,6 @@ class CGEFinder():
                     exclude_reasons.append(sbjct_ident)
 
                 if(exclude_reasons):
-                    # kma_results[db]["excluded"][hit] = exclude_reasons
                     kma_results["excluded"][hit] = exclude_reasons
 
                 kma_results[db][hit] = dict()
@@ -171,7 +192,7 @@ class CGEFinder():
 
             # Open align file
             with open(align_filename, "r") as align_file:
-                hit_no = dict()
+                gene_aln_count = {}
                 gene = ""
                 # Parse through alignments
                 for line in align_file:
@@ -183,17 +204,13 @@ class CGEFinder():
                     if line.startswith("#"):
                         gene = line[1:].strip()
 
-                        if gene not in hit_no:
-                            hit_no[gene] = str(1)
-                        else:
-                            hit_no[gene] += str(int(hit_no[gene]) + 1)
-
+                        gene_count = gene_aln_count.get(gene, 0)
+                        gene_aln_count[gene] = gene_count + 1
                     else:
-                        # Check if gene one of the user specified genes
-                        if hit_no[gene] == '1':
+                        if(gene_aln_count[gene] == 1):
                             hit = gene
                         else:
-                            hit = gene + "_" + hit_no[gene]
+                            hit = "{}_{}".format(gene, gene_aln_count[gene])
 
                         if hit in kma_results[db]:
                             line_data = line.split("\t")[-1].strip()
@@ -209,7 +226,7 @@ class CGEFinder():
                         else:
                             print(hit + " not in results: ", kma_results)
 
-            # concatinate all sequences lists and find subject start
+            # concatinate all sequence lists and find subject start
             # and subject end
 
             gene_align_sbjct[db] = {}
@@ -217,8 +234,6 @@ class CGEFinder():
             gene_align_homo[db] = {}
 
             for hit in kma_results[db]:
-                # if(hit == "excluded"):
-                # continue
                 align_sbjct = "".join(kma_results[db][hit]['sbjct_string'])
                 align_query = "".join(kma_results[db][hit]['query_string'])
                 align_homo = "".join(kma_results[db][hit]['homo_string'])



View it on GitLab: https://salsa.debian.org/med-team/python-cgecore/commit/422e7b5eb5e98a546ffa7d7c6c19f145a39b3ac8

-- 
View it on GitLab: https://salsa.debian.org/med-team/python-cgecore/commit/422e7b5eb5e98a546ffa7d7c6c19f145a39b3ac8
You're receiving this email because of your account on salsa.debian.org.


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20200106/00f72c8f/attachment-0001.html>