[med-svn] [Git][med-team/python-cgecore][upstream] New upstream version 1.5.2
Steffen Möller
gitlab at salsa.debian.org
Mon Jan 6 16:44:04 GMT 2020
Steffen Möller pushed to branch upstream at Debian Med / python-cgecore
Commits:
422e7b5e by Steffen Moeller at 2020-01-06T17:33:00+01:00
New upstream version 1.5.2
- - - - -
5 changed files:
- PKG-INFO
- cgecore.egg-info/PKG-INFO
- cgecore/__init__.py
- cgecore/blaster/blaster.py
- cgecore/cgefinder.py
Changes:
=====================================
PKG-INFO
=====================================
@@ -1,6 +1,6 @@
Metadata-Version: 1.0
Name: cgecore
-Version: 1.5.0
+Version: 1.5.2
Summary: Center for Genomic Epidemiology Core Module
Home-page: https://bitbucket.org/genomicepidemiology/cge_core_module
Author: Center for Genomic Epidemiology
=====================================
cgecore.egg-info/PKG-INFO
=====================================
@@ -1,6 +1,6 @@
Metadata-Version: 1.0
Name: cgecore
-Version: 1.5.0
+Version: 1.5.2
Summary: Center for Genomic Epidemiology Core Module
Home-page: https://bitbucket.org/genomicepidemiology/cge_core_module
Author: Center for Genomic Epidemiology
=====================================
cgecore/__init__.py
=====================================
@@ -13,7 +13,7 @@ from .argumentparsing import (check_file_type, get_arguments, get_string,
)
#####################
-__version__ = "1.5.0"
+__version__ = "1.5.2"
__all__ = [
"argumentparsing",
"cmdline",
=====================================
cgecore/blaster/blaster.py
=====================================
@@ -98,7 +98,6 @@ class Blaster():
# Parsing over the hits and only keeping the best
for blast_record in blast_records:
-
# OLD CODE TO BE REMOVED
# query = blast_record.query
# blast_record.alignments.sort(key=lambda align: (
@@ -113,7 +112,6 @@ class Blaster():
key=lambda align: (max((int(hsp.identities)
for hsp in align.hsps))),
reverse=True)
-
query = blast_record.query
for alignment in blast_record.alignments:
@@ -121,13 +119,13 @@ class Blaster():
# HSP fragment
best_e_value = 1
best_bit = 0
+ start_hsp = 0
+ end_hsp = 0
for hsp in alignment.hsps:
-
if hsp.expect < best_e_value or hsp.bits > best_bit:
- best_e_value = hsp.expect
- best_bit = hsp.bits
-
+ # best_e_value = hsp.expect
+ # best_bit = hsp.bits
tmp = alignment.title.split(" ")
sbjct_header = tmp[1]
# DEBUG
@@ -158,7 +156,6 @@ class Blaster():
hit_id = "%s:%s..%s:%s:%f" % (
contig_name, query_start, query_end,
sbjct_header, cal_score)
-
# If the hit is on the other strand
if sbjct_start > sbjct_end:
tmp = sbjct_start
@@ -198,29 +195,30 @@ class Blaster():
'perc_coverage': perc_coverage
}
- # Saving the result if any
- if best_hsp:
- save = 1
-
- # If there are other gene alignments they are compared
- if gene_results:
- tmp_gene_split = gene_split
- tmp_results = gene_results
-
- # Compare the hit results
- save, gene_split, gene_results = (
- self.compare_results(save, best_hsp,
- tmp_results,
- tmp_gene_split,
- allowed_overlap)
- )
-
- # If the hit is not overlapping with other hit
- # seqeunces it is kept
- if save == 1:
- # DEBUG
- print("Saving: {}".format(hit_id))
- gene_results[hit_id] = best_hsp
+
+
+ # Saving the result if any
+ if best_hsp:
+ save = 1
+
+ # If there are other gene alignments they are compared
+ if gene_results:
+ tmp_gene_split = gene_split
+ tmp_results = gene_results
+ # Compare the hit results
+ save, gene_split, gene_results = (
+ self.compare_results(save, best_hsp,
+ tmp_results,
+ tmp_gene_split,
+ allowed_overlap)
+ )
+
+ # If the hit is not overlapping with other hit
+ # seqeunces it is kept
+ if save == 1:
+ # DEBUG
+ print("Saving: {}".format(hit_id))
+ gene_results[hit_id] = best_hsp
result_handle.close()
@@ -326,15 +324,23 @@ class Blaster():
old_HSP = hit_data['HSP_length']
remove_old = 0
-
# If they align to the same gene in the database they are
# compared
if new_db_hit == old_db_hit:
- # If the hit provids additional sequence it is kept and
+ #If the hit comes from different contig
+ if old_contig != new_contig:
+
+ # Save a split if the new hit still creats one
+ if(new_db_hit in tmp_gene_split
+ and hit_id not in tmp_gene_split[new_db_hit]):
+
+ tmp_gene_split[new_db_hit][hit_id] = 1
+
+ # If the hit provides additional sequence it is kept and
# the new coverage is saved otherwise the one with the
# highest score is kept
- if(new_start_sbjct < old_start_sbjct
+ elif(new_start_sbjct < old_start_sbjct
or new_end_sbjct > old_end_sbjct):
# Save the hits as split
@@ -342,28 +348,29 @@ class Blaster():
if hit not in tmp_gene_split[old_db_hit]:
tmp_gene_split[old_db_hit][hit] = 1
- else:
- if new_score > old_score:
+
+# else:
+# if new_score > old_score:
# Set to remove old hit
- remove_old = 1
+# remove_old = 1
# Save a split if the new hit still creats one
- if(new_db_hit in tmp_gene_split
- and hit_id not in tmp_gene_split[new_db_hit]):
+# if(new_db_hit in tmp_gene_split
+# and hit_id not in tmp_gene_split[new_db_hit]):
- tmp_gene_split[new_db_hit][hit_id] = 1
- else:
- save = 0
+# tmp_gene_split[new_db_hit][hit_id] = 1
+# else:
+# save = 0
# If the old and new hit is not identical the
# possible saved gene split for the new hit is
# removed
- if hit_id != hit:
- if(new_db_hit in tmp_gene_split
- and hit_id in tmp_gene_split[new_db_hit]):
+# if hit_id != hit:
+# if(new_db_hit in tmp_gene_split
+# and hit_id in tmp_gene_split[new_db_hit]):
- del tmp_gene_split[new_db_hit][hit_id]
- break
+# del tmp_gene_split[new_db_hit][hit_id]
+# break
# If the hits comes form the same part of the contig
# sequnce but match different genes only the best hit is
=====================================
cgecore/cgefinder.py
=====================================
@@ -36,9 +36,28 @@ class CGEFinder():
kma_apm=None, kma_memmode=False, kma_nanopore=False, debug=False,
kma_add_args=None, kma_cge=False, kma_1t1=False):
"""
- I expect that there will only be one hit pr gene, but if there are
+ TODO: Result storage - Too complex. Not effective.
+ Before changing code: Check downstream dependencies of results
+ dicts.
+ Currently the code stores results in four different dicts. This can
+ be reduced to just one.
+ The main dict that stores all results currently stores one result pr
+ hit, and distiguishes hits to the same gene by adding an increasing
+ integer (obtained by getting the length of the internal gene dict).
+ The main dict also stores an internal dict for each 'gene'
+ containing an empty dict for each hit.
+ Solution: Create a main<dict> -> gene<list> -> hit<dict> design, and
+ remove all the references main -> hit. This reduces redundancy and
+ the need for manipulating gene names with an incremental integer.
+
+ TODO: Method too many responsibilities
+ Solution: Create KMA class, create additional functions.
+
+ Original comment:
+ "I expect that there will only be one hit pr gene, but if there are
more, I assume that the sequence of the hits are the same in the res
- file and the aln file.
+ file and the aln file."
+ Not sure if this holds for the current code.
"""
threshold = threshold * 100
min_cov = min_cov * 100
@@ -117,16 +136,24 @@ class CGEFinder():
"\n{}\n{}".format(out.decode("utf-8"),
err.decode("utf-8")))
+ gene_res_count = {}
+
for line in res_file:
if kma_results[db] == 'No hit found':
kma_results[db] = dict()
- # kma_results[db]["excluded"] = dict()
- # continue
data = [data.strip() for data in line.split("\t")]
gene = data[0]
+ gene_count = gene_res_count.get(gene, 0)
+ gene_count = gene_count + 1
+ gene_res_count[gene] = gene_count
+ if(gene_count == 1):
+ hit = gene
+ else:
+ hit = "{}_{}".format(gene, gene_count)
+
sbjct_len = int(data[3])
sbjct_ident = float(data[4])
coverage = float(data[5])
@@ -134,11 +161,6 @@ class CGEFinder():
q_value = float(data[-2])
p_value = float(data[-1])
- if gene not in kma_results[db]:
- hit = gene
- else:
- hit = gene + "_" + str(len(kma_results[db][gene]) + 1)
-
exclude_reasons = []
if(coverage < min_cov or sbjct_ident < threshold):
@@ -146,7 +168,6 @@ class CGEFinder():
exclude_reasons.append(sbjct_ident)
if(exclude_reasons):
- # kma_results[db]["excluded"][hit] = exclude_reasons
kma_results["excluded"][hit] = exclude_reasons
kma_results[db][hit] = dict()
@@ -171,7 +192,7 @@ class CGEFinder():
# Open align file
with open(align_filename, "r") as align_file:
- hit_no = dict()
+ gene_aln_count = {}
gene = ""
# Parse through alignments
for line in align_file:
@@ -183,17 +204,13 @@ class CGEFinder():
if line.startswith("#"):
gene = line[1:].strip()
- if gene not in hit_no:
- hit_no[gene] = str(1)
- else:
- hit_no[gene] += str(int(hit_no[gene]) + 1)
-
+ gene_count = gene_aln_count.get(gene, 0)
+ gene_aln_count[gene] = gene_count + 1
else:
- # Check if gene one of the user specified genes
- if hit_no[gene] == '1':
+ if(gene_aln_count[gene] == 1):
hit = gene
else:
- hit = gene + "_" + hit_no[gene]
+ hit = "{}_{}".format(gene, gene_aln_count[gene])
if hit in kma_results[db]:
line_data = line.split("\t")[-1].strip()
@@ -209,7 +226,7 @@ class CGEFinder():
else:
print(hit + " not in results: ", kma_results)
- # concatinate all sequences lists and find subject start
+ # concatinate all sequence lists and find subject start
# and subject end
gene_align_sbjct[db] = {}
@@ -217,8 +234,6 @@ class CGEFinder():
gene_align_homo[db] = {}
for hit in kma_results[db]:
- # if(hit == "excluded"):
- # continue
align_sbjct = "".join(kma_results[db][hit]['sbjct_string'])
align_query = "".join(kma_results[db][hit]['query_string'])
align_homo = "".join(kma_results[db][hit]['homo_string'])
View it on GitLab: https://salsa.debian.org/med-team/python-cgecore/commit/422e7b5eb5e98a546ffa7d7c6c19f145a39b3ac8
--
View it on GitLab: https://salsa.debian.org/med-team/python-cgecore/commit/422e7b5eb5e98a546ffa7d7c6c19f145a39b3ac8
You're receiving this email because of your account on salsa.debian.org.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20200106/00f72c8f/attachment-0001.html>
More information about the debian-med-commit
mailing list