[med-svn] [Git][med-team/resfinder][upstream] New upstream version 3.2
Andreas Tille
gitlab at salsa.debian.org
Thu Nov 7 11:47:43 GMT 2019
Andreas Tille pushed to branch upstream at Debian Med / resfinder
Commits:
ef10b474 by Andreas Tille at 2019-11-07T11:41:12Z
New upstream version 3.2
- - - - -
2 changed files:
- README.md
- resfinder.py
Changes:
=====================================
README.md
=====================================
@@ -66,7 +66,7 @@ You can run resfinder command line using python3
# Example of running resfinder
python3 resfinder.py -i test.fsa -o . -p /path/to/resfinder_db \
--b /path/to/blastn -d aminoglycoside -k 90.00 -l 0.60
+-b /path/to/blastn -d aminoglycoside -t 0.90 -l 0.60
# The program can be invoked with the -h option
Usage: resfinder.py [-h] [-i INPUTFILE] [-1 FASTQ1] [-2 FASTQ2] [-o OUT_PATH]
@@ -103,6 +103,10 @@ optional arguments:
-t THRESHOLD, --threshold THRESHOLD
Blast threshold for identity
default minimum 0.9
+ -matrix, --matrix
+ If used, gives the counts all all called bases at each position
+ in each mapped template. Columns are: reference base,
+ A count, C count, G count, T count, N count, - count.
```
### Web-server
=====================================
resfinder.py
=====================================
@@ -9,7 +9,10 @@ import subprocess
from argparse import ArgumentParser
from tabulate import tabulate
import collections
+import pprint
+import json
+from distutils.spawn import find_executable
from cgecore.blaster import Blaster
from cgecore.cgefinder import CGEFinder
@@ -28,6 +31,7 @@ class ResFinder(CGEFinder):
self.db_path_kma = db_path_kma
self.configured_dbs = dict()
+ self.description_dbs = dict()
self.kma_db_files = None
self.load_db_config(db_conf_file=db_conf_file)
@@ -36,288 +40,311 @@ class ResFinder(CGEFinder):
self.phenos = dict()
self.load_notes(notes=notes)
-
self.blast_results = None
# self.kma_results = None
# self.results = None
- def write_results(self, out_path, result, res_type):
- """
- """
- if(res_type == ResFinder.TYPE_BLAST):
- result_str = self.results_to_str(res_type=res_type,
- results=result.results,
- query_align=result.gene_align_query,
- homo_align=result.gene_align_homo,
- sbjct_align=result.gene_align_sbjct)
- elif(res_type == ResFinder.TYPE_KMA):
- result_str = self.results_to_str(res_type=res_type,
- results=result)
-
- with open(out_path + "/results_tab.txt", "w") as fh:
- fh.write(result_str[0])
- with open(out_path + "/results_table.txt", "w") as fh:
- fh.write(result_str[1])
- with open(out_path + "/results.txt", "w") as fh:
- fh.write(result_str[2])
- with open(out_path + "/Resistance_gene_seq.fsa", "w") as fh:
- fh.write(result_str[3])
- with open(out_path + "/Hit_in_genome_seq.fsa", "w") as fh:
- fh.write(result_str[4])
def blast(self, inputfile, out_path, min_cov=0.9, threshold=0.6,
- blast="blastn"):
+ blast="blastn", allowed_overlap=0):
"""
"""
blast_run = Blaster(inputfile=inputfile, databases=self.databases,
db_path=self.db_path, out_path=out_path,
- min_cov=min_cov, threshold=threshold, blast=blast)
+ min_cov=min_cov, threshold=threshold, blast=blast,
+ allowed_overlap=allowed_overlap)
self.blast_results = blast_run.results
return blast_run
- def results_to_str(self, res_type, results, query_align=None,
- homo_align=None, sbjct_align=None):
+ def create_results(self,results_method,outdir):
- if(res_type != ResFinder.TYPE_BLAST and res_type != ResFinder.TYPE_KMA):
- sys.exit("resfinder.py error: result method was not provided or not "
- "recognized.")
+ results = results_method.results
+ query_aligns = results_method.gene_align_query
+ homo_aligns = results_method.gene_align_homo
+ sbjct_aligns = results_method.gene_align_sbjct
+ json_results = dict()
- # Write the header for the tab file
- tab_str = ("Resistance gene\tIdentity\tAlignment Length/Gene Length\t"
- "Coverage\tPosition in reference\tContig\t"
- "Position in contig\tPhenotype\tAccession no.\n")
-
- table_str = ""
- txt_str = ""
- ref_str = ""
- hit_str = ""
-
- # Getting and writing out the results
titles = dict()
rows = dict()
headers = dict()
txt_file_seq_text = dict()
split_print = collections.defaultdict(list)
+ hits = []
for db in results:
- if(db == "excluded"):
- continue
-
- # Clean up dbs with only excluded hits
- no_hits = True
- for hit in results[db]:
- if(hit not in results["excluded"]):
- print("hit not excluded: " + str(hit))
- print("\tIn DB: " + str(db))
- no_hits = False
- break
- if(no_hits):
- print("No hits in: " + str(db))
- results[db] = "No hit found"
-
- profile = str(self.configured_dbs[db][0])
- if results[db] == "No hit found":
- table_str += ("%s\n%s\n\n" % (profile, results[db]))
- else:
- titles[db] = "%s" % (profile)
- headers[db] = ["Resistance gene", "Identity",
- "Alignment Length/Gene Length", "Coverage",
- "Position in reference", "Contig",
- "Position in contig", "Phenotype",
- "Accession no."]
- table_str += ("%s\n" % (profile))
- table_str += ("Resistance gene\tIdentity\t"
- "Alignment Length/Gene Length\tCoverage\t"
- "Position in reference\tContig\tPosition in contig\t"
- "Phenotype\tAccession no.\n")
-
- rows[db] = list()
- txt_file_seq_text[db] = list()
-
- for hit in results[db]:
- if(hit in results["excluded"]):
+ contig_res = {}
+ if db == 'excluded':
+ continue
+ db_name = str(db).capitalize()
+ if db_name not in json_results:
+ json_results[db_name] = {}
+ if db not in json_results[db_name]:
+ json_results[db_name][db] = {}
+ if results[db] == "No hit found":
+ json_results[db_name][db] = "No hit found"
+ else:
+ for contig_id, hit in results[db].items():
+ identity = float(hit["perc_ident"])
+ coverage = float(hit["perc_coverage"])
+
+ # Skip hits below coverage
+ if coverage < (min_cov * 100) or identity < (threshold * 100):
+ continue
+
+ bit_score = identity * coverage
+
+ if contig_id not in contig_res:
+ contig_res[contig_id] = []
+ contig_res[contig_id].append([hit["query_start"], hit["query_end"],
+ bit_score, hit])
+
+ # Check for overlapping hits, only report the best
+ for contig_id, hit_lsts in contig_res.items():
+
+ hit_lsts.sort(key=lambda x: x[0])
+ hits = [hit[3] for hit in hit_lsts]
+
+ for hit in hits:
+
+ header = hit["sbjct_header"]
+ tmp = header.split("_")
+ try:
+ gene = tmp[0]
+ note = tmp[1]
+ acc = "_".join(tmp[2:])
+ except IndexError:
+ gene = ":".join(tmp)
+ note = ""
+ acc = ""
+ try:
+ variant = tmp[3]
+ except IndexError:
+ variant = ""
+
+ identity = hit["perc_ident"]
+ coverage = hit["perc_coverage"]
+ sbj_length = hit["sbjct_length"]
+ HSP = hit["HSP_length"]
+ positions_contig = "%s..%s" % (hit["query_start"],
+ hit["query_end"])
+ positions_ref = "%s..%s" % (hit["sbjct_start"], hit["sbjct_end"])
+ contig_name = hit["contig_name"]
+ pheno = self.phenos.get(gene, "Warning: gene is missing from "
+ "Notes file. Please inform curator.")
+ pheno = pheno.strip()
+
+ # Write JSON results dict
+ json_results[db_name][db].update({header: {}})
+ json_results[db_name][db][header] = {
+ "resistance_gene": gene,
+ "identity": round(identity, 2),
+ "HSP_length": HSP,
+ "template_length": sbj_length,
+ "position_in_ref": positions_ref,
+ "contig_name": contig_name,
+ "positions_in_contig": positions_contig,
+ "note": note,
+ "accession": acc,
+ "predicted_phenotype": pheno,
+ "coverage": round(coverage, 2),
+ "hit_id": contig_id}
+
+ # Get run info for JSON file
+ service = os.path.basename(__file__).replace(".py", "")
+ date = time.strftime("%d.%m.%Y")
+ time_ = time.strftime("%H:%M:%S")
+
+ # Make JSON output file
+ data = {service: {}}
+
+ userinput = {"filename(s)": args.inputfile,
+ "method": method,
+ "file_format": file_format}
+ run_info = {"date": date, "time": time_}
+
+ data[service]["user_input"] = userinput
+ data[service]["run_info"] = run_info
+ data[service]["results"] = json_results
+
+ pprint.pprint(data)
+
+ # Save json output
+ result_file = "{}/data_resfinder.json".format(tmp_dir)
+ with open(result_file, "w") as outfile:
+ json.dump(data, outfile)
+
+ # Getting and writing out the results
+ header = ["Resistance gene", "Identity", "Query / Template length", "Contig",
+ "Position in contig", "Predicted phenotype", "Accession number"]
+
+ if args.extented_output:
+ # Define extented output
+ table_filename = "{}/results_tab.tsv".format(outdir)
+ query_filename = "{}/Hit_in_genome_seq.fsa".format(outdir)
+ sbjct_filename = "{}/Resistance_genes.fsa".format(outdir)
+ result_filename = "{}/results.txt".format(outdir)
+ table_file = open(table_filename, "w")
+ query_file = open(query_filename, "w")
+ sbjct_file = open(sbjct_filename, "w")
+ result_file = open(result_filename, "w")
+ # Make results file
+ result_file.write("{} Results\n\nAntibiotic(s): {}\n\n"
+ .format(service, ",".join(self.configured_dbs)))
+ # Write tsv table
+ ## TODO##
+ rows = [["Database"] + header]
+ for species, dbs_info in json_results.items():
+ for db_name, db_hits in dbs_info.items():
+ result_file.write("*" * len("\t".join(header)) + "\n")
+ result_file.write(db_name.capitalize() + "\n")
+ db_rows = []
+
+ # Check it hits are found
+ if isinstance(db_hits, str):
+ content = [''] * len(header)
+ content[int(len(header) / 2)] = db_hits
+ print([content])
+ exit()
+ result_file.write(ResFinder.text_table(header, [content]) + "\n")
+ continue
+
+ for gene_id, gene_info in sorted(
+ db_hits.items(),
+ key=lambda x: (x[1]['resistance_gene'],
+ x[1]['accession'])):
+
+ res_gene = gene_info["resistance_gene"]
+ identity = str(gene_info["identity"])
+ coverage = str(gene_info["coverage"])
+
+ template_HSP = (
+ "{hsp_len} / {template_len}".
+ format(hsp_len=gene_info["HSP_length"],
+ template_len=gene_info["template_length"]))
+
+ position_in_ref = gene_info["position_in_ref"]
+ position_in_contig = gene_info["positions_in_contig"]
+ predicted_phenotype = gene_info["predicted_phenotype"]
+ acc = gene_info["accession"]
+ contig_name = gene_info["contig_name"]
+ print(contig_name)
+ exit()
+
+ # Add rows to result tables
+ db_rows.append([res_gene, identity, template_HSP, contig_name,
+ position_in_contig, predicted_phenotype, acc])
+ rows.append([db_name, res_gene, identity, template_HSP,
+ contig_name, position_in_contig, predicted_phenotype,
+ acc])
+
+ # Write query fasta output
+ hit_name = gene_info["hit_id"]
+ query_seq = query_aligns[db_name][hit_name]
+ sbjct_seq = sbjct_aligns[db_name][hit_name]
+
+ if coverage == 100 and identity == 100:
+ match = "PERFECT MATCH"
+ else:
+ match = "WARNING"
+ qry_header = (">{}:{} ID:{}% COV:{}% Best_match:{}\n"
+ .format(res_gene, match, identity, coverage,
+ gene_id))
+ query_file.write(qry_header)
+ for i in range(0, len(query_seq), 60):
+ query_file.write(query_seq[i:i + 60] + "\n")
+
+ # Write template fasta output
+ sbj_header = ">{}\n".format(gene_id)
+ sbjct_file.write(sbj_header)
+ for i in range(0, len(sbjct_seq), 60):
+ sbjct_file.write(sbjct_seq[i:i + 60] + "\n")
+
+ # Write db results tables in results file and table file
+ print(db_rows)
+ exit()
+ result_file.write(ResFinder.text_table(header, db_rows) + "\n")
+
+ result_file.write("\n")
+
+ for row in rows:
+ table_file.write("\t".join(row) + "\n")
+
+ # Write allignment output
+ result_file.write("\n\nExtended Output:\n\n")
+ self.make_aln(result_file, json_results, query_aligns, homo_aligns,
+ sbjct_aligns)
+
+ # Close all files
+
+ query_file.close()
+ sbjct_file.close()
+ table_file.close()
+ result_file.close()
+ if args.quiet:
+ f.close()
+
+ def make_aln(self,file_handle, json_data, query_aligns, homol_aligns, sbjct_aligns):
+ for dbs_info in json_data.values():
+ for db_name, db_info in dbs_info.items():
+ if isinstance(db_info, str):
continue
- res_header = results[db][hit]["sbjct_header"]
- tmp = res_header.split("_")
- gene = tmp[0]
- acc = "_".join(tmp[2:])
- ID = results[db][hit]["perc_ident"]
- coverage = results[db][hit]["perc_coverage"]
- sbjt_length = results[db][hit]["sbjct_length"]
- HSP = results[db][hit]["HSP_length"]
- positions_contig = "%s..%s" % (results[db][hit]["query_start"],
- results[db][hit]["query_end"])
- positions_ref = "%s..%s" % (results[db][hit]["sbjct_start"],
- results[db][hit]["sbjct_end"])
- contig_name = results[db][hit]["contig_name"]
- pheno = self.phenos.get(gene, "Warning: gene is missing from "
- "Notes file. Please inform curator.")
+ for gene_id, gene_info in sorted(
+ db_info.items(),
+ key=lambda x: (x[1]['resistance_gene'],
+ x[1]['accession'])):
+
+ seq_name = ("{gene}_{acc}"
+ .format(gene=gene_info["resistance_gene"],
+ acc=gene_info["accession"]))
+ hit_name = gene_info["hit_id"]
+
+ seqs = ["", "", ""]
+ seqs[0] = sbjct_aligns[db_name][hit_name]
+ seqs[1] = homol_aligns[db_name][hit_name]
+ seqs[2] = query_aligns[db_name][hit_name]
+
+ self.write_align(seqs, seq_name, file_handle)
+
+ def write_align(self,seq, seq_name, file_handle):
+ file_handle.write("# {}".format(seq_name) + "\n")
+ sbjct_seq = seq[0]
+ homol_seq = seq[1]
+ query_seq = seq[2]
+ for i in range(0, len(sbjct_seq), 60):
+ file_handle.write("%-10s\t%s\n" % ("template:", sbjct_seq[i:i + 60]))
+ file_handle.write("%-10s\t%s\n" % ("", homol_seq[i:i + 60]))
+ file_handle.write("%-10s\t%s\n\n" % ("query:", query_seq[i:i + 60]))
- pheno = pheno.strip()
-
- if "split_length" in results[db][hit]:
- total_HSP = results[db][hit]["split_length"]
- split_print[res_header].append([gene, ID, total_HSP,
- sbjt_length, coverage,
- positions_ref, contig_name,
- positions_contig, pheno,
- acc])
- tab_str += ("%s\t%s\t%s/%s\t%s\t%s\t%s\t%s\t%s\t%s\n"
- % (gene, ID, HSP, sbjt_length, coverage,
- positions_ref, contig_name, positions_contig,
- pheno, acc)
- )
- else:
- # Write tabels
- table_str += ("%s\t%.2f\t%s/%s\t%s\t%s\t%s\t%s\t%s\t%s\n"
- % (gene, ID, HSP, sbjt_length, coverage,
- positions_ref, contig_name,
- positions_contig, pheno, acc)
- )
- tab_str += ("%s\t%.2f\t%s/%s\t%s\t%s\t%s\t%s\t%s\t%s\n"
- % (gene, ID, HSP, sbjt_length, coverage,
- positions_ref, contig_name, positions_contig,
- pheno, acc)
- )
-
- # Saving the output to write the txt result table
- hsp_length = "%s/%s" % (HSP, sbjt_length)
- rows[db].append([gene, ID, hsp_length, coverage,
- positions_ref, contig_name, positions_contig,
- pheno, acc])
-
- # Writing subjet/ref sequence
- if(res_type == ResFinder.TYPE_BLAST):
- ref_seq = sbjct_align[db][hit]
- elif(res_type == ResFinder.TYPE_KMA):
- ref_seq = results[db][hit]["sbjct_string"]
-
- ref_str += (">%s_%s\n" % (gene, acc))
- for i in range(0, len(ref_seq), 60):
- ref_str += ("%s\n" % (ref_seq[i:i + 60]))
-
- # Getting the header and text for the txt file output
- sbjct_start = results[db][hit]["sbjct_start"]
- sbjct_end = results[db][hit]["sbjct_end"]
- text = ("%s, ID: %.2f %%, Alignment Length/Gene Length: %s/%s, "
- "Coverage: %s, "
- "Positions in reference: %s..%s, Contig name: %s, "
- "Position: %s" % (gene, ID, HSP, sbjt_length, coverage,
- sbjct_start, sbjct_end, contig_name,
- positions_contig))
- hit_str += (">%s\n" % text)
-
- # Writing query/hit sequence
- if(res_type == ResFinder.TYPE_BLAST):
- hit_seq = query_align[db][hit]
- elif(res_type == ResFinder.TYPE_KMA):
- hit_seq = results[db][hit]["query_string"]
-
- for i in range(0, len(hit_seq), 60):
- hit_str += ("%s\n" % (hit_seq[i:i + 60]))
-
- # Saving the output to print the txt result file allignemts
- if(res_type == ResFinder.TYPE_BLAST):
- txt_file_seq_text[db].append((text, ref_seq,
- homo_align[db][hit], hit_seq))
- elif(res_type == ResFinder.TYPE_KMA):
- txt_file_seq_text[db].append((text, ref_seq,
- results[db][hit]["homo_string"],
- hit_seq))
-
- for res in split_print:
- gene = split_print[res][0][0]
- ID = split_print[res][0][1]
- HSP = split_print[res][0][2]
- sbjt_length = split_print[res][0][3]
- coverage = split_print[res][0][4]
- positions_ref = split_print[res][0][5]
- contig_name = split_print[res][0][6]
- positions_contig = split_print[res][0][7]
- pheno = split_print[res][0][8]
- acc = split_print[res][0][9]
-
- total_coverage = 0
-
- for i in range(1, len(split_print[res])):
- ID = "%s, %.2f" % (ID, split_print[res][i][1])
- total_coverage += split_print[res][i][4]
- positions_ref = positions_ref + ", " + split_print[res][i][5]
- contig_name = contig_name + ", " + split_print[res][i][6]
- positions_contig = (positions_contig + ", "
- + split_print[res][i][7])
-
- table_str += ("%s\t%s\t%s/%s\t%s\t%s\t%s\t%s\t%s\t%s\n"
- % (gene, ID, HSP, sbjt_length, coverage,
- positions_ref, contig_name, positions_contig,
- pheno, acc)
- )
-
- hsp_length = "%s/%s" % (HSP, sbjt_length)
-
- rows[db].append([gene, ID, hsp_length, coverage, positions_ref,
- contig_name, positions_contig, pheno, acc])
-
- table_str += ("\n")
-
- # Writing table txt for all hits
- for db in titles:
- # Txt file table
- table = ResFinder.text_table(titles[db], headers[db], rows[db])
- txt_str += table
-
- # Writing alignment txt for all hits
- for db in titles:
- # Txt file alignments
- txt_str += ("##################### %s #####################\n"
- % (db))
- for text in txt_file_seq_text[db]:
- txt_str += ("%s\n\n" % (text[0]))
- for i in range(0, len(text[1]), 60):
- txt_str += ("%s\n" % (text[1][i:i + 60]))
- txt_str += ("%s\n" % (text[2][i:i + 60]))
- txt_str += ("%s\n\n" % (text[3][i:i + 60]))
- txt_str += ("\n")
-
- return (tab_str, table_str, txt_str, ref_str, hit_str)
@staticmethod
- def text_table(title, headers, rows, table_format='psql'):
- ''' Create text table
-
- USAGE:
- >>> from tabulate import tabulate
- >>> title = 'My Title'
- >>> headers = ['A','B']
- >>> rows = [[1,2],[3,4]]
- >>> print(text_table(title, headers, rows))
- +-----------+
- | My Title |
- +-----+-----+
- | A | B |
- +=====+=====+
- | 1 | 2 |
- | 3 | 4 |
- +-----+-----+
- '''
- # Create table
- table = tabulate(rows, headers, tablefmt=table_format)
- # Prepare title injection
- width = len(table.split('\n')[0])
- tlen = len(title)
- if tlen + 4 > width:
- # Truncate oversized titles
- tlen = width - 4
- title = title[:tlen]
- spaces = width - 2 - tlen
- left_spacer = ' ' * int(spaces / 2)
- right_spacer = ' ' * (spaces - len(left_spacer))
- # Update table with title
- table = '\n'.join(['+%s+' % ('-' * (width - 2)),
- '|%s%s%s|' % (left_spacer, title, right_spacer),
- table, '\n'])
- return table
+ def text_table(headers, rows, empty_replace='-'):
+ ''' Create text table
+
+ USAGE:
+ >>> from tabulate import tabulate
+ >>> headers = ['A','B']
+ >>> rows = [[1,2],[3,4]]
+ >>> print(text_table(headers, rows))
+ **********
+ A B
+ **********
+ 1 2
+ 3 4
+ ==========
+ '''
+ # Replace empty cells with placeholder
+ rows = map(lambda row: map(lambda x: x if x else empty_replace, row), rows)
+ # Create table
+ table = tabulate(rows, headers, tablefmt='simple').split('\n')
+ # Prepare title injection
+ width = len(table[0])
+ # Switch horisontal line
+ table[1] = '*' * (width + 2)
+ # Update table with title
+ table = (("%s\n" * 3)
+ % ('*' * (width + 2), '\n'.join(table), '=' * (width + 2)))
+ return table
def load_notes(self, notes):
with open(notes, 'r') as f:
@@ -390,6 +417,7 @@ class ResFinder(CGEFinder):
if db_prefix not in self.configured_dbs:
self.configured_dbs[db_prefix] = []
self.configured_dbs[db_prefix].append(name)
+ self.description_dbs[db_prefix] = description
if len(self.configured_dbs) == 0:
sys.exit("Input Error: No databases were found in the "
@@ -401,6 +429,52 @@ class ResFinder(CGEFinder):
self.kma_db_files = [kma_db + ".b", kma_db + ".length.b",
kma_db + ".name.b", kma_db + ".align.b"]
+def is_gzipped(file_path):
+ ''' Returns True if file is gzipped and False otherwise.
+ The result is inferred from the first two bits in the file read
+ from the input path.
+ On unix systems this should be: 1f 8b
+ Theoretically there could be exceptions to this test but it is
+ unlikely and impossible if the input files are otherwise expected
+ to be encoded in utf-8.
+ '''
+ with open(file_path, mode='rb') as fh:
+ bit_start = fh.read(2)
+ if(bit_start == b'\x1f\x8b'):
+ return True
+ else:
+ return False
+
+def get_file_format(input_files):
+ """
+ Takes all input files and checks their first character to assess
+ the file format. Returns one of the following strings; fasta, fastq,
+ other or mixed. fasta and fastq indicates that all input files are
+ of the same format, either fasta or fastq. other indiates that all
+ files are not fasta nor fastq files. mixed indicates that the inputfiles
+ are a mix of different file formats.
+ """
+ # Open all input files and get the first character
+ file_format = []
+ invalid_files = []
+ for infile in input_files:
+ if is_gzipped(infile):
+ f = gzip.open(infile, "rb")
+ fst_char = f.read(1)
+ else:
+ f = open(infile, "rb")
+ fst_char = f.read(1)
+ f.close()
+ # Assess the first character
+ if fst_char == b"@":
+ file_format.append("fastq")
+ elif fst_char == b">":
+ file_format.append("fasta")
+ else:
+ invalid_files.append("other")
+ if len(set(file_format)) != 1:
+ return "mixed"
+ return ",".join(set(file_format))
if __name__ == '__main__':
@@ -411,41 +485,26 @@ if __name__ == '__main__':
parser = ArgumentParser()
parser.add_argument("-i", "--inputfile",
dest="inputfile",
- help="Input file",
- default=None)
- parser.add_argument("-1", "--fastq1",
- help="Raw read data file 1.",
- default=None)
- parser.add_argument("-2", "--fastq2",
- help="Raw read data file 2 (only required if data is \
- paired-end).",
- default=None)
+ help="FASTA or FASTQ input files.",
+ nargs="+",
+ required=True)
parser.add_argument("-o", "--outputPath",
dest="out_path",
help="Path to blast output",
- default='')
+ default='.')
+ parser.add_argument("-tmp", "--tmp_dir",
+ help=("Temporary directory for storage of the results "
+ "from the external software."))
+
+ parser.add_argument("-mp", "--methodPath",
+ dest="method_path",
+ help="Path to method to use (kma or blastn)")
- parser.add_argument("-b", "--blastPath",
- dest="blast_path",
- help="Path to blast",
- default='blastn')
parser.add_argument("-p", "--databasePath",
dest="db_path",
help="Path to the databases",
default='')
- parser.add_argument("-k", "--kmaPath",
- dest="kma_path",
- help="Path to KMA",
- default=None)
- parser.add_argument("-q", "--databasePathKMA",
- dest="db_path_kma",
- help="Path to the directories containing the KMA \
- indexed databases. Defaults to the directory \
- 'kma_indexing' inside the databasePath \
- directory.",
- default=None)
-
parser.add_argument("-d", "--databases",
dest="databases",
help="Databases chosen to search in - if none are \
@@ -459,16 +518,45 @@ if __name__ == '__main__':
dest="threshold",
help="Blast threshold for identity",
default=0.90)
+ parser.add_argument("-ao", "--acq_overlap",
+ help="Genes are allowed to overlap this number of\
+ nucleotides. Default: 30.",
+ type=int,
+ default=30)
+ parser.add_argument("-matrix", "--matrix",
+ help="Gives the counts all all called bases at each\
+ position in each mapped template. Columns are: reference\
+ base, A count, C count, G count, T count, N count,\
+ - count.",
+ dest="kma_matrix",
+ action='store_true',
+ default=False)
+ parser.add_argument("-x", "--extented_output",
+ help=("Give extented output with allignment files, "
+ "template and query hits in fasta and a tab "
+ "seperated file with gene profile results"),
+ action="store_true")
+ parser.add_argument("-q", "--quiet",
+ action="store_true",
+ default=False)
+
args = parser.parse_args()
##########################################################################
# MAIN
##########################################################################
+ if args.quiet:
+ f = open('/dev/null', 'w')
+ sys.stdout = f
+
# Defining varibales
- min_cov = args.min_cov
- threshold = args.threshold
+ min_cov = float(args.min_cov)
+ threshold = float(args.threshold)
+ method_path = args.method_path
+
+
# Check if valid database is provided
if args.db_path is None:
@@ -489,37 +577,17 @@ if __name__ == '__main__':
if not os.path.exists(notes_path):
sys.exit('Input Error: notes.txt not found! (%s)' % (notes_path))
- # Check for input
- if args.inputfile is None and args.fastq1 is None:
- sys.exit("Input Error: No Input were provided!\n")
-
- # Check if valid input file for assembly is provided
- if args.inputfile:
- if not os.path.exists(args.inputfile):
- sys.exit("Input Error: Input file does not exist!\n")
- else:
- inputfile = args.inputfile
- else:
- inputfile = None
-
- # Check if valid input files for raw data
- if args.fastq1:
-
- if not os.path.exists(args.fastq1):
- sys.exit("Input Error: fastq1 file does not exist!\n")
- else:
- input_fastq1 = args.fastq1
-
- if args.fastq2:
- if not os.path.exists(args.fastq2):
- sys.exit("Input Error: fastq2 file does not exist!\n")
- else:
- input_fastq2 = args.fastq2
- else:
- input_fastq2 = None
+ # Check if valid input files are provided
+ if args.inputfile is None:
+ sys.exit("Input Error: No input file was provided!\n")
+ elif not os.path.exists(args.inputfile[0]):
+ sys.exit("Input Error: Input file does not exist!\n")
+ elif len(args.inputfile) > 1:
+ if not os.path.exists(args.inputfile[1]):
+ sys.exit("Input Error: Input file does not exist!\n")
+ infile = args.inputfile
else:
- input_fastq1 = None
- input_fastq2 = None
+ infile = args.inputfile
# Check if valid output directory is provided
if not os.path.exists(args.out_path):
@@ -527,26 +595,76 @@ if __name__ == '__main__':
else:
out_path = args.out_path
- # If input is an assembly, then use BLAST
- if(inputfile is not None):
- finder = ResFinder(db_conf_file=db_config_file, databases=args.databases,
- db_path=db_path, notes=notes_path)
-
- blast_run = finder.blast(inputfile=inputfile, out_path=out_path,
- min_cov=min_cov, threshold=threshold,
- blast=args.blast_path)
-
- finder.write_results(out_path=out_path, result=blast_run,
- res_type=ResFinder.TYPE_BLAST)
-
- # If the input is raw read data, then use KMA
- elif(input_fastq1 is not None):
- finder = ResFinder(db_conf_file=db_config_file, databases=args.databases,
- db_path=db_path, db_path_kma=args.db_path_kma,
- notes=notes_path)
-
- # if input_fastq2 is None, it is ignored by the kma method.
- kma_run = finder.kma(inputfile_1=input_fastq1, inputfile_2=input_fastq2)
-
- finder.write_results(out_path=out_path, result=kma_run,
- res_type=ResFinder.TYPE_KMA)
+ # Add kma_matrix
+ if (args.kma_matrix):
+ extra_args = "-matrix"
+ else:
+ extra_args = None
+
+ # Check if valid tmp directory is provided
+ if args.tmp_dir:
+ if not os.path.exists(args.tmp_dir):
+ sys.exit("Input Error: Tmp dirctory, {}, does not exist!\n"
+ .format(args.tmp_dir))
+ else:
+ tmp_dir = os.path.abspath(args.tmp_dir)
+ else:
+ tmp_dir = out_path
+
+ file_format = get_file_format(infile)
+
+ if file_format == "fastq":
+ if not method_path:
+ method_path = "kma"
+ if find_executable(method_path) is None:
+ sys.exit("No valid path to a kma program was provided. Use the -mp "
+ "flag to provide the path.")
+
+ # Check the number of files
+ if len(infile) == 1:
+ infile_1 = infile[0]
+ infile_2 = None
+ elif len(infile) == 2:
+ infile_1 = infile[0]
+ infile_2 = infile[1]
+ else:
+ sys.exit("Only 2 input file accepted for raw read data,\
+ if data from more runs is avaliable for the same\
+ sample, please concatinate the reads into two files")
+
+
+ sample_name = os.path.basename(sorted(args.inputfile)[0])
+ method = "kma"
+ finder = ResFinder(db_conf_file=db_config_file, databases=args.databases,
+ db_path=db_path, db_path_kma=args.db_path,
+ notes=notes_path)
+ # if input_fastq2 is None, it is ignored by the kma method.
+ kma_run = finder.kma(inputfile_1=infile_1, inputfile_2=infile_2,
+ out_path=tmp_dir, databases=finder.databases,
+ db_path_kma=finder.db_path_kma,
+ min_cov=min_cov,threshold=threshold,
+ kma_path=method_path,
+ kma_add_args=extra_args)
+
+ finder.create_results(results_method=kma_run,outdir=out_path)
+
+
+ elif file_format == "fasta":
+ if not method_path:
+ method_path = "blastn"
+ if find_executable(method_path) is None:
+ sys.exit("No valid path to a blastn program was provided. Use the "
+ "-mp flag to provide the path.")
+
+ # Assert that only one fasta file is inputted
+ assert len(infile) == 1, "Only one input file accepted for assembled data"
+ infile = infile[0]
+ method = "blast"
+ finder = ResFinder(db_conf_file=db_config_file, databases=args.databases,
+ db_path=db_path, notes=notes_path)
+
+ blast_run = finder.blast(inputfile=infile, out_path=tmp_dir,
+ min_cov=min_cov, threshold=threshold,
+ blast=method_path,
+ allowed_overlap=args.acq_overlap)
+ finder.create_results(results_method=blast_run,outdir=out_path)
View it on GitLab: https://salsa.debian.org/med-team/resfinder/commit/ef10b47463acad3a6d4264c8968345ff8b83b27a
--
View it on GitLab: https://salsa.debian.org/med-team/resfinder/commit/ef10b47463acad3a6d4264c8968345ff8b83b27a
You're receiving this email because of your account on salsa.debian.org.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20191107/703cbdbd/attachment-0001.html>
More information about the debian-med-commit
mailing list