[med-svn] [Git][med-team/resfinder][upstream] New upstream version 3.2

Andreas Tille gitlab at salsa.debian.org
Thu Nov 7 11:47:43 GMT 2019



Andreas Tille pushed to branch upstream at Debian Med / resfinder


Commits:
ef10b474 by Andreas Tille at 2019-11-07T11:41:12Z
New upstream version 3.2
- - - - -


2 changed files:

- README.md
- resfinder.py


Changes:

=====================================
README.md
=====================================
@@ -66,7 +66,7 @@ You can run resfinder command line using python3
 
 # Example of running resfinder
 python3 resfinder.py -i test.fsa -o . -p /path/to/resfinder_db \
--b /path/to/blastn -d aminoglycoside -k 90.00 -l 0.60
+-b /path/to/blastn -d aminoglycoside -t 0.90 -l 0.60
 
 # The program can be invoked with the -h option 
 Usage: resfinder.py [-h] [-i INPUTFILE] [-1 FASTQ1] [-2 FASTQ2] [-o OUT_PATH]
@@ -103,6 +103,10 @@ optional arguments:
   -t THRESHOLD, --threshold THRESHOLD
                         Blast threshold for identity
                         default minimum 0.9 
+  -matrix, --matrix
+                        If used, gives the counts all all called bases at each position
+                        in each mapped template. Columns are: reference base,
+                        A count, C count, G count, T count, N count, - count.
 ```
 
 ### Web-server


=====================================
resfinder.py
=====================================
@@ -9,7 +9,10 @@ import subprocess
 from argparse import ArgumentParser
 from tabulate import tabulate
 import collections
+import pprint
+import json
 
+from distutils.spawn import find_executable
 from cgecore.blaster import Blaster
 from cgecore.cgefinder import CGEFinder
 
@@ -28,6 +31,7 @@ class ResFinder(CGEFinder):
          self.db_path_kma = db_path_kma
 
       self.configured_dbs = dict()
+      self.description_dbs = dict()
       self.kma_db_files = None
       self.load_db_config(db_conf_file=db_conf_file)
 
@@ -36,288 +40,311 @@ class ResFinder(CGEFinder):
 
       self.phenos = dict()
       self.load_notes(notes=notes)
-
       self.blast_results = None
       # self.kma_results = None
       # self.results = None
 
-   def write_results(self, out_path, result, res_type):
-      """
-      """
-      if(res_type == ResFinder.TYPE_BLAST):
-         result_str = self.results_to_str(res_type=res_type,
-                                          results=result.results,
-                                          query_align=result.gene_align_query,
-                                          homo_align=result.gene_align_homo,
-                                          sbjct_align=result.gene_align_sbjct)
-      elif(res_type == ResFinder.TYPE_KMA):
-         result_str = self.results_to_str(res_type=res_type,
-                                          results=result)
-
-      with open(out_path + "/results_tab.txt", "w") as fh:
-         fh.write(result_str[0])
-      with open(out_path + "/results_table.txt", "w") as fh:
-         fh.write(result_str[1])
-      with open(out_path + "/results.txt", "w") as fh:
-         fh.write(result_str[2])
-      with open(out_path + "/Resistance_gene_seq.fsa", "w") as fh:
-         fh.write(result_str[3])
-      with open(out_path + "/Hit_in_genome_seq.fsa", "w") as fh:
-         fh.write(result_str[4])
 
    def blast(self, inputfile, out_path, min_cov=0.9, threshold=0.6,
-             blast="blastn"):
+             blast="blastn", allowed_overlap=0):
       """
       """
       blast_run = Blaster(inputfile=inputfile, databases=self.databases,
                           db_path=self.db_path, out_path=out_path,
-                          min_cov=min_cov, threshold=threshold, blast=blast)
+                          min_cov=min_cov, threshold=threshold, blast=blast,
+                          allowed_overlap=allowed_overlap)
       self.blast_results = blast_run.results
       return blast_run
 
-   def results_to_str(self, res_type, results, query_align=None,
-                      homo_align=None, sbjct_align=None):
+   def create_results(self,results_method,outdir):
 
-      if(res_type != ResFinder.TYPE_BLAST and res_type != ResFinder.TYPE_KMA):
-         sys.exit("resfinder.py error: result method was not provided or not "
-                  "recognized.")
+      results = results_method.results
+      query_aligns = results_method.gene_align_query
+      homo_aligns = results_method.gene_align_homo
+      sbjct_aligns = results_method.gene_align_sbjct
+      json_results = dict()
 
-      # Write the header for the tab file
-      tab_str = ("Resistance gene\tIdentity\tAlignment Length/Gene Length\t"
-                 "Coverage\tPosition in reference\tContig\t"
-                 "Position in contig\tPhenotype\tAccession no.\n")
-
-      table_str = ""
-      txt_str = ""
-      ref_str = ""
-      hit_str = ""
-
-      # Getting and writing out the results
       titles = dict()
       rows = dict()
       headers = dict()
       txt_file_seq_text = dict()
       split_print = collections.defaultdict(list)
 
+      hits = []
       for db in results:
-         if(db == "excluded"):
-             continue
-
-         # Clean up dbs with only excluded hits
-         no_hits = True
-         for hit in results[db]:
-             if(hit not in results["excluded"]):
-                 print("hit not excluded: " + str(hit))
-                 print("\tIn DB: " + str(db))
-                 no_hits = False
-                 break
-         if(no_hits):
-             print("No hits in: " + str(db))
-             results[db] = "No hit found"
-
-         profile = str(self.configured_dbs[db][0])
-         if results[db] == "No hit found":
-            table_str += ("%s\n%s\n\n" % (profile, results[db]))
-         else:
-            titles[db] = "%s" % (profile)
-            headers[db] = ["Resistance gene", "Identity",
-                           "Alignment Length/Gene Length", "Coverage",
-                           "Position in reference", "Contig",
-                           "Position in contig", "Phenotype",
-                           "Accession no."]
-            table_str += ("%s\n" % (profile))
-            table_str += ("Resistance gene\tIdentity\t"
-                          "Alignment Length/Gene Length\tCoverage\t"
-                          "Position in reference\tContig\tPosition in contig\t"
-                          "Phenotype\tAccession no.\n")
-
-            rows[db] = list()
-            txt_file_seq_text[db] = list()
-
-            for hit in results[db]:
-               if(hit in results["excluded"]):
+          contig_res = {}
+          if db == 'excluded':
+              continue
+          db_name = str(db).capitalize()
+          if db_name not in json_results:
+              json_results[db_name] = {}
+          if db not in json_results[db_name]:
+              json_results[db_name][db] = {}
+          if results[db] == "No hit found":
+              json_results[db_name][db] = "No hit found"
+          else:
+              for contig_id, hit in results[db].items():
+                  identity = float(hit["perc_ident"])
+                  coverage = float(hit["perc_coverage"])
+
+                   # Skip hits below coverage
+                  if coverage < (min_cov * 100) or identity < (threshold * 100):
+                      continue
+
+                  bit_score = identity * coverage
+
+                  if contig_id not in contig_res:
+                      contig_res[contig_id] = []
+                  contig_res[contig_id].append([hit["query_start"], hit["query_end"],
+                                               bit_score, hit])
+
+          # Check for overlapping hits, only report the best
+          for contig_id, hit_lsts in contig_res.items():
+
+              hit_lsts.sort(key=lambda x: x[0])
+              hits = [hit[3] for hit in hit_lsts]
+
+              for hit in hits:
+
+                  header = hit["sbjct_header"]
+                  tmp = header.split("_")
+                  try:
+                      gene = tmp[0]
+                      note = tmp[1]
+                      acc = "_".join(tmp[2:])
+                  except IndexError:
+                      gene = ":".join(tmp)
+                      note = ""
+                      acc = ""
+                  try:
+                      variant = tmp[3]
+                  except IndexError:
+                      variant = ""
+
+                  identity = hit["perc_ident"]
+                  coverage = hit["perc_coverage"]
+                  sbj_length = hit["sbjct_length"]
+                  HSP = hit["HSP_length"]
+                  positions_contig = "%s..%s" % (hit["query_start"],
+                                                 hit["query_end"])
+                  positions_ref = "%s..%s" % (hit["sbjct_start"], hit["sbjct_end"])
+                  contig_name = hit["contig_name"]
+                  pheno = self.phenos.get(gene, "Warning: gene is missing from "
+                                       "Notes file. Please inform curator.")
+                  pheno = pheno.strip()
+
+                  # Write JSON results dict
+                  json_results[db_name][db].update({header: {}})
+                  json_results[db_name][db][header] = {
+                      "resistance_gene": gene,
+                      "identity": round(identity, 2),
+                      "HSP_length": HSP,
+                      "template_length": sbj_length,
+                      "position_in_ref": positions_ref,
+                      "contig_name": contig_name,
+                      "positions_in_contig": positions_contig,
+                      "note": note,
+                      "accession": acc,
+                      "predicted_phenotype": pheno,
+                      "coverage": round(coverage, 2),
+                      "hit_id": contig_id}
+
+      # Get run info for JSON file
+      service = os.path.basename(__file__).replace(".py", "")
+      date = time.strftime("%d.%m.%Y")
+      time_ = time.strftime("%H:%M:%S")
+
+      # Make JSON output file
+      data = {service: {}}
+
+      userinput = {"filename(s)": args.inputfile,
+                   "method": method,
+                   "file_format": file_format}
+      run_info = {"date": date, "time": time_}
+
+      data[service]["user_input"] = userinput
+      data[service]["run_info"] = run_info
+      data[service]["results"] = json_results
+
+      pprint.pprint(data)
+
+      # Save json output
+      result_file = "{}/data_resfinder.json".format(tmp_dir)
+      with open(result_file, "w") as outfile:
+          json.dump(data, outfile)
+
+      # Getting and writing out the results
+      header = ["Resistance gene", "Identity", "Query / Template length", "Contig",
+                "Position in contig", "Predicted phenotype", "Accession number"]
+
+      if args.extented_output:
+          # Define extented output
+          table_filename = "{}/results_tab.tsv".format(outdir)
+          query_filename = "{}/Hit_in_genome_seq.fsa".format(outdir)
+          sbjct_filename = "{}/Resistance_genes.fsa".format(outdir)
+          result_filename = "{}/results.txt".format(outdir)
+          table_file = open(table_filename, "w")
+          query_file = open(query_filename, "w")
+          sbjct_file = open(sbjct_filename, "w")
+          result_file = open(result_filename, "w")
+          # Make results file
+          result_file.write("{} Results\n\nAntibiotic(s): {}\n\n"
+                            .format(service, ",".join(self.configured_dbs)))
+          # Write tsv table
+          ## TODO##
+          rows = [["Database"] + header]
+          for species, dbs_info in json_results.items():
+              for db_name, db_hits in dbs_info.items():
+                  result_file.write("*" * len("\t".join(header)) + "\n")
+                  result_file.write(db_name.capitalize() + "\n")
+                  db_rows = []
+
+                  # Check it hits are found
+                  if isinstance(db_hits, str):
+                      content = [''] * len(header)
+                      content[int(len(header) / 2)] = db_hits
+                      print([content])
+                      exit()
+                      result_file.write(ResFinder.text_table(header, [content]) + "\n")
+                      continue
+
+                  for gene_id, gene_info in sorted(
+                          db_hits.items(),
+                          key=lambda x: (x[1]['resistance_gene'],
+                                         x[1]['accession'])):
+
+                      res_gene = gene_info["resistance_gene"]
+                      identity = str(gene_info["identity"])
+                      coverage = str(gene_info["coverage"])
+
+                      template_HSP = (
+                          "{hsp_len} / {template_len}".
+                          format(hsp_len=gene_info["HSP_length"],
+                                 template_len=gene_info["template_length"]))
+
+                      position_in_ref = gene_info["position_in_ref"]
+                      position_in_contig = gene_info["positions_in_contig"]
+                      predicted_phenotype = gene_info["predicted_phenotype"]
+                      acc = gene_info["accession"]
+                      contig_name = gene_info["contig_name"]
+                      print(contig_name)
+                      exit()
+
+                      # Add rows to result tables
+                      db_rows.append([res_gene, identity, template_HSP, contig_name,
+                                      position_in_contig, predicted_phenotype, acc])
+                      rows.append([db_name, res_gene, identity, template_HSP,
+                                   contig_name, position_in_contig, predicted_phenotype,
+                                   acc])
+
+                      # Write query fasta output
+                      hit_name = gene_info["hit_id"]
+                      query_seq = query_aligns[db_name][hit_name]
+                      sbjct_seq = sbjct_aligns[db_name][hit_name]
+
+                      if coverage == 100 and identity == 100:
+                          match = "PERFECT MATCH"
+                      else:
+                          match = "WARNING"
+                      qry_header = (">{}:{} ID:{}% COV:{}% Best_match:{}\n"
+                                    .format(res_gene, match, identity, coverage,
+                                            gene_id))
+                      query_file.write(qry_header)
+                      for i in range(0, len(query_seq), 60):
+                          query_file.write(query_seq[i:i + 60] + "\n")
+
+                      # Write template fasta output
+                      sbj_header = ">{}\n".format(gene_id)
+                      sbjct_file.write(sbj_header)
+                      for i in range(0, len(sbjct_seq), 60):
+                          sbjct_file.write(sbjct_seq[i:i + 60] + "\n")
+
+                  # Write db results tables in results file and table file
+                  print(db_rows)
+                  exit()
+                  result_file.write(ResFinder.text_table(header, db_rows) + "\n")
+
+              result_file.write("\n")
+
+          for row in rows:
+              table_file.write("\t".join(row) + "\n")
+
+          # Write allignment output
+          result_file.write("\n\nExtended Output:\n\n")
+          self.make_aln(result_file, json_results, query_aligns, homo_aligns,
+                   sbjct_aligns)
+
+          # Close all files
+
+          query_file.close()
+          sbjct_file.close()
+          table_file.close()
+          result_file.close()
+      if args.quiet:
+          f.close()
+
+   def make_aln(self,file_handle, json_data, query_aligns, homol_aligns, sbjct_aligns):
+       for dbs_info in json_data.values():
+           for db_name, db_info in dbs_info.items():
+               if isinstance(db_info, str):
                    continue
 
-               res_header = results[db][hit]["sbjct_header"]
-               tmp = res_header.split("_")
-               gene = tmp[0]
-               acc = "_".join(tmp[2:])
-               ID = results[db][hit]["perc_ident"]
-               coverage = results[db][hit]["perc_coverage"]
-               sbjt_length = results[db][hit]["sbjct_length"]
-               HSP = results[db][hit]["HSP_length"]
-               positions_contig = "%s..%s" % (results[db][hit]["query_start"],
-                                              results[db][hit]["query_end"])
-               positions_ref = "%s..%s" % (results[db][hit]["sbjct_start"],
-                                           results[db][hit]["sbjct_end"])
-               contig_name = results[db][hit]["contig_name"]
-               pheno = self.phenos.get(gene, "Warning: gene is missing from "
-                                       "Notes file. Please inform curator.")
+               for gene_id, gene_info in sorted(
+                       db_info.items(),
+                       key=lambda x: (x[1]['resistance_gene'],
+                                      x[1]['accession'])):
+
+                   seq_name = ("{gene}_{acc}"
+                               .format(gene=gene_info["resistance_gene"],
+                                       acc=gene_info["accession"]))
+                   hit_name = gene_info["hit_id"]
+
+                   seqs = ["", "", ""]
+                   seqs[0] = sbjct_aligns[db_name][hit_name]
+                   seqs[1] = homol_aligns[db_name][hit_name]
+                   seqs[2] = query_aligns[db_name][hit_name]
+
+                   self.write_align(seqs, seq_name, file_handle)
+
+   def write_align(self,seq, seq_name, file_handle):
+       file_handle.write("# {}".format(seq_name) + "\n")
+       sbjct_seq = seq[0]
+       homol_seq = seq[1]
+       query_seq = seq[2]
+       for i in range(0, len(sbjct_seq), 60):
+           file_handle.write("%-10s\t%s\n" % ("template:", sbjct_seq[i:i + 60]))
+           file_handle.write("%-10s\t%s\n" % ("", homol_seq[i:i + 60]))
+           file_handle.write("%-10s\t%s\n\n" % ("query:", query_seq[i:i + 60]))
 
-               pheno = pheno.strip()
-
-               if "split_length" in results[db][hit]:
-                  total_HSP = results[db][hit]["split_length"]
-                  split_print[res_header].append([gene, ID, total_HSP,
-                                                  sbjt_length, coverage,
-                                                  positions_ref, contig_name,
-                                                  positions_contig, pheno,
-                                                  acc])
-                  tab_str += ("%s\t%s\t%s/%s\t%s\t%s\t%s\t%s\t%s\t%s\n"
-                              % (gene, ID, HSP, sbjt_length, coverage,
-                                 positions_ref, contig_name, positions_contig,
-                                 pheno, acc)
-                              )
-               else:
-                  # Write tabels
-                  table_str += ("%s\t%.2f\t%s/%s\t%s\t%s\t%s\t%s\t%s\t%s\n"
-                                % (gene, ID, HSP, sbjt_length, coverage,
-                                   positions_ref, contig_name,
-                                   positions_contig, pheno, acc)
-                                )
-                  tab_str += ("%s\t%.2f\t%s/%s\t%s\t%s\t%s\t%s\t%s\t%s\n"
-                              % (gene, ID, HSP, sbjt_length, coverage,
-                                 positions_ref, contig_name, positions_contig,
-                                 pheno, acc)
-                              )
-
-                  # Saving the output to write the txt result table
-                  hsp_length = "%s/%s" % (HSP, sbjt_length)
-                  rows[db].append([gene, ID, hsp_length, coverage,
-                                  positions_ref, contig_name, positions_contig,
-                                  pheno, acc])
-
-               # Writing subjet/ref sequence
-               if(res_type == ResFinder.TYPE_BLAST):
-                  ref_seq = sbjct_align[db][hit]
-               elif(res_type == ResFinder.TYPE_KMA):
-                  ref_seq = results[db][hit]["sbjct_string"]
-
-               ref_str += (">%s_%s\n" % (gene, acc))
-               for i in range(0, len(ref_seq), 60):
-                  ref_str += ("%s\n" % (ref_seq[i:i + 60]))
-
-               # Getting the header and text for the txt file output
-               sbjct_start = results[db][hit]["sbjct_start"]
-               sbjct_end = results[db][hit]["sbjct_end"]
-               text = ("%s, ID: %.2f %%, Alignment Length/Gene Length: %s/%s, "
-                       "Coverage: %s, "
-                       "Positions in reference: %s..%s, Contig name: %s, "
-                       "Position: %s" % (gene, ID, HSP, sbjt_length, coverage,
-                                         sbjct_start, sbjct_end, contig_name,
-                                         positions_contig))
-               hit_str += (">%s\n" % text)
-
-               # Writing query/hit sequence
-               if(res_type == ResFinder.TYPE_BLAST):
-                  hit_seq = query_align[db][hit]
-               elif(res_type == ResFinder.TYPE_KMA):
-                  hit_seq = results[db][hit]["query_string"]
-
-               for i in range(0, len(hit_seq), 60):
-                  hit_str += ("%s\n" % (hit_seq[i:i + 60]))
-
-               # Saving the output to print the txt result file allignemts
-               if(res_type == ResFinder.TYPE_BLAST):
-                  txt_file_seq_text[db].append((text, ref_seq,
-                                               homo_align[db][hit], hit_seq))
-               elif(res_type == ResFinder.TYPE_KMA):
-                  txt_file_seq_text[db].append((text, ref_seq,
-                                               results[db][hit]["homo_string"],
-                                               hit_seq))
-
-            for res in split_print:
-               gene = split_print[res][0][0]
-               ID = split_print[res][0][1]
-               HSP = split_print[res][0][2]
-               sbjt_length = split_print[res][0][3]
-               coverage = split_print[res][0][4]
-               positions_ref = split_print[res][0][5]
-               contig_name = split_print[res][0][6]
-               positions_contig = split_print[res][0][7]
-               pheno = split_print[res][0][8]
-               acc = split_print[res][0][9]
-
-               total_coverage = 0
-
-               for i in range(1, len(split_print[res])):
-                  ID = "%s, %.2f" % (ID, split_print[res][i][1])
-                  total_coverage += split_print[res][i][4]
-                  positions_ref = positions_ref + ", " + split_print[res][i][5]
-                  contig_name = contig_name + ", " + split_print[res][i][6]
-                  positions_contig = (positions_contig + ", "
-                                      + split_print[res][i][7])
-
-               table_str += ("%s\t%s\t%s/%s\t%s\t%s\t%s\t%s\t%s\t%s\n"
-                             % (gene, ID, HSP, sbjt_length, coverage,
-                                positions_ref, contig_name, positions_contig,
-                                pheno, acc)
-                             )
-
-               hsp_length = "%s/%s" % (HSP, sbjt_length)
-
-               rows[db].append([gene, ID, hsp_length, coverage, positions_ref,
-                                contig_name, positions_contig, pheno, acc])
-
-            table_str += ("\n")
-
-      # Writing table txt for all hits
-      for db in titles:
-         # Txt file table
-         table = ResFinder.text_table(titles[db], headers[db], rows[db])
-         txt_str += table
-
-      # Writing alignment txt for all hits
-      for db in titles:
-         # Txt file alignments
-         txt_str += ("##################### %s #####################\n"
-                     % (db))
-         for text in txt_file_seq_text[db]:
-            txt_str += ("%s\n\n" % (text[0]))
-            for i in range(0, len(text[1]), 60):
-               txt_str += ("%s\n" % (text[1][i:i + 60]))
-               txt_str += ("%s\n" % (text[2][i:i + 60]))
-               txt_str += ("%s\n\n" % (text[3][i:i + 60]))
-            txt_str += ("\n")
-
-      return (tab_str, table_str, txt_str, ref_str, hit_str)
 
    @staticmethod
-   def text_table(title, headers, rows, table_format='psql'):
-      ''' Create text table
-
-      USAGE:
-         >>> from tabulate import tabulate
-         >>> title = 'My Title'
-         >>> headers = ['A','B']
-         >>> rows = [[1,2],[3,4]]
-         >>> print(text_table(title, headers, rows))
-         +-----------+
-         | My Title  |
-         +-----+-----+
-         |   A |   B |
-         +=====+=====+
-         |   1 |   2 |
-         |   3 |   4 |
-         +-----+-----+
-      '''
-      # Create table
-      table = tabulate(rows, headers, tablefmt=table_format)
-      # Prepare title injection
-      width = len(table.split('\n')[0])
-      tlen = len(title)
-      if tlen + 4 > width:
-         # Truncate oversized titles
-         tlen = width - 4
-         title = title[:tlen]
-      spaces = width - 2 - tlen
-      left_spacer = ' ' * int(spaces / 2)
-      right_spacer = ' ' * (spaces - len(left_spacer))
-      # Update table with title
-      table = '\n'.join(['+%s+' % ('-' * (width - 2)),
-                         '|%s%s%s|' % (left_spacer, title, right_spacer),
-                         table, '\n'])
-      return table
+   def text_table(headers, rows, empty_replace='-'):
+       ''' Create text table
+
+       USAGE:
+           >>> from tabulate import tabulate
+           >>> headers = ['A','B']
+           >>> rows = [[1,2],[3,4]]
+           >>> print(text_table(headers, rows))
+           **********
+             A     B
+           **********
+             1     2
+             3     4
+           ==========
+       '''
+       # Replace empty cells with placeholder
+       rows = map(lambda row: map(lambda x: x if x else empty_replace, row), rows)
+       # Create table
+       table = tabulate(rows, headers, tablefmt='simple').split('\n')
+       # Prepare title injection
+       width = len(table[0])
+       # Switch horisontal line
+       table[1] = '*' * (width + 2)
+       # Update table with title
+       table = (("%s\n" * 3)
+                % ('*' * (width + 2), '\n'.join(table), '=' * (width + 2)))
+       return table
 
    def load_notes(self, notes):
       with open(notes, 'r') as f:
@@ -390,6 +417,7 @@ class ResFinder(CGEFinder):
             if db_prefix not in self.configured_dbs:
                self.configured_dbs[db_prefix] = []
             self.configured_dbs[db_prefix].append(name)
+            self.description_dbs[db_prefix] = description
 
       if len(self.configured_dbs) == 0:
          sys.exit("Input Error: No databases were found in the "
@@ -401,6 +429,52 @@ class ResFinder(CGEFinder):
          self.kma_db_files = [kma_db + ".b", kma_db + ".length.b",
                               kma_db + ".name.b", kma_db + ".align.b"]
 
+def is_gzipped(file_path):
+    ''' Returns True if file is gzipped and False otherwise.
+         The result is inferred from the first two bits in the file read
+         from the input path.
+         On unix systems this should be: 1f 8b
+         Theoretically there could be exceptions to this test but it is
+         unlikely and impossible if the input files are otherwise expected
+         to be encoded in utf-8.
+    '''
+    with open(file_path, mode='rb') as fh:
+        bit_start = fh.read(2)
+    if(bit_start == b'\x1f\x8b'):
+        return True
+    else:
+        return False
+
+def get_file_format(input_files):
+    """
+    Takes all input files and checks their first character to assess
+    the file format. Returns one of the following strings; fasta, fastq,
+    other or mixed. fasta and fastq indicates that all input files are
+    of the same format, either fasta or fastq. other indiates that all
+    files are not fasta nor fastq files. mixed indicates that the inputfiles
+    are a mix of different file formats.
+    """
+    # Open all input files and get the first character
+    file_format = []
+    invalid_files = []
+    for infile in input_files:
+        if is_gzipped(infile):
+            f = gzip.open(infile, "rb")
+            fst_char = f.read(1)
+        else:
+            f = open(infile, "rb")
+            fst_char = f.read(1)
+        f.close()
+        # Assess the first character
+        if fst_char == b"@":
+            file_format.append("fastq")
+        elif fst_char == b">":
+            file_format.append("fasta")
+        else:
+            invalid_files.append("other")
+    if len(set(file_format)) != 1:
+        return "mixed"
+    return ",".join(set(file_format))
 
 if __name__ == '__main__':
 
@@ -411,41 +485,26 @@ if __name__ == '__main__':
    parser = ArgumentParser()
    parser.add_argument("-i", "--inputfile",
                        dest="inputfile",
-                       help="Input file",
-                       default=None)
-   parser.add_argument("-1", "--fastq1",
-                       help="Raw read data file 1.",
-                       default=None)
-   parser.add_argument("-2", "--fastq2",
-                       help="Raw read data file 2 (only required if data is \
-                             paired-end).",
-                       default=None)
+                       help="FASTA or FASTQ input files.",
+                       nargs="+",
+                       required=True)
    parser.add_argument("-o", "--outputPath",
                        dest="out_path",
                        help="Path to blast output",
-                       default='')
+                       default='.')
+   parser.add_argument("-tmp", "--tmp_dir",
+                       help=("Temporary directory for storage of the results "
+                       "from the external software."))
+
+   parser.add_argument("-mp", "--methodPath",
+                       dest="method_path",
+                       help="Path to method to use (kma or blastn)")
 
-   parser.add_argument("-b", "--blastPath",
-                       dest="blast_path",
-                       help="Path to blast",
-                       default='blastn')
    parser.add_argument("-p", "--databasePath",
                        dest="db_path",
                        help="Path to the databases",
                        default='')
 
-   parser.add_argument("-k", "--kmaPath",
-                       dest="kma_path",
-                       help="Path to KMA",
-                       default=None)
-   parser.add_argument("-q", "--databasePathKMA",
-                       dest="db_path_kma",
-                       help="Path to the directories containing the KMA \
-                             indexed databases. Defaults to the directory \
-                             'kma_indexing' inside the databasePath \
-                             directory.",
-                       default=None)
-
    parser.add_argument("-d", "--databases",
                        dest="databases",
                        help="Databases chosen to search in - if none are \
@@ -459,16 +518,45 @@ if __name__ == '__main__':
                        dest="threshold",
                        help="Blast threshold for identity",
                        default=0.90)
+   parser.add_argument("-ao", "--acq_overlap",
+                       help="Genes are allowed to overlap this number of\
+                             nucleotides. Default: 30.",
+                       type=int,
+                       default=30)
+   parser.add_argument("-matrix", "--matrix",
+                       help="Gives the counts all all called bases at each\
+                       position in each mapped template. Columns are: reference\
+                       base, A count, C count, G count, T count, N count,\
+                        - count.",
+                       dest="kma_matrix",
+                       action='store_true',
+                       default=False)
+   parser.add_argument("-x", "--extented_output",
+                       help=("Give extented output with allignment files, "
+                       "template and query hits in fasta and a tab "
+                       "seperated file with gene profile results"),
+                       action="store_true")
+   parser.add_argument("-q", "--quiet",
+                       action="store_true",
+                       default=False)
+
    args = parser.parse_args()
 
    ##########################################################################
    # MAIN
    ##########################################################################
 
+   if args.quiet:
+       f = open('/dev/null', 'w')
+       sys.stdout = f
+
    # Defining varibales
 
-   min_cov = args.min_cov
-   threshold = args.threshold
+   min_cov = float(args.min_cov)
+   threshold = float(args.threshold)
+   method_path = args.method_path
+
+
 
    # Check if valid database is provided
    if args.db_path is None:
@@ -489,37 +577,17 @@ if __name__ == '__main__':
    if not os.path.exists(notes_path):
       sys.exit('Input Error: notes.txt not found! (%s)' % (notes_path))
 
-   # Check for input
-   if args.inputfile is None and args.fastq1 is None:
-      sys.exit("Input Error: No Input were provided!\n")
-
-   # Check if valid input file for assembly is provided
-   if args.inputfile:
-      if not os.path.exists(args.inputfile):
-         sys.exit("Input Error: Input file does not exist!\n")
-      else:
-         inputfile = args.inputfile
-   else:
-      inputfile = None
-
-   # Check if valid input files for raw data
-   if args.fastq1:
-
-      if not os.path.exists(args.fastq1):
-         sys.exit("Input Error: fastq1 file does not exist!\n")
-      else:
-         input_fastq1 = args.fastq1
-
-      if args.fastq2:
-         if not os.path.exists(args.fastq2):
-            sys.exit("Input Error: fastq2 file does not exist!\n")
-         else:
-            input_fastq2 = args.fastq2
-      else:
-         input_fastq2 = None
+   # Check if valid input files are provided
+   if args.inputfile is None:
+       sys.exit("Input Error: No input file was provided!\n")
+   elif not os.path.exists(args.inputfile[0]):
+       sys.exit("Input Error: Input file does not exist!\n")
+   elif len(args.inputfile) > 1:
+       if not os.path.exists(args.inputfile[1]):
+           sys.exit("Input Error: Input file does not exist!\n")
+       infile = args.inputfile
    else:
-      input_fastq1 = None
-      input_fastq2 = None
+       infile = args.inputfile
 
    # Check if valid output directory is provided
    if not os.path.exists(args.out_path):
@@ -527,26 +595,76 @@ if __name__ == '__main__':
    else:
       out_path = args.out_path
 
-   # If input is an assembly, then use BLAST
-   if(inputfile is not None):
-      finder = ResFinder(db_conf_file=db_config_file, databases=args.databases,
-                         db_path=db_path, notes=notes_path)
-
-      blast_run = finder.blast(inputfile=inputfile, out_path=out_path,
-                               min_cov=min_cov, threshold=threshold,
-                               blast=args.blast_path)
-
-      finder.write_results(out_path=out_path, result=blast_run,
-                           res_type=ResFinder.TYPE_BLAST)
-
-   # If the input is raw read data, then use KMA
-   elif(input_fastq1 is not None):
-      finder = ResFinder(db_conf_file=db_config_file, databases=args.databases,
-                         db_path=db_path, db_path_kma=args.db_path_kma,
-                         notes=notes_path)
-
-      # if input_fastq2 is None, it is ignored by the kma method.
-      kma_run = finder.kma(inputfile_1=input_fastq1, inputfile_2=input_fastq2)
-
-      finder.write_results(out_path=out_path, result=kma_run,
-                           res_type=ResFinder.TYPE_KMA)
+   # Add kma_matrix
+   if (args.kma_matrix):
+      extra_args = "-matrix"
+   else:
+      extra_args = None
+
+   # Check if valid tmp directory is provided
+   if args.tmp_dir:
+       if not os.path.exists(args.tmp_dir):
+           sys.exit("Input Error: Tmp dirctory, {}, does not exist!\n"
+                    .format(args.tmp_dir))
+       else:
+           tmp_dir = os.path.abspath(args.tmp_dir)
+   else:
+       tmp_dir = out_path
+
+   file_format = get_file_format(infile)
+
+   if file_format == "fastq":
+       if not method_path:
+           method_path = "kma"
+       if find_executable(method_path) is None:
+           sys.exit("No valid path to a kma program was provided. Use the -mp "
+                    "flag to provide the path.")
+
+    # Check the number of files
+       if len(infile) == 1:
+           infile_1 = infile[0]
+           infile_2 = None
+       elif len(infile) == 2:
+           infile_1 = infile[0]
+           infile_2 = infile[1]
+       else:
+           sys.exit("Only 2 input file accepted for raw read data,\
+                    if data from more runs is avaliable for the same\
+                    sample, please concatinate the reads into two files")
+
+
+       sample_name = os.path.basename(sorted(args.inputfile)[0])
+       method = "kma"
+       finder = ResFinder(db_conf_file=db_config_file, databases=args.databases,
+                          db_path=db_path, db_path_kma=args.db_path,
+                          notes=notes_path)
+       # if input_fastq2 is None, it is ignored by the kma method.
+       kma_run = finder.kma(inputfile_1=infile_1, inputfile_2=infile_2,
+                            out_path=tmp_dir, databases=finder.databases,
+                            db_path_kma=finder.db_path_kma,
+                            min_cov=min_cov,threshold=threshold,
+                            kma_path=method_path,
+                            kma_add_args=extra_args)
+
+       finder.create_results(results_method=kma_run,outdir=out_path)
+
+
+   elif file_format == "fasta":
+       if not method_path:
+           method_path = "blastn"
+       if find_executable(method_path) is None:
+           sys.exit("No valid path to a blastn program was provided. Use the "
+                    "-mp flag to provide the path.")
+
+       # Assert that only one fasta file is inputted
+       assert len(infile) == 1, "Only one input file accepted for assembled data"
+       infile = infile[0]
+       method = "blast"
+       finder = ResFinder(db_conf_file=db_config_file, databases=args.databases,
+                          db_path=db_path, notes=notes_path)
+
+       blast_run = finder.blast(inputfile=infile, out_path=tmp_dir,
+                                min_cov=min_cov, threshold=threshold,
+                                blast=method_path,
+                                allowed_overlap=args.acq_overlap)
+       finder.create_results(results_method=blast_run,outdir=out_path)



View it on GitLab: https://salsa.debian.org/med-team/resfinder/commit/ef10b47463acad3a6d4264c8968345ff8b83b27a

-- 
View it on GitLab: https://salsa.debian.org/med-team/resfinder/commit/ef10b47463acad3a6d4264c8968345ff8b83b27a
You're receiving this email because of your account on salsa.debian.org.


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20191107/703cbdbd/attachment-0001.html>


More information about the debian-med-commit mailing list