[med-svn] [card-rgi] 01/02: New upstream version 3.2.0

Andreas Tille tille at debian.org
Thu May 11 07:31:10 UTC 2017


This is an automated email from the git hooks/post-receive script.

tille pushed a commit to branch master
in repository card-rgi.

commit 386753fc7556bae6f33df22b3c94ef946776bdbd
Author: Andreas Tille <tille at debian.org>
Date:   Thu May 11 09:29:35 2017 +0200

    New upstream version 3.2.0
---
 .gitignore          |    3 +
 _data/.gitignore    |    1 +
 _data/card.json     |    1 +
 _db/.gitignore      |    2 +
 _docs/INSTALL       |  373 +++++++++++++
 _docs/README        |   78 +++
 blastnsnp.py        |  223 ++++++++
 clean.py            |   98 ++++
 contigToORF.py      |  113 ++++
 contigToProteins.py |   46 ++
 convertJsonToTSV.py |  426 +++++++++++++++
 create_gff3_file.py |  197 +++++++
 filepaths.py        |   13 +
 formatJson.py       |   26 +
 fqToFsa.py          |   20 +
 load.py             |   45 ++
 rgi.py              | 1470 +++++++++++++++++++++++++++++++++++++++++++++++++++
 rrna.py             |  159 ++++++
 tests/protein.fasta |    2 +
 19 files changed, 3296 insertions(+)

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..1c98ee2
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,3 @@
+*.pyc
+app.log
+!__init__.py
\ No newline at end of file
diff --git a/_data/.gitignore b/_data/.gitignore
new file mode 100644
index 0000000..8fa7c98
--- /dev/null
+++ b/_data/.gitignore
@@ -0,0 +1 @@
+card.json
\ No newline at end of file
diff --git a/_data/card.json b/_data/card.json
new file mode 100644
index 0000000..1829b89
--- /dev/null
+++ b/_data/card.json
@@ -0,0 +1 @@
+{"2":{"model_id":"2","model_name":"CblA-1","model_type":"protein homolog model","model_type_id":"40292","model_description":"Models to detect proteins conferring antibiotic resistance, which include a reference protein sequence and a curated BLASTP cut-off.","model_param":{"blastp_evalue":{"param_type":"BLASTP e-value","param_description":"A curated expectation value (e-value) for assignment of an Antibiotic Resistance Ontology term based on a BLASTP hit to a CARD reference sequence.","p [...]
\ No newline at end of file
diff --git a/_db/.gitignore b/_db/.gitignore
new file mode 100644
index 0000000..8802c9f
--- /dev/null
+++ b/_db/.gitignore
@@ -0,0 +1,2 @@
+protein*
+!__init__.py
\ No newline at end of file
diff --git a/_docs/INSTALL b/_docs/INSTALL
new file mode 100644
index 0000000..79d1088
--- /dev/null
+++ b/_docs/INSTALL
@@ -0,0 +1,373 @@
+
+|--------------------------------------------------------------------------
+| Resistance Gene Identifier (RGI) Documentation
+|--------------------------------------------------------------------------
+
+	Before you run the RGI scripts, make sure you have installed needed external tools:
+
+|--------------------------------------------------------------------------
+| Install open reading frame callers : Prodigal, http://prodigal.ornl.gov/, https://github.com/hyattpd/prodigal/wiki/Installation
+|--------------------------------------------------------------------------
+
+
+	# Install Prodigal - conda should be installed
+
+		$ conda install --channel bioconda prodigal
+
+	# Mac OS X
+
+		$ brew tap homebrew/science
+
+		$ brew install homebrew/science/prodigal
+
+	# Linux Redhat / Centos
+
+		$ sudo yum groupinstall 'Development Tools' && sudo yum install curl git irb python-setuptools ruby
+
+		$ git clone https://github.com/Linuxbrew/brew.git ~/.linuxbrew
+
+		$ export PATH="$HOME/.linuxbrew/bin:$PATH"
+
+		$ export MANPATH="$HOME/.linuxbrew/share/man:$MANPATH"
+	
+		$ export INFOPATH="$HOME/.linuxbrew/share/info:$INFOPATH"
+
+	# Linux Debian / Ubuntu
+
+		$ brew tap homebrew/science
+
+		$ brew install homebrew/science/prodigal
+
+
+|--------------------------------------------------------------------------
+| Install alignment software : BLAST and DIAMOND
+|--------------------------------------------------------------------------
+
+|--------------------------------------------------------------------------
+| BLAST ftp://ftp.ncbi.nlm.nih.gov/blast/executables/blast+/LATEST/
+|--------------------------------------------------------------------------
+
+	- Tested with BLAST 2.2.28, BLAST 2.2.31+ and 2.5.0+ on linux 64 and Mac OS X
+
+	* You can alson run the following command to install blast. This will only install version 2.2.28
+
+	$ sudo apt-get install ncbi-blast+
+
+	- Test blast install with the following command:
+
+	$ makeblastdb
+
+	* Biopython http://biopython.org/DIST/docs/install/Installation.html#sec12
+	* Run the following command to install Bio-python
+
+	$ sudo apt-get install python-biopython
+
+	* Download the database - card.json from Downloads on the CARD website (a copy may be included with this release)
+
+
+|--------------------------------------------------------------------------
+| DIAMOND 
+| - https://ab.inf.uni-tuebingen.de/software/diamond
+| - https://github.com/bbuchfink/diamond
+| - https://github.com/bbuchfink/diamond/releases
+|--------------------------------------------------------------------------
+
+	* Install diamond using conda packages
+
+	$ conda install --channel bioconda diamond
+
+
+|--------------------------------------------------------------------------
+| Running RGI:
+|--------------------------------------------------------------------------
+
+Open a terminal, type: 
+
+	$ python rgi.py -h 
+
+Check software version:
+
+	$ python rgi.py -sv or python rgi.py --software_version
+
+Check data version:
+
+	$ python rgi.py -dv or python rgi.py --data_version
+
+|--------------------------------------------------------------------------
+| RGI inputs
+|--------------------------------------------------------------------------
+
+	$ python rgi.py -h
+
+		usage: rgi.py [-h] [-t INTYPE] [-i INPUTSEQ] [-n THREADS] [-o OUTPUT]
+		              [-e CRITERIA] [-c CLEAN] [-d DATA] [-l VERBOSE] [-a ALIGNER]
+		              [-r DATABASE] [-sv] [-dv]
+
+		Resistance Gene Identifier - Version 3.1.2
+
+		optional arguments:
+		  -h, --help            show this help message and exit
+		  -t INTYPE, --input_type INTYPE
+		                        must be one of contig, orf, protein, read (default:
+		                        contig)
+		  -i INPUTSEQ, --input_sequence INPUTSEQ
+		                        input file must be in either FASTA (contig and
+		                        protein), FASTQ(read) or gzip format! e.g
+		                        myFile.fasta, myFasta.fasta.gz
+		  -n THREADS, --num_threads THREADS
+		                        Number of threads (CPUs) to use in the BLAST search
+		                        (default=32)
+		  -o OUTPUT, --output_file OUTPUT
+		                        Output JSON file (default=Report)
+		  -e CRITERIA, --loose_criteria CRITERIA
+		                        The options are YES to include loose hits and NO to
+		                        exclude loose hits. (default=NO to exclude loose hits)
+		  -c CLEAN, --clean CLEAN
+		                        This removes temporary files in the results directory
+		                        after run. Options are NO or YES (default=YES for
+		                        remove)
+		  -d DATA, --data DATA  Specify a data-type, i.e. wgs, chromosome, plasmid,
+		                        etc. (default = NA)
+		  -l VERBOSE, --verbose VERBOSE
+		                        log progress to file. Options are OFF or ON (default =
+		                        OFF for no logging)
+		  -a ALIGNER, --alignment_tool ALIGNER
+		                        choose between BLAST and DIAMOND. Options are BLAST or
+		                        DIAMOND (default = BLAST)
+		  -r DATABASE, --db DATABASE
+		                        specify path to CARD blast databases (default: None)
+		  -sv, --software_version
+		                        Prints software number
+		  -dv, --data_version   Prints data version number
+
+
+	INTYPE could be one of 'contig', 'protein' or 'read'.
+
+	1. 'contig' means that inputSequence is a DNA sequence stored in a FASTA file, presumably a complete genome or assembly contigs. RGI will predict ORFs de novo and predict resistome using a combination of BLASTP against the CARD data, curated cut-offs, and SNP screening.
+
+	2. 'protein', as its name suggests, requires a FASTA file with protein sequences. As above, RGI predict resistome using a combination of BLASTP against the CARD data, curated cut-offs, and SNP screening.
+
+	3. 'read' expects raw FASTQ format nucleotide data and predicts resistome using a combination of BLASTX against the CARD data, curated cut-offs, and SNP screening. This is an experimental tool and we have yet to adjust the CARD cut-offs for BLASTX.  We will be exploring other metagenomics or FASTQ screening methods. Note that RGI does not perform any pre-processing of the FASTQ data (linker trimming, etc).
+
+
+|--------------------------------------------------------------------------
+| RGI outputs
+|--------------------------------------------------------------------------
+
+	RGI output will produce a detailed JSON file, Summary Tab-delimited file and gff3 (where applicable)
+
+	The JSON is as follows (example shows only one hit):
+
+	- gene_71|gi|378406451|gb|JN420336.1| Klebsiella pneumoniae plasmid pNDM-MAR, complete sequence: {
+		// Hit 1
+		gnl|BL_ORD_ID|39|hsp_num:0: {
+			SequenceFromBroadStreet: "MRYIRLCIISLLATLPLAVHASPQPLEQIKQSESQLSGRVGMIEMDLASGRTLTAWRADERFPMMSTFKVVLCGAVLARVDAGDEQLERKIHYRQQDLVDYSPVSEKHLADGMTVGELCAAAITMSDNSAANLLLATVGGPAGLTAFLRQIGDNVTRLDRWETELNEALPGDARDTTTPASMAATLRKLLTSQRLSARSQRQLLQWMVDDRVAGPLIRSVLPAGWFIADKTGASKRGARGIVALLGPNNKAERIVVIYLRDTPASMAERNQQIAGIGAA",
+			"orf_start": 67822,
+			"ARO_name": "SHV-12",
+			"type_match": "Loose",
+			"query": "INDWRLDYNECRPHSSLNYLTPAEFAAGWRN",
+			"evalue": 3.82304,
+			"max-identities": 10,
+			"orf_strand": "-",
+			"bit-score": 24.6386,
+			"cvterm_id": "35914",
+			"sequenceFromDB": "LDRWETELNEALPGDARDTTTPASMAATLRK",
+			"match": "++ W  + NE  P  + +  TPA  AA  R ",
+			"model_id": "103",
+			"orf_From": "gi|378406451|gb|JN420336.1| Klebsiella pneumoniae plasmid pNDM-MAR, complete sequence",
+			"pass_evalue": 1e-100,
+			"query_end": 68607,
+			"ARO_category": {
+			    "36696": {
+				    "category_aro_name": "antibiotic inactivation enzyme",
+				    "category_aro_cvterm_id": "36696",
+				    "category_aro_accession": "3000557",
+				    "category_aro_description": "Enzyme that catalyzes the inactivation of an antibiotic resulting in resistance.  Inactivation includes chemical modification, destruction, etc."
+				},
+				"36268": {
+			        "category_aro_name": "beta-lactam resistance gene",
+			        "category_aro_cvterm_id": "36268",
+			        "category_aro_accession": "3000129",
+			        "category_aro_description": "Genes conferring resistance to beta-lactams."
+			    }
+			},
+			"ARO_accession": "3001071",
+			"query_start": 68515,
+			"model_name": "SHV-12",
+			"model_type": "protein homolog model",
+			"orf_end": 68646
+		},
+		...
+		// Hit 2
+		...
+		// Hit 3
+		...
+	}
+
+|--------------------------------------------------------------------------
+| Getting Tab Delimited output after running RGI:
+|--------------------------------------------------------------------------
+
+	Run the following command to get help on how to get the Tab Delimited output
+
+	$ python convertJsonToTSV.py -h
+
+|--------------------------------------------------------------------------
+| convertJsonToTSV inputs
+|--------------------------------------------------------------------------
+
+	$ python convertJsonToTSV.py -h
+
+		usage: convertJsonToTSV.py [-h] [-i AFILE] [-o OUTPUT] [-v VERBOSE]
+
+		Convert RGI JSON file to Tab-delimited file
+
+		optional arguments:
+		  -h, --help            show this help message and exit
+		  -i AFILE, --afile AFILE
+		                        must be a json file generated from RGI in JSON or gzip
+		                        format e.g out.json, out.json.gz
+		  -o OUTPUT, --out_file OUTPUT
+		                        Output JSON file (default=dataSummary)
+		  -v VERBOSE, --verbose VERBOSE
+		                        include help menu. Options are OFF or ON (default =
+		                        OFF for no help)
+
+|--------------------------------------------------------------------------
+| convertJsonToTSV outputs
+|--------------------------------------------------------------------------
+
+	This outputs a tab-delimited text file: dataSummary.txt
+
+	The tab-output is as follows:
+			
+	---------------------------------------------------------------------		
+	COLUMN  		HELP_MESSAGE
+	---------------------------------------------------------------------
+	ORF_ID ------- Open Reading Frame identifier (internal to RGI)
+	CONTIG ------- Source Sequence
+	START ------- Start co-ordinate of ORF
+	STOP ------- End co-ordinate of ORF
+	ORIENTATION ------- Strand of ORF
+	CUT_OFF ------- RGI Detection Paradigm
+	PASS_EVALUE ------- STRICT detection model Expectation value cut-off
+	Best_Hit_evalue ------- Expectation value of match to top hit in CARD
+	Best_Hit_ARO ------- ARO term of top hit in CARD
+	Best_Identities ------- Percent identity of match to top hit in CARD
+	ARO ------- ARO accession of top hit in CARD
+	ARO_name ------- ARO term of top hit in CARD
+	Model_type ------- CARD detection model type
+	SNP ------- Observed mutation (if applicable)
+	AR0_category ------- ARO Categorization
+	bit_score ------- Bitscore of match to top hit in CARD
+	Predicted_Protein ------- ORF predicted protein sequence
+	CARD_Protein_Sequence ------- Protein sequence of top hit in CARD
+	LABEL ------- ORF label (internal to RGI)
+	ID ------- HSP identifier (internal to RGI)
+
+|--------------------------------------------------------------------------
+| Files Structure
+|--------------------------------------------------------------------------
+
+
+`-- rgi/
+   |-- _data/
+   	   |-- card.json
+   |-- _db/
+   		... (BLAST DBs)
+   |-- _docs/
+   		|-- INSTALL
+   		|-- README
+   |-- _tmp/
+   |-- tests/
+   |-- __init__.py
+   |-- blastnsnp.py
+   |-- clean.py
+   |-- contigToORF.py
+   |-- contigToProteins.py
+   |-- convertJsonToTSV.py
+   |-- create_gff3_file.py
+   |-- filepaths.py
+   |-- formatJson.py
+   |-- fqToFsa.py
+   |-- load.py
+   |-- rgi.py
+   |-- rrna.py
+   
+
+
+|--------------------------------------------------------------------------
+| Loading new card.json:
+|--------------------------------------------------------------------------
+
+	* If new card.json is available. Replace card.json in the directory show above. Use the following command:
+
+	$ python load.py -h
+
+
+|--------------------------------------------------------------------------
+| Load inputs
+|--------------------------------------------------------------------------
+
+	$ python load.py -h
+
+		usage: load.py [-h] [-i AFILE]
+
+		Load card database json file
+
+		optional arguments:
+		  -h, --help            show this help message and exit
+		  -i AFILE, --afile AFILE
+		                        must be a card database json file
+
+
+|--------------------------------------------------------------------------
+| Clean databases
+|--------------------------------------------------------------------------
+
+	* Database is created once the rgi.py is run. Use clean.py to remove databases after new card.json is loaded.
+
+	* Then run clean.py to clean directory.
+
+	$ python clean.py -h
+
+
+|--------------------------------------------------------------------------
+| Clean inputs
+|--------------------------------------------------------------------------
+
+	$ python clean.py -h
+
+		usage: clean.py [-h]
+
+		Removes BLAST databases created using card.json
+
+		optional arguments:
+		  -h, --help  show this help message and exit
+
+|--------------------------------------------------------------------------
+| Format JSON
+|--------------------------------------------------------------------------
+
+	$ python formatJson.py -h 
+
+		usage: formatJson.py [-h] [-i IN_FILE] [-o OUT_FILE]
+
+		Convert RGI JSON file to Readable JSON file
+
+		optional arguments:
+		  -h, --help            show this help message and exit
+		  -i IN_FILE, --in_file IN_FILE
+		                        input file must be in JSON format e.g Report.json
+		  -o OUT_FILE, --out_file OUT_FILE
+		                        Output JSON file (default=ReportFormatted)	
+
+|--------------------------------------------------------------------------
+| Contact Us:
+|--------------------------------------------------------------------------
+
+	For help please contact us at:
+
+	* CARD Developers <card at mcmaster.ca>
+
+ 
\ No newline at end of file
diff --git a/_docs/README b/_docs/README
new file mode 100644
index 0000000..47035cc
--- /dev/null
+++ b/_docs/README
@@ -0,0 +1,78 @@
+|--------------------------------------------------------------------------
+| Running RGI localy
+|--------------------------------------------------------------------------
+
+
+|--------------------------------------------------------------------------
+| Files
+|--------------------------------------------------------------------------
+
+
+`-- rgi/
+   |-- _data/
+         |-- card.json
+   |-- _db/
+         ... (BLAST DBs)
+   |-- _docs/
+         |-- INSTALL
+         |-- README
+   |-- _tmp/
+   |-- tests/
+   |-- __init__.py
+   |-- blastnsnp.py
+   |-- clean.py
+   |-- contigToORF.py
+   |-- contigToProteins.py
+   |-- convertJsonToTSV.py
+   |-- create_gff3_file.py
+   |-- filepaths.py
+   |-- formatJson.py
+   |-- fqToFsa.py
+   |-- load.py
+   |-- rgi.py
+   |-- rrna.py
+   
+
+
+|--------------------------------------------------------------------------
+| Commands for Running RGI localy
+|--------------------------------------------------------------------------
+
+
+$ python rgi.py -h
+
+$ python clean.py -h
+
+$ python convertJsonToTSV.py -h
+
+$ python load.py -h
+
+$ python formatJson.py -h 
+
+
+|--------------------------------------------------------------------------
+| Running RGI system-wide
+|--------------------------------------------------------------------------
+
+# install RGI
+
+$ conda install --channel rgi
+
+# un-install RGI
+
+$ conda remove --channel rgi
+
+|--------------------------------------------------------------------------
+| Commands for Running RGI system-wide
+|--------------------------------------------------------------------------
+
+$ rgi -h
+
+$ rgi_clean -h
+
+$ rgi_jsontab -h
+
+$ rgi_load -h
+
+$ rgi_jsonformat -h
+
diff --git a/blastnsnp.py b/blastnsnp.py
new file mode 100644
index 0000000..0705a84
--- /dev/null
+++ b/blastnsnp.py
@@ -0,0 +1,223 @@
+import sys
+import json
+import os
+
+
+def removeTempFile():	
+	if os.path.isfile("blastnRes.xml"):
+		os.remove("blastnRes.xml")
+	if os.path.isfile("dnadb.fsa"):
+		os.remove("dnadb.fsa")
+	if os.path.isfile("dna.db.nhr"):
+		os.remove("dna.db.nhr")
+		os.remove("dna.db.nin")
+		os.remove("dna.db.nsq")
+
+
+def isfloat(value):
+	try:
+		float(value)
+		return True
+	except ValueError:
+		return False
+
+
+def convert(input):
+	if isinstance(input, dict):
+		return dict((convert(key), convert(value)) for key, value in input.iteritems())
+	elif isinstance(input, list):
+		return [convert(element) for element in input]
+	elif isinstance(input, unicode):
+		return input.encode('utf-8')
+	else:
+		return input
+
+
+def findnthbar(bunchstr, n):
+	barc = 0
+	start = n+3
+	over = n+4
+	temp = ""
+	for eachc in bunchstr:
+		if eachc == '|':
+			barc += 1
+		if barc == start:
+			temp += eachc
+		if barc == over:
+			break
+	colonpos = temp.find(':')
+	res=temp[colonpos+2:]
+	res=res.rstrip()
+	if res.isdigit():
+		return int(res)
+	elif isfloat(res):
+		return float(res)
+	else:
+		res = res.encode('ascii','replace')
+		return res
+
+
+def checkKeyExisted(key, my_dict):
+	try:
+		nonNone = my_dict[key] is not None
+	except KeyError:
+		nonNone = False
+	return nonNone
+
+
+def writeFASTAfromJson():
+	if os.path.isfile("dnadb.fsa") == False:
+		with open("card.json") as json_file:
+			json_data = json.load(json_file)
+			
+			with open ('dnadb.fsa', 'w') as wd:
+				for item in json_data:
+					if item.isdigit():
+						# model_type: blastN + SNP (pass_evalue + snp)
+						if json_data[item]["model_type_id"] == "40295":
+							snpList = ""
+							if checkKeyExisted('snp', json_data[item]['model_param']):
+								for key in json_data[item]['model_param']['snp']['param_value']:
+									snpList += json_data[item]['model_param']['snp']['param_value'][key]
+									snpList += ','
+
+							if checkKeyExisted("model_sequences", json_data[item]):
+								for seqkey in json_data[item]["model_sequences"]["sequence"]:
+									print>>wd, ('>' + item + '_' + seqkey + " | model_type_id: 40295" + " | SNP: " + snpList)
+									print>>wd, (json_data[item]["model_sequences"]["sequence"][seqkey]["dna_sequence"]["sequence"])
+			wd.close()
+		json_file.close()
+		
+
+def runBlastnSnp (inType, inputSeq):
+	if os.path.isfile("dna.db.nsq") == False: 
+		os.system('makeblastdb -in dnadb.fsa -dbtype nucl -out dna.db')		
+	os.system("blastn -out blastnRes.xml -outfmt 5 -query " + inputSeq + " -db dna.db")
+	
+	result_handle = open('blastnRes.xml')
+	from Bio.Blast import NCBIXML
+	blast_records = NCBIXML.parse(result_handle)			
+	blastnResults = {}
+	result_handle.close()
+
+	with open("card.json") as json_file:
+		json_data = json.load(json_file)
+	json_file.close()
+
+	for blast_record in blast_records:
+		nsnp = {}
+
+		for alignment in blast_record.alignments:
+			alignTitle = alignment.title
+			modelTypeID = findnthbar(alignTitle, 0)
+			spacepos = alignTitle.index(' ')
+			hitid = alignTitle[0:spacepos]
+			hitid = hitid.encode('ascii','replace')
+
+			modelDescrpt = alignTitle[alignTitle.index(' ')+1:]
+			underscoreinMD = modelDescrpt.index('_')
+			modelID = modelDescrpt[0:underscoreinMD]
+			seqinModel = modelDescrpt[underscoreinMD+1:modelDescrpt.index(' ')]
+
+			modelTypeDscp = alignTitle[alignTitle.index(':')+2:]
+			modelTypeId = modelTypeDscp[0:modelTypeDscp.index(' ')]
+			passevalue = modelTypeDscp [modelTypeDscp.index(':')+2:]
+
+			init = 0
+			evalueSNP = findnthbar(alignTitle, 1)
+				#print evalueSNP
+			snpL = []
+			temp = ""
+
+			for eachc in evalueSNP:
+				if eachc == ',':
+					snpL.append(temp)
+					temp = ""
+				else:
+					temp += eachc
+
+			snpdictlist = []
+			for eachsnp in snpL:					
+				snpdictlist.append({"original": eachsnp[0], "change": eachsnp[-1], "position": int(eachsnp[1:-1])})
+					
+			for hsp in alignment.hsps:
+				querySeq = hsp.query.replace('-', '')
+				realQueryLength = len(querySeq)
+
+				sbjctSeq = hsp.sbjct.replace('-', '')
+				realSbjctLength = len(sbjctSeq)
+				sinsidedict = {}
+
+				for eachs in snpdictlist:
+					pos = eachs["position"]
+					ori = eachs["original"]
+					chan = eachs["change"]
+
+					if hsp.sbjct_start <= pos and (hsp.sbjct_start + realSbjctLength) > pos:
+						target = pos - hsp.sbjct_start
+						c = 0
+						snpInQuery = 0
+
+						for ch in hsp.sbjct:
+							if c == target:
+								if ch == '-':
+									snpInQuery += 1
+								else:
+									break
+							elif ch == '-':
+								snpInQuery += 1
+							else:
+								snpInQuery += 1
+								c += 1
+
+						if hsp.query[snpInQuery] == chan and sbjctSeq[target] == ori:
+							nsinsidedict = {}
+							nsinsidedict["model_id"] = modelID
+							nsinsidedict["SNP"] = eachs
+							nsinsidedict["query_start"] = hsp.query_start
+							nsinsidedict["query_end"] = hsp.query_start + realQueryLength - 1
+							nsinsidedict["query_From"] = blast_record.query.encode('ascii','replace')
+							
+							if checkKeyExisted("NCBI_taxonomy", json_data[modelID]["model_sequences"]["sequence"][seqinModel]):
+								nsinsidedict["cvterm_id"] = json_data[modelID]["model_sequences"]["sequence"][seqinModel]["NCBI_taxonomy"]["NCBI_taxonomy_cvterm_id"]
+							nsinsidedict["model_name"] = json_data[modelID]["model_name"]
+							nsinsidedict["model_type"] = json_data[modelID]["model_type"]
+							nsinsidedict["pass_evalue"] = passevalue
+							nsinsidedict["ARO_accession"] = json_data[modelID]["ARO_accession"]
+							nsinsidedict["ARO_name"] = json_data[modelID]["ARO_name"]
+							nsinsidedict["ARO_category"] = json_data[modelID]["ARO_category"]
+							nsinsidedict["evalue"] = hsp.expect
+							nsinsidedict["max-identites"] = hsp.identities
+							nsinsidedict["bit-score"] = hsp.bits
+							
+							nsinsidedict["query"] = hsp.query.encode('ascii','replace')
+							nsinsidedict["match"] = hsp.match.encode('ascii','replace')
+							nsinsidedict["sequenceFromDB"] = hsp.sbjct.encode('ascii','replace')
+												
+							nsnp[hitid + "|hsp_num:" + str(init)] = nsinsidedict
+							init += 1
+
+			blastnResults[blast_record.query.encode('ascii','replace')] = nsnp	
+	jsonRes = json.dumps(blastnResults)
+	with open ('blastnReport.json', 'w') as wj:
+		print>>wj, jsonRes
+	wj.close()
+	return jsonRes
+
+
+def main(inType, inputSeq):
+	writeFASTAfromJson()
+	try:
+		myjson = runBlastnSnp('contig', inputSeq)
+		removeTempFile()
+		return myjson
+
+	except Exception as inst:
+		#pass
+		print>>sys.stderr, inst
+		removeTempFile()
+
+
+if __name__ == '__main__':
+	main('contig', sys.argv[2])
+	"""required: inputSeq must be in FASTA format!"""
diff --git a/clean.py b/clean.py
new file mode 100755
index 0000000..7a787b4
--- /dev/null
+++ b/clean.py
@@ -0,0 +1,98 @@
+import os
+import sys
+import filepaths
+import argparse
+
+script_path = filepaths.determine_path()
+working_directory = os.getcwd()
+path = script_path+"/"
+
+app_files = [".gitignore","_docs","_tmp","_db","mgm"]
+
+# clean other files left over
+def _clean():
+	import glob
+	files = glob.glob(path+"*")
+	for f in files:
+		if os.path.isfile(f) and os.path.splitext(os.path.basename(f))[1][1:].strip() in ["adraft","xml","fsa","draft","pyc","log"]:
+			os.remove(f)
+
+		if os.path.isdir(f) == False:
+			if os.path.isfile(f) == True and os.path.splitext(os.path.basename(f))[1][1:].strip() in ["py"]:
+				pass
+			else:
+				if os.path.isfile(f):
+					print>>sys.stderr, "Remove: " + str(f)
+					os.remove(f)
+
+    # clean db files
+	db_files = glob.glob(path+"_db/*")
+	for dbfile in db_files:
+		if os.path.isfile(dbfile):
+			os.remove(dbfile)
+	print>>sys.stderr, "Cleaned directory: "+path+"_db"
+
+    # clean tmp files
+	tmp_files = glob.glob(path+"_tmp/*")
+	for tempfile in tmp_files:
+		if os.path.isfile(tempfile):
+			os.remove(tempfile)
+	print>>sys.stderr, "Cleaned directory: "+path+"_tmp"
+
+
+#remove temporary file
+def main():
+	if os.path.isfile(path+"contigToPro.fasta"):
+		os.remove(path+"contigToPro.fasta")
+	if os.path.isfile(path+"read.fsa"):
+		os.remove(path+"read.fsa")
+	if os.path.isfile(path+"contig.fsa"):
+		os.remove(path+"contig.fsa")
+	if os.path.isfile(path+"contigToORF.fsa"):
+		os.remove(path+"contigToORF.fsa")
+	if os.path.isfile(path+"blastRes.xml"):
+		os.remove(path+"blastRes.xml")
+	if os.path.isfile(path+"encodedHeaders.json"):
+		os.remove(path+"encodedHeaders.json")
+	if os.path.isfile(path+"predictedGenes.json"):
+		os.remove(path+"predictedGenes.json")
+	if os.path.isfile(path+"blastpjson"):
+		os.remove(path+"blastpjson")
+	if os.path.isfile(path+"proteindb.fsa"):
+		os.remove(path+"proteindb.fsa")
+	if os.path.isfile(path+"dnadb.fsa"):
+		os.remove(path+"dnadb.fsa")
+	if os.path.isfile(path+"protein.db.phr"):
+		os.remove(path+"protein.db.phr")
+		os.remove(path+"protein.db.pin")
+		os.remove(path+"protein.db.psq")
+	if os.path.isfile(path+"protein.db.dmnd"):
+		os.remove(path+"protein.db.dmnd")
+	if os.path.isfile(path+"temp.fsa"):
+		os.remove(path+"temp.fsa")
+	if os.path.isfile(path+"dna.db.nhr"):
+		os.remove(path+"dna.db.nhr")
+		os.remove(path+"dna.db.nin")
+		os.remove(path+"dna.db.nsq")
+	if os.path.isfile(path+"Report.json"):
+		os.remove(path+"Report.json")
+	if os.path.isfile(path+"ReportFormatted.json"):
+		os.remove(path+"ReportFormatted.json")
+	if os.path.isfile(path+"draft"):
+		os.remove(path+"draft")
+	if os.path.isfile(path+"adraft"):
+		os.remove(path+"adraft")
+	if os.path.isfile(path+"dataSummary.txt"):
+		os.remove(path+"dataSummary.txt")
+
+	_clean()
+
+	print>>sys.stderr, "Cleaned directory: "+path
+
+def run():
+	parser = argparse.ArgumentParser(description='Removes BLAST databases created using card.json')
+	args = parser.parse_args()
+	main()
+
+if __name__ == '__main__':
+	run()
diff --git a/contigToORF.py b/contigToORF.py
new file mode 100644
index 0000000..a9cb5af
--- /dev/null
+++ b/contigToORF.py
@@ -0,0 +1,113 @@
+import os
+import sys
+import filepaths
+import time
+import hashlib
+import datetime
+
+script_path = filepaths.determine_path()
+working_directory = os.getcwd()
+
+path = script_path
+
+def catProteins(filename,afile):
+
+	with open(afile, 'r') as f:
+		data = f.readlines()
+	f.close()
+
+	with open(working_directory+'/'+filename+'.contigToORF.fsa', 'w') as wf:
+
+		startrecord = False
+
+		for eachline in data:
+			if eachline == '':
+				pass
+			elif eachline[0:18] == "Model information:":
+				startrecord = False
+			elif startrecord:
+				if eachline[0] == '>':
+					print>>wf, eachline.replace('\t>', '|').strip()
+				else:
+					if eachline.strip() != "":
+						print>>wf, eachline.strip()
+			elif eachline.strip() == "Nucleotide sequence of predicted genes:":
+				startrecord = True
+	wf.close()
+
+def get_character_len(file_name):
+	chars = words = lines = 0
+	with open(file_name, 'r') as in_file:
+	    for line in in_file:
+	    	if line[0] == '>':
+	    		pass
+	    	else:
+		        lines += 1
+		        words += len(line.split())
+		        chars += len(line)
+	in_file.close()
+	return chars	
+
+def main(argvfile,clean,orf):
+	clean = clean.lower()
+	orf = orf.lower()
+	if orf == "genemark":
+		orf_metagenemark(argvfile,clean)
+	else:
+		orf_prodigal(argvfile,clean)
+   
+
+def orf_metagenemark(argvfile,clean):
+	filename = os.path.basename(argvfile)
+	os.system(path+"/mgm/gmhmmp -r -m "+path+"/mgm/MetaGeneMark_v1.mod -o "+working_directory+"/"+filename+".adraft -d " + argvfile)
+	catProteins(filename,working_directory +"/"+filename+".adraft")
+	if clean == "yes":
+		os.remove(working_directory +"/"+filename+".adraft")
+
+'''
+Training Using genomes:
+
+Train:
+	prodigal -i Salmonella_enterica_genome.fasta -p single -t Salmonella_enterica.trn
+
+Using traning file:
+
+	prodigal -i query.fasta -c -m -a trans_proteins.fasta -d trans_nulc -o output.txt -t Salmonella_enterica.trn -p single
+'''
+# prodigal -a dna.fasta.adraft -i dna.fasta -o dna.fasta.draft -p meta -d dna.fasta.contigToORF.fsa
+def orf_prodigal(argvfile,clean):
+
+	p = "single"
+	count =  int(get_character_len(argvfile))
+	#check for number of characters for selection procedure -- ribosomal binding sites
+	if count < 200000:
+		p = "meta"
+
+	filename = os.path.basename(argvfile)
+	os.system("prodigal -c -m -a "+working_directory+"/"+filename+".contig.fsa -i "+argvfile+" -o "+working_directory+"/"+filename+".draft -p "+p+" -d "+working_directory+"/"+filename+".contigToORF.fsa -q")
+
+	# format the contig file headers to remove space
+	format_fasta_headers(working_directory+"/"+filename+".contig.fsa")
+
+	if clean == "yes":
+		os.remove(working_directory +"/"+filename+".draft")
+
+def getHashName(name):
+	m = hashlib.md5()
+	t = datetime.datetime.utcnow()
+	m.update(name + str(t))
+	return m.hexdigest()
+
+def format_fasta_headers(afile):
+	name = getHashName(afile)
+	ofile = open(name, "w")
+	from Bio import SeqIO
+	for record in SeqIO.parse(afile, 'fasta'):
+		new_header =  record.description.strip().replace(" # ", "#")
+		ofile.write(">" + new_header + "\n" + str(record.seq) + "\n")
+	ofile.close()
+	os.rename(name, afile)	
+
+
+if __name__ == '__main__':
+	main(sys.argv[1],sys.argv[2], sys.argv[3])
diff --git a/contigToProteins.py b/contigToProteins.py
new file mode 100755
index 0000000..1f3f0a6
--- /dev/null
+++ b/contigToProteins.py
@@ -0,0 +1,46 @@
+import os
+import sys
+import filepaths
+
+script_path = filepaths.determine_path()
+working_directory = os.getcwd()
+
+path = script_path
+
+def catProteins(filename,afile):
+
+	with open(afile, 'r') as f:
+		data = f.readlines()
+	f.close()
+	
+	with open(working_directory+'/'+filename+'.contig.fsa', 'w') as wf:
+
+		startrecord = False
+
+		for eachline in data:
+			if eachline == '':
+				pass
+			elif eachline[0:18] == "Model information:":
+				startrecord = False
+			elif startrecord:
+				if eachline[0] == '>':
+					print>>wf, eachline.replace('\t>', '|').strip().lstrip()
+				else:
+					if eachline.strip() != "":
+						print>>wf, eachline.strip()
+
+			elif eachline[0:19] == "Predicted proteins:":
+				startrecord = True
+	wf.close()
+
+
+def main(argvfile, clean):
+	filename = os.path.basename(argvfile)
+	os.system(path+"/mgm/gmhmmp -r -m "+path+"/mgm/MetaGeneMark_v1.mod -o "+working_directory+"/"+filename+".draft -a " + argvfile)
+	catProteins(filename,working_directory +"/"+filename+".draft")
+	if clean == "yes":
+		os.remove(working_directory +"/"+filename+".draft")
+
+
+if __name__ == '__main__':
+	main(sys.argv[1],sys.argv[2])
diff --git a/convertJsonToTSV.py b/convertJsonToTSV.py
new file mode 100755
index 0000000..824b642
--- /dev/null
+++ b/convertJsonToTSV.py
@@ -0,0 +1,426 @@
+import json
+import csv
+import sys
+import os
+import rgi
+import re
+
+import argparse
+import filepaths
+
+script_path = filepaths.determine_path()
+working_directory = os.getcwd()
+path = script_path+"/"
+
+def check_delimiter(fastaHeader):
+	# Colon
+	if ((':').join(re.split(':',fastaHeader)) == fastaHeader) and re.split(':',fastaHeader)[0] != fastaHeader:
+		return ":"
+	# Pipe
+	elif (('|').join(re.split('|',fastaHeader)) == fastaHeader) and re.split('|',fastaHeader)[0] != fastaHeader:
+		return "|"
+	# Dash
+	elif (('-').join(re.split('-',fastaHeader)) == fastaHeader) and re.split('-',fastaHeader)[0] != fastaHeader:
+		return "-"
+	# Underscore
+	elif (('_').join(re.split('_',fastaHeader)) == fastaHeader) and re.split('_',fastaHeader)[0] != fastaHeader:
+		return "_"	
+	# Other
+	else:
+		return ""	
+
+#output the information particular field from alignment.Title by splicing it by '|'
+def findnthbar(bunchstr, start):
+
+	barc = 0
+	over = start+1
+	temp = ""
+
+	for eachc in bunchstr:
+		if eachc == '|':
+			barc += 1
+		if barc == start:
+			if eachc == '|':
+				pass
+			else:
+				temp += eachc
+		if barc == over:
+			break		
+
+	return temp
+
+#output the information particular field from alignment.Title by splicing it by '#'
+def findnthbar2(bunchstr, n):
+	arr = bunchstr.split("#")
+	if n < len(arr):
+		# gene id
+		if n == 1 and arr[n]:
+			return int(arr[n])
+		elif n == 2:
+			return int(arr[n])
+		elif n == 3:
+			 if int(arr[n]) == 1:
+			 	# positive
+			 	return "+"
+			 else:
+			 	# neg
+			 	return "-"
+		else:
+			return arr[n]
+	else:
+		return ""
+
+def findORFfrom (bunchstr):
+	barc = 0
+	start = 6
+	temp = ""
+	allout = False
+
+	for eachc in bunchstr:
+		if eachc == '|':
+			barc += 1
+		if allout or barc == start:
+			allout = True
+			temp += eachc
+
+	return temp[1:]
+
+
+def convert(input):
+	if isinstance(input, dict):
+		return dict((convert(key), convert(value)) for key, value in input.iteritems())
+	elif isinstance(input, list):
+		return [convert(element) for element in input]
+	elif isinstance(input, unicode):
+		return input.encode('utf-8')
+	else:
+		return input
+
+
+def checkKeyExisted(key, my_dict):
+	try:
+		nonNone = my_dict[key] is not None
+	except KeyError:
+		nonNone = False
+	return nonNone
+
+
+def printCSV(resultfile,ofile,orf,verbose):
+	if os.path.isfile(resultfile) == False:
+		print>>sys.stderr, "convertJsonToTSV missing input JSON file."
+		exit()
+
+	try:
+		with open(resultfile, 'r') as f:
+			data = json.load(f)
+		f.close()
+	
+	except ValueError:
+		print>>sys.stderr, "convertJsonToTSV expects a file contains a VALID JSON string."
+		exit()
+
+	with open(working_directory+"/"+ofile+".txt", "w") as af:
+		writer = csv.writer(af, delimiter='\t', dialect='excel')
+		writer.writerow(["ORF_ID", "CONTIG", "START", "STOP", "ORIENTATION", "CUT_OFF", "PASS_EVALUE", "Best_Hit_evalue", "Best_Hit_ARO", "Best_Identities", "ARO", "ARO_name", "Model_type", "SNP", "Best_Hit_ARO_category", "ARO_category", "PASS_bitscore", "Best_Hit_bitscore", "bit_score","Predicted_DNA","Predicted_Protein","CARD_Protein_Sequence","LABEL","ID","Model_id"])
+		for item in data:
+			minevalue = 0.0
+			minscore = 0.0
+			maxpercent = 0.0
+			startCompare = False
+			minARO = ""
+			bestAROcategorydict = {}
+			AROlist = []
+			AROnameList = []
+			bitScoreList = []
+			AROcatList = []
+			snpList = []
+			cutoffList = []
+			typeList = []
+			evalueList = []
+			identityList = []
+			SequenceFromBroadStreet = ""
+			predictedProtein = ""
+			predictedDNA = ""
+			geneID = ""
+			hitID = ""
+			topModel = ""
+			if item not in ["_metadata","data_type"]:
+				geneID = item
+				for it in data[item]:
+					cgList = []
+					if checkKeyExisted("ARO_category", data[item][it]):
+						for aroctkey in data[item][it]["ARO_category"]:
+							cgList.append(str(data[item][it]["ARO_category"][aroctkey]["category_aro_name"].encode('ascii','replace')))
+
+					if data[item][it]["model_type_id"] == 40293:
+						temp = data[item][it]["SNP"]["original"] + str(data[item][it]["SNP"]["position"]) + data[item][it]["SNP"]["change"]
+						snpList.append(convert(temp))
+					elif data[item][it]["model_type_id"] == 40292:
+						snpList.append("n/a")
+					"""
+					if data[item][it]["model_type_id"] == 41091:
+						if checkKeyExisted("SNP",data[item][it]):
+							temp = data[item][it]["SNP"]["original"] + str(data[item][it]["SNP"]["position"]) + data[item][it]["SNP"]["change"]
+							snpList.append(convert(temp))
+						else:
+							snpList.append("n/a")
+					"""
+
+					AROlist.append(convert(data[item][it]["ARO_accession"]))
+					AROnameList.append(convert(data[item][it]["ARO_name"]))
+					bitScoreList.append(data[item][it]["bit-score"])
+					pass_evalue = str(data[item][it]["pass_evalue"]).split("|")[0]
+					pass_bitscore = "n/a"
+					AROcatList.append(cgList)
+					typeList.append(convert(data[item][it]["model_type"]))
+					cutoffList.append(convert(data[item][it]["type_match"]))
+					identityList.append(float(data[item][it]["perc_identity"]))
+
+					bestAROcategory = []
+
+					# sort results by minimum e-value and maximum percent identity
+					if startCompare:
+						if maxscore < data[item][it]["bit-score"] and maxpercent < float(data[item][it]["perc_identity"]):
+							minevalue = data[item][it]["evalue"]
+							maxscore = data[item][it]["bit-score"]
+							maxpercent = float(data[item][it]["perc_identity"])
+							minARO = data[item][it]["ARO_name"]
+							topModel = data[item][it]["model_id"]
+							SequenceFromBroadStreet = data[item][it]["SequenceFromBroadStreet"]
+
+							if "orf_prot_sequence" in data[item][it]:
+								predictedProtein = data[item][it]["orf_prot_sequence"]
+							if "orf_dna_sequence" in data[item][it]:
+								predictedDNA = data[item][it]["orf_dna_sequence"]
+
+							if checkKeyExisted("ARO_category", data[item][it]):
+								for key in data[item][it]["ARO_category"]:
+									bestAROcategory.append(str(data[item][it]["ARO_category"][key]["category_aro_name"].encode('ascii','replace')))
+								bestAROcategorydict[str(minARO)+"|"+str(minevalue)] = bestAROcategory
+
+							if "hsp_num:" in it:
+								hitID = it
+						
+
+					else:
+						startCompare = True
+						minevalue = data[item][it]["evalue"]
+						maxscore = data[item][it]["bit-score"]
+						maxpercent = float(data[item][it]["perc_identity"])
+						minARO = data[item][it]["ARO_name"]
+						topModel = data[item][it]["model_id"]
+						SequenceFromBroadStreet = data[item][it]["SequenceFromBroadStreet"]
+
+						if "orf_prot_sequence" in data[item][it]:
+							predictedProtein = data[item][it]["orf_prot_sequence"]
+						if "orf_dna_sequence" in data[item][it]:
+								predictedDNA = data[item][it]["orf_dna_sequence"]
+
+						if checkKeyExisted("ARO_category", data[item][it]):
+							for key in data[item][it]["ARO_category"]:
+								bestAROcategory.append(str(data[item][it]["ARO_category"][key]["category_aro_name"].encode('ascii','replace')))
+							bestAROcategorydict[str(minARO)+"|"+str(minevalue)] = bestAROcategory
+
+						if "hsp_num:" in it:
+							hitID = it
+
+				clist = set(cutoffList)
+				tl = set(typeList)
+				arocatset = set(AROnameList)
+				
+				if set(snpList) == set(['n/a']):
+					snpList = 'n/a'
+				else:
+					snpList = ', '.join(snpList)
+
+				from itertools import chain
+				AROcatList = list(chain.from_iterable(AROcatList))
+				AROcatalphaSet = set(AROcatList)
+				AROsortedList = sorted(list(AROcatalphaSet))
+
+				if typeList:
+					if orf == "genemark":
+						#for protein RGI runs where there's no | or seq_start/stop/strand
+						if findnthbar(item, 4) == "":
+							writer.writerow([item, 
+								"", 
+								"", 
+								"", 
+								"", 
+								', '.join(list(clist)),
+								pass_evalue,
+								minevalue, 
+								minARO, 
+								maxpercent, 
+								', '.join(map(lambda x:"ARO:"+x, AROlist)), 
+								'; '.join(list(arocatset)),
+								'; '.join(list(tl)), 
+								snpList, 
+								'; '.join(bestAROcategorydict[str(minARO)+"|"+str(minevalue)]) ,
+								'; '.join(AROsortedList),
+								pass_bitscore,
+								maxscore ,
+								', '.join(map(str, bitScoreList)),
+								predictedDNA,
+								predictedProtein,
+								SequenceFromBroadStreet,
+								geneID,
+								hitID, 
+								topModel
+								])
+		                                else:
+						        writer.writerow([findnthbar(item, 0), 
+						        	findORFfrom(item), 
+						        	int(findnthbar(item, 4))-1, 
+						        	int(findnthbar(item, 5))-1, 
+						        	findnthbar(item, 3), 
+						        	', '.join(list(clist)), 
+						        	pass_evalue,
+						        	minevalue ,
+						        	minARO, 
+						        	max(identityList), 
+						        	', '.join(map(lambda x:"ARO:"+x, AROlist)), 
+						        	'; '.join(list(arocatset)), 
+						        	'; '.join(list(tl)), 
+						        	snpList, 
+						        	'; '.join(bestAROcategorydict[str(minARO)+"|"+str(minevalue)]) ,
+						        	'; '.join(AROsortedList), 
+						        	pass_bitscore,
+						        	maxscore ,
+						        	', '.join(map(str, bitScoreList)),
+						        	predictedDNA,
+						        	predictedProtein,
+						        	SequenceFromBroadStreet,
+						        	geneID,
+						        	hitID,
+						        	topModel
+						        	])
+					else:
+						if findnthbar2(item, 1) == "":
+							writer.writerow([item, 
+								"", 
+								"", 
+								"", 
+								"", 
+								', '.join(list(clist)),
+								pass_evalue,
+								minevalue,
+								minARO, 
+								maxpercent, 
+								', '.join(map(lambda x:"ARO:"+x, AROlist)), 
+								'; '.join(list(arocatset)),
+								', '.join(list(tl)), 
+								snpList, 
+								'; '.join(bestAROcategorydict[str(minARO)+"|"+str(minevalue)]),
+								'; '.join(AROsortedList), 
+								pass_bitscore,
+								maxscore,
+								', '.join(map(str, bitScoreList)),
+								predictedDNA,
+								predictedProtein,
+								SequenceFromBroadStreet,
+								geneID,
+								hitID,
+								topModel
+								])
+						else:
+							writer.writerow([findnthbar2(item, 0),
+								findnthbar2(item, 4).strip(" "), 
+					        	int(findnthbar2(item, 1))-1, 
+					        	int(findnthbar2(item, 2))-1, 
+					        	findnthbar2(item, 3), 
+					        	', '.join(list(clist)), 
+					        	pass_evalue, 
+					        	minevalue,
+					        	minARO, 
+					        	maxpercent, 
+					        	', '.join(map(lambda x:"ARO:"+x, AROlist)), 
+					        	', '.join(list(arocatset)), 
+					        	', '.join(list(tl)), 
+					        	snpList, 
+					        	'; '.join(bestAROcategorydict[str(minARO)+"|"+str(minevalue)]),
+					        	'; '.join(AROsortedList), 
+					        	pass_bitscore,
+					        	maxscore,
+					        	', '.join(map(str, bitScoreList)),
+					        	predictedDNA,
+					        	predictedProtein,
+					        	SequenceFromBroadStreet,
+					        	geneID,
+					        	hitID,
+					        	topModel
+						        ])
+
+
+
+	af.close()
+
+def manual():
+	h = {}
+	h["ORF_ID"] = "Open Reading Frame identifier (internal to RGI)"
+	h["CONTIG"] = "Source Sequence"
+	h["START"] = "Start co-ordinate of ORF"
+	h["STOP"] = "End co-ordinate of ORF"
+	h["ORIENTATION"] = "Strand of ORF"
+	h["CUT_OFF"] = "RGI Detection Paradigm"
+	h["PASS_EVALUE"] = "STRICT detection model Expectation value cut-off"
+	h["Best_Hit_evalue"] = "Expectation value of match to top hit in CARD"
+	h["Best_Hit_ARO"] = "ARO term of top hit in CARD"
+	h["Best_Identities"] = "Percent identity of match to top hit in CARD"
+	h["ARO"] = "ARO accession of top hit in CARD"
+	h["ARO_name"] = "ARO term of top hit in CARD"
+	h["Model_type"] = "CARD detection model type"
+	h["SNP"] = "Observed mutation (if applicable)"
+	h["Best_Hit_ARO_category"] = "top hit ARO Categorization"
+	h["ARO_category"] = "ARO Categorization"
+	h["PASS_bitscore"] = "STRICT detection model bitscore value cut-off"
+	h["Best_Hit_bitscore"] = "Bit score of match to top hit in CARD"
+	h["bit_score"] = "Bitscore of match to top hit in CARD"
+	h["Predicted_DNA"] = "ORF predicted nucleotide sequence"
+	h["Predicted_Protein"] = "ORF predicted protein sequence"
+	h["CARD_Protein_Sequence"] = "Protein sequence of top hit in CARD"
+	h["LABEL"] = "ORF label (internal to RGI)"
+	h["ID"] = "HSP identifier (internal to RGI)"
+	h["Model_id"] = "CARD detection model id"
+
+	print "\n"
+	print "COLUMN","\t\t\t","HELP_MESSAGE"
+	for i in h:
+		print i,"\t\t\t",h[i]	
+	print "\n"
+
+
+class customAction(argparse.Action):
+    def __call__(self, parser, namespace, values, option_string=None):
+        manual()
+        exit()
+
+def main(args):
+	afile = args.afile
+	ofile = args.output
+	#orf = args.orf.lower()
+	orf = "prodigal"
+	verbose = args.verbose.lower()
+
+	# Check if file is compressed
+	if afile.endswith('.gz'):
+		afile = rgi.decompress(afile,'gz',working_directory)
+
+	if os.path.isfile(afile):	
+		printCSV(afile,ofile,orf,verbose)
+	else:
+		print "Missing file: ",afile 
+	rgi.removeTemp()
+
+def run():
+	parser = argparse.ArgumentParser(description='Convert RGI JSON file to Tab-delimited file')
+	parser.add_argument('-i','--afile',help='must be a json file generated from RGI in JSON or gzip format e.g out.json, out.json.gz')	
+	parser.add_argument('-o', '--out_file',  dest="output", default="dataSummary", help="Output Tab-delimited file (default=dataSummary)")
+	parser.add_argument('-v', '--verbose', dest="verbose", default="OFF", help = "include help menu. Options are OFF or ON  (default = OFF for no help)")
+	parser.add_argument('--headers', dest="headers", action=customAction,nargs=0,  help = "print tab-delimted help. Options are OFF or ON  (default = OFF for no help)")
+	args = parser.parse_args()
+	main(args)	
+
+if __name__ == '__main__':
+	run()
+
diff --git a/create_gff3_file.py b/create_gff3_file.py
new file mode 100644
index 0000000..7bdb23d
--- /dev/null
+++ b/create_gff3_file.py
@@ -0,0 +1,197 @@
+import json
+import csv
+import sys
+
+'''
+Column 1: "seqid"
+-----------------
+	The ID of the landmark used to establish the coordinate system for the current feature. IDs may contain any characters, but must escape any characters not in the set [a-zA-Z0-9.:^*$@!+_?-|]. In particular, IDs may not contain unescaped whitespace and must not begin with an unescaped ">".
+
+Column 2: "source"
+-----------------
+	The source is a free text qualifier intended to describe the algorithm or operating procedure that generated this feature. Typically this is the name of a piece of software, such as "Genescan" or a database name, such as "Genbank." In effect, the source is used to extend the feature ontology by adding a qualifier to the type creating a new composite type that is a subclass of the type in the type column.
+
+Column 3: "type"
+-----------------
+	The type of the feature (previously called the "method"). This is constrained to be either: (a)a term from the "lite" version of the Sequence Ontology - SOFA, a term from the full Sequence Ontology - it must be an is_a child of sequence_feature (SO:0000110) or (c) a SOFA or SO accession number. The latter alternative is distinguished using the syntax SO:000000.
+
+Columns 4 & 5: "start" and "end"
+--------------------------------
+	The start and end coordinates of the feature are given in positive 1-based integer coordinates, relative to the landmark given in column one. Start is always less than or equal to end. For features that cross the origin of a circular feature (e.g. most bacterial genomes, plasmids, and some viral genomes), the requirement for start to be less than or equal to end is satisfied by making end = the position of the end + the length of the landmark feature.
+
+	For zero-length features, such as insertion sites, start equals end and the implied site is to the right of the indicated base in the direction of the landmark.
+
+Column 6: "score"
+-----------------
+	The score of the feature, a floating point number. As in earlier versions of the format, the semantics of the score are ill-defined. It is strongly recommended that E-values be used for sequence similarity features, and that P-values be used for ab initio gene prediction features.
+
+Column 7: "strand"
+-----------------
+	The strand of the feature. + for positive strand (relative to the landmark), - for minus strand, and . for features that are not stranded. In addition, ? can be used for features whose strandedness is relevant, but unknown.
+
+Column 8: "phase"
+-----------------
+	For features of type "CDS", the phase indicates where the feature begins with reference to the reading frame. The phase is one of the integers 0, 1, or 2, indicating the number of bases that should be removed from the beginning of this feature to reach the first base of the next codon. In other words, 
+		a phase of "0" indicates that the next codon begins at the first base of the region described by the current line, 
+		a phase of "1" indicates that the next codon begins at the second base of this region, 
+		and a phase of "2" indicates that the codon begins at the third base of this region. This is NOT to be confused with the frame, which is simply start modulo 3.
+
+	For forward strand features, phase is counted from the start field. For reverse strand features, phase is counted from the end field.
+
+	The phase is REQUIRED for all CDS features.
+
+Column 9: "attributes"
+----------------------
+	A list of feature attributes in the format tag=value. Multiple tag=value pairs are separated by semicolons. URL escaping rules are used for tags or values containing the following characters: ",=;". Spaces are allowed in this field, but tabs must be replaced with the %09 URL escape. Attribute values do not need to be and should not be quoted. The quotes should be included as part of the value by parsers and not stripped.
+
+	These tags have predefined meanings:
+
+	ID
+		Indicates the ID of the feature. IDs for each feature must be unique within the scope of the GFF file. In the case of discontinuous features (i.e. a single feature that exists over multiple genomic locations) the same ID may appear on multiple lines. All lines that share an ID collectively represent a single feature.
+	Name
+		Display name for the feature. This is the name to be displayed to the user. Unlike IDs, there is no requirement that the Name be unique within the file.
+	Alias
+		A secondary name for the feature. It is suggested that this tag be used whenever a secondary identifier for the feature is needed, such as locus names and accession numbers. Unlike ID, there is no requirement that Alias be unique within the file.
+	Parent
+		Indicates the parent of the feature. A parent ID can be used to group exons into transcripts, transcripts into genes, an so forth. A feature may have multiple parents. Parent can *only* be used to indicate a partof relationship.
+	Target
+		Indicates the target of a nucleotide-to-nucleotide or protein-to-nucleotide alignment. The format of the value is "target_id start end [strand]", where strand is optional and may be "+" or "-". If the target_id contains spaces, they must be escaped as hex escape %20.
+	Gap
+		The alignment of the feature to the target if the two are not collinear (e.g. contain gaps). The alignment format is taken from the CIGAR format described in the Exonerate documentation. See "THE GAP ATTRIBUTE" for a description of this format.
+	Derives_from
+		Used to disambiguate the relationship between one feature and another when the relationship is a temporal one rather than a purely structural "part of" one. This is needed for polycistronic genes. See "PATHOLOGICAL CASES" for further discussion.
+	Note
+		A free text note.
+	Dbxref
+		A database cross reference. See the section "Ontology Associations and Db Cross References" for details on the format.
+	Ontology_term
+		A cross reference to an ontology term. See the section "Ontology Associations and Db Cross References" for details.
+	Is_circular
+		A flag to indicate whether a feature is circular. See extended discussion below.
+'''
+
+def create_gff3(afile,orf):
+	try:
+		with open(afile, 'r') as f:
+			data = json.load(f)
+		f.close()
+	
+	except ValueError:
+		print>>sys.stderr, "convertJsonToTSV expects a file contains a VALID JSON string."
+		exit()
+	
+	with open("gff3.txt", "w") as af:
+		writer = csv.writer(af, delimiter='\t', dialect='excel')
+		writer.writerow(['##gff-version 3'])
+		#writer.writerow(["seqid", "source", "type", "start", "end", "score", "strand", "phase", "attributes"])
+		for item in data:
+			minevalue = False
+			maxpercent = 0
+			startCompare = False
+			minARO = 0
+			minARO_id = 0
+			hitID = ""
+			geneID = ""
+			AROlist = {}
+
+			if item not in ["_metadata","data_type"]:
+				geneID = item
+				for it in data[item]:
+
+					idenPercent = float(data[item][it]["max-identities"]) / len(data[item][it]["query"])
+					# sort results by minimum e-value and maximum percent identity
+					AROlist[str(data[item][it]["ARO_name"])] = data[item][it]["ARO_accession"]
+
+					if startCompare:
+						if minevalue > data[item][it]["evalue"] and float(maxpercent) < float(idenPercent):
+							minevalue = data[item][it]["evalue"]
+							maxpercent = float(idenPercent)
+							minARO = data[item][it]["ARO_name"]
+							minARO_id = data[item][it]["ARO_accession"]
+							SequenceFromBroadStreet = data[item][it]["SequenceFromBroadStreet"]
+
+							if "orf_prot_sequence" in data[item][it]:
+								predictedProtein = data[item][it]["orf_prot_sequence"]
+							if "hsp_num:" in it:
+								hitID = it
+
+					else:
+						startCompare = True
+						minevalue = data[item][it]["evalue"]
+						maxpercent = float(idenPercent)
+						minARO = data[item][it]["ARO_name"]
+						minARO_id = data[item][it]["ARO_accession"]
+						SequenceFromBroadStreet = data[item][it]["SequenceFromBroadStreet"]
+
+						if "orf_prot_sequence" in data[item][it]:
+							predictedProtein = data[item][it]["orf_prot_sequence"]
+						if "hsp_num:" in it:
+							hitID = it
+
+		 		_seqid = str(geneID) + "|" + str(hitID)
+				_source = "RGI:3.1.2"	
+				_type = "CDS"
+				_phase = "."
+				_attributes = "name=" + str(minARO)+";alias=" + "ARO:"+str(minARO_id)
+				_score = str(minevalue)
+
+				if orf == "genemark":
+					_start = str(int(findnthbar(item, 4))-1)
+					_end = str(int(findnthbar(item, 5))-1)
+					_strand = str(findnthbar(item, 3))				
+				else:
+					_start = str(int(findnthbar2(item, 1))-1)
+					_end = str(int(findnthbar2(item, 2))-1)
+					_strand = str(findnthbar2(item, 3))
+
+				writer.writerow([_seqid, _source, _type, _start, _end, _score, _strand, _phase, _attributes])
+	af.close()
+
+#output the information particular field from alignment.Title by splicing it by '|'
+def findnthbar(bunchstr, start):
+
+	barc = 0
+	over = start+1
+	temp = ""
+
+	for eachc in bunchstr:
+		if eachc == '|':
+			barc += 1
+		if barc == start:
+			if eachc == '|':
+				pass
+			else:
+				temp += eachc
+		if barc == over:
+			break		
+
+	return temp
+
+#output the information particular field from alignment.Title by splicing it by '#'
+def findnthbar2(bunchstr, n):
+	arr = bunchstr.split("#")
+	if n < len(arr):
+		# gene id
+		if n == 1 and arr[n]:
+			return int(arr[n])
+		elif n == 2:
+			return int(arr[n])
+		elif n == 3:
+			 if int(arr[n]) == 1:
+			 	# positive
+			 	return "+"
+			 else:
+			 	# neg
+			 	return "-"
+		else:
+			return arr[n]
+	else:
+		return ""
+
+def main(afile,orf):
+	create_gff3(afile,orf)
+
+
+if __name__ == '__main__':
+	main(sys.argv[1],sys.argv[2])
+	"""required: sys.argv[1] must be a json file"""
\ No newline at end of file
diff --git a/filepaths.py b/filepaths.py
new file mode 100644
index 0000000..c96179f
--- /dev/null
+++ b/filepaths.py
@@ -0,0 +1,13 @@
+import sys
+import os
+
+def determine_path():
+    try:
+        root = __file__
+        if os.path.islink (root):
+            root = os.path.realpath (root)
+        return os.path.dirname (os.path.abspath (root))
+    except:
+        print "I'm sorry, but something is wrong."
+        print "There is no __file__ variable. Please contact the author."
+        sys.exit ()
\ No newline at end of file
diff --git a/formatJson.py b/formatJson.py
new file mode 100755
index 0000000..7fe9969
--- /dev/null
+++ b/formatJson.py
@@ -0,0 +1,26 @@
+import os
+import sys
+import filepaths
+import argparse
+import json
+
+def main(args):
+	if args.in_file == None:
+		print "[error] missing input(s)"
+		print "[info] Try: python formatJson.py -h"	
+		exit()
+
+	infile = args.in_file 
+	outfile = args.out_file
+
+	os.system("cat "+infile+" | python -m json.tool > "+outfile+".json" )
+
+def run():
+	parser = argparse.ArgumentParser(description='Convert RGI JSON file to Readable JSON file')
+	parser.add_argument('-i','--in_file',help='input file must be in JSON format e.g Report.json')
+	parser.add_argument('-o', '--out_file',  dest="out_file", default="ReportFormatted", help="Output JSON file (default=ReportFormatted)")	
+	args = parser.parse_args()
+	main(args)		
+
+if __name__ == '__main__':
+	run()
diff --git a/fqToFsa.py b/fqToFsa.py
new file mode 100755
index 0000000..549e4fe
--- /dev/null
+++ b/fqToFsa.py
@@ -0,0 +1,20 @@
+import sys
+import os
+import filepaths
+from Bio import SeqIO
+
+script_path = filepaths.determine_path()
+working_directory = os.getcwd()
+
+path = script_path
+
+def convertFqToFsa(fq_path,fa_path):
+	SeqIO.convert(fq_path, "fastq", fa_path, "fasta")
+
+def main(argvfile):
+	file_name = os.path.basename(argvfile)
+	convertFqToFsa(argvfile,working_directory+'/'+file_name+'.read.fsa')
+
+
+if __name__ == "__main__":
+    main(sys.argv[1])
diff --git a/load.py b/load.py
new file mode 100644
index 0000000..af6683b
--- /dev/null
+++ b/load.py
@@ -0,0 +1,45 @@
+import os, sys
+import shutil
+import filepaths
+import json
+import argparse
+
+# this script is used to load new card.json file to system wide package
+script_path = filepaths.determine_path()
+
+def validateFile(filename):
+    try:
+		with open(filename) as f:
+			out = json.load(f)
+		f.close()
+		return out
+		
+    except ValueError as e:
+        print>>sys.stderr, ('[error] invalid json: %s' % e)
+        return None # or: raise	
+
+def main(args):
+	filepath = args.afile
+	if os.path.exists(filepath):
+		if validateFile(filepath):
+			dst = script_path+"/_data/card.json"
+			try:
+				# copy new card.json file
+				shutil.copyfile(filepath, dst)
+				print>>sys.stderr, "[success] file copied ok"
+			except Exception as e:
+				print e
+				print>>sys.stderr, "[error] failed to copy json file"
+		else:
+			print>>sys.stderr, "[error] failed to read json file"
+	else:
+		print>>sys.stderr,"[error] failed to upload file"
+
+def run():
+	parser = argparse.ArgumentParser(description='Load card database json file')
+	parser.add_argument('-i', '--afile',help='must be a card database json file')
+	args = parser.parse_args()
+	main(args)
+
+if __name__ == "__main__":
+	run()
diff --git a/rgi.py b/rgi.py
new file mode 100755
index 0000000..d9bcf67
--- /dev/null
+++ b/rgi.py
@@ -0,0 +1,1470 @@
+import json
+import sys
+import os
+import fqToFsa
+import contigToProteins
+import contigToORF
+#import blastnsnp
+import argparse
+import filepaths
+import gzip, zlib
+from Bio.Seq import Seq
+from Bio.Alphabet import generic_dna
+import logging
+import hashlib
+import time
+import convertJsonToTSV
+import csv
+import subprocess
+import shutil
+import datetime
+import clean as cln
+#import splitter
+
+script_path = filepaths.determine_path()
+working_directory = os.getcwd()
+
+clean_files = []
+
+path = script_path+"/_db/"
+data_path = script_path+"/_data/"
+tmp = script_path+"/_tmp/"
+
+#remove temporary file
+def removeTemp():
+	for tempfile in clean_files:
+		if os.path.isfile(tempfile):
+			logging.info("removeTemp => remove temp file: " + str(tempfile))
+			os.remove(tempfile)
+
+
+# check if a string is a float number
+def isfloat(value):
+  try:
+    float(value)
+    return True
+  except ValueError:
+    return False
+
+
+# TODO:: re-write this function
+#output the information particular field from alignment.Title by splicing it by '|'
+def findnthbar(bunchstr, n):
+	if bunchstr[0:5] in "gene_":
+		pass
+	else:
+		if "model_type_id" in bunchstr:
+			pass
+		else:
+			bunchstr = "_|_|_ "+ bunchstr
+
+	barc = 0
+	start = n+3
+	over = n+4
+	temp = ""
+	for eachc in bunchstr:
+		if eachc == '|':
+			barc += 1
+		if barc == start:
+			temp += eachc
+		if barc == over:
+			break
+	colonpos = temp.find(':')
+	res=temp[colonpos+2:]
+	res=res.rstrip()
+	
+	if res.isdigit():
+		return int(res)
+	elif isfloat(res):
+		return float(res)
+	else:
+		res = res.encode('ascii','replace')
+		return res
+
+# TODO:: re-write this function
+#output the information particular field from alignment.Title by splicing it by '#'
+def findnthbar2(bunchstr, n):
+
+	if "#" in bunchstr:
+		arr = bunchstr.split("#")
+		if n < len(arr):
+			# gene id
+			if n == 1:
+				return int(arr[n])
+			elif n == 2:
+				return int(arr[n])
+			elif n == 3:
+				 if int(arr[n]) == 1:
+				 	# positive
+				 	return "+"
+				 else:
+				 	# neg
+				 	return "-"
+			else:
+				return arr[n]
+		else:
+			return ""
+	else:
+		return 0
+
+def checkBeforeBlast(inType, inputSeq):
+	logging.info("checkBeforeBlast => " + str([inType,inputSeq]))
+	# give warning if inType is neither 'protein' nor 'dna' and then exit this program
+	if inType != 'protein' and inType != 'contig' and inType != 'read':
+		print>>sys.stderr, "inType must be one of protein, contig, orf or read. "
+		logging.error("checkBeforeBlast => " + str(inType) + ", inType must be one of protein, contig, orf or read. ")
+		removeTemp()
+		exit()
+	logging.info("checkBeforeBlast => success")
+
+
+def checkKeyExisted(key, my_dict):
+	try:
+		nonNone = my_dict[key] is not None
+	except KeyError:
+		nonNone = False
+	return nonNone
+
+def loadDatabase(card_dir):
+	card_dir = card_dir.rstrip("/")
+	print "source_path: ", card_dir
+	if card_dir == None:
+		exit("Error: no new card path")
+	else:
+		"""
+		verify that we have the following files at the specified location:
+
+		# CARD json data file
+		- card.json
+
+		# diamond blast
+		- protein.db.dmnd
+
+		# ncbi blast
+		- protein.db.phr
+		- protein.db.pin
+		- protein.db.psq
+
+		# Protein fasta file
+		- proteindb.fsa
+
+		"""
+		needed_files = ['card.json','proteindb.fsa','protein.db.dmnd','protein.db.phr','protein.db.pin','protein.db.psq']
+		found_files = []
+
+		files = os.listdir(card_dir)
+
+		for f in files:
+			if not f.startswith('.'):
+				found_files.append(f)
+
+		missing_files = list(set(needed_files) - set(found_files))
+
+		if len(missing_files) > 0 :
+		 	print "Error: missing database files", missing_files
+		 	exit()
+
+		# Files found - move files into _data and _db directories
+		for _file in os.listdir(card_dir):
+			if not _file.startswith('.') and _file in needed_files:
+				src_path = str(card_dir)+"/"+str(_file)
+				dst_path = ""
+				if _file in ['card.json']:
+					dst_path = data_path+str(_file)
+				else:
+					dst_path = path+str(_file)
+				print "copy", src_path, " to ", dst_path
+				shutil.copy2(src_path,dst_path)
+
+		"""
+		logging.info("new card_json: " + str(card_json))
+		logging.info("loadDatabase => copy " + str(card_json) + " to "+ str(data_path))
+		subprocess.call(['python', script_path + '/load.py', '--afile', card_json])
+		logging.info("loadDatabase => clean old databases")
+		subprocess.call(['python', script_path + '/clean.py'])
+		"""
+		
+'''e-value function'''
+def writeFASTAfromJson():
+	noSeqList = []
+
+	if os.path.isfile(path+"proteindb.fsa") == False:
+		with open(data_path+"card.json") as json_file:
+			json_data = json.load(json_file)
+			with open (path+'proteindb.fsa', 'w') as wp:
+
+				for item in json_data:
+					if item.isdigit(): #get rid of __comment __timestamp etc
+						# model_type: blastP only (pass_evalue)
+						# model_type: protein homolog model
+						if json_data[item]["model_type_id"] == "40292":
+							pass_eval = 1e-30
+							if checkKeyExisted("model_param", json_data[item]):
+								if checkKeyExisted("blastp_evalue", json_data[item]["model_param"]):
+									pass_eval = json_data[item]["model_param"]["blastp_evalue"]["param_value"]
+							
+							if checkKeyExisted("model_sequences", json_data[item]):
+								for seqkey in json_data[item]["model_sequences"]["sequence"]:
+									print>>wp, ('>' + item + '_' + seqkey + " | model_type_id: 40292" + " | pass_evalue: " + str(pass_eval))
+									print>>wp, (json_data[item]["model_sequences"]["sequence"][seqkey]["protein_sequence"]["sequence"])
+
+							else:
+								 noSeqList.append(item)
+
+						# model_type: blastP + SNP (pass_evalue + snp)
+						# model_type: protein variant model
+						elif json_data[item]["model_type_id"] == "40293":
+							snpList = ""
+							pass_eval = 1e-30
+							if checkKeyExisted("model_param", json_data[item]):
+								if checkKeyExisted("blastp_evalue", json_data[item]["model_param"]):
+									pass_eval = json_data[item]["model_param"]["blastp_evalue"]["param_value"]
+									
+								if checkKeyExisted("snp", json_data[item]['model_param']):
+									for key in json_data[item]['model_param']['snp']['param_value']:
+										snpList += json_data[item]['model_param']['snp']['param_value'][key]
+										snpList += ','
+							
+							if checkKeyExisted("model_sequences", json_data[item]):
+								for seqkey in json_data[item]["model_sequences"]["sequence"]:
+									print>>wp, ('>' + item + '_' + seqkey + " | model_type_id: 40293" + " | pass_evalue: " + str(pass_eval) + " | SNP: " + snpList)
+									print>>wp, (json_data[item]["model_sequences"]["sequence"][seqkey]["protein_sequence"]["sequence"])
+							else:
+								 noSeqList.append(item)
+			wp.close()
+		json_file.close()
+
+
+'''bit-score function'''
+'''
+def writeFASTAfromJson():
+	noSeqList = []
+
+	if os.path.isfile(path+"proteindb.fsa") == False:
+		with open(data_path+"card.json") as json_file:
+			json_data = json.load(json_file)
+			with open (path+'proteindb.fsa', 'w') as wp:
+
+				for item in json_data:
+					if item.isdigit(): #get rid of __comment __timestamp etc
+						# model_type: protein homolog model
+						if json_data[item]["model_type_id"] == "40292":
+							pass_bit_score = 0
+							if checkKeyExisted("model_param", json_data[item]):
+								if checkKeyExisted("blastp_bit_score", json_data[item]["model_param"]):
+									pass_bit_score = json_data[item]["model_param"]["blastp_bit_score"]["param_value"]
+							
+							if checkKeyExisted("model_sequences", json_data[item]):
+								for seqkey in json_data[item]["model_sequences"]["sequence"]:
+									print>>wp, ('>' + item + '_' + seqkey + " | model_type_id: 40292" + " | pass_bit_score: " + str(pass_bit_score)) + " | " + json_data[item]["ARO_name"] 
+									print>>wp, (json_data[item]["model_sequences"]["sequence"][seqkey]["protein_sequence"]["sequence"])
+
+							else:
+								noSeqList.append(item)							
+
+						# model_type: protein variant model
+						elif json_data[item]["model_type_id"] == "40293":
+							snpList = ""
+							pass_bit_score = 0
+							if checkKeyExisted("model_param", json_data[item]):
+								if checkKeyExisted("blastp_bit_score", json_data[item]["model_param"]):
+									pass_bit_score = json_data[item]["model_param"]["blastp_bit_score"]["param_value"]
+									
+								if checkKeyExisted("snp", json_data[item]['model_param']):
+									for key in json_data[item]['model_param']['snp']['param_value']:
+										snpList += json_data[item]['model_param']['snp']['param_value'][key]
+										snpList += ','
+							
+							if checkKeyExisted("model_sequences", json_data[item]):
+								for seqkey in json_data[item]["model_sequences"]["sequence"]:
+									print>>wp, ('>' + item + '_' + seqkey + " | model_type_id: 40293" + " | pass_bit_score: " + str(pass_bit_score) + " | SNP: " + snpList) + " | " + json_data[item]["ARO_name"]
+									print>>wp, (json_data[item]["model_sequences"]["sequence"][seqkey]["protein_sequence"]["sequence"])
+							else:
+								 noSeqList.append(item)
+							
+						# model_type: presence and absence of protein variant model
+						"""
+						elif json_data[item]["model_type_id"] == "41091":
+							snpList = ""
+							pass_bit_score = 0
+							if checkKeyExisted("model_param", json_data[item]):
+								if checkKeyExisted("blastp_bit_score", json_data[item]["model_param"]):
+									pass_bit_score = json_data[item]["model_param"]["blastp_bit_score"]["param_value"]
+									
+								if checkKeyExisted("snp", json_data[item]['model_param']):
+									for key in json_data[item]['model_param']['snp']['param_value']:
+										snpList += json_data[item]['model_param']['snp']['param_value'][key]
+										snpList += ','
+							
+							if checkKeyExisted("model_sequences", json_data[item]):
+								for seqkey in json_data[item]["model_sequences"]["sequence"]:
+									print>>wp, ('>' + item + '_' + seqkey + " | model_type_id: 41091" + " | pass_bit_score: " + str(pass_bit_score) + " | SNP: " + snpList) + " | " + json_data[item]["ARO_name"]
+									print>>wp, (json_data[item]["model_sequences"]["sequence"][seqkey]["protein_sequence"]["sequence"])
+							else:
+								 noSeqList.append(item)
+						"""
+
+
+			wp.close()
+		json_file.close()
+'''
+
+
+#make protein (and dna if needed) database:
+def makeBlastDB(verbose):
+	if os.path.isfile(path+"proteindb.fsa") == True and os.path.exists(path+"proteindb.fsa") == True  and os.path.exists(path+"protein.db.phr") == True and os.path.exists(path+"protein.db.pin") == True and os.path.exists(path+"protein.db.psq") == True :
+		logging.info("makeBlastDB => blast DB exists")
+	else:
+		if verbose == "on":
+			logging.info("makeBlastDB => create blast DB.")
+			os.system('makeblastdb -in '+path+'proteindb.fsa -dbtype prot -out '+path+'protein.db 2>&1 >> ' + logging.getLoggerClass().root.handlers[0].baseFilename)
+		else:
+			os.system('makeblastdb -in '+path+'proteindb.fsa -dbtype prot -out '+path+'protein.db > /dev/null 2>&1')
+
+def makeDiamondDB(verbose):
+	if os.path.isfile(path+"proteindb.fsa") == True and os.path.exists(path+"proteindb.fsa") == True  and os.path.exists(path+"protein.db.dmnd") == True:
+		logging.info("makeDiamondDB => diamond DB exists")
+	else:
+		if verbose == "on":
+			logging.info("makeDiamondDB => create diamond DB.")
+			os.system('diamond makedb --quiet --in '+path+'proteindb.fsa --db '+path+'protein.db 2>&1 >> ' + logging.getLoggerClass().root.handlers[0].baseFilename)
+		else:
+			os.system('diamond makedb --quiet --in '+path+'proteindb.fsa --db '+path+'protein.db')
+
+def dd(parsed):
+	return json.dumps(parsed, indent=4, sort_keys=True)
+
+def getORFDNASequence(file_name,orf,inType):
+	predicted_genes_dict = {}
+	if inType in ["contig"]:
+		if os.stat(working_directory+"/"+file_name+".contigToORF.fsa").st_size != 0:
+			from Bio import SeqIO
+			clean_files.append(working_directory+"/"+file_name+".contigToORF.fsa")
+			for record in SeqIO.parse(working_directory+"/"+file_name+".contigToORF.fsa", 'fasta'):
+				if orf == "genemark" and '|' in record.id:
+					predicted_genes_dict[record.id[:str(record.id).index('|')]] = str(record.seq)
+				else:
+					predicted_genes_dict[record.id] = str(record.seq)
+	else:
+		if os.stat(working_directory+"/"+file_name+".read.fsa").st_size != 0:
+			from Bio import SeqIO
+			clean_files.append(working_directory+"/"+file_name+".read.fsa")
+			for record in SeqIO.parse(working_directory+"/"+file_name+".read.fsa", 'fasta'):
+				if orf == "genemark" and '|' in record.id:
+					predicted_genes_dict[record.id[:str(record.id).index('|')]] = str(record.seq)
+				else:
+					predicted_genes_dict[record.id] = str(record.seq)		
+	
+	# write json for all predicted file
+	pjson = json.dumps(predicted_genes_dict)
+	clean_files.append(working_directory+'/'+file_name+'.predictedGenes.json')
+	with open(working_directory+'/'+file_name+'.predictedGenes.json', 'w') as wf:
+		print>>wf, pjson
+	wf.close()
+
+	return predicted_genes_dict
+
+def getSubmittedProteinSequence(afile):
+	submitted_proteins_dict = {}
+	if os.stat(afile).st_size != 0:
+		from Bio import SeqIO
+		for record in SeqIO.parse(afile, 'fasta'):
+			submitted_proteins_dict[record.id] = str(record.seq)
+
+	return submitted_proteins_dict
+
+def runDiamond(inType, inputSeq, threads, outputFile, verbose, num_sequences):
+
+	cfilter = "	--index-chunks 1 --block-size 1 --quiet --more-sensitive"
+ 
+	if verbose == "on":
+	   cfilter = cfilter + " --log 2>&1 >> " + logging.getLoggerClass().root.handlers[0].baseFilename
+
+	if inType == 'read':
+	 	logging.info('runDiamond => diamond blastx --in '+path+'proteindb.fsa --db '+path+'protein.db'+' --query '+inputSeq+' --outfmt 5 --out '+outputFile+' --threads '+threads+' --salltitles  '+cfilter)
+		os.system('diamond blastx --in '+path+'proteindb.fsa --db '+path+'protein.db'+' --query '+inputSeq+' --outfmt 5 --out '+outputFile+' --threads '+threads+' --salltitles '+cfilter)
+	elif inType == 'contig':
+		exit("Error : contigs")
+	else:
+		logging.info('runDiamond => diamond blastp --in '+path+'proteindb.fsa --db '+path+'protein.db'+' --query '+inputSeq+' --outfmt 5 --out '+outputFile+' --threads '+threads+' --salltitles '+cfilter)
+		os.system('diamond blastp --in '+path+'proteindb.fsa --db '+path+'protein.db'+' --query '+inputSeq+' --outfmt 5 --out '+outputFile+' --threads '+threads+' --salltitles '+cfilter)
+   
+
+def getHashName(name):
+	m = hashlib.md5()
+	t = time.gmtime()
+	m.update(name + str(t))
+	return m.hexdigest()
+
+def encodeHeaders(fastaFile):
+	_fastaFile = os.path.basename(fastaFile)
+	headers_dict = {}
+	ofile = open("my_fasta.txt", "w")
+	from Bio import SeqIO
+	for record in SeqIO.parse(fastaFile, 'fasta'):
+		new_header = getHashName(record.id)
+		headers_dict[new_header] = record.id
+		ofile.write(">" + new_header + "\n" + str(record.seq) + "\n")
+	ofile.close()
+	os.rename("my_fasta.txt", fastaFile)
+	# write dictionaty for encoded headers
+	headers_dict_json = json.dumps(headers_dict)
+	clean_files.append(working_directory+"/"+_fastaFile+".encodedHeaders.json")
+	with open(working_directory+"/"+_fastaFile+".encodedHeaders.json", 'w') as f:
+	 	print>>f, headers_dict_json
+	f.close()	
+
+def validateFastA(fastaFile):
+
+	ofile = open(fastaFile, "r")
+	instream = ofile.readlines()
+	ofile.close()
+
+	lastchar = instream[len(instream) -1]
+
+	if lastchar.endswith('\n'):
+		logging.info("validateFastA => fasta file looks good")
+	else:
+		logging.error("validateFastA => invalid fasta, missing newline at the end of fasta file ("+fastaFile+")")
+		print>>sys.stderr, "[error] invalid fasta, missing newline at the end of fasta file ("+fastaFile+")"
+		exit()	
+
+# TODO:: validate fastq files
+def validateFastQ(afile):
+	logging.info("validateFastQ => TODO:: validate fastq file")
+
+
+def runBlast(args, inType, inputSeq, threads, outputFile, criteria, data_type, clean, orf, alignment_tool,verbose, num_sequences):	
+	cmd = str(args)
+	t0 = time.time()
+	pjson = {}
+	startBlast = False
+	predicted_genes_dict = {}
+	submitted_proteins_dict = {}
+
+	# encode header using md5 and store this to dictionary - this will solve issues with headers which have same characters as alignment tools :)
+	#encodeHeaders(inputSeq)
+	file_name = os.path.basename(inputSeq)
+	clean_files.append(working_directory+"/"+file_name+".blastRes.xml")
+
+	if inType == 'contig':
+		if orf == "genemark":
+			logging.info("runBlast => contigToProteins => start")
+			contigToProteins.main(inputSeq,clean)
+			logging.info("runBlast => contigToProteins => done")
+		
+			# get predicted dna
+			logging.info("runBlast => contigToORF => start")
+			contigToORF.main(inputSeq,clean,orf)
+			logging.info("runBlast => contigToORF => done")
+
+			logging.info("runBlast => getORFDNASequence => start")
+			predicted_genes_dict = getORFDNASequence(file_name,orf,inType)
+			logging.info("runBlast => getORFDNASequence => done")
+		else:
+			# logging.info("runBlast => contigToProteins => start")
+			# contigToProteins.main(inputSeq,clean)
+			# logging.info("runBlast => contigToProteins => done")
+			
+			# get predicted dna
+			logging.info("runBlast => contigToORF => start")
+			contigToORF.main(inputSeq,clean,orf)
+			logging.info("runBlast => contigToORF => done")
+
+			logging.info("runBlast => getORFDNASequence => start")
+			predicted_genes_dict = getORFDNASequence(file_name,orf,inType)
+			logging.info("runBlast => getORFDNASequence => done")			
+
+		if os.stat(working_directory+"/"+file_name+".contig.fsa").st_size != 0:
+
+			logging.info("runBlast => start blastP for inType: " + inType)
+			clean_files.append(working_directory+"/"+file_name+".contig.fsa")
+
+			if alignment_tool == "diamond":
+				runDiamond("protein", working_directory+"/"+file_name+".contig.fsa", threads, working_directory+"/"+file_name+".blastRes.xml",verbose, num_sequences)
+				#runDiamond(inType, working_directory+"/"+file_name+".contigToORF.fsa", threads, working_directory+"/"+file_name+".blastRes.xml")
+				#fasta.contigToORF.fsa
+			else:
+				from Bio.Blast.Applications import NcbiblastpCommandline
+				blastCLine = NcbiblastpCommandline(query=working_directory+"/"+file_name+".contig.fsa", db=path+"protein.db", outfmt=5, out=working_directory+"/"+file_name+".blastRes.xml",num_threads=threads)
+				stdt, stdr = blastCLine()
+
+			result_handle = open(working_directory+"/"+file_name+".blastRes.xml")
+			startBlast = True
+
+	elif inType == 'protein':
+
+		submitted_proteins_dict = getSubmittedProteinSequence(inputSeq)
+		logging.info("runBlast => start blastP for inType: " + inType)
+
+		if alignment_tool == "diamond":
+			runDiamond(inType, inputSeq, threads, working_directory+"/"+file_name+".blastRes.xml",verbose, num_sequences)
+		else:
+			from Bio.Blast.Applications import NcbiblastpCommandline
+			blastCLine = NcbiblastpCommandline(query=inputSeq, db=path+"protein.db", outfmt=5, out=working_directory+"/"+file_name+".blastRes.xml",num_threads=threads)
+			stdt, stdr = blastCLine()
+
+		result_handle = open(working_directory+"/"+file_name+".blastRes.xml")
+		startBlast = True
+
+	elif inType == 'read':
+		logging.info("runBlast => start blastX for inType: " + inType)
+		logging.info("runBlast => fqToFsa => start")
+		fqToFsa.main(inputSeq)
+		logging.info("runBlast => fqToFsa => done")
+		clean_files.append(working_directory+"/"+file_name+".read.fsa")
+		#files = splitter.main(working_directory+"/"+file_name+".read.fsa")
+		#print " files : ", files
+		"""
+		# get predicted dna
+		logging.info("runBlast => contigToORF => start")
+		contigToORF.main(working_directory+"/"+file_name+".read.fsa",clean,orf)
+		logging.info("runBlast => contigToORF => done")
+		file_name = os.path.basename(working_directory+"/"+file_name+".read.fsa")
+
+		logging.info("runBlast => getORFDNASequence => start")
+		predicted_genes_dict = getORFDNASequence(file_name,orf,inType)
+		logging.info("runBlast => getORFDNASequence => done")
+		"""	
+
+		if alignment_tool == "diamond":
+			runDiamond(inType, working_directory+"/"+file_name+".read.fsa", threads, working_directory+"/"+file_name+".blastRes.xml",verbose,num_sequences)
+		else:
+			from Bio.Blast.Applications import NcbiblastxCommandline
+			blastCLine = NcbiblastxCommandline(query=working_directory+"/"+file_name+".read.fsa", db=path+"protein.db", outfmt=5, out=working_directory+"/"+file_name+".blastRes.xml",num_threads=threads)
+			stdt, stdr = blastCLine()
+		result_handle = open(working_directory+"/"+file_name+".blastRes.xml")
+		startBlast = True
+
+	if os.path.isfile(working_directory+"/"+file_name+".blastRes.xml"):
+		# Check if we have any results
+		if os.stat(working_directory+"/"+file_name+".blastRes.xml").st_size == 0:
+			return pjson
+	else:
+		print "No results"
+		return pjson
+
+	if startBlast:
+		logging.info("runBlast => startBlast = " + str(startBlast))
+		from Bio.Blast import NCBIXML
+		blast_records = NCBIXML.parse(result_handle)
+		blastResults = {}
+		pjson = {}
+
+		with open(data_path+"card.json") as json_file:
+			json_data = json.load(json_file)
+		json_file.close()
+
+		for blast_record in blast_records:
+			perfect = {}
+			strict = {}
+			loose = {}
+
+			for alignment in blast_record.alignments:		
+				alignTitle = alignment.title
+				orfInfo = blast_record.query.encode('ascii','replace')
+
+				c = 0
+				barc = 0
+				for eachc in orfInfo:
+					if barc >= 6:
+						break
+					elif eachc == '|':
+						barc += 1
+						c += 1
+					else:
+						c += 1
+				orffrom = orfInfo[c:]
+
+				modelTypeID = findnthbar(alignTitle, 0)
+				spacepos = alignTitle.index(' ')
+				hitid = alignTitle[0:spacepos]
+				hitid = hitid.encode('ascii','replace')
+
+				modelDescrpt =alignTitle[alignTitle.index(' ')+1:]
+				underscoreinMD = modelDescrpt.index('_')
+				modelID = modelDescrpt[0:underscoreinMD]
+				seqinModel = modelDescrpt[underscoreinMD+1: modelDescrpt.index(' ')]
+
+				modelTypeDscp = alignTitle[alignTitle.index(':')+2:]
+				modelTypeId = modelTypeDscp[0:modelTypeDscp.index(' ')]
+				passevalue = modelTypeDscp [modelTypeDscp.index(':')+2:]
+
+
+				if isfloat(passevalue):
+					truePassEvalue = passevalue
+				else:
+					truePassEvalue = float(passevalue[0:passevalue.find(' ')])
+
+				logging.info("runBlast => [info] | modelTypeID = " + str(alignTitle))
+
+				if modelTypeID == 40293:
+					
+					init = 0
+					evalueSNP = findnthbar(alignTitle, 2)
+					snpL = []
+					snpdictlist = []
+					temp = ""
+
+					for eachc in evalueSNP:
+						if eachc == ',':
+							snpL.append(temp)
+							temp = ""
+						else:
+							temp += eachc
+
+					for eachsnp in snpL:					
+						snpdictlist.append({"original": eachsnp[0], "change": eachsnp[-1], "position": int(eachsnp[1:-1])})
+
+					for hsp in alignment.hsps:
+						'''print>>sys.stderr, hsp.identities
+						print>>sys.stderr, len(hsp.match)
+						print "hsp.query: \n" +hsp.query + "\n"
+						print "hsp.sbj: \n" +hsp.sbjct + "\n"'''
+
+						querySeq =  hsp.query.replace('-', '')
+						realQueryLength = len(querySeq) 
+
+						sbjctSeq = hsp.sbjct.replace('-', '') 
+						realSbjctLength = len(sbjctSeq) 
+						sinsidedict = {}
+						'''
+						print "QuerySeq: \n" +querySeq + "\n"
+						print "sbjctSeq: \n" +sbjctSeq + "\n"
+						print "realQueryLength: \n" + str(realQueryLength) + "\n"
+						print "realSbjctLength: \n" +str(realSbjctLength) + "\n"
+						print "subject_start: \n" + str(hsp.sbjct_start) + "\n"
+						print "query_start: \n" + str(hsp.query_start) + "\n"
+						'''
+
+
+						for eachs in snpdictlist:
+							pos = eachs["position"]
+							ori = eachs["original"]
+							chan = eachs["change"]
+
+							if hsp.sbjct_start < pos and (hsp.sbjct_start + realSbjctLength) > pos:
+								'''print>>sys.stderr, eachs
+								print>>sys.stderr, hsp.sbjct
+								print>>sys.stderr, hsp.sbjct_start
+								print>>sys.stderr, ("wrong snp in query: " + hsp.query[pos - hsp.sbjct_start])
+								print>>sys.stderr, hsp.query'''
+								'''c = 0
+								target = pos - hsp.sbjct_start
+								snpInQuery = 0
+
+								for ch in hsp.sbjct:
+									if c == target:
+										if ch == '-':
+											snpInQuery += 1
+										else:
+											break
+									elif ch == '-':
+										snpInQuery += 1
+									else:
+										snpInQuery += 1
+										c += 1'''
+								'''print>>sys.stderr, ("corret: snp in sbject: " + sbjctSeq[target])
+								print>>sys.stderr, ("correct: snp in query: " + hsp.query[snpInQuery])'''
+
+								card_sequence = str(json_data[modelID]["model_sequences"]["sequence"][seqinModel]["protein_sequence"]["sequence"])
+								orf_protein_sequence = ""
+ 
+								if orf == "genemark":
+									if predicted_genes_dict:
+										orf_protein_sequence = str(Seq(predicted_genes_dict[orfInfo[:orfInfo.index('|')]], generic_dna).translate(table=11)).strip("*")
+									if submitted_proteins_dict:
+										orf_protein_sequence = str(submitted_proteins_dict[orfInfo.split(" ")[0]])
+								else:
+									if predicted_genes_dict:
+										orf_protein_sequence = str(Seq(predicted_genes_dict[orfInfo[:orfInfo.index('#')]], generic_dna).translate(table=11)).strip("*")
+									if submitted_proteins_dict:
+										orf_protein_sequence = str(submitted_proteins_dict[orfInfo.split(" ")[0]])									
+
+								logging.info("runBlast => [info] | Model:"+str(modelID) + " pos:" +str(pos) +" | "+str(hsp.query[pos - hsp.sbjct_start + findNumDash(hsp.sbjct, (pos-hsp.sbjct_start))]) + "=" + str(chan) + " AND " + str(hsp.sbjct[pos - hsp.sbjct_start +findNumDash(hsp.sbjct, (pos-hsp.sbjct_start))]) + "=" + str(ori))
+
+								# Report ONLY if the SNPs are present
+								if hsp.query[pos - hsp.sbjct_start + findNumDash(hsp.sbjct, (pos-hsp.sbjct_start))] == chan and hsp.sbjct[pos - hsp.sbjct_start +findNumDash(hsp.sbjct, (pos-hsp.sbjct_start))] == ori:								# 224 = pos. sbject_start = 9 = 216==
+
+								#pos = 224, start = 9 = 216
+									if hsp.expect <= truePassEvalue:
+									#if hsp.bits >= truePassEvalue:
+										sinsidedict = {}
+										sinsidedict["type_match"] = "Strict"
+										sinsidedict["SNP"] = eachs
+										sinsidedict["orf_strand"] = findnthbar(orfInfo, 0)
+										sinsidedict["orf_start"] = findnthbar(orfInfo, 1)
+										sinsidedict["orf_end"] = findnthbar(orfInfo, 2)
+										sinsidedict["orf_From"] = orffrom
+										
+										sinsidedict["model_name"] = json_data[modelID]["model_name"]
+										sinsidedict["model_type"] = json_data[modelID]["model_type"]
+										sinsidedict["model_type_id"] = modelTypeID
+
+										sinsidedict["model_id"] = modelID
+										sinsidedict["pass_evalue"] = passevalue
+										sinsidedict["ARO_accession"] = json_data[modelID]["ARO_accession"]
+										sinsidedict["ARO_name"] = json_data[modelID]["ARO_name"]
+										sinsidedict["ARO_category"] = json_data[modelID]["ARO_category"]
+										sinsidedict["evalue"] = hsp.expect
+										sinsidedict["max-identities"] = hsp.identities
+										sinsidedict["bit-score"] = hsp.bits
+										if checkKeyExisted("NCBI_taxonomy", json_data[modelID]["model_sequences"]["sequence"][seqinModel]):
+											sinsidedict["cvterm_id"] = json_data[modelID]["model_sequences"]["sequence"][seqinModel]["NCBI_taxonomy"]["NCBI_taxonomy_cvterm_id"]
+
+										sinsidedict["query"] = hsp.query.encode('ascii','replace')
+										sinsidedict["match"] = hsp.match.encode('ascii','replace')
+										sinsidedict["sequenceFromDB"] = hsp.sbjct.encode('ascii','replace')
+										sinsidedict["SequenceFromBroadStreet"] = json_data[modelID]["model_sequences"]["sequence"][seqinModel]["protein_sequence"]["sequence"]
+										sinsidedict["dnaSequenceFromBroadStreet"] = json_data[modelID]["model_sequences"]["sequence"][seqinModel]["dna_sequence"]["sequence"]
+										
+										if inType == 'contig':
+											if orf == "genemark":
+												sinsidedict["query_start"] = findnthbar(orfInfo, 1) + (hsp.query_start - 1)*3
+												sinsidedict["query_end"] = findnthbar(orfInfo, 1) + (hsp.query_start - 1)*3 + realQueryLength*3 - 1
+												sinsidedict["orf_strand"] = findnthbar(orfInfo, 0)
+												sinsidedict["orf_start"] = findnthbar(orfInfo, 1)
+												sinsidedict["orf_end"] = findnthbar(orfInfo, 2)
+												sinsidedict["orf_From"] = orffrom
+
+												if orfInfo[:orfInfo.index('|')] in predicted_genes_dict:
+													sinsidedict["orf_dna_sequence"] = predicted_genes_dict[orfInfo[:orfInfo.index('|')]] 
+													sinsidedict["orf_prot_sequence"] = str(Seq(predicted_genes_dict[orfInfo[:orfInfo.index('|')]], generic_dna).translate(table=11)).strip("*")
+												else:
+													sinsidedict["orf_dna_sequence"] = ""
+													sinsidedict["orf_prot_sequence"] = ""
+											else:
+												sinsidedict["query_start"] = findnthbar2(orfInfo, 1) + (hsp.query_start - 1)*3
+												sinsidedict["query_end"] = findnthbar2(orfInfo, 1) + (hsp.query_start - 1)*3 + realQueryLength*3 - 1
+												sinsidedict["orf_strand"] = findnthbar2(orfInfo, 3)
+												sinsidedict["orf_start"] = findnthbar2(orfInfo, 1)
+												sinsidedict["orf_end"] = findnthbar2(orfInfo, 2)
+												sinsidedict["orf_From"] = findnthbar2(orfInfo, 0)
+
+												if orfInfo[:orfInfo.index('#')] in predicted_genes_dict:
+													sinsidedict["orf_dna_sequence"] = predicted_genes_dict[orfInfo[:orfInfo.index('#')]] 
+													sinsidedict["orf_prot_sequence"] = str(Seq(predicted_genes_dict[orfInfo[:orfInfo.index('#')]], generic_dna).translate(table=11)).strip("*")
+												else:
+													sinsidedict["orf_dna_sequence"] = ""
+													sinsidedict["orf_prot_sequence"] = ""
+
+
+										elif inType == 'protein':
+											sinsidedict["query_start"] = hsp.query_start
+											sinsidedict["query_end"] = hsp.query_start + realQueryLength
+											sinsidedict["query_From"] = blast_record.query.encode('ascii','replace')
+											sinsidedict["orf_prot_sequence"] = orf_protein_sequence
+
+										elif inType == 'read':
+											pass
+
+										sinsidedict["perc_identity"] = float(format(float(sinsidedict["max-identities"]*100) / len(sinsidedict["query"]),'.2f'))
+
+										strict[hitid + "|hsp_num:" + str(init)] = sinsidedict
+										init += 1
+
+									else:
+										slinsidedict = {}
+										slinsidedict["type_match"] = "Loose"
+										slinsidedict["SNP"] = eachs
+										slinsidedict["orf_strand"] = findnthbar(orfInfo, 0)
+										slinsidedict["orf_start"] = findnthbar(orfInfo, 1)					
+										slinsidedict["orf_end"] = findnthbar(orfInfo, 2)
+										slinsidedict["orf_From"] = orffrom
+
+										slinsidedict["model_name"] = json_data[modelID]["model_name"]
+										slinsidedict["model_type"] = json_data[modelID]["model_type"]
+										slinsidedict["model_type_id"] = modelTypeID
+										
+										slinsidedict["pass_evalue"] = passevalue
+										slinsidedict["model_id"] = modelID
+										slinsidedict["ARO_accession"] = json_data[modelID]["ARO_accession"]
+										slinsidedict["ARO_name"] = json_data[modelID]["ARO_name"]
+										slinsidedict["ARO_category"] = json_data[modelID]["ARO_category"]
+										slinsidedict["evalue"] = hsp.expect
+										slinsidedict["bit-score"] = hsp.bits
+										slinsidedict["max-identities"] = hsp.identities
+										if checkKeyExisted("NCBI_taxonomy", json_data[modelID]["model_sequences"]["sequence"][seqinModel]):
+											slinsidedict["cvterm_id"] = json_data[modelID]["model_sequences"]["sequence"][seqinModel]["NCBI_taxonomy"]["NCBI_taxonomy_cvterm_id"]
+
+										slinsidedict["query"] = hsp.query.encode('ascii','replace')
+										slinsidedict["match"] = hsp.match.encode('ascii','replace')
+										slinsidedict["sequenceFromDB"] = hsp.sbjct.encode('ascii','replace')
+										slinsidedict["SequenceFromBroadStreet"] = json_data[modelID]["model_sequences"]["sequence"][seqinModel]["protein_sequence"]["sequence"]
+										slinsidedict["dnaSequenceFromBroadStreet"] = json_data[modelID]["model_sequences"]["sequence"][seqinModel]["dna_sequence"]["sequence"]
+
+
+										if inType == 'contig':
+											if orf == "genemark":
+												slinsidedict["query_start"] = findnthbar(orfInfo, 1) + (hsp.query_start - 1)*3
+												slinsidedict["query_end"] = findnthbar(orfInfo, 1) + (hsp.query_start - 1)*3 + realQueryLength*3 - 1
+												slinsidedict["orf_strand"] = findnthbar(orfInfo, 0)
+												slinsidedict["orf_start"] = findnthbar(orfInfo, 1)
+												slinsidedict["orf_end"] = findnthbar(orfInfo, 2)
+												slinsidedict["orf_From"] = orffrom
+
+												if orfInfo[:orfInfo.index('|')] in predicted_genes_dict:
+													slinsidedict["orf_dna_sequence"] = predicted_genes_dict[orfInfo[:orfInfo.index('|')]]
+													slinsidedict["orf_prot_sequence"] = str(Seq(predicted_genes_dict[orfInfo[:orfInfo.index('|')]], generic_dna).translate(table=11)).strip("*")
+												else:
+													slinsidedict["orf_dna_sequence"] = ""
+													slinsidedict["orf_prot_sequence"] = ""
+											else:
+												slinsidedict["query_start"] = findnthbar2(orfInfo, 1) + (hsp.query_start - 1)*3
+												slinsidedict["query_end"] = findnthbar2(orfInfo, 1) + (hsp.query_start - 1)*3 + realQueryLength*3 - 1
+												slinsidedict["orf_strand"] = findnthbar2(orfInfo, 3)
+												slinsidedict["orf_start"] = findnthbar2(orfInfo, 1)
+												slinsidedict["orf_end"] = findnthbar2(orfInfo, 2)
+												slinsidedict["orf_From"] = findnthbar2(orfInfo, 0)
+
+												if orfInfo[:orfInfo.index('#')] in predicted_genes_dict:
+													slinsidedict["orf_dna_sequence"] = predicted_genes_dict[orfInfo[:orfInfo.index('#')]]
+													slinsidedict["orf_prot_sequence"] = str(Seq(predicted_genes_dict[orfInfo[:orfInfo.index('#')]], generic_dna).translate(table=11)).strip("*")
+												else:
+													slinsidedict["orf_dna_sequence"] = ""
+													slinsidedict["orf_prot_sequence"] = ""
+
+
+										elif inType == 'protein':
+											slinsidedict["query_start"] = hsp.query_start
+											slinsidedict["query_end"] = hsp.query_start + realQueryLength
+											slinsidedict["query_From"] = blast_record.query.encode('ascii','replace')
+											slinsidedict["orf_prot_sequence"] = orf_protein_sequence
+
+										elif inType == 'read':
+											pass
+
+										slinsidedict["perc_identity"] = float(format(float(slinsidedict["max-identities"]*100) / len(slinsidedict["query"]),'.2f'))
+
+										loose[hitid + "|hsp_num:" + str(init)] = slinsidedict
+										init += 1
+				
+				elif modelTypeID == 40292:
+					init = 0
+					passevalue = findnthbar(alignTitle, 1)
+					for hsp in alignment.hsps:
+
+						querySeq = hsp.query.replace('-', '')
+						realQueryLength = len(querySeq)
+
+						card_sequence = str(json_data[modelID]["model_sequences"]["sequence"][seqinModel]["protein_sequence"]["sequence"])
+						orf_protein_sequence = ""
+						
+						if orf == "genemark":
+							if predicted_genes_dict:
+								orf_protein_sequence = str(Seq(predicted_genes_dict[orfInfo[:orfInfo.index('|')]], generic_dna).translate(table=11)).strip("*")
+							if submitted_proteins_dict:
+								orf_protein_sequence = str(submitted_proteins_dict[orfInfo.split("#")[0]])
+						else:
+							if predicted_genes_dict:
+								if orfInfo.strip() in predicted_genes_dict.keys():
+									orf_protein_sequence = str(Seq(predicted_genes_dict[orfInfo], generic_dna).translate(table=11)).strip("*")
+								else:
+									orf_protein_sequence = str(Seq(predicted_genes_dict[orfInfo[:orfInfo.index('#')]], generic_dna).translate(table=11)).strip("*")
+							if submitted_proteins_dict:
+								orf_protein_sequence = str(submitted_proteins_dict[orfInfo.split(" ")[0]])														
+
+						if card_sequence == orf_protein_sequence:
+							ppinsidedict = {}
+							ppinsidedict["type_match"] = "Perfect"
+							ppinsidedict["model_id"] = modelID
+							ppinsidedict["orf_strand"] = findnthbar(orfInfo, 0)
+							ppinsidedict["orf_start"] = findnthbar(orfInfo, 1)
+							ppinsidedict["orf_end"] = findnthbar(orfInfo, 2)
+							ppinsidedict["orf_From"] = orffrom
+
+							ppinsidedict["model_name"] = json_data[modelID]["model_name"]
+							ppinsidedict["model_type"] = json_data[modelID]["model_type"]
+							ppinsidedict["model_type_id"] = modelTypeID
+
+							ppinsidedict["pass_evalue"] = passevalue
+							ppinsidedict["ARO_accession"] = json_data[modelID]["ARO_accession"]
+							ppinsidedict["ARO_name"] = json_data[modelID]["ARO_name"]
+							ppinsidedict["ARO_category"] = json_data[modelID]["ARO_category"]
+							ppinsidedict["evalue"] = hsp.expect
+							ppinsidedict["bit-score"] = hsp.bits
+							ppinsidedict["max-identities"] = hsp.identities
+							if checkKeyExisted("NCBI_taxonomy", json_data[modelID]["model_sequences"]["sequence"][seqinModel]):
+								ppinsidedict["cvterm_id"] = json_data[modelID]["model_sequences"]["sequence"][seqinModel]["NCBI_taxonomy"]["NCBI_taxonomy_cvterm_id"]
+							
+							ppinsidedict["query"] = hsp.query.encode('ascii','replace')
+							ppinsidedict["match"] = hsp.match.encode('ascii','replace')
+							ppinsidedict["sequenceFromDB"] = hsp.sbjct.encode('ascii','replace')
+							ppinsidedict["SequenceFromBroadStreet"] = json_data[modelID]["model_sequences"]["sequence"][seqinModel]["protein_sequence"]["sequence"]
+							ppinsidedict["dnaSequenceFromBroadStreet"] = json_data[modelID]["model_sequences"]["sequence"][seqinModel]["dna_sequence"]["sequence"]
+
+							if inType == 'contig':
+								if orf == "genemark":
+									ppinsidedict["query_start"] = findnthbar(orfInfo, 1) + (hsp.query_start - 1)*3
+									ppinsidedict["query_end"] = findnthbar(orfInfo, 1) + (hsp.query_start - 1)*3 + realQueryLength*3 - 1
+									ppinsidedict["orf_strand"] = findnthbar(orfInfo, 0)
+									ppinsidedict["orf_start"] = findnthbar(orfInfo, 1)
+									ppinsidedict["orf_end"] = findnthbar(orfInfo, 2)
+									ppinsidedict["orf_From"] = orffrom
+									if orfInfo[:orfInfo.index('|')] in predicted_genes_dict:
+										ppinsidedict["orf_dna_sequence"] = predicted_genes_dict[orfInfo[:orfInfo.index('|')]] 
+										ppinsidedict["orf_prot_sequence"] = str(Seq(predicted_genes_dict[orfInfo[:orfInfo.index('|')]], generic_dna).translate(table=11)).strip("*")
+									else:
+										ppinsidedict["orf_dna_sequence"] = ""
+										ppinsidedict["orf_prot_sequence"] = ""
+
+								else:
+									ppinsidedict["query_start"] = findnthbar2(orfInfo, 1) + (hsp.query_start - 1)*3
+									ppinsidedict["query_end"] = findnthbar2(orfInfo, 1) + (hsp.query_start - 1)*3 + realQueryLength*3 - 1
+									ppinsidedict["orf_strand"] = findnthbar2(orfInfo, 3)
+									ppinsidedict["orf_start"] = findnthbar2(orfInfo, 1)
+									ppinsidedict["orf_end"] = findnthbar2(orfInfo, 2)
+									ppinsidedict["orf_From"] = findnthbar2(orfInfo, 0)
+
+									if orfInfo[:orfInfo.index('#')] in predicted_genes_dict:
+										ppinsidedict["orf_dna_sequence"] = predicted_genes_dict[orfInfo[:orfInfo.index('#')]] 
+										ppinsidedict["orf_prot_sequence"] = str(Seq(predicted_genes_dict[orfInfo[:orfInfo.index('#')]], generic_dna).translate(table=11)).strip("*")
+									else:
+										ppinsidedict["orf_dna_sequence"] = ""
+										ppinsidedict["orf_prot_sequence"] = ""
+
+							
+							elif inType == 'protein':
+								ppinsidedict["query_start"] = hsp.query_start
+								ppinsidedict["query_end"] = hsp.query_start + realQueryLength
+								ppinsidedict["query_From"] = blast_record.query.encode('ascii','replace')
+								ppinsidedict["orf_prot_sequence"] = orf_protein_sequence
+
+							elif inType == 'read':
+								pass
+
+							ppinsidedict["perc_identity"] = float(format(float(ppinsidedict["max-identities"]*100) / len(ppinsidedict["query"]),'.2f'))
+
+							perfect[hitid + "|hsp_num:" + str(init)] = ppinsidedict
+							init += 1
+									
+						elif hsp.expect <= passevalue:
+						#elif hsp.bits >= passevalue:
+							#print " 2>> ", hsp.bits, " <= ", passevalue
+							#print hsp
+							insidedict = {}
+							insidedict["type_match"] = "Strict"
+							insidedict["orf_strand"] = findnthbar(orfInfo, 0)
+							insidedict["orf_start"] = findnthbar(orfInfo, 1)							
+							insidedict["orf_end"] = findnthbar(orfInfo, 2)
+							insidedict["orf_From"] = orffrom
+
+							insidedict["model_name"] = json_data[modelID]["model_name"]
+							insidedict["model_type"] = json_data[modelID]["model_type"]
+							insidedict["model_type_id"] = modelTypeID
+
+							insidedict["model_id"] = modelID
+							insidedict["pass_evalue"] = passevalue
+							insidedict["ARO_accession"] = json_data[modelID]["ARO_accession"]
+							insidedict["ARO_name"] = json_data[modelID]["ARO_name"]
+							insidedict["ARO_category"] = json_data[modelID]["ARO_category"]
+							insidedict["evalue"] = hsp.expect
+							insidedict["bit-score"] = hsp.bits
+							insidedict["max-identities"] = hsp.identities
+							if checkKeyExisted("NCBI_taxonomy", json_data[modelID]["model_sequences"]["sequence"][seqinModel]):
+								insidedict["cvterm_id"] = json_data[modelID]["model_sequences"]["sequence"][seqinModel]["NCBI_taxonomy"]["NCBI_taxonomy_cvterm_id"]
+							
+							insidedict["query"] = hsp.query.encode('ascii','replace')
+							insidedict["match"] = hsp.match.encode('ascii','replace')
+							insidedict["sequenceFromDB"] = hsp.sbjct.encode('ascii','replace')
+							insidedict["SequenceFromBroadStreet"] = json_data[modelID]["model_sequences"]["sequence"][seqinModel]["protein_sequence"]["sequence"]
+							insidedict["dnaSequenceFromBroadStreet"] = json_data[modelID]["model_sequences"]["sequence"][seqinModel]["dna_sequence"]["sequence"]
+
+							if inType == 'contig':
+								if orf == "genemark":
+									insidedict["query_start"] = findnthbar(orfInfo, 1) + (hsp.query_start - 1)*3
+									insidedict["query_end"] = findnthbar(orfInfo, 1) + (hsp.query_start - 1)*3 + realQueryLength*3 - 1
+									insidedict["orf_strand"] = findnthbar(orfInfo, 0)
+									insidedict["orf_start"] = findnthbar(orfInfo, 1)
+									insidedict["orf_end"] = findnthbar(orfInfo, 2)
+									insidedict["orf_From"] = orffrom
+									if orfInfo[:orfInfo.index('|')] in predicted_genes_dict:
+										insidedict["orf_dna_sequence"] = predicted_genes_dict[orfInfo[:orfInfo.index('|')]] 
+										insidedict["orf_prot_sequence"] = str(Seq(predicted_genes_dict[orfInfo[:orfInfo.index('|')]], generic_dna).translate(table=11)).strip("*")
+									else:
+										insidedict["orf_dna_sequence"] = ""
+										insidedict["orf_prot_sequence"] = ""
+								else:
+									insidedict["query_start"] = findnthbar2(orfInfo, 1) + (hsp.query_start - 1)*3
+									insidedict["query_end"] = findnthbar2(orfInfo, 1) + (hsp.query_start - 1)*3 + realQueryLength*3 - 1
+									insidedict["orf_strand"] = findnthbar2(orfInfo, 3)
+									insidedict["orf_start"] = findnthbar2(orfInfo, 1)
+									insidedict["orf_end"] = findnthbar2(orfInfo, 2)
+									insidedict["orf_From"] = findnthbar2(orfInfo, 0)
+									
+									if orfInfo[:orfInfo.index('#')] in predicted_genes_dict:
+										insidedict["orf_dna_sequence"] = predicted_genes_dict[orfInfo[:orfInfo.index('#')]] 
+										insidedict["orf_prot_sequence"] = str(Seq(predicted_genes_dict[orfInfo[:orfInfo.index('#')]], generic_dna).translate(table=11)).strip("*")
+									else:
+										insidedict["orf_dna_sequence"] = ""
+										insidedict["orf_prot_sequence"] = ""									
+
+							elif inType == 'protein':
+								insidedict["query_start"] = hsp.query_start
+								insidedict["query_end"] = hsp.query_start + realQueryLength
+								insidedict["query_From"] = blast_record.query.encode('ascii','replace')
+								insidedict["orf_prot_sequence"] = orf_protein_sequence
+
+							elif inType == 'read':
+								pass
+
+							insidedict["perc_identity"] = float(format(float(insidedict["max-identities"]*100) / len(insidedict["query"]),'.2f'))
+
+							strict[hitid + "|hsp_num:" + str(init)] = insidedict
+							init += 1
+							
+						else:
+							linsidedict = {}
+							# print " >> expect: ",hsp.expect, " passevalue: ", passevalue, " ", json_data[modelID]["model_name"]
+							# print " >> bits: ",hsp.bits, " passevalue: ", passevalue, " ", json_data[modelID]["model_name"]
+							linsidedict["type_match"] = "Loose"
+							linsidedict["orf_strand"] = findnthbar(orfInfo, 0)
+							linsidedict["orf_start"] = findnthbar(orfInfo, 1)
+							linsidedict["orf_end"] = findnthbar(orfInfo, 2)
+							linsidedict["orf_From"] = orffrom
+							
+							linsidedict["model_name"] = json_data[modelID]["model_name"]
+							linsidedict["model_type"] = json_data[modelID]["model_type"]
+							linsidedict["model_type_id"] = modelTypeID
+
+							linsidedict["pass_evalue"] = passevalue
+							linsidedict["model_id"] = modelID
+							linsidedict["ARO_accession"] = json_data[modelID]["ARO_accession"]
+							linsidedict["ARO_name"] = json_data[modelID]["ARO_name"]
+							linsidedict["ARO_category"] = json_data[modelID]["ARO_category"]
+							linsidedict["evalue"] = hsp.expect
+							linsidedict["max-identities"] = hsp.identities
+							linsidedict["bit-score"] = hsp.bits
+							if checkKeyExisted("NCBI_taxonomy", json_data[modelID]["model_sequences"]["sequence"][seqinModel]):
+								linsidedict["cvterm_id"] = json_data[modelID]["model_sequences"]["sequence"][seqinModel]["NCBI_taxonomy"]["NCBI_taxonomy_cvterm_id"]
+
+							linsidedict["query"] = hsp.query.encode('ascii','replace')
+							linsidedict["match"] = hsp.match.encode('ascii','replace')
+							linsidedict["sequenceFromDB"] = hsp.sbjct.encode('ascii','replace')
+							linsidedict["SequenceFromBroadStreet"] = json_data[modelID]["model_sequences"]["sequence"][seqinModel]["protein_sequence"]["sequence"]
+							linsidedict["dnaSequenceFromBroadStreet"] = json_data[modelID]["model_sequences"]["sequence"][seqinModel]["dna_sequence"]["sequence"]
+
+							if inType == 'contig':
+								if orf == "genemark":
+									linsidedict["query_start"] = findnthbar(orfInfo, 1) + (hsp.query_start - 1)*3
+									linsidedict["query_end"] = findnthbar(orfInfo, 1) + (hsp.query_start - 1)*3 + realQueryLength*3 - 1
+									linsidedict["orf_strand"] = findnthbar(orfInfo, 0)
+									linsidedict["orf_start"] = findnthbar(orfInfo, 1)
+									linsidedict["orf_end"] = findnthbar(orfInfo, 2)
+									linsidedict["orf_From"] = orffrom
+
+									if orfInfo[:orfInfo.index('|')] in predicted_genes_dict:
+										linsidedict["orf_dna_sequence"] = predicted_genes_dict[orfInfo[:orfInfo.index('|')]]
+										linsidedict["orf_prot_sequence"] = str(Seq(predicted_genes_dict[orfInfo[:orfInfo.index('|')]], generic_dna).translate(table=11)).strip("*")
+									else:
+										linsidedict["orf_dna_sequence"] = ""
+										linsidedict["orf_prot_sequence"] = ""
+
+								else:
+									linsidedict["query_start"] = findnthbar2(orfInfo, 1) + (hsp.query_start - 1)*3
+									linsidedict["query_end"] = findnthbar2(orfInfo, 1) + (hsp.query_start - 1)*3 + realQueryLength*3 - 1
+									linsidedict["orf_strand"] = findnthbar2(orfInfo, 3)
+									linsidedict["orf_start"] = findnthbar2(orfInfo, 1)
+									linsidedict["orf_end"] = findnthbar2(orfInfo, 2)
+									linsidedict["orf_From"] = findnthbar2(orfInfo, 0)
+
+									if orfInfo[:orfInfo.index('#')] in predicted_genes_dict:
+										linsidedict["orf_dna_sequence"] = predicted_genes_dict[orfInfo[:orfInfo.index('#')]]
+										linsidedict["orf_prot_sequence"] = str(Seq(predicted_genes_dict[orfInfo[:orfInfo.index('#')]], generic_dna).translate(table=11)).strip("*")
+									else:
+										linsidedict["orf_dna_sequence"] = ""
+										linsidedict["orf_prot_sequence"] = ""
+
+							elif inType == 'protein':
+								linsidedict["query_start"] = hsp.query_start
+								linsidedict["query_end"] = hsp.query_start + realQueryLength
+								linsidedict["query_From"] = blast_record.query.encode('ascii','replace')
+								linsidedict["orf_prot_sequence"] = orf_protein_sequence
+
+							elif inType == 'read':
+								pass
+
+							linsidedict["perc_identity"] = float(format(float(linsidedict["max-identities"]*100) / len(linsidedict["query"]), '.2f'))
+
+							loose[hitid + "|hsp_num:" + str(init)] = linsidedict
+							init += 1
+
+			if len(perfect) == 0 and len(strict) == 0:
+				if criteria == "yes":
+					blastResults[blast_record.query.encode('ascii','replace')] = loose
+					logging.info("runBlast => hit = " + str(blast_record.query.encode('ascii','replace')) + " => Loose")
+				pscore = 1
+				
+			elif len(perfect) == 0:
+				blastResults[blast_record.query.encode('ascii','replace')] = strict
+				logging.info("runBlast => hit = " + str(blast_record.query.encode('ascii','replace')) + " => Strict")
+				pscore = 2
+				
+			else:
+				blastResults[blast_record.query.encode('ascii','replace')] = perfect
+				logging.info("runBlast => hit = " + str(blast_record.query.encode('ascii','replace')) + " => Perfect")
+				pscore = 3
+
+		blastResults["_metadata"] = {"data_type": data_type, "software_version": version() , "data_version": data_version(), "timestamp_local": datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), "timestamp_utc": str(datetime.datetime.utcnow()),"duration": round(time.time() - t0, 3), "command": cmd  }
+		pjson = json.dumps(blastResults)
+
+		with open(working_directory+"/"+outputFile+".json", 'w') as f:
+			print>>f, pjson
+		f.close()
+
+		if inType == 'contig':
+			for gene in blastResults:
+				logging.debug("runBlast => gene = " + str(gene))
+				if not gene == "_metadata":
+					for hsp in blastResults[gene]:
+						if blastResults[gene][hsp]["orf_end"] < blastResults[gene][hsp]["query_end"]:
+							pass							
+							#print>>sys.stderr, hsp
+							#print>>sys.stderr, blastResults[gene][hsp]["orf_end"]
+							#print>>sys.stderr, blastResults[gene][hsp]["query_end"]
+							#logging.error("runBlast => hsp => " + str(hsp))
+							#logging.error("runBlast => [gene][hsp] => " + str(blastResults[gene][hsp]))
+		if inType == 'read':
+			pass
+	result_handle.close()
+	logging.info("runBlast => done")
+	return pjson
+
+
+def findNumDash(subject, index):
+	numDash = 0
+	toFind = "-"
+	stringList = list(subject)
+	output = []
+	
+	for i in range(0, len(stringList)):
+		if (stringList[i] == toFind): 
+			numDash += 1
+		else: 
+			output.append(stringList[i])
+		if (len(output) == index): 
+			break
+	return numDash
+	
+
+
+"""
+function get_number_dash($subject,$index){
+
+    $number_of_dash = 0;
+    $toFind = "-";
+    $array = str_split($subject);
+
+    $output = [];
+
+    for($i = 0; $i < strlen($subject); $i++){
+
+        if($array[$i] == $toFind){
+            $number_of_dash++;
+        }else{
+            $output[] = $array[$i];
+        }
+
+        if(count($output) == $index){ break; }
+    }
+
+    return $number_of_dash;
+}
+"""
+
+def decompress(inputfile,ext,out_dir):
+
+    filename = os.path.basename(inputfile)
+    name = os.path.splitext(filename)
+    newFile = out_dir + "/" + str(name[0])
+    clean_files.append(newFile)
+
+    if ext == 'gz':
+	    readFq=gzip.open(inputfile)
+	    data = readFq.readlines()
+
+	    with open(newFile,'w') as fin:
+	    	for eachline in data:
+	    		print>>fin, eachline.rstrip() 
+	    fin.close()
+
+    return newFile
+
+def validateHeaders(inputfile,orf):
+	message = []
+	from Bio import SeqIO
+	for record in SeqIO.parse(inputfile, 'fasta'):
+		if "#" in record.id and orf == "prodigal":
+			# Bad header with character #
+			message.append("Bad header with character # : >"+ str(record.id))
+		elif "|" in record.id and orf == "genemark":
+			# Bad header with character | 
+			message.append("Bad header with character | : >"+ str(record.id))
+
+	if message:
+		#print message
+		print>>sys.stderr, message
+		exit()
+
+
+def main(args):
+	t0 = time.time()
+	if args.inputSeq == None:
+		print "[error] missing input(s)"
+		print "[info] Try: rgi -h"
+		logging.error("main => missing input(s)")
+		logging.error("main => Try: rgi -h")
+		exit()
+	if args.verbose.lower() == 'on':
+		log_file = str(working_directory) + "/"+os.path.basename(args.inputSeq)+".log"
+		logging.basicConfig(filename=log_file, level=logging.DEBUG, format='%(asctime)s %(name)-12s %(levelname)-8s %(message)s', datefmt='%Y-%m-%d %I:%M:%S %p')
+		logging.info("main => logs saved to : " + log_file)
+		logging.info('main => start rgi')
+	
+	# clean databases before running rgi, just in case the user have old blast databases
+	if args.clean_databases.lower() == 'yes':
+		logging.info('main => clean blast dbs')
+		cln.main()
+
+	inType = args.inType.lower()
+	inputSeq = args.inputSeq 
+	threads = args.threads
+	outputFile = args.output
+	criteria = args.criteria.lower()
+	clean = args.clean.lower()
+	data_type = args.data.lower()
+	num_sequences = args.num_sequences
+
+	#orf = args.orf.lower()
+	orf = "prodigal"
+	alignment_tool = args.aligner.lower()
+
+	"""
+	Select search mode options
+	The search modes are: 
+
+	FAST:
+		--alignment_tool DIAMOND (contig, protein)
+		--alignment_tool PALADIN (read)
+		--shapes 1
+
+	SENSITIVE
+		--alignment_tool BLAST (contig, protein, read)
+		--shapes 0
+	"""
+	
+	"""
+	if  args.mode == "fast":
+		alignment_tool = "diamond"
+	else:
+		alignment_tool = "blast"
+	"""
+
+	bpjson = ""
+
+	# Write each request to a directory based on the input filename with (-results) appeneded
+	#output_dir = working_directory + "/" + os.path.basename(inputSeq) + "-results"
+	output_dir = working_directory
+
+	#if not os.path.exists(output_dir):
+	#	os.makedirs(output_dir)
+
+	# Check if file is compressed
+	if inputSeq.endswith('.gz'):
+		inputSeq = decompress(inputSeq,'gz',output_dir)
+
+	checkBeforeBlast(inType, inputSeq)
+	
+	if args.database == None:
+		writeFASTAfromJson()
+
+		if alignment_tool == "blast":
+			makeBlastDB(args.verbose.lower())
+		else:
+			makeDiamondDB(args.verbose.lower())
+	else:
+		loadDatabase(args.database)
+
+	# check the heasders
+	validateHeaders(inputSeq,orf)
+
+	if inType == "read":
+		validateFastQ(inputSeq)
+	else:
+		validateFastA(inputSeq)
+
+	# run blast
+	bpjson = runBlast(args, inType, inputSeq, threads, outputFile, criteria, data_type, clean, orf, alignment_tool,args.verbose.lower(), num_sequences)
+
+	# Generate tab-delimited files
+	logging.info('main => Generate tab-delimited ' + outputFile + ".txt")
+	convertJsonToTSV.printCSV(outputFile + ".json",outputFile,orf,args.verbose.lower())
+
+	# Generare gff3 file
+	if inType in ["contig"]:
+		create_gff3(outputFile+".json",outputFile, orf)
+
+	if clean == "yes":
+		logging.info('main => clean temporary files')
+		removeTemp()
+	logging.info('main => rgi success')
+	logging.info('main => total running time ' + str(round(time.time()-t0, 3)) + 's')
+	logging.info('----------------------------------------')
+	return bpjson
+
+def create_gff3(afile,file_name, orf):
+	try:
+		with open(afile, 'r') as f:
+			data = json.load(f)
+		f.close()
+
+	except ValueError:
+		print>>sys.stderr, "invalid JSON."
+		exit()
+
+	with open(working_directory+"/"+file_name+".gff3", "w") as af:
+		sequences = []
+		headers = []
+		body = []
+		writer = csv.writer(af, delimiter='\t')
+		headers.append(['##gff-version 3.2.1'])
+		for item in data:
+			minevalue = False
+			maxpercent = 0
+			startCompare = False
+			minARO = 0
+			minARO_id = 0
+			hitID = ""
+			geneID = ""
+			AROlist = {}
+			orf_dna_sequence = ""
+			predictedProtein = ""
+
+			if item not in ["_metadata","data_type"]:
+				geneID = item
+				for it in data[item]:
+
+					idenPercent = float(data[item][it]["max-identities"]) / len(data[item][it]["query"])
+					# sort results by minimum e-value and maximum percent identity
+					AROlist[str(data[item][it]["ARO_name"])] = data[item][it]["ARO_accession"]
+					pass_evalue = str(data[item][it]["pass_evalue"]).split("|")[0]
+
+					if startCompare:
+						if minevalue > data[item][it]["evalue"] and float(maxpercent) < float(idenPercent):
+							minevalue = data[item][it]["evalue"]
+							maxpercent = float(idenPercent)
+							minARO = data[item][it]["ARO_name"]
+							minARO_id = data[item][it]["ARO_accession"]
+							SequenceFromBroadStreet = data[item][it]["SequenceFromBroadStreet"]
+
+							if "orf_prot_sequence" in data[item][it]:
+								predictedProtein = data[item][it]["orf_prot_sequence"]
+							if "orf_dna_sequence" in data[item][it]:
+								orf_dna_sequence = data[item][it]["orf_dna_sequence"]
+							if "hsp_num:" in it:
+								hitID = it
+					else:
+						startCompare = True
+						minevalue = data[item][it]["evalue"]
+						maxpercent = float(idenPercent)
+						minARO = data[item][it]["ARO_name"]
+						minARO_id = data[item][it]["ARO_accession"]
+						SequenceFromBroadStreet = data[item][it]["SequenceFromBroadStreet"]
+
+						if "orf_prot_sequence" in data[item][it]:
+							predictedProtein = data[item][it]["orf_prot_sequence"]
+						if "orf_dna_sequence" in data[item][it]:
+							orf_dna_sequence = data[item][it]["orf_dna_sequence"]
+						if "hsp_num:" in it:
+							hitID = it
+
+					_source = "RGI:"+ version()	
+					_type = "CDS"
+					_phase = "."
+					_score = str(minevalue)
+
+					if orf == "genemark":
+						_start = str(int(findnthbar(item, 1)))
+						_end = str(int(findnthbar(item, 2)))
+						_strand = str(findnthbar(item, 0))	
+						_seqid = str(geneID).split("|")[0]
+						sequences.append(format_fasta(str(_seqid), orf_dna_sequence))
+						_attributes = "Name=" + str(minARO)+";Alias="+"ARO:"+str(minARO_id)+",ORF label (internal to RGI):"+str(geneID) + ",HSP identifier (internal to RGI):" + str(hitID)+",RGI Detection Paradigm:"+data[item][it]["type_match"]			
+					else: 
+						_start = str(int(findnthbar2(item, 1)))
+						_end = str(int(findnthbar2(item, 2)))
+						_strand = str(findnthbar2(item, 3))
+						_seqid = str(geneID).split("#")[0]
+						sequences.append(format_fasta(str(_seqid), orf_dna_sequence))
+						_attributes = "Name=" + str(minARO)+";Alias="+"ARO:"+str(minARO_id)+",ORF label (internal to RGI):"+str(geneID) + ",HSP identifier (internal to RGI):" + str(hitID)+",RGI Detection Paradigm:"+data[item][it]["type_match"]
+
+					headers.append(['##sequence-region '+_seqid+' '+_start+' '+_end])
+					body.append([_seqid, _source, _type, _start, _end, _score, _strand, _phase, _attributes])
+		
+		# headers
+		for head_item in headers:
+			af.write(head_item[0]+ '\n')
+		# body
+		for body_item in body:
+			writer.writerow(body_item)
+		# footer		
+		writer.writerow(["##FASTA"])
+		for sequence in sequences:
+			af.write(sequence)
+	af.close()
+
+def format_fasta(name, sequence):
+    fasta_string = '>' + name + '\n' + sequence + '\n'
+    return fasta_string
+
+def rRNA_mutation():
+	pass
+
+def version():
+	software_version = "3.2.0"
+	return software_version
+
+def data_version():
+	data_version = ""
+	if os.path.isfile(data_path+"card.json") == True:
+		with open(data_path+"card.json") as json_file:
+			json_data = json.load(json_file)
+			for item in json_data.keys():
+				if item == "_version":
+					data_version = json_data[item]
+		json_file.close()
+
+	if data_version == "":
+		return "Error: card.json not found"
+	else:
+		return data_version
+
+class customAction(argparse.Action):
+    def __call__(self, parser, namespace, values, option_string=None):
+        print(data_version())
+        exit()
+
+
+def run():
+	parser = argparse.ArgumentParser(description='Resistance Gene Identifier - Version ' + version())
+	parser.add_argument('-t','--input_type', dest="inType", default="CONTIG", help='must be one of contig, protein and read (default: contig)')
+	parser.add_argument('-i','--input_sequence', dest="inputSeq", default=None, help='input file must be in either FASTA (contig and protein), FASTQ(read) or gzip format! e.g myFile.fasta, myFasta.fasta.gz')
+	parser.add_argument('-n','--num_threads', dest="threads", default="32", help="Number of threads (CPUs) to use in the BLAST search (default=32)")
+	parser.add_argument('-k','--max_target_seqs', dest="num_sequences", default="1", help="maximum number of target sequences to report alignments for (default=1)")
+	parser.add_argument('-o','--output_file', dest="output", default="Report", help="Output JSON file (default=Report)")	
+	parser.add_argument('-e','--loose_criteria', dest="criteria", default="NO", help="The options are YES to include loose hits and NO to exclude loose hits. (default=NO to exclude loose hits)")
+	parser.add_argument('-c','--clean', dest="clean", default="YES", help="This removes temporary files in the results directory after run. Options are NO or YES (default=YES for remove)")
+	parser.add_argument('-cd','--clean_databases', dest="clean_databases", default="YES", help="This removes blast databases before rgi run. Options are NO or YES (default=YES for remove)")
+	parser.add_argument('-d','--data', dest="data", default="NA", help = "Specify a data-type, i.e. wgs, chromosome, plasmid, etc. (default = NA)")
+	parser.add_argument('-l','--verbose', dest="verbose", default="OFF", help = "log progress to file. Options are OFF or ON  (default = OFF for no logging)")
+	parser.add_argument('-a','--alignment_tool', dest="aligner", default="BLAST", help = "choose between BLAST and DIAMOND. Options are BLAST or DIAMOND  (default = BLAST)")
+	parser.add_argument('-r','--db', dest="database", default=None, help='specify path to CARD blast databases (default: None)')
+	parser.add_argument('-sv','--software_version', action='version', version=version(), help = "Prints software number")
+	parser.add_argument('-dv','--data_version', action=customAction, nargs=0, help = "Prints data version number") 
+	#parser.add_argument('-m','--mode', dest="mode", default="fast", help='search mode. Options are FAST or SENSITIVE (default: FAST)')
+	args = parser.parse_args()
+	main(args)	
+
+if __name__ == '__main__':
+	run()
+	
diff --git a/rrna.py b/rrna.py
new file mode 100644
index 0000000..ce68838
--- /dev/null
+++ b/rrna.py
@@ -0,0 +1,159 @@
+import sys
+import os
+import filepaths
+from Bio import SeqIO
+from Bio.Seq import Seq
+import json
+
+'''
+Bacteria (70S)  
+        LSU 50S
+                5S      RF00001
+                23S     SILVA-LSU-Bac
+        SSU 30S
+                16S     RF00177
+'''
+
+'''
+Step 1: get rRNA coordinates using barrnap
+Step 2: extract rRNA sequences
+Step 3: blast and report for mutations in CARD
+'''
+
+script_path = filepaths.determine_path()
+working_directory = os.getcwd()
+
+path = script_path+"/_db/"
+data_path = script_path+"/_data/"
+tmp = script_path+"/_tmp/"
+
+def rRNA(fq_path,start,stop,strand):
+	try:
+		sequence = SeqIO.read(fq_path, "fasta")
+		return str(sequence.seq[int(start):int(stop)])
+	except Exception as e:
+		print "Error: ",e, ".Please input only one record / sequence"
+		return ""
+
+def rRNA_list(fq_path,file_name):
+	list_all = {}
+	list_all["5s"] = []
+	list_all["16s"] = []
+	list_all["23s"] = []
+	
+	# get rRNA
+	os.system('barrnap --reject 1e-9 --quiet --kingdom bac '+fq_path+' > '+file_name+'.gff3') # --incseq
+
+	# scan file and get sequence and coordinates
+	with open(working_directory+'/'+file_name+'.gff3', 'r') as f:
+		data = f.readlines()
+		for eachline in data:
+			if eachline == '':
+				pass
+			elif eachline[0:13] == '##gff-version':
+				pass
+			elif 'Name=5S_rRNA;product=5S ribosomal RNA' in eachline:
+				list_5s = eachline.split("\t")
+				list_all["5s"].append({"start": list_5s[3], "stop": list_5s[4], "strand": str(list_5s[6]), "sequence": rRNA(fq_path,list_5s[3],list_5s[4],list_5s[6])})
+			elif 'Name=16S_rRNA;product=16S ribosomal RNA' in eachline:
+				list_16s = eachline.split("\t")
+				list_all["16s"].append({"start": list_16s[3], "stop": list_16s[4], "strand": str(list_16s[6]), "sequence": rRNA(fq_path,list_16s[3],list_16s[4],list_16s[6])})
+			elif 'Name=23S_rRNA;product=23S ribosomal RNA' in eachline:
+				list_23s = eachline.split("\t")
+				list_all["23s"].append({"start": list_23s[3], "stop": list_23s[4], "strand": str(list_23s[6]), "sequence": rRNA(fq_path,list_23s[3],list_23s[4],list_23s[6])})
+
+	pjson = json.dumps(list_all)
+
+	with open(working_directory+"/"+file_name+".rRNA.json", 'w') as f:
+		print>>f, pjson
+	f.close()
+
+def checkKeyExisted(key, my_dict):
+	try:
+		nonNone = my_dict[key] is not None
+	except KeyError:
+		nonNone = False
+	return nonNone
+
+def writeFASTAfromJson():
+	noSeqList = []
+	if os.path.isfile(path+"nucleotidedb.fsa") == False:
+		with open(data_path+"card.json") as json_file:
+			json_data = json.load(json_file)
+			with open (path+'nucleotidedb.fsa', 'w') as wp:
+				for item in json_data:
+					if item.isdigit():
+						if json_data[item]["model_type_id"] == "40295":
+							snpList = ""
+							pass_eval = 1e-30
+							if checkKeyExisted("model_param", json_data[item]):
+								if checkKeyExisted("blastp_evalue", json_data[item]["model_param"]):
+									pass_eval = json_data[item]["model_param"]["blastp_evalue"]["param_value"]
+									
+								if checkKeyExisted("snp", json_data[item]['model_param']):
+									for key in json_data[item]['model_param']['snp']['param_value']:
+										snpList += json_data[item]['model_param']['snp']['param_value'][key]
+										snpList += ','
+							
+							if checkKeyExisted("model_sequences", json_data[item]):
+								for seqkey in json_data[item]["model_sequences"]["sequence"]:
+									print>>wp, ('>' + item + '_' + seqkey + " | model_type_id: 40295" + " | pass_evalue: " + str(pass_eval) + " | SNP: " + snpList)
+									print>>wp, (json_data[item]["model_sequences"]["sequence"][seqkey]["dna_sequence"]["sequence"])
+							else:
+								 noSeqList.append(item)
+					
+		json_file.close()
+	print noSeqList
+
+def runDiamond(inType, inputSeq, threads, outputFile):
+
+	cfilter = "	--index-chunks 1 --block-size 1 --quiet"
+
+	if inType == 'contig':
+		#logging.info("runDiamond => blastx")
+		#logging.info('runDiamond => diamond blastx --in '+path+'nucleotidedb.fsa --db '+path+'nucleotidedb.db'+' --query '+inputSeq+' --daa '+inputSeq+' --threads '+threads+' --salltitles --more-sensitive --evalue 10'+cfilter)
+		os.system('diamond blastx --in '+path+'nucleotidedb.fsa --db '+path+'nucleotidedb.db'+' --query '+inputSeq+' --daa '+inputSeq+' --threads '+threads+' --salltitles --more-sensitive --evalue 10'+cfilter)
+	else:
+		print "Error"
+    
+	#logging.info('runDiamond => diamond view --outfmt xml --daa '+inputSeq+'.daa --out '+outputFile + cfilter)
+	os.system('diamond view --outfmt xml --daa '+inputSeq+'.daa --out '+outputFile + cfilter )
+	#logging.info('runDiamond => diamond view --daa '+inputSeq+'.daa --out '+outputFile+".txt" + cfilter)
+	os.system('diamond view --daa '+inputSeq+'.daa --out '+outputFile+".txt" + cfilter )
+
+	#clean_files.append(inputSeq+'.daa')
+	#clean_files.append(outputFile+'.txt')
+def runBlast(inputSeq,threads):
+	file_name = os.path.basename(inputSeq)
+	os.system("blastn -perc_identity 0.10 -out blastnRes.xml -outfmt 5 -query " + inputSeq + " -db "+path+"dna.db")
+
+
+def makeDiamondDB():
+	if os.path.isfile(path+"nucleotidedb.fsa") == True and os.path.exists(path+"nucleotidedb.fsa") == True  and os.path.exists(path+"nucleotidedb.db.dmnd") == True:
+		print "diamond DB exists"
+	else:
+		print "create diamond DB."
+		os.system('diamond makedb --in '+path+'nucleotidedb.fsa --db '+path+'nucleotidedb.db')
+
+def makeBlastDB():
+	if os.path.isfile(path+"nucleotidedb.fsa") == True and os.path.exists(path+"nucleotidedb.fsa") == True  and os.path.exists(path+"dna.db.phr") == True and os.path.exists(path+"dna.db.pin") == True and os.path.exists(path+"dna.db.psq") == True :
+		print "blast DB exists"
+	else:
+		print "create blast DB."
+		os.system('makeblastdb -in '+path+'nucleotidedb.fsa -dbtype nucl -out '+path+'dna.db')
+	#os.system('makeblastdb -in proteindb.fsa -dbtype prot -out protein.db')
+
+def main(argvfile):
+	exit("In development...")
+	#file_name = os.path.basename(argvfile)
+	# get rRNA sequences
+	#rRNA_list(argvfile,file_name)
+	#rRNA(argvfile,0,1,'+')
+	#writeFASTAfromJson()
+	#makeDiamondDB()
+	#runDiamond('contig', argvfile, '8', file_name)
+	#makeBlastDB()
+	#runBlast(argvfile,8)
+
+if __name__ == "__main__":
+    main(sys.argv[1])
\ No newline at end of file
diff --git a/tests/protein.fasta b/tests/protein.fasta
new file mode 100644
index 0000000..714971a
--- /dev/null
+++ b/tests/protein.fasta
@@ -0,0 +1,2 @@
+>protein
+MELPNIMHPVAKLSTALAAALMLSGCMPGEIRPTIGQQMETGDQRFGDLVFRQLAPNVWQHTSYLDMPGFGAVASNGLIVRDGGRVLVVDTAWTDDQTAQILNWIKQEINLPVALAVVTHAHQDKMGGMDALHAAGIATYANALSNQLAPQEGMVAAQHSLTFAANGWVEPATAPNFGPLKVFYPGPGHTSDNITVGIDGTDIAFGGCLIKDSKAKSLGNLGDADTEHYAASARAFGAAFPKASMIVMSHSAPDSRAAITHTARMADKLR

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/card-rgi.git



More information about the debian-med-commit mailing list