[med-svn] [python-pyphlan] 01/05: Imported Upstream version 0.0.20151014

Andreas Tille tille at debian.org
Fri May 27 13:05:08 UTC 2016


This is an automated email from the git hooks/post-receive script.

tille pushed a commit to branch master
in repository python-pyphlan.

commit 5d8e0471fe5f347d3bf14a1a79b840f67458f635
Author: Andreas Tille <tille at debian.org>
Date:   Fri May 27 14:37:15 2016 +0200

    Imported Upstream version 0.0.20151014
---
 __init__.py                       |   0
 bo6_screen.py                     |  83 +++++
 choco_markers.py                  | 225 ++++++++++++
 choco_markers_summary.py          | 207 +++++++++++
 choco_postprocess_euks.py         | 106 ++++++
 choco_postprocess_viruses.py      | 203 +++++++++++
 choco_profile.py                  | 133 +++++++
 choco_sam2choco.py                |  27 ++
 choco_summary.py                  | 128 +++++++
 compare_clusters.py               | 125 +++++++
 conv_cg2cgs.py                    |  48 +++
 conv_cg2mg.py                     | 143 ++++++++
 conv_ctxt2coretxt.py              |  87 +++++
 conv_ctxt2coretxt_withexlusion.py | 174 +++++++++
 conv_ctxt2mat.py                  |  74 ++++
 conv_ctxt2rxl.py                  |  65 ++++
 conv_ctxt2ttxt.py                 |  80 ++++
 conv_ctxt2txt.py                  |  95 +++++
 conv_fna2rxl.py                   |  47 +++
 conv_seqs.py                      |  41 +++
 conv_t2g_2_g2t.py                 |  40 ++
 conv_tid2tax.py                   | 143 ++++++++
 conv_txt2ls.py                    |  34 ++
 conv_uc2txt.py                    |  41 +++
 excel.py                          |  22 ++
 fastq2fasta.py                    |  25 ++
 fastq_trim.py                     |  43 +++
 fastx_len_filter.py               |  23 ++
 fna_exact_matches.py              |  56 +++
 fna_extract_from_bo6.py           |  54 +++
 fna_extract_from_primersearch.py  |  78 ++++
 fna_extract_with_primers.py       |  96 +++++
 fna_g2r.py                        |  45 +++
 fna_len.py                        |  49 +++
 fna_random.py                     |  42 +++
 fna_revcomp.py                    |  46 +++
 fna_sss.py                        | 163 +++++++++
 fna_t2g.py                        |  36 ++
 gen_info.py                       |  76 ++++
 get_genome_length.py              |  22 ++
 libsvm_cv.py                      | 159 ++++++++
 license.txt                       |   8 +
 merg_ctxt.py                      |  47 +++
 mlst_table2fna.py                 |  62 ++++
 msg.py                            |  12 +
 ooSubprocess.py                   | 229 ++++++++++++
 pyphlan.py                        | 748 ++++++++++++++++++++++++++++++++++++++
 tab.py                            | 106 ++++++
 tab_sel_cols.py                   |  24 ++
 tab_sub.py                        |  27 ++
 tree_cores.py                     |  55 +++
 tree_get_leaves.py                |  43 +++
 tree_get_subtrees.py              |  45 +++
 tree_markers.py                   |  45 +++
 tree_pairwisedists.py             |  54 +++
 tree_precision.py                 |  42 +++
 tree_prune.py                     |  46 +++
 tree_recall.py                    |  38 ++
 tree_rename.py                    |  48 +++
 tree_reroot.py                    |  45 +++
 tree_select_markers.py            |  45 +++
 tree_subtree.py                   |  42 +++
 util_argparse.py                  |  41 +++
 utils.py                          |  23 ++
 64 files changed, 5259 insertions(+)

diff --git a/__init__.py b/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/bo6_screen.py b/bo6_screen.py
new file mode 100644
index 0000000..3aa1345
--- /dev/null
+++ b/bo6_screen.py
@@ -0,0 +1,83 @@
+#!/usr/bin/env python
+
+from __future__ import with_statement
+
+import sys,argparse
+import utils 
+import collections
+
+def read_params(args):
+    parser = argparse.ArgumentParser(description='NCBI blast outfmt6 output processing')
+    
+    parser.add_argument('inp_f', metavar='INPUT_FILE', nargs='?', default=None, type=str, 
+                        help="the input file [stdin if not present]")    
+    parser.add_argument('out_f', metavar='OUTPUT_FILE', nargs='?', default=None, type=str, 
+                        help="the output file [stdout if not present]")    
+    parser.add_argument('--pid', metavar='percentage identity', default=0, type=float,
+                        help="minimum percentage identity (default 0)")
+    parser.add_argument('--length', metavar='length', default=0, type=int,
+                        help="minimum alignment length (default 0)")
+    parser.add_argument('--evalue', metavar='evalue', default=float(sys.maxint), type=float,
+                        help="maximum expect value (default INF)")
+    parser.add_argument('--bitscore', metavar='bitscore', default=0.0, type=float,
+                        help="minimum bit score (default 0)")
+    parser.add_argument('--pid_col', metavar='pid_col', default=3, type=int,
+                        help="column of the pid value (default 3)")
+    parser.add_argument('--length_col', metavar='length_col', default=4, type=int,
+                        help="column of the length value (default 4)")
+    parser.add_argument('--evalue_col', metavar='evalue_col', default=11, type=int,
+                        help="column of the expect value (default 11)")        
+    parser.add_argument('--bitscore_col', metavar='bitscore_col', default=12, type=int,
+                        help="column of the bit score (default 12)")
+    parser.add_argument('-n', metavar='number of top hits', default=-1, type=int,
+			help="number of results to show (default -1 means all)")
+    parser.add_argument('-t', metavar='number of top hits per unique query', default=-1, type=int,
+			help="number of results per unique qeury to show (default -1 means all)")
+    parser.add_argument('-s', default=None, choices=["evalue","bitscore","length","pid"], type=str,
+			help="the measure used to sort the output (default bitscore)")
+    return vars(parser.parse_args()) 
+
+def blast_ncbi_outfmt6_screen(par):
+    finp,fout = bool(par['inp_f']), bool(par['out_f'])
+
+    inp_mat = (l.rstrip('\n').split("\t") for l in (utils.openr(par['inp_f']) if finp else sys.stdin))
+
+    out_mat =(l for l in inp_mat 
+                    if float(l[par['pid_col']-1]) >= par['pid'] and
+                       float(l[par['length_col']-1]) >= par['length'] and
+                       float(l[par['evalue_col']-1]) <= par['evalue'] and
+                       float(l[par['bitscore_col']-1]) >= par['bitscore']  )
+
+    if 's' in par and par['s']:
+        if par['s'] == 'pid':
+            col = par['pid']-1
+        elif par['s'] == 'evalue':
+            col = par['evalue_col']
+        elif par['s'] == 'length':
+            col = par['length_col']-1
+        elif par['s'] == 'bitscore':
+            col = par['bitscore_col']-1
+
+        out_mat = sorted( out_mat, 
+                          key=lambda x: float(x[col-1]) )
+
+        if 'n' in par and par['n'] > -1:
+            out_mat = out_mat[:par['n']]
+    
+    unique_queries = collections.defaultdict( int ) 
+    with utils.openw(par['out_f']) if fout else sys.stdout as out_file:
+        if 't' in par and par['t'] > -1:
+            for l in out_mat:
+                unique_queries[l[0]] += 1
+                if unique_queries[l[0]] > par['t']:
+                    continue
+                out_file.write("\t".join(l)+"\n")
+        else:
+            for l in out_mat:
+                out_file.write("\t".join(l)+"\n")
+
+
+if __name__ == '__main__':
+    params = read_params(sys.argv)
+    
+    blast_ncbi_outfmt6_screen(params)
diff --git a/choco_markers.py b/choco_markers.py
new file mode 100644
index 0000000..5c2b362
--- /dev/null
+++ b/choco_markers.py
@@ -0,0 +1,225 @@
+#!/usr/bin/env python
+
+import sys
+import collections
+import utils
+#import cPickle as pickle
+import pickle
+import pyphlan as ppa  
+from Bio import SeqIO
+
+try:
+    import argparse as ap
+    import bz2 
+except ImportError:
+    sys.stderr.write( "argparse not found" )
+    sys.exit(-1)
+
+def read_params( args ):
+    p = ap.ArgumentParser(description='Profile ChocoPhlAn genes\n')
+
+    p.add_argument( '--sam', required = True, default=None, type=str )
+    p.add_argument( '--centroids', required = True, default=None, type=str )
+    #p.add_argument( '--centroids_cores', action='store_true' )
+    p.add_argument( '--centroids_ffn', required = True, default=None, type=str )
+    p.add_argument( '--lens', required = True, default=None, type=str )
+    p.add_argument( '--g2c', required = True, default=None, type=str )
+    p.add_argument( '--taxonomy', required = True, default=None, type=str )
+    p.add_argument( '--out_markers', required = True, default=None, type=str )
+    p.add_argument( '--out_ml', required = True, default=None, type=str )
+    p.add_argument( '--out', required = True, default=None, type=str )
+    p.add_argument( '--out_mpa_pkl', required = True, default=None, type=str )
+    p.add_argument( '--out_m2c', required = True, default=None, type=str )
+    p.add_argument( '--out_summary', required = True, default=None, type=str )
+    p.add_argument( '--min_n_markers', default=10, type=int )
+    p.add_argument( '--min_n_markers_strains', default=20, type=int )
+    p.add_argument( '--top_n_markers', default=200, type=int )
+    p.add_argument( '--score_th', default=100, type=int )
+    p.add_argument( '--include_strains', default=False, action = 'store_true' )
+    return vars( p.parse_args() )
+    
+if __name__ == "__main__":
+    args = read_params( sys.argv )
+    
+    tree = ppa.PpaTree( args['taxonomy'],  lev_sep = "|" )
+
+    clades2terms = ppa.clades2terms( tree.tree )
+    
+    clades2taxa = dict([(clade.full_name,{'taxa':set([taxon.name[3:] for taxon in taxa]),'genes':set()}) for clade,taxa in clades2terms.items()])
+    #ttab = 1 if args['centroids_cores'] else 2
+    #gtab = 0 if args['centroids_cores'] else 1
+    ttab = 2
+    gtab = 1
+    genes2taxa, all_genes, all_reads = {}, set(), set()
+    with open(args['centroids']) as inp:
+        for line in (l.split('\t') for l in inp):
+            taxon = line[ttab]
+            gene = line[gtab]
+            genes2taxa[line[gtab]] = taxon
+            all_genes.add( gene )
+            clades2taxa[taxon]['genes'].add(gene)
+
+
+    contigs2genomes = {}
+    with open( args['g2c'] ) as inp:
+        lines = (list(i.strip().split('\t')) for i in inp)
+        for line in lines: 
+            genome = line[0]
+            for l in line[1:]:
+                contigs2genomes[l] = genome
+    
+    reads2lens = {}
+    #genes2nreads = collections.defaultdict( int )
+    with open( args['lens'] ) as inp:
+        lines = (i.strip().split('\t') for i in inp)
+        for r,l in lines:
+            #reads2lens[r] = int(l)
+            gene = "_".join(r.split("_")[:-1])
+            if gene in all_genes:
+                #genes2nreads[gene] += 1
+                all_reads.add( r )
+                reads2lens[r] = int(l)
+
+
+    #genes2nreads = collections.defaultdict( int )
+    #for r in reads2lens:
+    #    gene = "_".join(r.split("_")[:-1])
+    #    if gene in all_genes:
+    #        genes2nreads[gene] += 1
+    #        all_reads.add( r )
+
+
+    reads2ghits = {}
+    inp = bz2.BZ2File( args['sam'] )
+    lines = (list(l.split('\t')) for l in inp)
+    lines_strip = ([l[0],l[1],int(l[2])] for l in lines if l[0] in all_reads)
+    for fr,to,val in lines_strip:
+        to_genome = contigs2genomes[to]
+        valf = float(val)/float(reads2lens[fr])
+        if fr not in reads2ghits:
+            reads2ghits[fr] = {to_genome: valf}
+        elif to_genome not in reads2ghits[fr]:
+            reads2ghits[fr][to_genome] = valf
+        elif to_genome in reads2ghits[fr] and valf > reads2ghits[fr][to_genome]:
+            reads2ghits[fr][to_genome] = valf
+    inp.close()
+
+    genes2genomes = {}
+    for r,h in reads2ghits.items():
+        gene = "_".join(r.split("_")[:-1])
+        if gene not in genes2genomes:
+            genes2genomes[gene] = {}
+        
+        for hh,vv in h.items():
+            if hh not in genes2genomes[gene]:
+                genes2genomes[gene][hh] = set()
+            genes2genomes[gene][hh].add( vv )
+
+    ffn = SeqIO.to_dict(SeqIO.parse(args['centroids_ffn'], "fasta"))
+    ffn_len = dict([(k,len(v)) for k,v in ffn.items()])
+
+    res = {}
+    for gene,taxon in genes2taxa.items():
+        if gene not in genes2genomes:
+            continue
+        int_genomes = set(clades2taxa[taxon]['taxa'])
+        genomes_hit = set(genes2genomes[gene])
+        int_genomes_hit = int_genomes & genomes_hit
+        ext_genomes_hit = genomes_hit - int_genomes
+       
+        n_int_genomes, n_genomes_hit, n_int_genomes_hit, n_ext_genomes_hit = len(int_genomes), len(genomes_hit), len(int_genomes_hit), len(ext_genomes_hit)
+
+        len_pen = 0 
+        if ffn_len[gene] < 250:
+            len_penlen_pen = 6
+        if ffn_len[gene] < 500:
+            len_penlen_pen = 4
+        if ffn_len[gene] < 750:
+            len_penlen_pen = 2
+        
+        score = n_ext_genomes_hit + len_pen
+        score += 5.0*float( n_int_genomes-n_int_genomes_hit ) / float(n_int_genomes) 
+
+        if taxon not in res:
+            res[taxon] = {}
+
+        if score > args['score_th']:
+            continue
+
+        if "t__" in taxon and score > 0:
+            continue
+
+        res[taxon][gene] = { 'score' : score, 'ext' : ext_genomes_hit }
+
+
+    selected_markers = set()
+    for taxon, markers in res.items():
+        for i, (marker, marker_v) in enumerate(sorted( markers.items(), key = lambda x: x[1]['score'] )):
+            if i >= args['top_n_markers']:
+                del res[taxon][marker]
+            #res[taxon][marker]['seq'] = ffn[marker]
+
+    
+    markers_ffn = []
+    to_pkl = {'markers':{}}
+    with open( args['out_summary'], "w" ) as outf:
+        with open( args['out'], "w" ) as out:
+            for taxon, markers in res.items():
+                if not args['include_strains']  and "t__" in taxon:
+                    continue
+                if args['include_strains']  and "t__" in taxon:
+                    if len(markers) < args['min_n_markers_strains']:
+                        continue
+                if "t__"  not in taxon and len(markers) < args['min_n_markers']:
+                    continue
+                
+                outf.write( "\t".join( [str(taxon), str(len(markers))] ) +"\n" )
+
+                for marker, marker_v in markers.items():
+                    out.write( "\t".join([  marker,
+                                            #taxon,
+                                            genes2taxa[marker].split("|")[-1],
+                                            str( ffn_len[marker]),
+                                            str(marker_v['score']),
+                                            str(len(marker_v['ext'])),
+                                            str(",".join(marker_v['ext'])),
+                                            ])
+                                            +"\n" )
+                    to_pkl['markers'][marker] = {   'taxon':taxon,
+                                                    'clade':genes2taxa[marker].split("|")[-1],
+                                                    'len': ffn_len[marker],
+                                                    'score': marker_v['score'],
+                                                    'ext': marker_v['ext'] }
+
+                    res[taxon][marker]['seq'] = ffn[marker]
+                    markers_ffn.append( ffn[marker] )
+                    selected_markers.add( ffn[marker].id  )
+    
+    SeqIO.write( markers_ffn, args['out_markers'], "fasta")
+
+    to_pkl['taxonomy'] = [l.strip() for l in open(args['taxonomy'])]
+
+    #with open(args['out_mpa_pkl'], 'wb') as out:
+    #   bz2.compress(pickle.dump(to_pkl, out, pickle.HIGHEST_PROTOCOL))
+    out = bz2.BZ2File(args['out_mpa_pkl'],"wb")
+    pickle.dump(to_pkl, out, pickle.HIGHEST_PROTOCOL)
+    out.close()
+
+
+
+
+    #with open( args['out_ml'], "w" ) as outf:
+    with open( args['out_ml'], "w" ) as out_ml:
+        with open( args['out_m2c'], "w" ) as out_m2c:
+            for k,v in ffn_len.items():
+                if not args['include_strains'] and "t__" in genes2taxa[k]:
+                    continue
+                if k not in selected_markers:
+                    continue
+                #outf.write( "\t".join([k,str(v)]) + "\n" )
+                out_ml.write( "\t".join([k,str( ffn_len[k]  )]) + "\n" ) 
+                out_m2c.write( "\t".join([k, genes2taxa[k].split("|")[-1]  ]) + "\n" ) 
+                #out_m2c.write( "\t".join([k,str( ffn_len[k]  )]) + "\n" ) 
+
+
+
diff --git a/choco_markers_summary.py b/choco_markers_summary.py
new file mode 100644
index 0000000..75c99fe
--- /dev/null
+++ b/choco_markers_summary.py
@@ -0,0 +1,207 @@
+#!/usr/bin/env python
+
+import sys
+import collections
+import utils
+import cPickle as pickle
+import pyphlan as ppa  
+from Bio import SeqIO
+
+try:
+    import argparse as ap
+    import bz2 
+except ImportError:
+    sys.stderr.write( "argparse not found" )
+    sys.exit(-1)
+
+def read_params( args ):
+    p = ap.ArgumentParser(description='Profile ChocoPhlAn genes\n')
+
+    p.add_argument( '--inp', required = True, default=None, type=str )
+    p.add_argument( '--taxonomy', required = True, default=None, type=str )
+    return vars( p.parse_args() )
+    
+if __name__ == "__main__":
+    args = read_params( sys.argv )
+    
+    tree = ppa.PpaTree( args['taxonomy'],  lev_sep = "|" )
+
+    with open( args['inp'] ) as inp:
+        markers = dict([(line[0],int(line[1])) for line in (l.strip().split('\t') for l in inp)])
+
+
+    def traverse( clade, father = None ):
+        if len(clade.clades) == 1:
+            if clade.full_name in markers:
+                print clade.full_name, markers[clade.full_name]
+                traverse(clade.clades[0],markers[clade.full_name])
+            else:
+                if father is None:
+                    print clade.full_name, 0
+                    traverse(clade.clades[0])
+                else:
+                    print clade.full_name, father
+                    traverse(clade.clades[0],father)
+        else:
+            if clade.full_name in markers:
+                print clade.full_name, markers[clade.full_name] 
+            else:
+                if father is None:
+                    print clade.full_name, 0
+                else:
+                    print clade.full_name, father
+            for c in clade.clades:
+                traverse(c)
+
+    traverse(tree.tree.root)
+
+
+
+
+
+    """   
+
+
+    clades2terms = ppa.clades2terms( tree.tree )
+    
+    clades2taxa = dict([(clade.full_name,{'taxa':set([taxon.name[3:] for taxon in taxa]),'genes':set()}) for clade,taxa in clades2terms.items()])
+    
+    genes2taxa, all_genes, all_reads = {}, set(), set()
+    with open(args['centroids']) as inp:
+        for line in (l.split('\t') for l in inp):
+            taxon = line[2]
+            gene = line[1]
+            genes2taxa[line[1]] = taxon
+            all_genes.add( gene )
+            clades2taxa[taxon]['genes'].add(gene)
+
+
+    contigs2genomes = {}
+    with open( args['g2c'] ) as inp:
+        lines = (list(i.strip().split('\t')) for i in inp)
+        for line in lines: 
+            genome = line[0]
+            for l in line[1:]:
+                contigs2genomes[l] = genome
+    
+    reads2lens = {}
+    #genes2nreads = collections.defaultdict( int )
+    with open( args['lens'] ) as inp:
+        lines = (i.strip().split('\t') for i in inp)
+        for r,l in lines:
+            #reads2lens[r] = int(l)
+            gene = "_".join(r.split("_")[:-1])
+            if gene in all_genes:
+                #genes2nreads[gene] += 1
+                all_reads.add( r )
+                reads2lens[r] = int(l)
+
+
+    #genes2nreads = collections.defaultdict( int )
+    #for r in reads2lens:
+    #    gene = "_".join(r.split("_")[:-1])
+    #    if gene in all_genes:
+    #        genes2nreads[gene] += 1
+    #        all_reads.add( r )
+
+
+    reads2ghits = {}
+    inp = bz2.BZ2File( args['sam'] )
+    lines = (list(l.split('\t')) for l in inp)
+    lines_strip = ([l[0],l[1],int(l[2])] for l in lines if l[0] in all_reads)
+    for fr,to,val in lines_strip:
+        to_genome = contigs2genomes[to]
+        valf = float(val)/float(reads2lens[fr])
+        if fr not in reads2ghits:
+            reads2ghits[fr] = {to_genome: valf}
+        elif to_genome not in reads2ghits[fr]:
+            reads2ghits[fr][to_genome] = valf
+        elif to_genome in reads2ghits[fr] and valf > reads2ghits[fr][to_genome]:
+            reads2ghits[fr][to_genome] = valf
+    inp.close()
+
+    genes2genomes = {}
+    for r,h in reads2ghits.items():
+        gene = "_".join(r.split("_")[:-1])
+        if gene not in genes2genomes:
+            genes2genomes[gene] = {}
+        
+        for hh,vv in h.items():
+            if hh not in genes2genomes[gene]:
+                genes2genomes[gene][hh] = set()
+            genes2genomes[gene][hh].add( vv )
+
+    ffn = SeqIO.to_dict(SeqIO.parse(args['centroids_ffn'], "fasta"))
+    ffn_len = dict([(k,len(v)) for k,v in ffn.items()])
+
+    res = {}
+    for gene,taxon in genes2taxa.items():
+        if gene not in genes2genomes:
+            continue
+        
+        int_genomes = set(clades2taxa[taxon]['taxa'])
+        genomes_hit = set(genes2genomes[gene])
+        int_genomes_hit = int_genomes & genomes_hit
+        ext_genomes_hit = genomes_hit - int_genomes
+       
+        n_int_genomes, n_genomes_hit, n_int_genomes_hit, n_ext_genomes_hit = len(int_genomes), len(genomes_hit), len(int_genomes_hit), len(ext_genomes_hit)
+
+        len_pen = 0 
+        if ffn_len[gene] < 250:
+            len_penlen_pen = 6
+        if ffn_len[gene] < 500:
+            len_penlen_pen = 4
+        if ffn_len[gene] < 750:
+            len_penlen_pen = 2
+        
+        score = n_ext_genompes_hit + len_pen
+        score += 5.0*float( n_int_genomes-n_int_genomes_hit ) / float(n_int_genomes) 
+
+        if taxon not in res:
+            res[taxon] = {}
+
+        if score > 100:
+            continue
+
+        res[taxon][gene] = { 'score' : score, 'ext' : ext_genomes_hit }
+
+
+    selected_markers = set()
+    for taxon, markers in res.items():
+        for i, (marker, marker_v) in enumerate(sorted( markers.items(), key = lambda x: x[1]['score'] )):
+            if i >= args['top_n_markers']:
+                del res[taxon][marker]
+            #res[taxon][marker]['seq'] = ffn[marker]
+
+    
+    markers_ffn = []
+
+    with open( args['out_summary'], "w" ) as outf:
+        for taxon, markers in res.items():
+            if "t__" in taxon:
+                continue
+            if len(markers) < args['min_n_markers']:
+                continue
+            outf.write( "\t".join( [str(taxon), str(len(markers))] ) +"\n" )
+            for marker, marker_v in markers.items():
+                res[taxon][marker]['seq'] = ffn[marker]
+                markers_ffn.append( ffn[marker] )
+                selected_markers.add( ffn[marker].id  )
+    
+    SeqIO.write( markers_ffn, args['out_markers'], "fasta")
+
+    #with open( args['out_ml'], "w" ) as outf:
+    with open( args['out_ml'], "w" ) as out_ml:
+        with open( args['out_m2c'], "w" ) as out_m2c:
+            for k,v in ffn_len.items():
+                if "t__" in genes2taxa[k]:
+                    continue
+                if k not in selected_markers:
+                    continue
+                #outf.write( "\t".join([k,str(v)]) + "\n" )
+                out_ml.write( "\t".join([k,str( ffn_len[k]  )]) + "\n" ) 
+                out_m2c.write( "\t".join([k, genes2taxa[k].split("|")[-1]  ]) + "\n" ) 
+                #out_m2c.write( "\t".join([k,str( ffn_len[k]  )]) + "\n" ) 
+
+
+    """
diff --git a/choco_postprocess_euks.py b/choco_postprocess_euks.py
new file mode 100644
index 0000000..306472f
--- /dev/null
+++ b/choco_postprocess_euks.py
@@ -0,0 +1,106 @@
+#!/usr/bin/env python
+
+import sys
+import collections
+import utils
+import pyphlan as ppa  
+
+try:
+    import argparse as ap
+    import bz2 
+except ImportError:
+    sys.stderr.write( "argparse not found" )
+    sys.exit(-1)
+
+def read_params( args ):
+    p = ap.ArgumentParser(description='Profile ChocoPhlAn genes\n')
+
+    p.add_argument( '--markers', required = True, default=None, type=str )
+    p.add_argument( '--euk_taxonomy', required = True, default=None, type=str )
+    #p.add_argument( '--mic_taxonomy', required = True, default=None, type=str )
+    p.add_argument('out', nargs='?', default=None, type=str,
+            help=   "the output summary file\n"
+                    "[stdout if not present]")
+
+    return vars( p.parse_args() )
+
+if __name__ == "__main__":
+    args = read_params( sys.argv )
+
+    cen2data = {}
+    etree = ppa.PpaTree( args['euk_taxonomy'] )
+    #vtree = ppa.PpaTree( args['vir_taxonomy'] )
+    #mtree = ppa.PpaTree( args['mic_taxonomy'] )
+    #vall = set([a.name for a in vtree.tree.get_terminals()])
+    #mall = set([int(a.name[3:]) for a in mtree.tree.get_terminals()])
+    eall = set([a.name for a in etree.tree.get_terminals()])
+
+    lin = (l.split('\t') for l in open(args['markers']))
+    for d1,cen,taxa,tax,d2,d3,coreness,d4,d5,d6,d7,tin,tout,d8,tsin,tsout in lin:
+        tsin,tsout = set(["t__"+a for a in tsin.strip().split(":") if a]),set(["t__"+b for b in tsout.strip().split(":") if b])
+        #tsin,tsout = set([int(a) for a in tsin.strip().split(":") if a]),set([int(b) for b in tsout.strip().split(":") if b])
+        cen2data[int(cen)] = {'taxa':int(taxa),'tax':tax,'coreness':float(coreness),'tsin':tsin,'tsout':tsout}
+
+
+    tax2cen = collections.defaultdict( set )
+    for k,v in cen2data.items():
+        tax2cen[v['tax']].add( k )
+   
+    for t,cs in tax2cen.items():
+        q0 = sorted([c for c in cs if not cen2data[c]['tsout']],key=lambda x:-cen2data[x]['coreness'])
+
+        """
+        ok,err = [],[] 
+        for c in cs:
+            if c in q0:
+                continue
+            vext = vall & cen2data[c]['tsout']
+            if vext: 
+                err.append( c )
+            else:
+                ok.append( c )
+
+        q10,q20 = {},{}
+        for c in ok:
+            q10[c] = mtree.lca( [str(tt) for tt in cen2data[c]['tsout'] if str(tt) in mall] ).full_name.split(".t__")[0]
+        for c in err:
+            q20[c] = mtree.lca( [str(tt) for tt in cen2data[c]['tsout'] if str(tt) in mall] ).full_name.split(".t__")[0]
+        """
+
+        def maxn(x,y): return x > 2000 
+
+        n = 0
+        for o in q0:
+            sys.stdout.write( "\t".join([ str(o),str(t),""]) + "\n" )
+            n += 1
+            if n > 2000:
+                break
+
+        if maxn( n, len(cs) ):
+            continue
+        
+        """
+        added = set()
+        for taxlev in "sgfoc":
+            for c in ok:
+                if True: # taxlev+"__" in q10[c]:
+                    if c not in added:
+                        sys.stdout.write( "\t".join([str(c),str(t),str(q10[c])]) + "\n"  )
+                        added.add(c)
+                        n += 1
+                if maxn( n, len(cs) ):
+                    break
+            for c in err:
+                if True: # taxlev+"__" in q20[c]:
+                    if c not in added:
+                        sys.stdout.write( "\t".join([str(c),str(t),str(q20[c])]) + "\n"  )
+                        added.add(c)
+                        n += 1
+                if maxn( n, len(cs) ):
+                    break
+            if maxn( n, len(cs) ):
+                break
+        """
+        ##print "+++++++++++++++++++++"
+
+        ##print t,len(cs),len(q0),err
diff --git a/choco_postprocess_viruses.py b/choco_postprocess_viruses.py
new file mode 100644
index 0000000..2f98c67
--- /dev/null
+++ b/choco_postprocess_viruses.py
@@ -0,0 +1,203 @@
+#!/usr/bin/env python
+
+import sys
+import collections
+import utils
+import pyphlan as ppa  
+
+try:
+    import argparse as ap
+    import bz2 
+except ImportError:
+    sys.stderr.write( "argparse not found" )
+    sys.exit(-1)
+
+def read_params( args ):
+    p = ap.ArgumentParser(description='Profile ChocoPhlAn genes\n')
+
+    p.add_argument( '--markers', required = True, default=None, type=str )
+    p.add_argument( '--vir_taxonomy', required = True, default=None, type=str )
+    p.add_argument( '--mic_taxonomy', required = True, default=None, type=str )
+    p.add_argument('out', nargs='?', default=None, type=str,
+            help=   "the output summary file\n"
+                    "[stdout if not present]")
+
+    return vars( p.parse_args() )
+
+if __name__ == "__main__":
+    args = read_params( sys.argv )
+
+    cen2data = {}
+    vtree = ppa.PpaTree( args['vir_taxonomy'] )
+    mtree = ppa.PpaTree( args['mic_taxonomy'] )
+    vall = set([a.name for a in vtree.tree.get_terminals()])
+    #mall = set([int(a.name[3:]) for a in mtree.tree.get_terminals()])
+    mall = set([a.name for a in mtree.tree.get_terminals()])
+
+    lin = (l.split('\t') for l in open(args['markers']))
+    for d1,cen,taxa,tax,d2,d3,coreness,d4,d5,d6,d7,tin,tout,d8,tsin,tsout in lin:
+        tsin,tsout = set(["t__"+a for a in tsin.strip().split(":") if a]),set(["t__"+b for b in tsout.strip().split(":") if b])
+        #tsin,tsout = set([int(a) for a in tsin.strip().split(":") if a]),set([int(b) for b in tsout.strip().split(":") if b])
+        cen2data[int(cen)] = {'taxa':int(taxa),'tax':tax,'coreness':float(coreness),'tsin':tsin,'tsout':tsout}
+
+
+    tax2cen = collections.defaultdict( set )
+    for k,v in cen2data.items():
+        tax2cen[v['tax']].add( k )
+   
+    for t,cs in tax2cen.items():
+        q0 = sorted([c for c in cs if not cen2data[c]['tsout']],key=lambda x:-cen2data[x]['coreness'])
+
+        ok,err = [],[] 
+        for c in cs:
+            if c in q0:
+                continue
+            #print "======================",list(vall)[:3], list(cen2data[c]['tsout'])[:3]
+            vext = vall & cen2data[c]['tsout']
+            if vext: 
+                #print "ERRRRRRRRRRRRRRRRRRRR"
+                err.append( c )
+            else:
+                #print "OOOOOOOOOOOOOOOOOR"
+                ok.append( c )
+
+        q10,q20 = {},{}
+        for c in ok:
+
+            q10[c] = mtree.lca( [str(tt) for tt in cen2data[c]['tsout'] if str(tt) in mall] ).full_name.split(".t__")[0]
+            #print ["t__"+str(tt) for tt in cen2data[c]['tsout']]
+            #print list(vall)[:3]
+            #print ["t__"+str(tt) for tt in cen2data[c]['tsout'] if "t__"+str(tt) in mall]
+            #print "q1000000000000",q10[c], [str(tt) for tt in cen2data[c]['tsout']]
+        for c in err:
+            q20[c] = mtree.lca( [str(tt) for tt in cen2data[c]['tsout'] if str(tt) in mall] ).full_name.split(".t__")[0]
+            #print ["t__"+str(tt) for tt in cen2data[c]['tsout']]
+            #print "q2000000000000", q20[c], ["t__"+str(tt) for tt in cen2data[c]['tsout']]
+
+        def maxn(x,y): return x > 50 or ( x > 10 and y > 15 )
+
+        #print q0,q10,q20
+
+        n = 0
+        for o in q0:
+            sys.stdout.write( "\t".join([ str(o),str(t),""]) + "\n" )
+            n += 1
+            if n > 50:
+                break
+
+        if maxn( n, len(cs) ):
+            continue
+        ##print "============" 
+        added = set()
+        for taxlev in "sgfoc":
+            ##print "a"
+            for c in ok:
+                ##print "q",taxlev+"__", q10[c] 
+                if True: # taxlev+"__" in q10[c]:
+                    ##print taxlev+"__" in q10[c]
+                    if c not in added:
+                        sys.stdout.write( "\t".join([str(c),str(t),str(q10[c])]) + "\n"  )
+                        added.add(c)
+                        n += 1
+                if maxn( n, len(cs) ):
+                    break
+            ##print "b"
+            for c in err:
+                #print c,q20[c]
+                if True: # taxlev+"__" in q20[c]:
+                    if c not in added:
+                        sys.stdout.write( "\t".join([str(c),str(t),str(q20[c])]) + "\n"  )
+                        added.add(c)
+                        n += 1
+                if maxn( n, len(cs) ):
+                    break
+            if maxn( n, len(cs) ):
+                break
+
+        ##print "+++++++++++++++++++++"
+
+        ##print t,len(cs),len(q0),err
+
+
+    """
+    cscores = collections.defaultdict( set )
+    fwmarkers = {} 
+    maps = collections.defaultdict( set )
+    tree = ppa.PpaTree( args['taxonomy'] )
+    clades2terms = ppa.clades2terms( tree.tree ) 
+
+    clades2taxa = dict([(clade.full_name,set([-int(taxon.name[4:]) for taxon in taxa])) for clade,taxa in clades2terms.items()])
+    ntaxa = 0
+    for v in clades2terms.values():
+        ntaxa += len(v)
+
+    for l in open( args['cscores'] ):
+        gene_seed, clade, n, n_tot, coreness = l.strip().split('\t')
+        gene_seed, clade, n, n_tot, coreness = int(gene_seed), clade, int(n), int(n_tot), float(coreness)
+        cscores[gene_seed].add( (clade, n, n_tot, coreness) )
+    
+    for i,l in enumerate(open( args['fwmarkers'] )):
+        taxa_id, gene_seed, clade, n, n_tot, coreness, n_ext_seeds, n_ext_taxa, uniqueness, ext_taxa = l.strip().split('\t')
+        taxa_id, gene_seed, n, n_tot, coreness, n_ext_seeds, n_ext_taxa, uniqueness = int(taxa_id), int(gene_seed), int(n), int(n_tot), float(coreness), int(n_ext_seeds), int(n_ext_taxa), float(uniqueness)
+        fwmarkers[gene_seed] = (taxa_id, clade, n, n_tot, coreness, n_ext_seeds, n_ext_taxa, uniqueness)
+
+    for l in open( args['maps'] ):
+        line = l.strip().split('\t')
+        if len(line) < 2:
+            continue
+        fr,to = line
+        fr,to = int(fr.split("_")[0]), to # can be improved!!!!
+        maps[fr].add( to )
+
+    g2t,g2c,c2g = {},{},{}
+    if args['g2t']:
+        with open( args['g2t'] ) as inp:
+            g2t = dict(([int(a) for a in l.strip().split('\t')] for l in inp))
+    elif args['t2g']:
+        with open( args['t2g'] ) as inp:
+            for ll in (l.strip().split('\t') for l in inp):
+                for g in ll[1:]:
+                    g2t[int(g)] = int(ll[0])
+        
+    genomes = set([g2t[g] for g in cscores]) 
+
+    with open( args['g2c'] ) as inp:
+        for l in inp:
+            line = list(l.strip().split('\t'))
+            #if int(line[0]) not in genomes:
+            #    continue
+            #vals = [int(a) for a in line if utils.is_number(a)]
+            vals = [a for a in line]
+            if len(vals) > 1:
+                g2c[int(vals[0])] = vals[1:]
+    for g,c in g2c.items():
+        for cc in c:
+            c2g[cc] = g
+    with utils.openw( args['out'] ) as out:
+        for gene_seed,cscores_t in cscores.items():
+            taxa = g2t[gene_seed]
+            for clade, n, n_tot, coreness in cscores_t:
+                out.write( "\t".join(["CSCORE",str(gene_seed),str(taxa),clade,str(n), str(n_tot), str(coreness)]) +"\n" )
+            
+            # anche sotto ???
+
+            if gene_seed in fwmarkers:
+                taxa_id, clade, n, n_tot, coreness, n_ext_seeds, n_ext_taxa, uniqueness = fwmarkers[gene_seed]
+                if uniqueness < 0.01:
+                     out.write( "\t".join(["FWMARKER",str(gene_seed),str(taxa),clade,str(n), str(n_tot), str(coreness),
+                                                      str(n_ext_seeds), str(n_ext_taxa), str(1.0-uniqueness)]) +"\n" ) 
+
+                if gene_seed in maps:
+                    ext_tax = set([(c2g[s] if s in c2g else 0) for s in maps[gene_seed]])
+                    ext_tax_ok = ext_tax & clades2taxa[clade]
+                    ext_tax_ko = ext_tax - clades2taxa[clade]
+                    ext_tax_okl = ":".join([str(s) for s in ext_tax_ok])
+                    ext_tax_kol = ":".join([str(s) for s in ext_tax_ko])
+                    muniq = 1.0 - float(len(ext_tax_kol)) / float(ntaxa)
+
+                    out.write( "\t".join(["MARKER",str(gene_seed),str(taxa),clade,str(n), str(n_tot), str(coreness),
+                                                   str(n_ext_seeds), str(n_ext_taxa), str(1.0-uniqueness), 
+                                                   str(len(ext_tax)), str(len(ext_tax_ok)), str(len(ext_tax_ko)), 
+                                                   str(muniq), ext_tax_okl, ext_tax_kol]) +"\n" ) 
+
+    """
diff --git a/choco_profile.py b/choco_profile.py
new file mode 100644
index 0000000..c289377
--- /dev/null
+++ b/choco_profile.py
@@ -0,0 +1,133 @@
+#!/usr/bin/env python
+
+import sys
+import collections
+import utils
+import pyphlan as ppa  
+
+try:
+    import argparse as ap
+    import bz2 
+except ImportError:
+    sys.stderr.write( "argparse not found" )
+    sys.exit(-1)
+
+def read_params( args ):
+    p = ap.ArgumentParser(description='Profile ChocoPhlAn genes\n')
+
+    p.add_argument( '--cscores', required = True, default=None, type=str )
+    p.add_argument( '--fwmarkers', required = True, default=None, type=str )
+    p.add_argument( '--maps', required = True, default=None, type=str )
+    p.add_argument( '--taxonomy', required = True, default=None, type=str )
+    p.add_argument('--g2t', metavar="Mapping file from genes to taxa", default=None, type=str )
+    p.add_argument('--t2g', metavar="Mapping file from taxa to genes", default=None, type=str )
+    p.add_argument('--g2c', metavar="Mapping file from genomes to contigs", required = True, default=None, type=str )
+    p.add_argument('out', nargs='?', default=None, type=str,
+            help=   "the output summary file\n"
+                    "[stdout if not present]")
+
+    return vars( p.parse_args() )
+
+if __name__ == "__main__":
+    args = read_params( sys.argv )
+
+    cscores = collections.defaultdict( set )
+    fwmarkers = {} 
+    maps = collections.defaultdict( set )
+    tree = ppa.PpaTree( args['taxonomy'],  lev_sep = "|" )
+    clades2terms = ppa.clades2terms( tree.tree ) 
+
+    clades2taxa = dict([(clade.full_name,set([-int(taxon.name[4:]) for taxon in taxa])) for clade,taxa in clades2terms.items()])
+    ntaxa = 0
+    for v in clades2terms.values():
+        ntaxa += len(v)
+
+    for l in open( args['cscores'] ):
+        gene_seed, clade, n, n_tot, coreness = l.strip().split('\t')
+        gene_seed, clade, n, n_tot, coreness = int(gene_seed), clade, int(n), int(n_tot), float(coreness)
+        cscores[gene_seed].add( (clade, n, n_tot, coreness) )
+    
+    for i,l in enumerate(open( args['fwmarkers'] )):
+        taxa_id, gene_seed, clade, n, n_tot, coreness, n_ext_seeds, n_ext_taxa, uniqueness, ext_taxa = l.strip().split('\t')
+        taxa_id, gene_seed, n, n_tot, coreness, n_ext_seeds, n_ext_taxa, uniqueness = int(taxa_id), int(gene_seed), int(n), int(n_tot), float(coreness), int(n_ext_seeds), int(n_ext_taxa), float(uniqueness)
+        fwmarkers[gene_seed] = (taxa_id, clade, n, n_tot, coreness, n_ext_seeds, n_ext_taxa, uniqueness)
+
+    for l in open( args['maps'] ):
+        line = l.strip().split('\t')
+        if len(line) < 2:
+            continue
+        fr,to = line
+        fr,to = int(fr.split("_")[0]), to # can be improved!!!!
+        maps[fr].add( to )
+
+    g2t,g2c,c2g = {},{},{}
+    if args['g2t']:
+        with open( args['g2t'] ) as inp:
+            g2t = dict(([int(a) for a in l.strip().split('\t')] for l in inp))
+    elif args['t2g']:
+        with open( args['t2g'] ) as inp:
+            for ll in (l.strip().split('\t') for l in inp):
+                for g in ll[1:]:
+                    g2t[int(g)] = int(ll[0])
+        
+    genomes = set([g2t[g] for g in cscores]) 
+
+    with open( args['g2c'] ) as inp:
+        for l in inp:
+            line = list(l.strip().split('\t'))
+            #if int(line[0]) not in genomes:
+            #    continue
+            #vals = [int(a) for a in line if utils.is_number(a)]
+            vals = [a for a in line]
+            if len(vals) > 1:
+                g2c[int(vals[0])] = vals[1:]
+    for g,c in g2c.items():
+        for cc in c:
+            c2g[cc] = g
+    with utils.openw( args['out'] ) as out:
+        for gene_seed,cscores_t in cscores.items():
+            taxa = g2t[gene_seed]
+            for clade, n, n_tot, coreness in cscores_t:
+                out.write( "\t".join(["CSCORE",str(gene_seed),str(taxa),clade,str(n), str(n_tot), str(coreness)]) +"\n" )
+            
+            # anche sotto ???
+
+            if gene_seed in fwmarkers:
+                taxa_id, clade, n, n_tot, coreness, n_ext_seeds, n_ext_taxa, uniqueness = fwmarkers[gene_seed]
+                if uniqueness < 0.01:
+                     out.write( "\t".join(["FWMARKER",str(gene_seed),str(taxa),clade,str(n), str(n_tot), str(coreness),
+                                                      str(n_ext_seeds), str(n_ext_taxa), str(1.0-uniqueness)]) +"\n" ) 
+
+                if gene_seed in maps:
+                    ext_tax = set([(c2g[s] if s in c2g else 0) for s in maps[gene_seed]])
+                    ext_tax_ok = ext_tax & clades2taxa[clade]
+                    ext_tax_ko = ext_tax - clades2taxa[clade]
+                    ext_tax_okl = ":".join([str(s) for s in ext_tax_ok])
+                    ext_tax_kol = ":".join([str(s) for s in ext_tax_ko])
+                    muniq = 1.0 - float(len(ext_tax_kol)) / float(ntaxa)
+
+                    out.write( "\t".join(["MARKER",str(gene_seed),str(taxa),clade,str(n), str(n_tot), str(coreness),
+                                                   str(n_ext_seeds), str(n_ext_taxa), str(1.0-uniqueness), 
+                                                   str(len(ext_tax)), str(len(ext_tax_ok)), str(len(ext_tax_ko)), 
+                                                   str(muniq), ext_tax_okl, ext_tax_kol]) +"\n" ) 
+
+    """
+    gid2cores = collections.defaultdict( set )
+    with utils.openr(args['cg']) as inp:
+        for line in (l.split('\t') for l in inp):
+            if int(line[0]) > 0:
+                gid,clade,ncore,ngenomes,pv =  line[:5]
+            else:
+                gid,clade,ncore,ngenomes,pv =  line[1:6]
+            gid2cores[gid].add( (clade,ncore,ngenomes,pv) )
+
+    clades2cores = collections.defaultdict( set )
+    for k,v in gid2cores.items():
+        if len(v) > 1:
+            continue
+        clades2cores[list(v)[0][0]].add( k )
+
+    with utils.openw(args['cgs']) as out:
+        for k,v in sorted(clades2cores.items(),key=lambda x:-len(x[1])):
+            out.write( "\t".join([k,str(len(v))]) +"\n" )
+    """
diff --git a/choco_sam2choco.py b/choco_sam2choco.py
new file mode 100644
index 0000000..033f42d
--- /dev/null
+++ b/choco_sam2choco.py
@@ -0,0 +1,27 @@
+#!/usr/bin/env python
+import sys
+import collections
+import utils
+import cPickle as pickle
+import pyphlan as ppa  
+import argparse as ap
+
+def read_params( args ):
+    p = ap.ArgumentParser(description='Reduce Sam for ChocoPhlAn\n')
+    p.add_argument( 'sam', nargs='+', type=str )
+    p.add_argument( '--out', required = False, default=None, type=str )
+    return vars( p.parse_args() )
+
+if __name__ == "__main__":
+    args = read_params( sys.argv )
+
+    out = utils.openw( args['out'] )
+    for f in args['sam']:
+       with utils.openr( f ) as inp:
+            lines = (l.split('\t') for l in inp)
+            for l in lines:
+                out.write(l[0]+"\t"+l[2]+"\t"+l[11][5:] + "\n" )
+                #out.write(l[0]+"\t"+l[2]+"\t"+[ll for ll in l[3:] if "AS:i:" in ll][0][5:] + "\n" )
+    out.close()
+
+
diff --git a/choco_summary.py b/choco_summary.py
new file mode 100644
index 0000000..457dc50
--- /dev/null
+++ b/choco_summary.py
@@ -0,0 +1,128 @@
+#!/usr/bin/env python
+
+import sys
+import collections
+import utils
+import pyphlan as ppa  
+
+try:
+    import argparse as ap
+    import bz2 
+except ImportError:
+    sys.stderr.write( "argparse not found" )
+    sys.exit(-1)
+
+def read_params( args ):
+    p = ap.ArgumentParser(description='Profile ChocoPhlAn genes\n')
+
+    p.add_argument( '--cscores', required = True, default=None, type=str )
+    p.add_argument( '--fwmarkers', required = True, default=None, type=str )
+    p.add_argument( '--maps', required = True, default=None, type=str )
+    p.add_argument( '--taxonomy', required = True, default=None, type=str )
+    p.add_argument('--g2t', metavar="Mapping file from genes to taxa", default=None, type=str )
+    p.add_argument('--t2g', metavar="Mapping file from taxa to genes", default=None, type=str )
+    p.add_argument('--g2c', metavar="Mapping file from genomes to contigs", required = True, default=None, type=str )
+    p.add_argument('out', nargs='?', default=None, type=str,
+            help=   "the output summary file\n"
+                    "[stdout if not present]")
+
+    return vars( p.parse_args() )
+
+if __name__ == "__main__":
+    args = read_params( sys.argv )
+
+    cscores = collections.defaultdict( set )
+    fwmarkers = {} 
+    maps = collections.defaultdict( set )
+    tree = ppa.PpaTree( args['taxonomy'] )
+    clades2terms = ppa.clades2terms( tree.tree ) 
+
+    clades2taxa = dict([(clade.full_name,set([-int(taxon.name[4:]) for taxon in taxa])) for clade,taxa in clades2terms.items()])
+
+    for l in open( args['cscores'] ):
+        gene_seed, clade, n, n_tot, coreness = l.strip().split('\t')
+        gene_seed, clade, n, n_tot, coreness = int(gene_seed), clade, int(n), int(n_tot), float(coreness)
+        cscores[gene_seed].add( (clade, n, n_tot, coreness) )
+    
+    for i,l in enumerate(open( args['fwmarkers'] )):
+        taxa_id, gene_seed, clade, n, n_tot, coreness, n_ext_seeds, n_ext_taxa, uniqueness, ext_taxa = l.strip().split('\t')
+        taxa_id, gene_seed, n, n_tot, coreness, n_ext_seeds, n_ext_taxa, uniqueness = int(taxa_id), int(gene_seed), int(n), int(n_tot), float(coreness), int(n_ext_seeds), int(n_ext_taxa), float(uniqueness)
+        fwmarkers[gene_seed] = (taxa_id, clade, n, n_tot, coreness, n_ext_seeds, n_ext_taxa, uniqueness)
+
+    for l in open( args['maps'] ):
+        line = l.strip().split('\t')
+        if len(line) < 2:
+            continue
+        fr,to = line
+        fr,to = int(fr.split("_")[0]), int(to) # can be improved!!!!
+        maps[fr].add( to )
+
+    g2t,g2c,c2g = {},{},{}
+    if args['g2t']:
+        with open( args['g2t'] ) as inp:
+            g2t = dict(([int(a) for a in l.strip().split('\t')] for l in inp))
+    elif args['t2g']:
+        with open( args['t2g'] ) as inp:
+            for ll in (l.strip().split('\t') for l in inp):
+                for g in ll[1:]:
+                    g2t[int(g)] = int(ll[0])
+        
+    genomes = set([g2t[g] for g in cscores]) 
+
+    with open( args['g2c'] ) as inp:
+        for l in inp:
+            line = list(l.strip().split('\t'))
+            #if int(line[0]) not in genomes:
+            #    continue
+            vals = [int(a) for a in line if utils.is_number(a)]
+            if len(vals) > 1:
+                g2c[vals[0]] = vals[1:]
+    for g,c in g2c.items():
+        for cc in c:
+            c2g[cc] = g
+    with utils.openw( args['out'] ) as out:
+        for gene_seed,cscores_t in cscores.items():
+            taxa = g2t[gene_seed]
+            for clade, n, n_tot, coreness in cscores_t:
+                out.write( "\t".join(["CSCORE",str(gene_seed),str(taxa),clade,str(n), str(n_tot), str(coreness)]) +"\n" )
+            
+            # anche sotto ???
+
+            if gene_seed in fwmarkers:
+                taxa_id, clade, n, n_tot, coreness, n_ext_seeds, n_ext_taxa, uniqueness = fwmarkers[gene_seed]
+                if uniqueness < 0.01:
+                     out.write( "\t".join(["FWMARKER",str(gene_seed),str(taxa),clade,str(n), str(n_tot), str(coreness),
+                                                      str(n_ext_seeds), str(n_ext_taxa), str(uniqueness)]) +"\n" ) 
+
+                if gene_seed in maps:
+                    ext_tax = set([(c2g[s] if s in c2g else 0) for s in maps[gene_seed]])
+                    ext_tax_ok = ext_tax & clades2taxa[clade]
+                    ext_tax_ko = ext_tax - clades2taxa[clade]
+                    ext_tax_okl = ":".join([str(s) for s in ext_tax_ok])
+                    ext_tax_kol = ":".join([str(s) for s in ext_tax_ko])
+
+                    out.write( "\t".join(["MARKER",str(gene_seed),str(taxa),clade,str(n), str(n_tot), str(coreness),
+                                                   str(n_ext_seeds), str(n_ext_taxa), str(uniqueness), 
+                                                   str(len(ext_tax)), str(len(ext_tax_ok)), str(len(ext_tax_ko)), 
+                                                   ext_tax_okl, ext_tax_kol]) +"\n" ) 
+
+    """
+    gid2cores = collections.defaultdict( set )
+    with utils.openr(args['cg']) as inp:
+        for line in (l.split('\t') for l in inp):
+            if int(line[0]) > 0:
+                gid,clade,ncore,ngenomes,pv =  line[:5]
+            else:
+                gid,clade,ncore,ngenomes,pv =  line[1:6]
+            gid2cores[gid].add( (clade,ncore,ngenomes,pv) )
+
+    clades2cores = collections.defaultdict( set )
+    for k,v in gid2cores.items():
+        if len(v) > 1:
+            continue
+        clades2cores[list(v)[0][0]].add( k )
+
+    with utils.openw(args['cgs']) as out:
+        for k,v in sorted(clades2cores.items(),key=lambda x:-len(x[1])):
+            out.write( "\t".join([k,str(len(v))]) +"\n" )
+    """
diff --git a/compare_clusters.py b/compare_clusters.py
new file mode 100644
index 0000000..5125a52
--- /dev/null
+++ b/compare_clusters.py
@@ -0,0 +1,125 @@
+#!/usr/bin/env python
+
+import sys
+import collections
+import utils
+import numpy as np
+from Bio import SeqIO 
+import subprocess
+from Bio.Align.Applications import MuscleCommandline
+from Bio import AlignIO
+
+try:
+    import argparse as ap
+    import bz2 
+except ImportError:
+    sys.stderr.write( "argparse not found" )
+    sys.exit(-1)
+
+def read_params( args ):
+    p = ap.ArgumentParser(description='Convert core gene files to core gene summaries\n')
+
+    p.add_argument( '-1', default=None, type=str,
+            help=   "Cluster file 1")
+    p.add_argument('-2', default=None, type=str,
+            help=   "Cluster file 2")
+    p.add_argument('-i', default=None, type=str,
+            help=   "Input sequences")
+
+    return vars( p.parse_args() )
+
+def get_clusters( clf ):
+    with utils.openr(clf) as inp:
+        return dict([(i,{'cl':set(l.strip().split('\t'))}) for i,l in enumerate(inp)])
+    return None
+
+def add_sequences( cl, ffn ):
+    for i,c in cl.items():
+        seqs = [ffn[idf] for idf in c['cl']]
+        cl[i]['ffn'] = seqs
+        print sorted([len(s) for s in seqs])
+        sys.stdin.readline()
+    return cl
+
+def edit_distance( x, y ):
+
+    start, stop = None, None
+    for i,(a,b) in enumerate(zip(x,y)):
+        if a != '-' and b != '-':
+            start = i
+            break
+    for i,(a,b) in enumerate(zip(x,y)[::-1]):
+        if a != '-' and b != '-':
+            stop = i
+            break
+
+    eql = len([1 for a,b in zip(x[start:-stop],y[start:-stop]) if a == b])
+
+    return float(eql)/(len(x)-(stop+start))
+
+def add_alignments( cl ):
+    for i,c in cl.items():
+        cline = MuscleCommandline(clwstrict=True)
+        child = subprocess.Popen( str(cline),
+                                  stdin=subprocess.PIPE,
+                                  stdout=subprocess.PIPE,
+                                  stderr=subprocess.PIPE,
+                                  universal_newlines=True,
+                                  shell=(sys.platform!="win32")) 
+        SeqIO.write(c['ffn'], child.stdin, "fasta")
+        child.stdin.close()
+        align = AlignIO.read(child.stdout, "clustal")
+        cl[i]['aln'] = align
+
+        for a in align:
+            for b in align:
+                print edit_distance(str(a.seq),str(b.seq))
+                print a.id
+                print str(a.seq)
+                print b.id
+                print str(b.seq)
+        sys.exit()
+
+    return cl
+        
+
+
+    
+def stats( cl ):
+    lens = [len(c['cl']) for c in cl.values()]
+    avg, stdev = np.mean(lens), np.std(lens)
+    singletons = len([1 for l in lens if l == 1])
+    print avg, stdev, len(lens), singletons
+    for l in sorted(set(lens),reverse=True):
+        print l, lens.count(l)
+
+if __name__ == "__main__":
+    args = read_params( sys.argv )
+
+    cl1 = get_clusters( args['1'] )
+    cl2 = get_clusters( args['2'] )
+
+    stats(cl1)
+    stats(cl2)
+
+    ffn = SeqIO.to_dict(SeqIO.parse( args['i'], "fasta"))
+
+    cl1 = add_sequences( cl1, ffn ) 
+    cl2 = add_sequences( cl2, ffn )
+
+    cl2 = add_alignments( cl2 )
+
+    """
+    equal = []
+    for c1 in cl1:
+        for c2 in cl2:
+            if len(c1) == len(c2) and len(c1) == len(c1 & c2):
+                equal.append(c1)
+    print len(equal)
+
+    for c1 in cl1:
+        for c2 in cl2:
+            if not (len(c1) == len(c2) and len(c1) == len(c1 & c2)):
+                if c1 <= c2:
+                    print c1, " contained in ", c2
+    """
diff --git a/conv_cg2cgs.py b/conv_cg2cgs.py
new file mode 100644
index 0000000..19d337c
--- /dev/null
+++ b/conv_cg2cgs.py
@@ -0,0 +1,48 @@
+#!/usr/bin/env python
+
+import sys
+import collections
+import utils
+
+try:
+    import argparse as ap
+    import bz2 
+except ImportError:
+    sys.stderr.write( "argparse not found" )
+    sys.exit(-1)
+
+def read_params( args ):
+    p = ap.ArgumentParser(description='Convert core gene files to core gene summaries\n')
+
+    p.add_argument( 'cg', nargs='?', default=None, type=str,
+            help=   "the input cg file [stdin if not present]")
+    p.add_argument('cgs', nargs='?', default=None, type=str,
+            help=   "the output summary file\n"
+                    "[stdout if not present]")
+
+    return vars( p.parse_args() )
+
+if __name__ == "__main__":
+    args = read_params( sys.argv )
+
+    gid2cores = collections.defaultdict( set )
+    #with (open(args['uc']) if args['uc'] else sys.stdin) as inp:
+    with utils.openr(args['cg']) as inp:
+        for line in (l.split('\t') for l in inp):
+            #if int(line[0]) > 0:
+            gid,clade,ncore,ngenomes,pv =  line[:5]
+            #else:
+            #    gid,clade,ncore,ngenomes,pv =  line[1:6]
+            gid2cores[gid].add( (clade,ncore,ngenomes,pv) )
+
+    clades2cores = collections.defaultdict( set )
+    for k,v in gid2cores.items():
+        if len(v) > 1:
+            continue
+        clades2cores[list(v)[0][0]].add( k )
+
+    #openw = bz2.BZ2File if args['txt'].endswith(".bz2") else open
+    with utils.openw(args['cgs']) as out:
+        for k,v in sorted(clades2cores.items(),key=lambda x:-len(x[1])):
+            out.write( "\t".join([k,str(len(v))]) +"\n" )
+
diff --git a/conv_cg2mg.py b/conv_cg2mg.py
new file mode 100644
index 0000000..3218757
--- /dev/null
+++ b/conv_cg2mg.py
@@ -0,0 +1,143 @@
+#!/usr/bin/env python
+
+import sys
+import collections
+import cPickle as pickle
+import utils
+import bz2
+try:
+    import argparse as ap
+    import bz2 
+except ImportError:
+    sys.stderr.write( "argparse not found" )
+    sys.exit(-1)
+
+def read_params( args ):
+    p = ap.ArgumentParser(description='Add markerness'
+            ' to core gene file\n')
+
+    p.add_argument( 'ctxt', nargs='?', default=None, type=str,
+            help=   "the input ctxt file [stdin if not present]")
+    p.add_argument('--b6o', metavar="The outfmt6 file for the cores",
+            default=None, type=str )
+    p.add_argument('-n', metavar="Total number of target sets (total targets in the b6o file if unspecified)",
+            default=None, required = True, type=int )
+    p.add_argument('--g2t', metavar="Mapping file from genes to taxa",
+            default=None, type=str )
+    p.add_argument('--t2g', metavar="Mapping file from taxa to genes",
+            default=None, type=str )
+    p.add_argument('--pkl', metavar="The compressed pickled output",
+            default=None, type=str )
+    p.add_argument('mtxt', nargs='?', default=None, type=str,
+            help=   "the output mtxt file, compressed if fiven with bz2 extension\n"
+                    "[stdout if not present]")
+
+    return vars( p.parse_args() )
+
+if __name__ == "__main__":
+    args = read_params( sys.argv )
+    uc2cl = collections.defaultdict( set )
+   
+    if not args['g2t'] and not args['t2g']:
+        sys.stdout.write("Error one of --t2g and --g2t must be provided\n")
+        sys.exit(0)
+    g2t = {}
+    if args['g2t']:
+        with open( args['g2t'] ) as inp:
+            g2t = dict(([a for a in l.strip().split('\t')] for l in inp))
+            #g2t = dict(([int(a) for a in l.strip().split('\t')] for l in inp))
+    elif args['t2g']:
+        with open( args['t2g'] ) as inp:
+            for ll in (l.strip().split('\t') for l in inp):
+                for g in ll[1:]:
+                    g2t[g] = ll[0]
+    
+    all_fr = set()
+    with utils.openr( args['ctxt'] ) as inp:
+        valin = (l.strip().split('\t') for l in inp)
+
+        g2c = collections.defaultdict( set )
+        
+        if args['b6o']:
+            inp_mat = ((a,b) for a,b in (l.rstrip('\n').split("\t")[:2] for l in utils.openr(args['b6o'])))
+    
+            #all_targets = set()
+            for fr,to in inp_mat:
+                all_fr.add( fr )
+                #all_targets.add( to )
+                if fr != to:
+                    g2c[fr].add( to )
+
+        n = args['n'] # if args['n'] else len(all_targets)
+        n = float(n)
+   
+        out_dict = collections.defaultdict( dict ) 
+        all_fr_taxa = set([g2t[tt] for tt in all_fr])
+
+        with utils.openw(args['mtxt']) as out:
+            last,lastv = "",[]
+            outbuf = []
+            gt = None
+
+            outtmp = [] 
+
+            for v in valin:
+                gt = v[0]
+                if last == gt:
+                    outtmp.append( v )
+                    continue
+                    
+                if len(outtmp) == 1:
+                    #print outtmp
+                    outbuf.append( outtmp[0] )
+                elif len(outtmp) > 1:
+                    #print outtmp
+                    outtmp2 = [o for o in outtmp if "_sp_" not in o[1] and "_bacterium_" not in o[1] and "_unclass" not in o[1]]
+                    if len(outtmp2) == 1:
+                        outbuf.append( outtmp2[0] )
+                    elif len(outtmp2) > 1:
+                        outtmp3 = sorted(outtmp2,key=lambda x:-int(x[2]))
+                        cur = int(outtmp3[0][2])
+                        other = sum([int(vv[2]) for vv in outtmp3[1:]])
+                        #print cur, other, outtmp3[0]
+                        if float(cur)/float(other) > 2.5 and float(other) < 10:
+                            outbuf.append( outtmp3[0] )
+                
+                last = gt
+                outtmp = [v]
+
+            if last and last != gt:
+                outbuf.append( lastv )
+            """
+            for v in valin:
+                gt = v[0]
+                if last == gt:
+                    lastv = ""
+                    continue
+                if lastv:
+                    outbuf.append( lastv )
+                last = gt
+                lastv = v
+            if last and last != gt:
+                outbuf.append( lastv )
+            """
+            for v in outbuf:
+                fr = v[0]
+                frt = g2t[fr]
+                targett = list(set([g2t[s] for s in g2c[fr]])-all_fr_taxa) if args['b6o'] else []
+                nu = len(g2c[fr]) - len(targett) if args['b6o'] else 0
+                targets = ":".join([str(s) for s in targett]) if targett else "-"
+                uniqueness = round(float(len(targett)) / n,3)
+                out.write( "\t".join([str(g2t[fr])]+v+[str(nu),str(len(targett)),str(uniqueness),targets]) +"\n" )
+                
+                toadd = {   'gid' : v[0], 'tid' : g2t[fr], 'n_cores' : v[2], 'coreness' : float(v[4]), 
+                            'uniqueness' : float(uniqueness),
+                            'targets' : targett }
+                out_dict[v[1]][v[0]] = toadd
+
+            if args['pkl']:
+                outf = open( args['pkl'], 'wb' ) 
+                # pickle.loads( bz2.decompress( open("/tmp/a.pkl").read() ) )
+                outf.write( bz2.compress(pickle.dumps(out_dict,pickle.HIGHEST_PROTOCOL)) )
+                outf.close()
+
diff --git a/conv_ctxt2coretxt.py b/conv_ctxt2coretxt.py
new file mode 100644
index 0000000..8b2867e
--- /dev/null
+++ b/conv_ctxt2coretxt.py
@@ -0,0 +1,87 @@
+#!/usr/bin/env python
+
+import sys
+import collections
+import utils
+try:
+    import argparse as ap
+    import bz2 
+except ImportError:
+    sys.stderr.write( "argparse not found" )
+    sys.exit(-1)
+
+def read_params( args ):
+    p = ap.ArgumentParser(description='Convert core gene txt file'
+            ' substituting gene IDs with genomes IDs\n')
+
+    p.add_argument( 'ctxt', nargs='?', default=None, type=str,
+            help=   "the input ctxt file [stdin if not present]")
+    p.add_argument('--g2t', metavar="Mapping file from genes to taxa",
+            default=None, type=str )
+    p.add_argument('--t2g', metavar="Mapping file from taxa to genes",
+            default=None, type=str )
+    p.add_argument('txt', nargs='?', default=None, type=str,
+            help=   "the output gtxt file, compressed if fiven with bz2 extension\n"
+                    "[stdout if not present]")
+    p.add_argument('-n', default=1, type=int )
+    p.add_argument('-c', default=1, type=int )
+    p.add_argument('--out_taxa', default=0, type=int )
+    p.add_argument('--sk', action='store_true' )
+
+    return vars( p.parse_args() )
+
+if __name__ == "__main__":
+    args = read_params( sys.argv )
+    uc2cl = collections.defaultdict( set )
+    
+    gint = str if args['sk'] else int
+
+    if not args['g2t'] and not args['t2g']:
+        sys.stdout.write("Error one of --t2g and --g2t must be provided\n")
+        sys.exit(0)
+    g2t = {}
+
+
+    if args['g2t']:
+        with utils.openr( args['g2t'] ) as inp:
+            #g2t = dict(([int(a) for a in l.strip().split('\t')] for l in inp))
+            for l in inp:
+                f,t = l.strip().split('\t')
+                g2t[gint(f)] = gint(t)
+    elif args['t2g']:
+        with utils.openr( args['t2g'] ) as inp:
+            for ll in (l.strip().split('\t') for l in inp):
+                for g in ll[1:]:
+                    g2t[gint(g)] = gint(ll[0])
+    
+    with utils.openw(args['txt']) as out:
+        with utils.openr( args['ctxt'] ) as inp:
+            for l in inp:
+                valin = [gint(a) for a in l.strip().split('\t')]
+
+                if len(valin) < args['n']:
+                    continue
+
+                ko = False
+                genes2taxa = collections.defaultdict(list)
+                for vv in valin:
+                    genes2taxa[g2t[vv]].append(vv)
+                    if len( genes2taxa[g2t[vv]] ) > args['c']:
+                        ko = True
+                        break
+
+                if len(genes2taxa) < args['n']:
+                    continue
+
+                if ko:
+                    continue
+
+                valin = list(valin) 
+   
+                if args['out_taxa']:
+                    out.write( "\t".join([str(g2t[s]) for s in valin]) +"\n" ) 
+                else:    
+                    out.write( "\t".join([str(s) for s in valin]) +"\n" )
+
+
+
diff --git a/conv_ctxt2coretxt_withexlusion.py b/conv_ctxt2coretxt_withexlusion.py
new file mode 100644
index 0000000..1a850a5
--- /dev/null
+++ b/conv_ctxt2coretxt_withexlusion.py
@@ -0,0 +1,174 @@
+#!/usr/bin/env python
+
+import sys
+import collections
+import utils
+try:
+    import argparse as ap
+    import bz2 
+except ImportError:
+    sys.stderr.write( "argparse not found" )
+    sys.exit(-1)
+
+def read_params( args ):
+    p = ap.ArgumentParser(description='Convert core gene txt file'
+            ' substituting gene IDs with genomes IDs\n')
+
+    p.add_argument( 'ctxt', nargs='?', default=None, type=str,
+            help=   "the input ctxt file [stdin if not present]")
+    p.add_argument('--g2t', metavar="Mapping file from genes to taxa",
+            default=None, type=str )
+    p.add_argument('--t2g', metavar="Mapping file from taxa to genes",
+            default=None, type=str )
+    p.add_argument('txt', nargs='?', default=None, type=str,
+            help=   "the output gtxt file, compressed if fiven with bz2 extension\n"
+                    "[stdout if not present]")
+    #p.add_argument('-n', default=1, type=int )
+    p.add_argument('--ncores', default=1, type=int )
+    p.add_argument('-c', default=1, type=int )
+    p.add_argument('--out_taxa', default=0, type=int )
+    p.add_argument('--max_tot_repeats', default=0, type=int )
+    p.add_argument('--removed_taxa', required = True, type=str )
+    p.add_argument('--sk', action='store_true' )
+
+    return vars( p.parse_args() )
+
+
+def k_subsets_i(n, k):
+    '''
+    Yield each subset of size k from the set of intergers 0 .. n - 1
+    n -- an integer > 0
+    k -- an integer > 0
+    '''
+    # Validate args
+    if n < 0:
+        raise ValueError('n must be > 0, got n=%d' % n)
+    if k < 0:
+        raise ValueError('k must be > 0, got k=%d' % k)
+    # check base cases
+    if k == 0 or n < k:
+        yield set()
+    elif n == k:
+        yield set(range(n))
+    else:
+        # Use recursive formula based on binomial coeffecients:
+        # choose(n, k) = choose(n - 1, k - 1) + choose(n - 1, k)
+        for s in k_subsets_i(n - 1, k - 1):
+            s.add(n - 1)
+            yield s
+        for s in k_subsets_i(n - 1, k):
+            yield s
+
+def k_subsets(s, k):
+    '''
+    Yield all subsets of size k from set (or list) s
+    s -- a set or list (any iterable will suffice)
+    k -- an integer > 0
+    '''
+    s = list(s)
+    n = len(s)
+    for k_set in k_subsets_i(n, k):
+        yield set([s[i] for i in k_set])
+
+def count_ncores( pang, to_excl, tot ):
+    ntot = tot - len(to_excl)
+    n = 0
+    for k,v in pang.items():
+        if len(v) < ntot:
+            continue
+        if len( v - to_excl ) >= ntot:
+            n += 1
+    return n
+
+if __name__ == "__main__":
+    args = read_params( sys.argv )
+    uc2cl = collections.defaultdict( set )
+    
+    gint = str if args['sk'] else int
+
+    if not args['g2t'] and not args['t2g']:
+        sys.stdout.write("Error one of --t2g and --g2t must be provided\n")
+        sys.exit(0)
+    g2t = {}
+
+
+    if args['g2t']:
+        with utils.openr( args['g2t'] ) as inp:
+            #g2t = dict(([int(a) for a in l.strip().split('\t')] for l in inp))
+            for l in inp:
+                f,t = l.strip().split('\t')
+                g2t[gint(f)] = gint(t)
+    elif args['t2g']:
+        with utils.openr( args['t2g'] ) as inp:
+            for ll in (l.strip().split('\t') for l in inp):
+                for g in ll[1:]:
+                    g2t[gint(g)] = gint(ll[0])
+
+    pangenome = {}
+    all_genomes = set()
+    with utils.openr( args['ctxt'] ) as inp:
+        for l in inp:
+            line = l.strip().split('\t')
+            genomes = set([g2t[al] for al in line])
+            pangenome[line[0]] = genomes
+            all_genomes |= genomes
+    #print all_genomes 
+    #print pangenome
+    nn = count_ncores( pangenome, set(), len(all_genomes) )
+    run = 1
+    torem = set()
+    while nn < args['ncores']:
+        nn = 0
+        for g in k_subsets( all_genomes, run ):
+            newn = count_ncores( pangenome, g, len(all_genomes) )
+            if newn > nn:
+                nn = newn
+                torem |= g 
+        run += 1
+    #print nn
+
+    with utils.openw(args['removed_taxa']) as out:
+        for a in list(torem):
+            out.write( a + "\n" )
+
+    ntaxa = len(all_genomes) - len(torem)
+
+    with utils.openw(args['txt']) as out:
+        with utils.openr( args['ctxt'] ) as inp:
+            for l in inp:
+                valin = [gint(a) for a in l.strip().split('\t') if g2t[a] not in torem]
+                if len(valin) < ntaxa:
+                    continue
+                ko = False
+                genes2taxa = collections.defaultdict(list)
+                for vv in valin:
+                    genes2taxa[g2t[vv]].append( vv )
+                    if len( genes2taxa[g2t[vv]] ) > args['c']:
+                        ko = True
+                        break
+                    #totlen += len(genes2taxa[g2t[vv]])
+                    #genes2taxa[g2t[vv]] = genes2taxa[g2t[vv]][:1]
+
+                if not ko:
+                    totlen = 0
+                    for k1, v1 in genes2taxa.items():
+                        totlen += len( v1 )
+                        genes2taxa[k1] = v1[:1]
+                    if totlen - ntaxa > args['max_tot_repeats']:
+                        ko = True
+
+                if len(genes2taxa) < ntaxa:
+                    continue
+
+                if ko:
+                    continue
+                valin = list(valin) 
+   
+                if args['out_taxa']:
+                    #out.write( "\t".join([str(g2t[s]) for s in valin]) +"\n" ) 
+                    out.write( "\t".join([str(s) for s in genes2taxa.keys()]) +"\n" ) 
+                else:    
+                    out.write( "\t".join([str(s[0]) for s in genes2taxa.values()]) +"\n" )
+
+
+
diff --git a/conv_ctxt2mat.py b/conv_ctxt2mat.py
new file mode 100644
index 0000000..1fb952b
--- /dev/null
+++ b/conv_ctxt2mat.py
@@ -0,0 +1,74 @@
+#!/usr/bin/env python
+
+import sys
+import collections
+import utils
+try:
+    import argparse as ap
+    import bz2 
+    import random
+except ImportError:
+    sys.stderr.write( "argparse not found" )
+    sys.exit(-1)
+
+def read_params( args ):
+    p = ap.ArgumentParser(description='Convert Usearch ".uc" files in tab-delimited'
+            ' files with the seed as first field followed by the other IDs\n')
+
+    p.add_argument( 'ctxt', nargs='?', default=None, type=str,
+            help=   "the input uc file [stdin if not present]")
+    p.add_argument('txt', nargs='?', default=None, type=str,
+            help=   "the output txt file compresse if fiven with bz2 extension\n"
+                    "[stdout if not present]")
+    p.add_argument('--subsample', metavar="Subsampling rate",
+            default=1.0, type=float )
+    p.add_argument('-n', action='store_true' )
+    p.add_argument('-p', metavar="Prefix for taxon names",
+            default="", type=str )
+    p.add_argument('--sk', action='store_true' )
+
+    return vars( p.parse_args() )
+
+if __name__ == "__main__":
+    args = read_params( sys.argv )
+    uc2cl = collections.defaultdict( set )
+    
+    gint = str if args['sk'] else int
+
+    valin = []
+    with utils.openr( args['ctxt'] ) as inp:
+        for l in inp:
+            tset = set([gint(a) for a in l.strip().split('\t')][1:])
+            valin.append(tset)
+    all_t = set()
+    for v in valin:
+        all_t |= v
+
+    res = {}
+    for t in all_t:
+        #if len(t) < args['n']:
+        #    continue
+        res[t] = [int(t in v) for v in valin]
+
+    with utils.openw(args['txt']) as out:
+        mat_res = {}
+        for k1 in res:
+            mat_res[k1] = {}
+            for k2 in res:
+                mat_res[k1][k2] = float(len([1 for x,y in zip(res[k1],res[k2]) if x == 1 and y == 1]))/max(sum(res[k1]),sum(res[k2]))
+
+        keys = sorted(res)
+        out.write( "\t".join( ["x"]+keys ) +"\n" )
+        for k in keys:
+            out.write( "\t".join( [k] + [str(mat_res[k][k2]) for k2 in keys] ) +"\n" )
+
+
+        """
+        n = len(res.values()[0])
+        n_s = int(float(n)*args['subsample'])
+        indok = set(random.sample( list(range(n)), n_s))
+        out.write( "\t".join(['genes']+["g"+str(v) for v in range(n)]) + "\n" )
+
+        for k,v in res.items():
+            out.write( args['p'] + str(k)+"\t"+"\t".join([str(s) for i,s in enumerate(v) if i in indok]) +"\n" )
+        """
diff --git a/conv_ctxt2rxl.py b/conv_ctxt2rxl.py
new file mode 100644
index 0000000..17c3d0b
--- /dev/null
+++ b/conv_ctxt2rxl.py
@@ -0,0 +1,65 @@
+#!/usr/bin/env python
+
+import sys
+import collections
+import utils
+try:
+    import argparse as ap
+    import bz2 
+    import random
+except ImportError:
+    sys.stderr.write( "argparse not found" )
+    sys.exit(-1)
+
+def read_params( args ):
+    p = ap.ArgumentParser(description='Convert Usearch ".uc" files in tab-delimited'
+            ' files with the seed as first field followed by the other IDs\n')
+
+    p.add_argument( 'ctxt', nargs='?', default=None, type=str,
+            help=   "the input uc file [stdin if not present]")
+    p.add_argument('txt', nargs='?', default=None, type=str,
+            help=   "the output txt file compresse if fiven with bz2 extension\n"
+                    "[stdout if not present]")
+    p.add_argument('--subsample', metavar="Subsampling rate",
+            default=1.0, type=float )
+    p.add_argument('-n', metavar="Minimum number of matching taxa",
+            default=0, type=int )
+    p.add_argument('-p', metavar="Prefix for taxon names",
+            default="", type=str )
+    p.add_argument('--sk', action='store_true' )
+
+    return vars( p.parse_args() )
+
+if __name__ == "__main__":
+    args = read_params( sys.argv )
+    uc2cl = collections.defaultdict( set )
+    
+    gint = str if args['sk'] else int
+
+    valin = []
+    with utils.openr( args['ctxt'] ) as inp:
+        for l in inp:
+            tset = set([gint(a) for a in l.strip().split('\t')][1:])
+            if len(tset) < args['n']:
+                continue
+            valin.append(tset)
+    all_t = set()
+    for v in valin:
+        all_t |= v
+
+    res = {}
+    for t in all_t:
+        #if len(t) < args['n']:
+        #    continue
+        res[t] = [int(t in v) for v in valin]
+
+    with utils.openw(args['txt']) as out:
+        n = len(res.values()[0])
+        n_s = int(float(n)*args['subsample'])
+        out.write( str(len(res))+" "+str(n_s)+"\n" )
+        indok = set(random.sample( list(range(n)), n_s))
+
+        for k,v in res.items():
+            if isinstance(k,basestring) and len(k) > 15:
+                k = k[:14]
+            out.write( args['p'] + str(k)+" "*(15-len(str(k)[1:]))+"".join([str(s) for i,s in enumerate(v) if i in indok]) +"\n" )
diff --git a/conv_ctxt2ttxt.py b/conv_ctxt2ttxt.py
new file mode 100644
index 0000000..49efeff
--- /dev/null
+++ b/conv_ctxt2ttxt.py
@@ -0,0 +1,80 @@
+#!/usr/bin/env python
+
+import sys
+import collections
+import utils
+try:
+    import argparse as ap
+    import bz2 
+except ImportError:
+    sys.stderr.write( "argparse not found" )
+    sys.exit(-1)
+
+def read_params( args ):
+    p = ap.ArgumentParser(description='Convert core gene txt file'
+            ' substituting gene IDs with genomes IDs\n')
+
+    p.add_argument( 'ctxt', nargs='?', default=None, type=str,
+            help=   "the input ctxt file [stdin if not present]")
+    p.add_argument('--g2t', metavar="Mapping file from genes to taxa",
+            default=None, type=str )
+    p.add_argument('-n', metavar="Minimum size of cluster to report (def 1, means all)",
+            default=1, type=int )
+    p.add_argument('--t2g', metavar="Mapping file from taxa to genes",
+            default=None, type=str )
+    p.add_argument('--sk', action='store_true' )
+    p.add_argument('txt', nargs='?', default=None, type=str,
+            help=   "the output gtxt file, compressed if fiven with bz2 extension\n"
+                    "[stdout if not present]")
+
+    return vars( p.parse_args() )
+
+
+if __name__ == "__main__":
+    args = read_params( sys.argv )
+    uc2cl = collections.defaultdict( set )
+
+    gint = str if args['sk'] else int
+
+    if not args['g2t'] and not args['t2g']:
+        sys.stdout.write("Error one of --t2g and --g2t must be provided\n")
+        sys.exit(0)
+    g2t = {}
+
+    vals, all_val = [], set()
+    with utils.openr( args['ctxt'] ) as inp:
+        for l in inp:
+            valin = [gint(a) for a in l.strip().split('\t')]
+            vals.append( valin )
+            all_val |= set(valin)
+
+    if args['g2t']:
+        with utils.openr( args['g2t'] ) as inp:
+            #g2t = dict(([int(a) for a in l.strip().split('\t')] for l in inp))
+            for l in inp:
+                f,t = l.strip().split('\t')
+                if gint(f) not in all_val:
+                    continue
+                g2t[gint(f)] = gint(t)
+    elif args['t2g']:
+        with utils.openr( args['t2g'] ) as inp:
+            for ll in (l.strip().split('\t') for l in inp):
+                for g in ll[1:]:
+                    if gint(g) not in all_val:
+                        continue
+                    g2t[gint(g)] = gint(ll[0])
+    
+    #for v in valin:
+    #    valin[j] = [v[0]]+list(set(g2t[vv] for vv in v))
+
+    with utils.openw(args['txt']) as out:
+        for valin in vals:
+            #valin = [gint(a) for a in l.strip().split('\t')]
+            valin = [valin[0]]+list(set(g2t[vv] for vv in valin)) 
+   
+            #for v in sorted(valin,key=lambda x:-len(x)):
+            if len(valin) > args['n']:
+                out.write( "\t".join([str(s) for s in valin]) +"\n" )
+
+
+
diff --git a/conv_ctxt2txt.py b/conv_ctxt2txt.py
new file mode 100644
index 0000000..4fc5a30
--- /dev/null
+++ b/conv_ctxt2txt.py
@@ -0,0 +1,95 @@
+#!/usr/bin/env python
+
+import sys
+import collections
+import utils
+try:
+    import argparse as ap
+    import bz2 
+    import random
+except ImportError:
+    sys.stderr.write( "argparse not found" )
+    sys.exit(-1)
+
+def read_params( args ):
+    p = ap.ArgumentParser(description='Convert Usearch ".uc" files in tab-delimited'
+            ' files with the seed as first field followed by the other IDs\n')
+
+    p.add_argument( 'ctxt', nargs='?', default=None, type=str,
+            help=   "the input uc file [stdin if not present]")
+    p.add_argument('txt', nargs='?', default=None, type=str,
+            help=   "the output txt file compresse if fiven with bz2 extension\n"
+                    "[stdout if not present]")
+    p.add_argument('--subsample', metavar="Subsampling rate",
+            default=1.0, type=float )
+    p.add_argument('-n', metavar="Minimum number of matching taxa",
+            default=None, type=int )
+    p.add_argument('-m', metavar="Minimum number of matching taxa",
+            default=0, type=int )
+    p.add_argument('-M', metavar="Minimum number of matching taxa",
+            default=None, type=int )
+    p.add_argument('-p', metavar="Prefix for taxon names",
+            default="", type=str )
+    p.add_argument('--sk', action='store_true' )
+    p.add_argument('-s', action='store_true' )
+    p.add_argument('--original_gene_names', action='store_true' )
+
+    return vars( p.parse_args() )
+
+if __name__ == "__main__":
+    args = read_params( sys.argv )
+    uc2cl = collections.defaultdict( set )
+    
+    gint = str if args['sk'] else int
+
+    valin, kin = [], []
+    with utils.openr( args['ctxt'] ) as inp:
+        for l in inp:
+            tset = set([gint(a) for a in l.strip().split('\t')][1:])
+            if len(tset) < args['n']:
+                continue
+            valin.append(tset)
+            kin.append( l.strip().split('\t')[0] )
+    all_t = set()
+    for v in valin:
+        all_t |= v
+
+    res = {}
+    for t in all_t:
+        #if len(t) < args['n']:
+        #    continue
+        res[t] = [int(t in v) for v in valin]
+
+    if args['s']:
+        keys = sorted( res.keys() )
+        v = ["".join([str(vvv) for vvv in vv]) for vv in zip(*[res[k] for k in keys])]
+        resk = collections.defaultdict( int )
+        for vv in v:
+            resk[ vv ] += 1
+        res = [r for r in [ [int(kk)*v for kk in k] for k,v in resk.items()] if max(r) > args['m']]
+        res = dict([(keys[i],v) for i,v in enumerate(zip(*res))])
+
+    with utils.openw(args['txt']) as out:
+        n = len(res.values()[0])
+        n_s = int(float(n)*args['subsample'])
+        indok = set(random.sample( list(range(n)), n_s))
+
+        torem = set()
+        if not args['s'] and args['m']:
+            torem = set([i for i,l in enumerate(zip(*res.values())) if sum(l) < args['m'] or ( args['M'] is not None and sum(l) >= args['M']) ])
+        if args['original_gene_names']:
+            out.write( "\t".join(['genes']+[kin[v] for v in range(n) if v not in torem]) + "\n" )
+        else:
+            out.write( "\t".join(['genes']+["g"+str(v) for v in range(n) if v not in torem]) + "\n" )
+
+        for k,v in res.items():
+            out.write( args['p'] + str(k)+"\t"+"\t".join([str(s) for i,s in enumerate(v) if i in indok and i not in torem]) +"\n" )
+
+
+
+
+
+
+
+
+
diff --git a/conv_fna2rxl.py b/conv_fna2rxl.py
new file mode 100644
index 0000000..acf0cee
--- /dev/null
+++ b/conv_fna2rxl.py
@@ -0,0 +1,47 @@
+#!/usr/bin/env python
+
+import sys
+import collections
+import utils
+try:
+    import argparse as ap
+    import bz2 
+    import random
+    from Bio import SeqIO 
+except ImportError:
+    sys.stderr.write( "argparse not found" )
+    sys.exit(-1)
+
+def read_params( args ):
+    p = ap.ArgumentParser(description='Convert fasta files files in tab-delimited'
+            ' files with the seed as first field followed by the other IDs\n')
+
+    p.add_argument( 'fna', nargs='?', default=None, type=str,
+            help=   "the input uc file [stdin if not present]")
+    p.add_argument('rxl', nargs='?', default=None, type=str,
+            help=   "the output txt file compresse if fiven with bz2 extension\n"
+                    "[stdout if not present]")
+    """
+    p.add_argument('--subsample', metavar="Subsampling rate",
+            default=1.0, type=float )
+    p.add_argument('-n', metavar="Minimum number of matching taxa",
+            default=0, type=int )
+    p.add_argument('-p', metavar="Prefix for taxon names",
+            default="", type=str )
+    """
+    return vars( p.parse_args() )
+
+if __name__ == "__main__":
+    args = read_params( sys.argv )
+
+
+    fna = SeqIO.to_dict(SeqIO.parse( utils.openr(args['fna']), "fasta"))
+
+    with utils.openw(args['rxl']) as out:
+        n = len(fna.values()[0])
+        out.write( str(len(fna))+" "+str(n)+"\n" )
+
+        for k,v in fna.items():
+            if len(k) > 14:
+                k = k[:14]
+            out.write( str(k)+" "*(15-len(str(k)[1:]))+str(v.seq) +"\n" )
diff --git a/conv_seqs.py b/conv_seqs.py
new file mode 100644
index 0000000..3ab48e5
--- /dev/null
+++ b/conv_seqs.py
@@ -0,0 +1,41 @@
+#!/usr/bin/env python
+
+import sys
+import collections
+import utils
+try:
+    import argparse as ap
+    import bz2 
+    import random
+    from Bio import SeqIO 
+except ImportError:
+    sys.stderr.write( "argparse not found" )
+    sys.exit(-1)
+
+def read_params( args ):
+    p = ap.ArgumentParser(description='')
+
+    p.add_argument( '-i', nargs='?', default=None, type=str,
+            help=   "the input file [stdin if not present]")
+    p.add_argument( '--if', nargs='?', default='fastq', type=str,
+            help=   "the input format [default fastq]")
+    p.add_argument('--of', nargs='?', default='fasta', type=str,
+            help=   "the output format [default fastqq]")
+
+    """
+    p.add_argument('--subsample', metavar="Subsampling rate",
+            default=1.0, type=float )
+    p.add_argument('-n', metavar="Minimum number of matching taxa",
+            default=0, type=int )
+    p.add_argument('-p', metavar="Prefix for taxon names",
+            default="", type=str )
+    """
+    return vars( p.parse_args() )
+
+if __name__ == "__main__":
+    args = read_params( sys.argv )
+
+
+    with utils.openw( None ) as outf:
+        for seq in SeqIO.parse( utils.openr( args['i'], mode = 'rb' ), args['if']):
+            SeqIO.write(seq, outf, args['of']) 
diff --git a/conv_t2g_2_g2t.py b/conv_t2g_2_g2t.py
new file mode 100644
index 0000000..b60fb06
--- /dev/null
+++ b/conv_t2g_2_g2t.py
@@ -0,0 +1,40 @@
+#!/usr/bin/env python
+
+import sys
+import collections
+import utils
+try:
+    import argparse as ap
+    import bz2 
+except ImportError:
+    sys.stderr.write( "argparse not found" )
+    sys.exit(-1)
+
+def read_params( args ):
+    p = ap.ArgumentParser(description='Convert core gene txt file'
+            ' substituting gene IDs with genomes IDs\n')
+
+    p.add_argument( 't2g', nargs='?', default=None, type=str,
+            help=   "")
+    p.add_argument('g2t', nargs='?', default=None, type=str,
+            help=   "")
+
+    return vars( p.parse_args() )
+
+if __name__ == "__main__":
+    args = read_params( sys.argv )
+    uc2cl = collections.defaultdict( set )
+
+    g2t = {}
+
+    with utils.openr( args['t2g'] ) as inp:
+        for ll in (l.strip().split('\t') for l in inp):
+            #to = int(ll[0])
+            to = ll[0]
+            for g in ll[1:]:
+                #g2t[int(g)] = to 
+                g2t[g] = to 
+    
+    with utils.openw(args['g2t']) as out:
+        for g,t in g2t.iteritems():
+            out.write( "\t".join([str(g),str(t)]) +"\n" )
diff --git a/conv_tid2tax.py b/conv_tid2tax.py
new file mode 100644
index 0000000..d8595e3
--- /dev/null
+++ b/conv_tid2tax.py
@@ -0,0 +1,143 @@
+#!/usr/bin/env python
+
+import sys
+import collections
+import utils
+import pandas
+import StringIO
+import tempfile
+
+try:
+    import argparse as ap
+    import bz2 
+except ImportError:
+    sys.stderr.write( "argparse not found" )
+    sys.exit(-1)
+
+def read_params( args ):
+    p = ap.ArgumentParser(description='Generate a taxonomic txt hierarchy'
+            ' with genome assignments from IMG taxa table\n')
+
+    
+    p.add_argument( 'img', nargs='?', default=None, type=str,
+            help=   "the input uc file [stdin if not present]")
+    p.add_argument('txt', nargs='?', default=None, type=str,
+            help=   "the output txt file compressed if fiven with bz2 extension\n"
+                    "[stdout if not present]")
+    p.add_argument('--NCBI_names', metavar="NCBI names.dmp", default = None )
+    p.add_argument('--corrections', metavar="Correction file", default = None )
+    p.add_argument('-d', metavar="Domain",
+            default='Mic', choices=['Mic','Vir','Euk'] )
+
+    return vars( p.parse_args() )
+
+qm = "?"
+def get_qm( s, ncbiid, t, tl, existing_species ):
+    name = t['Genome Name / Sample Name'].replace("Candidatus ","").split(" ")[:2]
+    genus, species = name if len(name) > 1 else ("","")
+    if s is None or s == "-1" or not s or type(s) != str or  s in ['Unclassified','unclassified'] or s.split("_")[0] in ['bacterium','sp','sp.','Sp','Sp.','spp','spp.','Spp','Spp.']:
+        if tl == 's' and species in existing_species[t['Genus']] and genus == t['Genus']:
+            s = species
+            #print str(t.name)+"\t+\t"+str(genus)+" "+str(species)
+        else: # species not in ['sp','sp.','Sp','Sp.','spp','spp.','Spp','Spp.']:
+            #print "--",species
+            if tl == 's' and genus == t['Genus']  and genus + " " + species == str(ncbiid[t['NCBI Taxon ID']]) and species not in ['bacterium','sp','sp.','Sp','Sp.','spp','spp.','Spp','Spp.']:
+                s = species
+                #print str(t.name)+"\t**\t"+str(genus)+" "+str(species)
+            else:
+                #print ">", genus, t['Genus'],  t['NCBI Taxon ID'], genus + " " + species, str(ncbiid[t['NCBI Taxon ID']])
+                return qm
+    return s.replace("Candidatus ","").replace(" ","_").replace(".","_").replace(",","_")
+
+def add_unnamed( arr ):
+    gunamed = "?" in arr[-2]
+    if "?" in arr[0]:
+        return a
+    lt,last = 'd',arr[0]
+    v = [arr[0]]
+    for a in arr[1:-2]:
+        if "?" in a and not gunamed:
+            v.append( a[:3] + last+"_noname" )
+        else:
+            v.append( a )
+        lt,last = a[0],(a[3:] if "?" not in a else last)
+    v += arr[-2:] 
+        
+    return v 
+
+if __name__ == "__main__":
+    args = read_params( sys.argv )
+    uc2cl = collections.defaultdict( set )
+
+    tax_lev = "dpcofgs"
+    tax_lev_exp = ['Domain','Phylum','Class','Order','Family','Genus','Species']
+
+    fp = tempfile.TemporaryFile()
+
+    if args['corrections']:
+        with utils.openr(args['corrections']) as inp:
+            frto = {}
+            frtoid = {}
+            for pat in (l.split('\t') for l in inp):
+                if len(pat) == 2:
+                    frto[pat[0].strip()] = pat[1].strip()
+                else:
+                    frtoid[pat[0].strip()] = (pat[1].strip(),pat[2].strip())
+            with utils.openr(args['img'],"rU") as inpf:
+                nfa = []
+                for l in inpf:
+                    nf = l
+                    for f,t in frto.items():
+                        nf = nf.replace(f,t)
+                    for i,(f,t) in frtoid.items():
+                        if l.startswith(i+"\t"):
+                            nf = nf.replace(f,t)
+                    nfa.append(nf) 
+                fp.write( "".join( nfa ) )
+                fp.seek(0)
+
+    with (utils.openr(args['img'],"rU") if not args['corrections'] else fp) as inp:
+        table = pandas.read_table( inp, sep='\t', index_col=0)
+ 
+    if args['d'] == 'Mic': 
+        table = table[table['Domain'].isin(['Bacteria','Archaea'])]
+        table = table[table['Gene Count'] > 250.0]
+        table = table[table['Genome Size'] > 50000.0]
+    elif args['d'] == 'Vir':
+        table = table[table['Domain'].isin(['Viruses'])]
+        table = table[table['Gene Count'] > 0.0]
+    elif args['d'] == 'Euk':
+        table = table[table['Domain'].isin(['Eukaryota'])]
+        able = table[table['Gene Count'] > 1000.0]
+        table = table[table['Genome Size'] > 500000.0]
+
+    toexcl = ['Candidatus','sp','sp.','Sp','Sp.','spp','spp.','Spp','Spp.','Unclassified','unclassified',"-1"]
+    #table['Species'] = [("-1" if t in toexcl else t) for t in table['Species']]
+
+    infos = ['Genome Name / Sample Name','NCBI Taxon ID']
+    table = table.reindex(columns=infos+tax_lev_exp+['Genome Name'])
+    table['Genus'] = [t.replace("Candidatus ","") for t in table['Genus']]
+
+    existing_species_l = [(b,a) for a,b in list(set(zip(list(table['Species']),list(table['Genus'])))) if a not in toexcl]
+    existing_species = collections.defaultdict( set )
+    for a,b in existing_species_l:
+        existing_species[a].add(" ".join(b.replace("Candidatus ","").split(" ")[:2]))
+
+    ncbi_species = set()
+    ncbiid = dict([(int(a),None) for a in table['NCBI Taxon ID']])
+
+    if args['NCBI_names']:
+        for line in (l.strip().split("|") for l in open(args['NCBI_names'])):
+            if int(line[0]) in ncbiid and "scientific name" in line[3]:
+                ncbiid[int(line[0])] = " ".join(line[1].replace("Candidatus ","").strip().split(" ")[:2])
+
+    with utils.openw(args['txt']) as out:
+        for i,t in table.iterrows():
+            out.write( "\t".join( [ str(-int(i)),
+                                  ".".join( add_unnamed(["__".join([taxl, get_qm(t[taxle],ncbiid,t,taxl,existing_species)]) 
+                                      for taxl,taxle in  zip(tax_lev,tax_lev_exp)]) + ["t__"+str(-int(i))]
+                                        ),
+                                  #str(t['Genome Name'])
+                                  ] ) + "\n")
+    
+    
diff --git a/conv_txt2ls.py b/conv_txt2ls.py
new file mode 100644
index 0000000..cdee6be
--- /dev/null
+++ b/conv_txt2ls.py
@@ -0,0 +1,34 @@
+#!/usr/bin/env python
+
+import sys
+import collections
+import utils
+
+try:
+    import argparse as ap
+    import bz2 
+except ImportError:
+    sys.stderr.write( "argparse not found" )
+    sys.exit(-1)
+
+def read_params( args ):
+    p = ap.ArgumentParser(description='Convert txt files to libsvm\n')
+
+    p.add_argument( 'txt', nargs='?', default=None, type=str,
+            help=   "the input txt file [stdin if not present]")
+    p.add_argument('ls', nargs='?', default=None, type=str,
+            help=   "the output ilibsvm file compressed if fiven with bz2 extension\n"
+                    "[stdout if not present]")
+
+    return vars( p.parse_args() )
+
+if __name__ == "__main__":
+    args = read_params( sys.argv )
+    uc2cl = collections.defaultdict( set )
+
+    with utils.openr(args['txt']) as inp:
+        data = zip(*[l.strip().split('\t') for l in inp])
+        outd = [[d[0]]+[str(i+1)+":"+dd for i,dd in enumerate(d[1:])] for d in data[1:]]
+        with utils.openw(args['ls']) as out:
+            for o in outd:
+                out.write( "\t".join(o) +"\n" )
diff --git a/conv_uc2txt.py b/conv_uc2txt.py
new file mode 100644
index 0000000..bd00e84
--- /dev/null
+++ b/conv_uc2txt.py
@@ -0,0 +1,41 @@
+#!/usr/bin/env python
+
+import sys
+import collections
+import utils
+
+try:
+    import argparse as ap
+    import bz2 
+except ImportError:
+    sys.stderr.write( "argparse not found" )
+    sys.exit(-1)
+
+def read_params( args ):
+    p = ap.ArgumentParser(description='Convert Usearch ".uc" files in tab-delimited'
+            ' files with the seed as first field followed by the other IDs\n')
+
+    p.add_argument( 'uc', nargs='?', default=None, type=str,
+            help=   "the input uc file [stdin if not present]")
+    p.add_argument('txt', nargs='?', default=None, type=str,
+            help=   "the output txt file compressed if fiven with bz2 extension\n"
+                    "[stdout if not present]")
+
+    return vars( p.parse_args() )
+
+if __name__ == "__main__":
+    args = read_params( sys.argv )
+    uc2cl = collections.defaultdict( set )
+
+    #with (open(args['uc']) if args['uc'] else sys.stdin) as inp:
+    with utils.openr(args['uc']) as inp:
+        for type,cln,seql,pid,strand,ing1,ign2,aln,query,target in (l.split('\t') for l in inp):
+            if type == 'H':
+                uc2cl[target.strip()].add( query )
+            elif type == 'S' and  query not in uc2cl:
+                uc2cl[query] = set()
+
+    #openw = bz2.BZ2File if args['txt'].endswith(".bz2") else open
+    with utils.openw(args['txt']) as out:
+        for k,v in sorted(uc2cl.items(),key=lambda x:-len(x[1])):
+            out.write( "\t".join([k]+list(v)) +"\n" )
diff --git a/excel.py b/excel.py
new file mode 100644
index 0000000..61ce37f
--- /dev/null
+++ b/excel.py
@@ -0,0 +1,22 @@
+#!/usr/bin/env python
+#Author: Duy Tin Truong (duytin.truong at unitn.it)
+#        at CIBIO, University of Trento, Italy
+
+
+from openpyxl import Workbook
+from openpyxl import load_workbook
+import os
+
+def table2xlsx(table, ofn, mode = 'write'):
+	if mode == 'append' and os.path.exists(ofn):
+		workbook = load_workbook(ofn)
+		worksheet = workbook.create_sheet()
+	else:
+		workbook = Workbook()
+		worksheet = workbook.active
+	for r, row in enumerate(table): 
+		for c, col in enumerate(row):
+			worksheet.cell(row = r+1, column = c+1).value = col
+	workbook.save(ofn)
+
+
diff --git a/fastq2fasta.py b/fastq2fasta.py
new file mode 100644
index 0000000..1a3c3fe
--- /dev/null
+++ b/fastq2fasta.py
@@ -0,0 +1,25 @@
+#!/usr/bin/env python
+from Bio import SeqIO
+import argparse as ap
+import sys
+
+def read_params():
+    p = ap.ArgumentParser(description = 'fastq2fasta.py Parameters\n')
+    p.add_argument('--ifn', required = False, default = None, type = str)
+    p.add_argument('--ofn', required = False, default = None, type = str)
+    return vars(p.parse_args())
+
+    
+if __name__ == '__main__':
+    args = read_params()
+    if args['ifn'] == None:
+        ifile = sys.stdin
+    else:
+        ifile = open(args['ifn'], 'r')
+    if args['ofn'] == None:
+        ofile = sys.stdout
+    else:
+        ofile = open(args['ofn'], 'w')
+    
+    for r in SeqIO.parse(ifile, "fastq"):
+        SeqIO.write(r, ofile, "fasta")
diff --git a/fastq_trim.py b/fastq_trim.py
new file mode 100644
index 0000000..3df6f58
--- /dev/null
+++ b/fastq_trim.py
@@ -0,0 +1,43 @@
+#!/usr/bin/env python
+
+import sys
+import collections
+import utils
+try:
+    import argparse as ap
+    import bz2 
+    import random
+    from Bio import SeqIO 
+except ImportError:
+    sys.stderr.write( "argparse not found" )
+    sys.exit(-1)
+
+def read_params( args ):
+    p = ap.ArgumentParser(description='')
+
+    p.add_argument('-q', metavar="Subsampling rate",
+            default=10, type=int )
+    """
+    p.add_argument('--subsample', metavar="Subsampling rate",
+            default=1.0, type=float )
+    p.add_argument('-n', metavar="Minimum number of matching taxa",
+            default=0, type=int )
+    p.add_argument('-p', metavar="Prefix for taxon names",
+            default="", type=str )
+    """
+    return vars( p.parse_args() )
+
+if __name__ == "__main__":
+    args = read_params( sys.argv )
+
+
+    with utils.openw( None ) as outf:
+        for seq in SeqIO.parse( utils.openr( None ),'fastq' ):
+            minp, maxp = None, None
+            for i,v in enumerate(seq.letter_annotations["phred_quality"]):
+                if minp is None and v >= args['q']:
+                    minp = i
+            for i,v in enumerate(seq.letter_annotations["phred_quality"][::-1]):
+                if maxp is None and v >= args['q']:
+                    maxp = len(seq.letter_annotations["phred_quality"]) - i
+            SeqIO.write(seq[minp:maxp], outf, 'fastq') 
diff --git a/fastx_len_filter.py b/fastx_len_filter.py
new file mode 100644
index 0000000..27656fa
--- /dev/null
+++ b/fastx_len_filter.py
@@ -0,0 +1,23 @@
+#!/usr/bin/env python
+from Bio import SeqIO
+import argparse as ap
+import sys
+
+def read_params(args):
+    p = ap.ArgumentParser(description = 'fastax_len_filter.py Parameters\n')
+    p.add_argument('--min_len', required = True, default = None, type = int)
+    p.add_argument('--max_len', required = False, default = None, type = int)
+    p.add_argument('-t', required = False, default = 'fastq', type = str)
+    return vars(p.parse_args()) 
+
+if __name__ == '__main__':
+    args = read_params(sys.argv)
+    min_len = args['min_len']
+    max_len = args['max_len']
+    with sys.stdout as outf:
+        for r in SeqIO.parse(sys.stdin, args['t']):
+            lenr = len(r)
+            if ( max_len is not None ) and ( lenr > max_len ):
+                continue
+            if lenr >= min_len:
+                SeqIO.write(r, outf, args['t'])
diff --git a/fna_exact_matches.py b/fna_exact_matches.py
new file mode 100644
index 0000000..5573e21
--- /dev/null
+++ b/fna_exact_matches.py
@@ -0,0 +1,56 @@
+#!/usr/bin/env python 
+from __future__ import with_statement
+
+import sys
+import argparse
+import os
+import textwrap
+from collections import namedtuple as nt
+import random as rnd
+rnd.seed(1982)
+import utils
+from Bio.Seq import Seq
+from Bio import SeqIO
+from Bio.SeqRecord import SeqRecord
+from Bio.SeqFeature import SeqFeature
+
+def read_params(args):
+    parser = argparse.ArgumentParser(description='')
+    arg = parser.add_argument
+    arg( 'inp_f', metavar='INPUT_FILE', nargs='?', default=None, type=str,
+         help="the input fna file [stdin if not present]")
+    arg( 'out_f', metavar='OUTPUT_FILE', nargs='?', default=None, type=str,
+         help="the output fna file [stdout if not present]")
+    arg( '-a', default=None, type=int,
+         help="number of char after the match to report")
+    arg( '-n', default=None, type=int,
+         help="number of matching primers")
+
+    parser.add_argument('-s', metavar='Subsequene to look for', required = True, type = str )
+
+    return vars(parser.parse_args())
+
+if __name__ == '__main__':
+    par = read_params(sys.argv)
+
+    ss = par['s'].lower()
+    ssr = Seq(par['s']).reverse_complement().lower()
+    f = os.path.basename(par['inp_f']).split(".")[0]
+    with utils.openw( par['out_f'] ) as outf:
+        for r in SeqIO.parse( utils.openr(par['inp_f']), "fasta"):
+            rl = r.seq.lower()
+            if ss in rl or ssr in rl:
+                if par['a']:
+                    if ss in rl:
+                        i = str(rl).index(str(ss))
+                        subs = rl[i:i+len(ss)+par['a']] if i+len(ss)+par['a'] < len(rl) else rl[i:]
+                    else:
+                        i = str(rl).index(str(ssr))
+                        subs = rl[i:i+len(ssr)+par['a']] if i+len(ssr)+par['a'] < len(rl) else rl[i:]
+                    outf.write( f + "\t" + str(r.id) + "\t" + str(subs) + "\n" )
+                else:
+                    if par['n']:
+                        n = str(rl).count(str(ss)) + str(rl).count(str(ssr))
+                        outf.write( f + "\t" + str(r.id) + "\t" + str(n) + "\n" )
+                    else:
+                        outf.write( f + "\t" + str(r.id) + "\n" )
diff --git a/fna_extract_from_bo6.py b/fna_extract_from_bo6.py
new file mode 100644
index 0000000..1d22eaa
--- /dev/null
+++ b/fna_extract_from_bo6.py
@@ -0,0 +1,54 @@
+#!/usr/bin/env python 
+from __future__ import with_statement
+
+import sys
+import argparse
+import os
+import textwrap
+from collections import namedtuple as nt
+import random as rnd
+rnd.seed(1982)
+import utils
+from Bio import SeqIO
+
+def read_params(args):
+    parser = argparse.ArgumentParser(description='Split/Select/Randomize/Subsample a multi fasta file')
+    arg = parser.add_argument
+    arg( 'inp_f', metavar='INPUT_FILE', nargs='?', default=None, type=str,
+         help="the input fna file [stdin if not present]")
+    arg( 'out_f', metavar='OUTPUT_FILE', nargs='?', default=None, type=str,
+         help="the output fna file [stdout if not present]")
+
+    parser.add_argument('--extract_targets', action='store_true', help="Select fna entries\n")
+    parser.add_argument('-i', action='store_true', help="Add hit stats to fna entries\n")
+    
+    parser.add_argument('--bo6', metavar='Bo6 file', required=True, type = str )
+
+    return vars(parser.parse_args())
+
+if __name__ == '__main__':
+    par = read_params(sys.argv)
+
+    inp_mat = (l.rstrip('\n').split("\t") for l in (utils.openr(par['bo6'])))
+
+    if par['extract_targets']:
+        toextr = ((l[1], l[2], l[3], l[11], int(l[8]), int(l[9])) for l in inp_mat)
+    else:
+        toextr = ((l[0], l[2],  l[3], l[11], int(l[6]), int(l[7])) for l in inp_mat)
+   
+    inpfasta = SeqIO.to_dict(SeqIO.parse( utils.openr(par['inp_f']), "fasta"))
+
+    out_seqs = []
+    for n,pid,l,bit,fr,to in toextr:
+        n = inpfasta[n][min(fr,to):max(fr,to)]
+        if par['i']:
+            p = "_pid"+pid.strip()+"_l"+l.strip()+"_bs"+bit.strip()
+        else:
+            p = ""
+        n.id = n.id+"_"+str(fr)+"_"+str(to)+p
+        out_seqs.append( n )
+
+    SeqIO.write(out_seqs, utils.openw(par['out_f']), "fasta") 
+
+
+
diff --git a/fna_extract_from_primersearch.py b/fna_extract_from_primersearch.py
new file mode 100644
index 0000000..6e325d8
--- /dev/null
+++ b/fna_extract_from_primersearch.py
@@ -0,0 +1,78 @@
+#!/usr/bin/env python 
+from __future__ import with_statement
+
+import sys
+import argparse
+import os
+import textwrap
+from collections import namedtuple as nt
+import random as rnd
+rnd.seed(1982)
+import utils
+from Bio import SeqIO
+from Bio.SeqRecord import SeqRecord
+
+
+def read_params( args ):
+    parser = argparse.ArgumentParser(description='Amplify DNA regions using primersearch output\n')
+
+    arg = parser.add_argument 
+    arg( '--ps', required = True, default=None, type=str )
+    arg( 'fna', nargs='?', default=None, type=str,
+            help=   "the input fna file [stdin if not present]")
+    arg('out', nargs='?', default=None, type=str,
+            help=   "the output fna file\n"
+                    "[stdout if not present]")
+
+    return vars( parser.parse_args() )
+
+
+def parse_primersearch( fn ):
+    seqs = {}
+    cur,seq,fs,rs,al,rseq,fseq = None, None, None, None, None, None, None
+    with utils.openr( fn, "U" ) as inpf:
+        for l in inpf:
+            line = l.strip()
+            if line.startswith("Amplimer") and 'Amplimer length' not in line:
+                cur = line
+            elif line.startswith("Sequence"):
+                seq = line.split("Sequence:")[1].strip()
+            elif 'hits forward strand at ' in line:
+                fseq = line.split()[0]
+                fs = int(line.split("hits forward strand at ")[1].split("with")[0])
+            elif 'hits reverse strand at ' in line:
+                rseq = line.split()[0]
+                rs = int(line.split("hits reverse strand at ")[1].split("with")[0].strip()[1:-1])
+            elif 'Amplimer length' in line:
+                al = int(line.split("Amplimer length: ")[1].split("bp")[0])
+                seqs[cur] = { 'seq' : seq, 'fs' : fs, 'rs' : rs, 'al' : al, 'rseq' : rseq, 'fseq' : fseq }
+                cur,seq,fs,rs,al,rseq,fseq = None, None, None, None, None, None, None
+    return seqs
+
+
+if __name__ == "__main__":
+    args = read_params( sys.argv )
+
+    extr = parse_primersearch( args['ps']  )
+    
+    seqs2extr = {}
+    for k,v in extr.items():
+        if v['seq'] in seqs2extr:
+            seqs2extr[v['seq']][k] = v
+        else:
+            seqs2extr[v['seq']] = { k: v }
+
+    with utils.openw( args['out'] ) as outf:
+        for r in SeqIO.parse( utils.openr(args['fna']), "fasta"):
+            if r.id in seqs2extr:
+                for pn,ext in seqs2extr[r.id].items():
+                    sq = SeqRecord( r.id )
+                    sq.id = r.id + " " + pn
+                    sq.description = r.description + " " + pn
+                    sq.seq = r.seq[ ext['fs']+len(ext['fseq']):len(r.seq)-ext['rs']-len(ext['rseq'])]
+                    SeqIO.write(sq, outf, "fasta") 
+
+
+
+
+
diff --git a/fna_extract_with_primers.py b/fna_extract_with_primers.py
new file mode 100644
index 0000000..d026212
--- /dev/null
+++ b/fna_extract_with_primers.py
@@ -0,0 +1,96 @@
+#!/usr/bin/env python 
+from __future__ import with_statement
+
+import sys
+import argparse
+import os
+import textwrap
+from collections import namedtuple as nt
+import random as rnd
+rnd.seed(1982)
+import utils
+from Bio.Seq import Seq
+from Bio import SeqIO
+from Bio.SeqRecord import SeqRecord
+from Bio.SeqFeature import SeqFeature
+import regex
+
+def read_params(args):
+    parser = argparse.ArgumentParser(description='')
+    arg = parser.add_argument
+    arg( 'inp_f', metavar='INPUT_FILE', nargs='?', default=None, type=str,
+         help="the input fna file [stdin if not present]")
+    arg( 'out_f', metavar='OUTPUT_FILE', nargs='?', default=None, type=str,
+         help="the output fna file [stdout if not present]")
+    parser.add_argument('-l', metavar='', required = True, type = str )
+    parser.add_argument('-r', metavar='', required = True, type = str )
+    parser.add_argument('-e', metavar='', required = False, type = int, default = 0 )
+
+    return vars(parser.parse_args())
+
+if __name__ == '__main__':
+    par = read_params(sys.argv)
+
+    rn,ln = "", ""
+    if par['r'].count(":"):
+        rn,par['r'] = par['r'].split(":")
+        rn+=":"
+    if par['l'].count(":"):
+        ln,par['l'] = par['l'].split(":")
+        ln+=":"
+
+    ne = str(par['e'])
+    c_r = par['r'].lower()
+    c_r_rev = Seq(par['r']).reverse_complement().lower()
+    c_l = par['l'].lower()
+    c_l_rev = Seq(par['l']).reverse_complement().lower()
+    f = os.path.basename(par['inp_f']).split(".")[0]
+
+    with utils.openw( par['out_f'] ) as outf:
+        for seq in SeqIO.parse( utils.openr(par['inp_f']), "fasta"):
+            seql = str(seq.seq.lower())
+            r_rev,l_rev,r,l = c_r_rev,c_l_rev,c_r,c_l      
+           
+            rr = regex.findall( "("+r+"){e<="+ne+"}", seql )
+            lr = regex.findall( "("+l+"){e<="+ne+"}", seql )
+            r_revr = regex.findall( "("+str(r_rev)+"){e<="+ne+"}", seql )
+            l_revr = regex.findall( "("+str(l_rev)+"){e<="+ne+"}", seql )
+
+            if len(rr) > 1:
+                outf.write( str(rr) +" unspecific 1\n" )
+            if len(lr) > 1:
+                outf.write( str(lr) +" unspecific 2\n" )
+            if len(r_revr) > 1:
+                outf.write( str(r_revr) +" unspecific 3\n" )
+            if len(l_revr) > 1:
+                outf.write( str(l_revr) +" unspecific 4\n" )
+            if len(rr) == 1 and len(l_revr) == 1:
+                lr,r_revr = l_revr, rr
+            if len(lr) == 1 and len(r_revr) == 1:
+                i = str(seql).index(str(lr[0])) + len(l)
+                j = str(seql).index(str(r_revr[0])) - len(r_rev)
+                if i > j:
+                    i,j = j,i
+                outf.write( ln+str(l) + "\t" + rn+str(r_rev) + "\t" + str(len(seql[i:j])) + "\t" + str(len(seql[i:j])+len(l)+len(r_rev)) +"\t" + str(seql[i:j]) +   "\n" )
+
+
+    
+    """
+    with utils.openw( par['out_f'] ) as outf:
+        for seq in SeqIO.parse( utils.openr(par['inp_f']), "fasta"):
+            r_rev,l_rev,r,l = c_r_rev,c_l_rev,c_r,c_l      
+            while True:
+                seql = seq.seq.lower()
+                if r in seql and l_rev in seql:
+                    r_rev,l = r,l_rev
+                if r_rev in seql and l in seql:
+                    if seql.count( r_rev ) > 1 or seql.count( l ) > 1:
+                        outf.write( "ambiguous primers\n" )
+                        break
+                    i = str(seql).index(str(l)) + len(l)
+                    j = str(seql).index(str(r_rev)) - len(r_rev)
+                    outf.write( ln+str(l) + "\t" + rn+str(r_rev) + "\t" + str(len(seql[i:j])) + "\t" + str(len(seql[i:j])+len(l)+len(r_rev)) +"\t" + str(seql[i:j]) +   "\n" )
+                    break
+                r_rev,l_rev = r_rev[:-1],l_rev[1:]
+                l,r = l[1:],l[:1]
+    """
diff --git a/fna_g2r.py b/fna_g2r.py
new file mode 100644
index 0000000..39e1d55
--- /dev/null
+++ b/fna_g2r.py
@@ -0,0 +1,45 @@
+#!/usr/bin/env python 
+from __future__ import with_statement
+
+import sys
+import argparse
+import os
+import textwrap
+from collections import namedtuple as nt
+import random as rnd
+rnd.seed(1982)
+import utils
+from Bio import SeqIO
+from Bio.SeqRecord import SeqRecord
+
+def read_params(args):
+    parser = argparse.ArgumentParser(description='Split/Select/Randomize/Subsample a multi fasta file')
+    arg = parser.add_argument
+    arg( 'inp_f', metavar='INPUT_FILE', nargs='?', default=None, type=str,
+         help="the input fna file [stdin if not present]")
+    arg( 'out_f', metavar='OUTPUT_FILE', nargs='?', default=None, type=str,
+         help="the output fna file [stdout if not present]")
+
+    parser.add_argument('-l', metavar='Length of the produced reads', default=100, type = int )
+    parser.add_argument('-s', metavar='Step in sampling reads', default=20, type = int )
+
+    return vars(parser.parse_args())
+
+if __name__ == '__main__':
+    par = read_params(sys.argv)
+
+    step = par['s']
+    rlen = par['l']
+
+    with utils.openw( par['out_f'] ) as outf:
+        for r in SeqIO.parse( utils.openr(par['inp_f']), "fasta"):
+            gene = r.seq
+            oid = r.id
+            reads = []
+            for i in xrange(0, len(gene), step ):
+                if len(gene) - i < rlen - step:
+                    break
+                nid = "_".join([oid,str(i)])
+                reads.append( SeqRecord( gene[i:i+rlen], id = nid, description = "") )
+
+            SeqIO.write(reads, outf, "fasta")
diff --git a/fna_len.py b/fna_len.py
new file mode 100644
index 0000000..091fc82
--- /dev/null
+++ b/fna_len.py
@@ -0,0 +1,49 @@
+#!/usr/bin/env python 
+from __future__ import with_statement
+
+import sys
+import argparse
+import os
+import textwrap
+from collections import namedtuple as nt
+import random as rnd
+rnd.seed(1982)
+import utils
+from Bio import SeqIO
+from Bio.SeqRecord import SeqRecord
+import numpy as np
+
+def read_params(args):
+    parser = argparse.ArgumentParser(description='Display the length of each fasta entry')
+    arg = parser.add_argument
+    arg( 'inp_f', metavar='INPUT_FILE', nargs='?', default=None, type=str,
+         help="the input fna file [stdin if not present]")
+    arg( 'out_f', metavar='OUTPUT_FILE', nargs='?', default=None, type=str,
+         help="the output txt file [stdout if not present]")
+    arg( '-q', action='store_true', 
+         help="set this for fastq")
+    arg( '-t','--total', action='store_true', help="Print only the sum of the length of all sequences\n")
+    arg( '-s','--stat', action='store_true', help="Print only the statistics about the length of sequences\n")
+    return vars(parser.parse_args())
+
+if __name__ == '__main__':
+    par = read_params(sys.argv)
+
+    ltot = 0
+    lvec= []
+    with utils.openw( par['out_f'] ) as outf:
+        for r in SeqIO.parse( utils.openr(par['inp_f']), "fastq" if par['q'] else "fasta"):
+            l = len(r.seq)
+            if par['total']:
+                ltot += l
+            elif par['stat']:
+                lvec.append(l);
+            else:
+                outf.write( "\t".join( [r.id,str(l)] ) + "\n" ) 
+        if par['total']:
+            outf.write( str(ltot) + "\n" )
+        if par['stat']:
+            outf.write("number_of_reads\t" + str(len(lvec))  + "\n")
+            outf.write("minimum_read_length\t" + str(np.min(lvec))  + "\n")
+            outf.write("median_read_length\t" + str(np.median(lvec))  + "\n")
+            outf.write("mean_read_length\t" + str(np.mean(lvec)) + "\n")
\ No newline at end of file
diff --git a/fna_random.py b/fna_random.py
new file mode 100644
index 0000000..640e500
--- /dev/null
+++ b/fna_random.py
@@ -0,0 +1,42 @@
+#!/usr/bin/env python 
+from __future__ import with_statement
+
+import sys
+import argparse
+import os
+import textwrap
+from collections import namedtuple as nt
+import random as rnd
+rnd.seed(1982)
+import utils
+from Bio import SeqIO
+from Bio.SeqRecord import SeqRecord
+
+
+bin2nuc = { "00":"A", "01":"T", "10":"C", "11":"T"}
+
+def read_params(args):
+    parser = argparse.ArgumentParser(description='Display the length of each fasta entry')
+    arg = parser.add_argument
+    arg( 'inp_f', metavar='INPUT_FILE', nargs='?', default=None, type=str,
+         help="the input fna file [stdin if not present]")
+    arg( 'out_f', metavar='OUTPUT_FILE', nargs='?', default=None, type=str,
+         help="the output fna file [stdout if not present]")
+    arg( '-t','--total', action='store_true', help="Print only the sum of the length of all sequences\n") 
+    return vars(parser.parse_args())
+
+if __name__ == '__main__':
+    par = read_params(sys.argv)
+
+    ltot = 0
+    with utils.openw( par['out_f'] ) as outf:
+        dna = ""
+        for r in utils.openr(par['inp_f']):
+            for l in r.strip():
+                sn = "{0:b}".format(ord(l))
+                if len(sn) % 2:
+                    sn += "0"
+                for i,s in enumerate( sn ):
+                    if not i % 2:
+                        dna += bin2nuc[sn[i:i+2]]
+        outf.write( dna + "\n" ) 
diff --git a/fna_revcomp.py b/fna_revcomp.py
new file mode 100644
index 0000000..5a8ddf0
--- /dev/null
+++ b/fna_revcomp.py
@@ -0,0 +1,46 @@
+#!/usr/bin/env python 
+from __future__ import with_statement
+
+import sys
+import argparse
+import os
+import textwrap
+from collections import namedtuple as nt
+import random as rnd
+rnd.seed(1982)
+import utils
+from Bio import SeqIO
+from Bio.SeqRecord import SeqRecord
+
+def read_params(args):
+    parser = argparse.ArgumentParser(description='Split/Select/Randomize/Subsample a multi fasta file')
+    arg = parser.add_argument
+    arg( 'inp_f', metavar='INPUT_FILE', nargs='?', default=None, type=str,
+         help="the input fna file [stdin if not present]")
+    arg( 'out_f', metavar='OUTPUT_FILE', nargs='?', default=None, type=str,
+         help="the output fna file [stdout if not present]")
+
+    parser.add_argument('-r','--reverse', action='store_true', help="Invert selection\n")
+    parser.add_argument('-c','--complement', action='store_true', help="Invert selection\n")
+    parser.add_argument('-p', action='store_true', help="Only for even samples\n")
+
+    return vars(parser.parse_args())
+
+def revcomp( par ):
+    p = par['p']
+    with utils.openw( par['out_f'] ) as outf:
+        if par['complement'] and par['reverse']:
+            res = ((SeqRecord(r.seq.reverse_complement(),id=r.id,description="RC") if not p or i % 2 == 1 else r) for i,r in enumerate(SeqIO.parse( utils.openr(par['inp_f']), "fasta")))
+        elif par['reverse']:
+            res = ((SeqRecord(r.seq[::-1],id=r.id,description="R") if not p or i % 2 == 1 else r) for i,r in enumerate(SeqIO.parse( utils.openr(par['inp_f']), "fasta")))
+        elif par['complement']:
+            res = ((SeqRecord(r.seq.complement(),id=r.id,description="C") if not p or i % 2 == 1 else r) for i,r in enumerate(SeqIO.parse( utils.openr(par['inp_f']), "fasta")))
+        else:
+            res = []
+
+        SeqIO.write(res, outf, "fasta")
+
+if __name__ == '__main__':
+    params = read_params(sys.argv)
+    revcomp( params )
+
diff --git a/fna_sss.py b/fna_sss.py
new file mode 100644
index 0000000..e6400d8
--- /dev/null
+++ b/fna_sss.py
@@ -0,0 +1,163 @@
+#!/usr/bin/env python 
+from __future__ import with_statement
+
+import sys
+import argparse
+import os
+import textwrap
+from collections import namedtuple as nt
+import random as rnd
+rnd.seed(1982)
+import utils
+from Bio import SeqIO
+
+def read_params(args):
+    parser = argparse.ArgumentParser(description='Split/Select/Randomize/Subsample a multi fasta file')
+    arg = parser.add_argument
+    arg( 'inp_f', metavar='INPUT_FILE', nargs='?', default=None, type=str,
+         help="the input fna file [stdin if not present]")
+    arg( 'out_f', metavar='OUTPUT_FILE', nargs='?', default=None, type=str,
+         help="the output fna file [stdout if not present]")
+
+    parser.add_argument('--select', action='store_true', help="Select fna entries\n")
+    
+    parser.add_argument('--subsample', metavar='Random subsample probability', 
+        default=None, type = float )
+    parser.add_argument('--randomize', help='Randomize output order. Not very memory efficient right now!', action='store_true' )
+    parser.add_argument('--split', metavar='nsplits', default=1, type = int )
+    
+    parser.add_argument('--min_len', metavar='Minimum length of the genes to keep', default=None, type = int )
+    parser.add_argument('--max_len', metavar='Maximum length of the genes to keep', default=None, type = int )
+    parser.add_argument('--reverse', action='store_true', help="Invert selection\n")
+    parser.add_argument('-q', action='store_true', help="Fastq format\n")
+
+    parser.add_argument('--ids', metavar='s', default="", type=str, 
+        help="the list of entries to select (separated by ::: if given as string otherwise as \\n if a file is given)")
+
+    return vars(parser.parse_args())
+
+class read:
+    def __init__( self, n = None, seq = None ):
+        self.n = n
+        self.seq = "" if seq is None else seq
+
+    def __str__( self ):
+        return "\n".join( [">"+self.n] + textwrap.wrap(self.seq, 120) ) +"\n" 
+
+    def __repr__( self ):
+        return str(self)
+
+class reader(object):
+    def __init__( self, fn, min_len = None, max_len = None ):
+        self.ret = False
+        self.min_len, self.max_len = min_len, max_len
+        #openr = bz2.BZ2File if bool(fn) and fn.endswith(".bz2") else open
+        self.inp = utils.openr(fn) if bool(fn) else sys.stdin
+        self.cr = read( )
+
+    def __iter__(self):
+        return self
+
+    def next( self ):
+        while True:
+            try:
+                l = self.inp.next().strip()
+                if l and l[0] == '>':
+                    ret = None
+                    if self.cr.n and len(self.cr.seq) > 0:
+                        ret = read(self.cr.n,self.cr.seq)
+                    self.cr.n = l[1:]# hash(l[1:])
+                    self.cr.seq = ""
+                    if ret:
+                        if self.min_len and len(ret.seq) < self.min_len:
+                            pass
+                        elif self.max_len and len(ret.seq) > self.max_len:
+                            pass
+                        else:
+                            return ret
+                else:
+                    self.cr.seq += l
+            except StopIteration:
+                if self.cr.n and len(self.cr.seq) > 0:
+                    ret = read(self.cr.n,self.cr.seq)
+                    self.cr.n = None
+                    return ret
+                raise StopIteration 
+
+def sss( par ):
+    subsample = bool(par['subsample']) 
+    select = bool(par['select'])
+    randomize = bool(par['randomize'])
+    if bool(par['out_f']):
+        n = par['split']
+        #openw = bz2.BZ2File if par['out_f'].endswith(".bz2") else open
+        if n == 1:
+            out_stream = [utils.openw( par['out_f'])]
+        else:
+            out_stream = [utils.openw( par['out_f']+str(r).zfill(len(str(n)))+".fna"+(".bz2" if par['out_f'].endswith(".bz2") else "")) for r in range(n)]
+    else:
+        out_stream = [sys.stdout] # larger buffer?
+
+    if select:
+        if os.path.exists(par['ids']):
+            #openr = bz2.BZ2File if par['ids'].endswith(".bz2") else open 
+            es = [s.strip().split('\t')[0] for s in utils.openr(par['ids'])]
+        else:
+            es = [(s.split("$")[1] if s.count("$") else s) for s in  par['ids'].split(":::")]
+        es = set(es)
+
+    all_reads = []
+    nstreams = len( out_stream )
+
+    p = par['subsample']
+    #reads = reader( par['inp_f'], par['min_len'], par['max_len'] )
+    cind = 0
+    lmin,lmax = par['min_len'], par['max_len'] 
+    for r in SeqIO.parse( utils.openr(par['inp_f']), "fasta" if not par['q'] else "fastq"):
+        if lmin and len(r.seq) < lmin:
+            continue
+        if lmax and len(r.seq) > lmax:
+            continue
+        if select:
+            if par['reverse']:
+                if r.id in es:
+                    continue
+            elif r.id not in es:
+                continue
+        if subsample and rnd.random() > p:
+            continue
+        if randomize:
+            all_reads.append( r )
+            continue
+        SeqIO.write(r, out_stream[cind], "fasta" if not par['q'] else "fastq")
+        cind = (cind + 1) % nstreams
+    
+    """
+    for r in reads:
+        if select and r.n not in es:
+            continue
+        if subsample and rnd.random() > p:
+            continue
+        if randomize:
+            all_reads.append( r )
+            continue
+        out_stream[cind].write(  str(r)  )
+        cind = (cind + 1) % nstreams
+    """
+
+    if randomize:
+        rnd.shuffle(all_reads)
+        step = len(all_reads) / nstreams 
+        for i,r in enumerate(all_reads):
+            #out_stream[cind].write( str(r) )
+            SeqIO.write(r, out_stream[cind], "fasta" if not par['q'] else "fastq" )
+            if not i % step:
+                cind = (cind + 1) % nstreams
+
+    for o in out_stream:
+        o.close()
+
+if __name__ == '__main__':
+    params = read_params(sys.argv)
+    sss( params )
+
diff --git a/fna_t2g.py b/fna_t2g.py
new file mode 100644
index 0000000..2ce34ec
--- /dev/null
+++ b/fna_t2g.py
@@ -0,0 +1,36 @@
+#!/usr/bin/env python 
+from __future__ import with_statement
+
+import sys
+import argparse
+import os
+import textwrap
+from collections import namedtuple as nt
+import random as rnd
+rnd.seed(1982)
+import utils
+from Bio import SeqIO
+
+def read_params(args):
+    parser = argparse.ArgumentParser(description='List the genes in the genome file')
+    arg = parser.add_argument
+    arg( 'inp_f', metavar='INPUT_FILE', default=None, type=str,
+         help="the input fna file")
+    arg( 'out_f', metavar='OUTPUT_FILE', nargs='?', default=None, type=str,
+         help="the output txt file [stdout if not present]")
+
+    return vars(parser.parse_args())
+
+def genome_id( fn ):
+    #return str(-int(os.path.basename(fn).split(".")[0]))
+    return os.path.basename(fn).split(".")[0]
+
+if __name__ == '__main__':
+    par = read_params(sys.argv)
+  
+    
+    ids = [r.id for r in SeqIO.parse( utils.openr(par['inp_f']), "fasta")]
+
+    with utils.openw( par['out_f']) as out:
+        out.write( "\t".join( [genome_id(par['inp_f'])]+ids ) + "\n" )
+
diff --git a/gen_info.py b/gen_info.py
new file mode 100644
index 0000000..75b4c0c
--- /dev/null
+++ b/gen_info.py
@@ -0,0 +1,76 @@
+#!/usr/bin/env python 
+
+from __future__ import with_statement
+
+import sys
+import argparse
+import os
+import textwrap
+from collections import namedtuple as nt
+import random as rnd
+rnd.seed(1982)
+import utils
+from Bio import SeqIO
+from Bio.SeqRecord import SeqRecord
+import numpy as np
+
+def read_params(args):
+    parser = argparse.ArgumentParser(description='Display the length of each fasta entry')
+    arg = parser.add_argument
+    arg( 'inp_f', metavar='INPUT_FILE', nargs='?', default=None, type=str,
+         help="the input fna file [stdin if not present]")
+    arg( 'out_f', metavar='OUTPUT_FILE', nargs='?', default=None, type=str,
+         help="the output txt file [stdout if not present]")
+    arg( '-q', action='store_true', 
+         help="set this for fastq")
+    arg( '-n', default=None, type=str ) 
+    return vars(parser.parse_args())
+
+
+def N50( lens ):
+    ll = sorted(lens,reverse=True)
+    val =sum(lens)*0.5
+    suml, i = 0, 0
+    n50 = 0
+    while suml < val:
+        suml += ll[i]
+        n50 = ll[i]
+        i += 1
+    return n50
+
+if __name__ == '__main__':
+    par = read_params(sys.argv)
+
+    ltot = 0
+    lvec= []
+
+
+    with utils.openw( par['out_f'] ) as outf:
+        lens = []
+        cn, gn, tn, an = 0, 0, 0, 0
+        for r in SeqIO.parse( utils.openr(par['inp_f']), "fastq" if par['q'] else "fasta"):
+            l = len(r.seq)
+            lens.append(l)
+            an += r.seq.count('a') + r.seq.count('A')
+            tn += r.seq.count('t') + r.seq.count('T')
+            gn += r.seq.count('g') + r.seq.count('G')
+            cn += r.seq.count('c') + r.seq.count('C')
+            
+        
+        info = ["total_length","n_contigs","N50","median_length","mean_length","CG_perc"]
+        if par['n']:
+            info = ['name'] + info
+
+        outf.write( "#"+"\t".join(info) +"\n")
+
+        res = {}
+        res['name'] = par['n']
+        res['total_length'] = sum( lens )
+        res['n_contigs'] = len( lens )
+        res['N50'] = N50( lens )
+        res['median_length'] = np.median( lens )
+        res['mean_length'] = round(np.mean( lens ), 0)
+        res['CG_perc'] = round( 100.0 * float(cn+gn) / float(an+tn+gn+cn), 2)
+
+        outf.write( "\t".join([str(res[v]) for v in info]) +"\n")
+
diff --git a/get_genome_length.py b/get_genome_length.py
new file mode 100644
index 0000000..14ad8d0
--- /dev/null
+++ b/get_genome_length.py
@@ -0,0 +1,22 @@
+#!/usr/bin/env python
+
+import sys
+import Bio
+from Bio import SeqIO	
+import argparse as ap
+
+
+def get_genome_length(ifn):
+	length = 0
+	for contig in SeqIO.parse(ifn, 'fasta'):
+		length += len(contig.seq)
+	return length
+
+def read_params(args):
+	p = ap.ArgumentParser(description = 'get_genome_length.py Parameters\n')
+	p.add_argument('--ifn', required = True, default = None, type = str)
+	return vars(p.parse_args())
+	
+if __name__ == '__main__':
+	args = read_params(sys.argv)
+	print get_genome_length(args['ifn'])
diff --git a/libsvm_cv.py b/libsvm_cv.py
new file mode 100644
index 0000000..a97c48d
--- /dev/null
+++ b/libsvm_cv.py
@@ -0,0 +1,159 @@
+#!/usr/bin/env python
+
+from svm import *
+from svmutil import *
+import croc
+import argparse
+import utils
+
+import random
+random.seed(1982)
+
+
+def ROC( ty, pv ):
+    if len(ty) != len(pv):
+        raise ValueError("len(ty) must equal to len(pv)")
+    SD = croc.ScoredData()
+    for t,p in zip(ty,pv):
+        #print t,p
+        SD.add( p,t )
+    it = SD.sweep_threshold()
+    #print it
+    curve = croc.ROC( it )
+    return (curve.area(),curve)
+
+
+def get_best_param( res_cv ):
+    cmax, gmax, rmax = None, None, -1.0
+    for c,gdict in res_cv.items():
+        for g,r in gdict.items():
+            if rmax < r:
+                rmax = r
+                cmax = c
+                gmax = g
+    return cmax,gmax,rmax
+
+def ms_cv( prob, param, nfolds = 10, metric = 'auc', c_list = None, g_list = None ):
+
+    prob = zip(*prob)
+    folds = get_folds( prob, nfolds )
+    labels,vals,targets = [],[],[]
+    c_orig, g_orig = param.C, param.gamma
+
+    for nf,fold in enumerate(folds):
+        training_folds = []
+        for j,f in enumerate(folds):
+            if nf != j:
+                training_folds += f
+    
+        y, x = zip(*training_folds)
+   
+        c_list = c_list if c_list else [c_orig]
+        g_list = g_list if g_list else [g_orig]
+        
+        cv_res = dict([(c,{}) for c in c_list])
+        for c in c_list:
+            for g in g_list:
+                param.C = c
+                param.gamma = g
+                cv_res[c][g] = cv( (y, x), param, nfolds = nfolds, metric = metric )[0] 
+
+        param.C, param.gamma, rmax = get_best_param(cv_res) 
+        #sys.stderr.write( str(param.C) + str(param.gamma) + str(rmax) )
+
+        m = svm_train( svm_problem( y, x ), param )
+        yp, xp = zip(*fold)
+        labs,acc,vs = svm_predict( yp, xp, m, options="-b 1" if param.probability else "" ) 
+        
+        labels += labs
+        vals += vs
+        targets += yp
+    
+    if metric == 'acc':
+        ACC, MSE, SCC = evaluations(labels, targets)
+        return ACC, None
+    if metric == 'auc':
+        auc,curve = ROC( targets, [-v[0] for v in vals]) 
+        return auc,curve
+
+
+def get_folds( prob, nfolds ):
+    
+    folds = [[] for i in range(nfolds)]
+    random.shuffle( list(prob) )
+    prob = sorted( prob, key = lambda x:x[0])
+    
+    for i,v in enumerate( prob ):
+        folds[i%nfolds].append(v)
+    
+    return folds
+
+
+def cv( prob, param, nfolds = 10, metric = 'acc' ):
+    assert len(prob[0]) > nfolds
+
+    prob = zip(*prob)
+
+    folds = get_folds( prob, nfolds )
+  
+    labels,vals,targets = [],[],[]
+
+    for nf,fold in enumerate(folds):
+        training_folds = []
+        for j,f in enumerate(folds):
+            if nf != j:
+                training_folds += f
+       
+        y, x = zip(*training_folds)
+        m = svm_train( svm_problem( y, x ), param )
+        yp, xp = zip(*fold)
+        labs,acc,vs = svm_predict( yp, xp, m, options="-b 1" if param.probability else "" ) 
+        labels += labs
+        vals += vs
+        targets += yp
+    
+    if metric == 'acc':
+        ACC, MSE, SCC = evaluations(labels, targets)
+        print "Acc",ACC
+        return ACC,None
+    if metric == 'auc':
+        auc,curve = ROC( targets, [-v[0] for v in vals]) 
+        print "AUC",auc
+        return auc,curve
+    
+
+def read_params(args):
+    parser = argparse.ArgumentParser(description='LibSVM extensions')
+    arg = parser.add_argument
+    arg( 'inp_f', metavar='INPUT_FILE', default=None, type=str )
+    arg( '--libsvm', default=[], nargs='+', type =str )
+    arg( '-n', default=10, type = int )
+    arg( '-m', default='acc', type = str )
+    arg( '-c', default=None, nargs='+', type =str )
+    arg( '-g', default=None, nargs='+', type =str )
+    arg( '--roc', default=None, type =str )
+    return vars(parser.parse_args())
+        
+args = read_params(sys.argv)
+
+p = svm_read_problem(args['inp_f'])
+print " ".join(args['libsvm']).replace("_","-")
+param = svm_parameter( " ".join(args['libsvm']).replace("_","-")  )
+
+
+c_list = [float(f) for f in args['c']] if args['c'] else [1.0]
+g_list = [float(f) for f in args['g']] if args['g'] else [1.0]
+
+v,g = ms_cv( p, param, nfolds = args['n'], metric = args['m'], c_list = c_list, g_list = g_list )
+
+if args['m'] == 'auc':
+    print "CV AUC\t"+str(v)
+elif args['m'] == 'acc':
+    print "CV acc\t"+str(v)
+
+if args['roc'] and args['m'] == 'auc':
+    with open( args['roc'], "w" ) as out:
+        for v in g:
+            out.write( str(v[0]) + "\t" + str(v[1]) + "\n" )
+
+
diff --git a/license.txt b/license.txt
new file mode 100644
index 0000000..f2eaf7c
--- /dev/null
+++ b/license.txt
@@ -0,0 +1,8 @@
+Copyright (c) 2012, Nicola Segata and Curtis Huttenhower
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
diff --git a/merg_ctxt.py b/merg_ctxt.py
new file mode 100644
index 0000000..3785036
--- /dev/null
+++ b/merg_ctxt.py
@@ -0,0 +1,47 @@
+#!/usr/bin/env python
+
+import sys
+import collections
+
+try:
+    import argparse as ap
+    import bz2 
+except ImportError:
+    sys.stderr.write( "argparse not found" )
+    sys.exit(-1)
+
+def read_params( args ):
+    p = ap.ArgumentParser(description="Merge two core txt files"
+            "where the second are subclusters of the first" )
+
+    p.add_argument( 'ctxt', default=None, type=str,
+            help=   "the main ctxt")
+    p.add_argument('--out_ctxt', default=None, type=str,
+            help=   "the output txt file compressef if given with bz2 extension")
+    p.add_argument('sctxt', nargs='*', default=None, type=str,
+            help=   "the list of subclustere ctxt")
+
+    return vars( p.parse_args() )
+
+if __name__ == "__main__":
+    args = read_params( sys.argv )
+    uc2cl = collections.defaultdict( set )
+
+    openr = bz2.BZ2File if args['ctxt'].endswith(".bz2") else open
+    mtxt = dict([([v[0],(v[1:] if len(v) > 1 else [])]) for v in (l.strip().split('\t') for l in openr(args['ctxt']))])
+    
+    for f in args['sctxt']:
+        openr = bz2.BZ2File if f.endswith(".bz2") else open
+        ctxt = dict([([v[0],(v[1:] if len(v) > 1 else [])]) for v in (l.strip().split('\t') for l in openr(f))])
+
+        for k,v in mtxt.items():
+            nv = v
+            for kk in [k]+v:
+                if kk in ctxt:
+                    nv += ctxt[kk]
+            mtxt[k] = nv
+
+    openw = bz2.BZ2File if args['out_ctxt'].endswith(".bz2") else open
+    with (openw(args['out_ctxt'],"w") if args['out_ctxt'] else sys.stdout) as out:
+        for k,v in sorted(mtxt.items(),key=lambda x:-len(x[1])):
+            out.write( "\t".join([k]+list(v)) +"\n" )
diff --git a/mlst_table2fna.py b/mlst_table2fna.py
new file mode 100644
index 0000000..986bf1d
--- /dev/null
+++ b/mlst_table2fna.py
@@ -0,0 +1,62 @@
+#!/usr/bin/env python
+
+import sys
+import collections
+import utils
+try:
+    import argparse as ap
+    import random
+    from Bio import SeqIO 
+    from Bio.SeqRecord import SeqRecord
+except ImportError:
+    sys.stderr.write( "argparse not found" )
+    sys.exit(-1)
+
+def read_params( args ):
+    p = ap.ArgumentParser(description='Create a fasta file with the'
+            'concatenated mlst sequence from a mlst table and the single sequences')
+
+    p.add_argument( '--fna', required=True, default=None, type=str,
+            help=   "the file with all the MLST profiles [in the format >profilineName_profileID")
+    p.add_argument( '--txt', required=True, default=None, type=str,
+            help=   "the table of the samples to profiles [tab-delimited, columns ID are profileName]")
+    p.add_argument( '--nmiss', default = 0, type = int )
+
+    return vars( p.parse_args() )
+
+if __name__ == "__main__":
+    args = read_params( sys.argv )
+
+    fna = SeqIO.to_dict(SeqIO.parse( utils.openr(args['fna']), "fasta"))
+    fna_out = [] 
+
+    profiles = {}
+    mlst_names = []
+    with utils.openr(args['txt']) as inp:
+        for i,line in enumerate(inp):
+            if i == 0:
+                mlst_names = line.strip().split('\t')[1:]
+                continue
+            l = line.strip().split('\t')
+            profiles[l[0]] = dict([(na,l[n+1]) for n,na in enumerate(mlst_names)])
+    for s,p in profiles.items():
+        seq = ""
+        skip = 0 
+        for n in mlst_names:
+            name = p[n]
+            if name not in fna:
+                name = n+"_"+p[n]
+            if name in fna:
+                seq += fna[name].seq
+            else:
+                skip += 1
+                continue
+        if skip > args['nmiss']:
+            continue
+        sample = SeqRecord( "sample_"+s  )
+        sample.id = "t"+s if s[0] in "0123456789" else s
+        sample.description = ""
+        sample.seq = seq
+        fna_out.append( sample )
+    
+    SeqIO.write(fna_out, sys.stdout, "fasta")
diff --git a/msg.py b/msg.py
new file mode 100644
index 0000000..49a7820
--- /dev/null
+++ b/msg.py
@@ -0,0 +1,12 @@
+import sys
+
+def info( s ):
+    sys.stdout.write( s )
+    sys.stdout.flush()
+
+def exit( s ):
+    sys.stderr.write(s+"\n")
+    sys.stderr.write("Exiting ... \n")
+    sys.exit(1)
+
+
diff --git a/ooSubprocess.py b/ooSubprocess.py
new file mode 100644
index 0000000..8d8ad71
--- /dev/null
+++ b/ooSubprocess.py
@@ -0,0 +1,229 @@
+#!/usr/bin/env python
+# Author: Duy Tin Truong (duytin.truong at unitn.it)
+#        at CIBIO, University of Trento, Italy
+
+
+import subprocess
+import os
+import multiprocessing
+from multiprocessing.pool import ThreadPool
+import sys
+import cStringIO
+
+
+class ooSubprocess:
+
+    def __init__(self, tmp_dir='tmp/'):
+        self.chain_cmds = []
+        self.tmp_dir = tmp_dir
+        mkdir(tmp_dir)
+
+    def ex(
+            self,
+            prog,
+            args=[],
+            get_output=False,
+            get_out_pipe=False,
+            out_fn=None,
+            in_pipe=None,
+            error_pipe=None,
+            current_dir=None,
+            verbose=True):
+
+        if isinstance(args, str):
+            args = args.split()
+
+        if not isinstance(args, list):
+            args = [args]
+
+        cmd = [prog] + args
+        print_cmd = 'ooSubprocess: ' + ' '.join(cmd)
+        if verbose and out_fn and (not get_output):
+            print_stderr(print_cmd + ' > ' + out_fn)
+        elif verbose:
+            print_stderr(print_cmd)
+
+        if get_output:
+            result = subprocess.check_output(
+                                                cmd,
+                                                cwd=current_dir,
+                                                stdin=in_pipe,
+                                                stderr=error_pipe)
+        elif get_out_pipe:
+            result = subprocess.Popen(
+                                        cmd, 
+                                        stdin=in_pipe, 
+                                        stderr=error_pipe,
+                                        stdout=subprocess.PIPE, 
+                                        cwd=current_dir).stdout
+        elif out_fn:
+            ofile = open(out_fn, 'w') if out_fn else None
+            result = subprocess.check_call(
+                                            cmd, 
+                                            stdin=in_pipe, 
+                                            stderr=error_pipe,
+                                            stdout=ofile, 
+                                            cwd=current_dir)
+            ofile.close()
+        else:
+            result = subprocess.check_call(
+                                            cmd, 
+                                            stdin=in_pipe, 
+                                            stderr=error_pipe,
+                                            cwd=current_dir)
+        return result
+
+    def chain(
+            self,
+            prog,
+            args=[],
+            stop=False,
+            in_pipe=None,
+            error_pipe=None,
+            get_output=False,
+            get_out_pipe=False,
+            out_fn=None,
+            current_dir=None,
+            verbose=True):
+
+        if in_pipe is None and self.chain_cmds != []:
+            print_stderr(
+                'Error: the pipeline was not stopped before creating a new one!')
+            print_stderr('In cach: %s' % (' | '.join(self.chain_cmds)))
+            exit(1)
+        if out_fn and stop == False:
+            print_stderr(
+                'Error: out_fn (output_file_name) is only specified when stop = True!')
+            exit(1)
+
+        if isinstance(args, str):
+            args = args.split()
+
+        if not isinstance(args, list):
+            args = [args]
+        cmd = [prog] + args
+
+        print_cmd = ' '.join(cmd)
+        if out_fn and (not get_output):
+            print_cmd += ' > ' + out_fn
+        self.chain_cmds.append(print_cmd)
+
+        if stop:
+            if in_pipe is None:
+                print_stderr('Error: No input process to create a pipeline!')
+                exit(1)
+
+            if verbose:
+                print_stderr('ooSubprocess: ' + ' | '.join(self.chain_cmds))
+
+            self.chain_cmds = []
+            if get_output:
+                result = subprocess.check_output(
+                                                    cmd,
+                                                    stdin=in_pipe,
+                                                    stderr=error_pipe,
+                                                    cwd=current_dir)
+            elif get_out_pipe:
+                result = subprocess.Popen(
+                                            cmd,
+                                            stdin=in_pipe,
+                                            stderr=error_pipe,
+                                            stdout=subprocess.PIPE,
+                                            cwd=current_dir).stdout
+            elif out_fn:
+                ofile = open(out_fn, 'w')
+                result = subprocess.check_call(
+                                                cmd,
+                                                stdin=in_pipe,
+                                                stderr=error_pipe,
+                                                stdout=ofile,
+                                                cwd=current_dir)
+                ofile.close()
+            else:
+                result = subprocess.check_call(
+                                                cmd,
+                                                stdin=in_pipe,
+                                                stderr=error_pipe,
+                                                cwd=current_dir)
+        else:
+            result = subprocess.Popen(
+                                        cmd,
+                                        stdin=in_pipe,
+                                        stderr=error_pipe,
+                                        stdout=subprocess.PIPE,
+                                        cwd=current_dir).stdout
+        return result
+
+    def ftmp(self, ifn):
+        return os.path.join(self.tmp_dir, os.path.basename(ifn))
+
+
+def fdir(dir, ifn):
+    return os.path.join(dir, os.path.basename(ifn))
+
+
+def mkdir(dir):
+    if not os.path.exists(dir):
+        os.makedirs(dir)
+    elif not os.path.isdir(dir):
+        print_stderr('Error: %s is not a directory!' % dir)
+        exit(1)
+
+
+def replace_ext(ifn, old_ext, new_ext):
+    # if not os.path.isfile(ifn):
+    #    print_stderr('Error: file %s does not exist!'%(ifn))
+    #    exit(1)
+    if ifn[len(ifn) - len(old_ext):] != old_ext:
+        # print_stderr('Error: the old file extension %s does not match!'%old_ext)
+        # exit(1)
+        new_ifn = ifn + new_ext
+    else:
+        new_ifn = ifn[:len(ifn) - len(old_ext)] + new_ext
+    return new_ifn
+
+
+def splitext(ifn):
+    base, ext = os.path.splitext(ifn)
+    base1, ext1 = os.path.splitext(base)
+    if ext1 in ['.tar', '.fastq', '.fasta', '.sam']:
+        base = base1
+        ext = ext1 + ext
+    return base, ext
+
+
+def splitext2(ifn):
+    basename = os.path.basename(ifn)
+    base, ext = splitext(basename)
+    return base, ext
+
+
+def parallelize(func, args, nprocs=1, use_threads=False):
+    if nprocs > 1:
+        if use_threads:
+            pool = ThreadPool(nprocs)
+        else:
+            pool = multiprocessing.Pool(nprocs)
+        results = pool.map(func, args)
+        pool.close()
+        pool.join()
+    else:
+        results = serialize(func, args)
+    return results
+
+
+def serialize(func, args):
+    results = []
+    for arg in args:
+        results.append(func(arg))
+    return results
+
+
+def print_stderr(*args):
+    sys.stderr.write(' '.join(map(str, args)) + '\n')
+    sys.stderr.flush()
+
+
+def print_stdout(*args):
+    sys.stdout.write(' '.join(map(str, args)) + '\n')
+    sys.stdout.flush()
diff --git a/pyphlan.py b/pyphlan.py
new file mode 100644
index 0000000..6e37ff9
--- /dev/null
+++ b/pyphlan.py
@@ -0,0 +1,748 @@
+from Bio import Phylo
+from Bio.Phylo import PhyloXML
+from Bio.Phylo import PhyloXMLIO
+from collections import defaultdict as ddict
+from Bio.Phylo.PhyloXML import Property as Prop
+from Bio.Phylo.PhyloXML import Clade as PClade
+from Bio.Phylo.BaseTree import Tree as BTree
+from Bio.Phylo.BaseTree import Clade as BClade
+import string
+from numpy import pi as rpi
+rpi2 = 2.0*rpi
+import numpy as np 
+import array as arr
+import collections as colls
+import sys
+#core_test = lambda ok,tot,pr: 1.0-st.binom.sf(ok,tot,pr)
+
+#lev_sep = "."
+
+uncl = "?" 
+
+# Here are three functions that I'd love to see in Biopython but they
+# are not there (yet).
+def partial_branch_length(clade, selective_targets):
+    def _partial_branch_length_( clade, selective_targets ):
+        if clade.is_terminal() and clade.name in selective_targets:
+            return [clade.branch_length]
+        if not any([c.name in selective_targets for c in clade.get_terminals()]):
+            return [0.0]
+        ret = [0.0]
+        for c in clade.clades:
+            ret += [partial_branch_length( c, selective_targets)]
+        ret += [clade.branch_length]
+        return ret
+    return sum( _partial_branch_length_( clade,selective_targets  )  )
+
+def reroot(tree, new_root):
+    outgroup = new_root
+    outgroup_path = tree.get_path(outgroup)
+    if len(outgroup_path) == 0:
+        # Outgroup is the current root -- no change
+        return
+    prev_blen = outgroup.branch_length
+    if outgroup.is_terminal():
+        # Create a new root with a 0-length branch to the outgroup
+        outgroup.branch_length = 0.0
+        new_root = tree.root.__class__(
+                branch_length=tree.root.branch_length, clades=[outgroup])
+        # The first branch reversal (see the upcoming loop) is modified
+        if len(outgroup_path) == 1:
+            # Trivial tree like '(A,B);
+            new_parent = new_root
+        else:
+            parent = outgroup_path.pop(-2)
+            parent.clades.pop(parent.clades.index(outgroup))
+            prev_blen, parent.branch_length = parent.branch_length, prev_blen
+            new_root.clades.insert(0, parent)
+            new_parent = parent
+    else:
+        # Use the given outgroup node as the new (trifurcating) root
+        new_root = outgroup
+        new_root.branch_length = tree.root.branch_length
+        new_parent = new_root
+
+    # Tracing the outgroup lineage backwards, reattach the subclades under a
+    # new root clade. Reverse the branches directly above the outgroup in
+    # the tree, but keep the descendants of those clades as they are.
+    for parent in outgroup_path[-2::-1]:
+        parent.clades.pop(parent.clades.index(new_parent))
+        prev_blen, parent.branch_length = parent.branch_length, prev_blen
+        new_parent.clades.insert(0, parent)
+        new_parent = parent
+
+    # Finally, handle the original root according to number of descendents
+    old_root = tree.root
+    if outgroup in old_root.clades:
+        assert len(outgroup_path) == 1
+        old_root.clades.pop(old_root.clades.index(outgroup))
+    else:
+        old_root.clades.pop(old_root.clades.index(new_parent))
+    if len(old_root) == 1:
+        # Delete the old bifurcating root & add branch lengths
+        ingroup = old_root.clades[0]
+        if ingroup.branch_length:
+            ingroup.branch_length += prev_blen
+        else:
+            ingroup.branch_length = prev_blen
+        new_parent.clades.insert(0, ingroup)
+        # ENH: If annotations are attached to old_root, do... something.
+    else:
+        # Keep the old trifurcating/polytomous root as an internal node
+        old_root.branch_length = prev_blen
+        new_parent.clades.insert(0, old_root)
+
+    tree.root = new_root
+    tree.rooted = True
+    return
+
+def get_parent(tree, child_clade):
+    node_path = tree.get_path(child_clade)
+    return node_path[-2] if len(node_path) > 1 else None
+
+def reroot_mid_fat_edge( tree, node ):
+    if tree.root == node:
+        return
+    fat = get_parent( tree, node )
+    bl = node.branch_length
+    node.branch_length = bl*0.5
+    new_clade = PClade(branch_length=bl*0.5, clades = [node])
+    if fat:
+        fat.clades = [c for c in fat.clades if c != node] + [new_clade]
+        reroot( tree, new_clade )
+    else:
+        tree.root.clades = [new_clade] + [c for c in tree.root.clades if c != node]
+        reroot( tree, new_clade)
+
+def clades2terms( tree, startswith = None ):
+    c2t = {}
+    def clades2terms_rec( c ):
+        if startswith: 
+            if c.name and c.name.startswith( startswith ):
+                c2t[c] = c.get_terminals()
+        else:
+            c2t[c] = c.get_terminals()
+        for cc in c.clades:
+            clades2terms_rec(cc)
+    clades2terms_rec( tree.root )
+    return c2t
+
+def dist_matrix( tree ):
+    terminals = list(tree.get_terminals())
+    term_names = [t.name for t in terminals] 
+    # can be made faster with recursion
+    for n in tree.get_nonterminals():
+        n.ids = set( [nn.name for nn in n.get_terminals()]  )
+    
+    dists = dict([(n,dict([(nn,0.0) for nn in term_names])) for n in term_names])
+
+    def dist_matrix_rec( clade ):
+        bl = clade.branch_length
+        if clade.is_terminal():
+            for t in term_names:
+                if t!=clade.name:
+                    dists[clade.name][t] += bl
+                    dists[t][clade.name] += bl
+            return
+        for t1 in clade.ids:
+            for t2 in terminals:
+                if t2.name not in clade.ids:
+                    dists[t1][t2.name] += bl
+                    dists[t2.name][t1] += bl
+        for c in clade.clades:
+            dist_matrix_rec( c )
+
+    dist_matrix_rec( tree.root )
+    return dists
+
+
+
+class PpaTree:
+
+    def __load_tree_txt__( self, fn ):
+        tree = Phylo.BaseTree.Tree()
+        try:
+            rows = [l.decode('utf-8').rstrip().split("\t")[0] for l in 
+                        open(fn, 'rb')]
+        except IOError:
+            raise IOError()
+      
+        clades = [r.split(self.lev_sep) for r in rows]
+
+        tree = BTree()
+        tree.root = BClade()
+       
+        def add_clade_rec( father, txt_tree ):
+            fl = set([t[0] for t in txt_tree])
+            father.clades = []
+            for c in fl:
+                nclade = BClade( branch_length = 1.0,
+                                 name = c )
+                father.clades.append( nclade )
+                children = [t[1:] for t in txt_tree if len(t)>1 and t[0] == c]
+                if children:
+                    add_clade_rec( nclade, children )
+
+        add_clade_rec( tree.root, clades )
+        self.ignore_branch_len = 1
+        return tree.as_phyloxml()
+        
+
+    def __read_tree__( self, fn ):
+        for ff in ['phyloxml','newick','nexus',"txt"]:
+            try:
+                if ff in ['txt']:
+                    tree = self.__load_tree_txt__( fn )
+                else:
+                    tree = Phylo.read(fn, ff)
+                    if len(tree.root.get_terminals()) == 1:
+                        raise ValueError
+            except ValueError:
+                continue
+            except IOError:
+                sys.stderr.write("Error: No tree file found: "+fn+"\n")
+                raise IOError 
+            except Exception:
+                continue
+            else:
+                return tree.as_phyloxml()
+        sys.stderr.write("Error: unrecognized input format "+fn+"\n")
+        raise ValueError 
+
+
+    def __init__( self, filename, warnings = False, lev_sep = "." ):
+        self.lev_sep = lev_sep
+        self.warnings = warnings
+        if filename is None:
+            self.tree = None
+            return 
+        try:
+            self.tree = self.__read_tree__(filename) 
+            self.add_full_paths()
+        except:
+            sys.exit(0) 
+
+
+    def core_test( self, ok, tot, pr ):
+        # scipy included here for non-compatibility with scons
+        import scipy.stats as st
+        if pr in self.ctc and tot in self.ctc[pr] and ok in self.ctc[pr][tot]:
+            return self.ctc[pr][tot][ok]
+        ret = 1.0-st.binom.sf(ok,tot,pr)
+        if not pr in self.ctc: self.ctc[pr] = {}
+        if not tot in self.ctc[pr]: self.ctc[pr][tot] = {}
+        if not ok in self.ctc[pr][tot]: self.ctc[pr][tot][ok] = ret
+        return ret
+
+    def is_core( self, clade, targs, er = 0.95 ):
+        intersection = clade.imgids & targs
+
+        len_intersection = len(intersection)
+
+        if len(clade.imgids) >= 2 and len_intersection < 2:
+           return False, 0.0, None
+
+        add = 0
+        for subclade in clade.clades:
+            if uncl in subclade.name:
+                out = subclade.imgids - intersection # targs  
+                add += len(out)
+        if add and len_intersection >= add:
+            len_intersection += int(round(add/1.99))
+
+        core = self.core_test( len_intersection, clade.nterminals, er )
+        if core < 0.05 or len_intersection == 0:
+            return False, core, None
+        nsubclades, nsubclades_absent = 0, 0
+        for subclade in set(clade.get_nonterminals()) - set([clade]):
+            if uncl in subclade.full_name: # full??/
+                continue
+            if subclade.nterminals == 1:
+                nsubclades += 1 # !!!
+                if len(subclade.imgids & targs) == 0:
+                    nsubclades_absent += 1
+                continue
+            
+            sc_intersection = subclade.imgids & targs
+            sc_len_intersection = len(sc_intersection)
+           
+            sc_add = 0
+            for sc_subclade in subclade.clades:
+                if uncl in sc_subclade.name:
+                    sc_out = sc_subclade.imgids - sc_intersection
+                    sc_add += len(sc_out)
+            if add and sc_len_intersection >= sc_add:
+                sc_len_intersection += int(round(sc_add/1.99))
+
+            subcore = self.core_test( sc_len_intersection, subclade.nterminals, er )    
+            if subcore < 0.05:
+                return False, core, None
+        if nsubclades > 0 and nsubclades == nsubclades_absent:
+            return False, core, None
+        return True, core, intersection
+
+    def _find_core( self, terminals, er = 0.95, root_name = None, skip_qm = True ):
+        #terminals_s = set(terminals)
+        def _find_core_rec( clade ):
+            """
+            if root_name:
+                #clname = lev_sep.join( [root_name]+clade.full_name.split(lev_sep)[1:] )
+                #clname = lev_sep.join( clade.full_name[1:] )
+                clname = clade.full_name
+            else:
+                clname = clade.full_name
+            """
+            clname = clade.full_name
+            if clade.is_terminal():
+                if clade.imgid in terminals:
+                    #n = terminals[clade.imgid]
+                    return [(clname,1,1,
+                                #n,n,n,
+                                1.0)]
+                return []
+            if skip_qm and clade.name and uncl in clade.name:
+                return [] 
+            if len(clade.imgids) == 1:
+                cimg = list(clade.imgids)[0]
+                if cimg in terminals:
+                    #n = terminals[cimg]
+                    return [(clname,1,1,
+                                #n,n,n,
+                                1.0)]
+                return []
+
+            core,pv,intersection = self.is_core( clade, terminals, er = er )
+
+            if core:
+                #ns = [terminals[ii] for ii in terminals_s if ii in clade.imgids]
+                return [( clname,
+                          len(intersection),len(clade.imgids),
+                          #len(clade.imgids&terminals_s),len(clade.imgids),
+                          #min(ns),max(ns),np.mean(ns),
+                          pv)]
+            rets = []
+            for c in clade.clades:
+                rets += _find_core_rec(c)
+            return rets
+        return  _find_core_rec( self.tree.root )
+
+
+    def add_full_paths( self ):
+        
+        def _add_full_paths_( clade, path ):
+            lpath = path + ([clade.name] if clade.name else [])
+            clade.full_name = self.lev_sep.join( lpath )
+            for c in clade.clades:
+                _add_full_paths_( c, lpath )
+        _add_full_paths_( self.tree.root, [] )
+
+    def find_cores( self, cl_taxa_file, min_core_size = 1, error_rate = 0.95, subtree = None, skip_qm = True ):
+        if subtree:
+            self.subtree( 'name', subtree )
+        self.ctc = {}
+        imgids2terminals = {}
+        for t in self.tree.get_terminals():
+            #t.imgid = t.name # if "t__"in t.name else t.name
+            t.imgid = t.name[3:] if "t__"in t.name else t.name # C2
+            #t.imgid = int(t.name[3:] if "t__"in t.name else t.name) # C2
+            t.nterminals = 1
+            imgids2terminals[t.imgid] = t
+
+
+        # can be made faster with recursion
+        for n in self.tree.get_nonterminals():
+            n.imgids = set( [nn.imgid for nn in n.get_terminals()]  )
+            n.nterminals = len( n.imgids )
+
+        self.add_full_paths() # unnecessary 
+
+        ret = {}
+        for vec in (l.strip().split('\t') for l in open(cl_taxa_file)):
+            sid = vec[0]
+            #sid = int(vec[0]) # C2
+            #tgts_l = [int(s) for s in vec[1:]]
+            #tgts = dict([(s,tgts_l.count(s)) for s in set(tgts_l)])
+            tgts = set([s for s in vec[1:]])
+            #tgts = set([int(s) for s in vec[1:]]) C2
+
+            if len(tgts) >= min_core_size:
+                subtree_name = self.lev_sep.join(subtree.split(self.lev_sep)[:-1] ) if subtree else None
+                ret[sid] = self._find_core( tgts, er = error_rate, root_name = subtree, skip_qm = skip_qm )
+                #print sid #, ret[sid]
+        return ret
+   
+    def markerness( self, coreness, uniqueness, cn_min, cn_max, cn_avg ):
+        return coreness * uniqueness * (1.0 / float(cn_max-cn_min+1)) * 1.0 / cn_avg
+
+    def find_markers( self, cu_file, hitmap_file, core_file ):
+        self.ctc = {}
+        imgids2terminals = {}
+        ids2clades = {}
+        for t in self.tree.get_terminals():
+            t.imgid = t.name
+            # t.imgid = int(t.name) # C2
+            t.nterminals = 1
+            imgids2terminals[t.imgid] = t
+            ids2clades[t.name] = t
+
+        # can be made faster with recursion (but it is not a bottleneck)
+        for n in self.tree.get_nonterminals():
+            n.imgids = set( [nn.imgid for nn in n.get_terminals()]  )
+            n.nterminals = len( n.imgids )
+
+        self.add_full_paths() # unnecessary 
+        
+        #cus = dict([(int(l[0]),[int(ll) for ll in l[1:]]) for l in # C2
+        cus = dict([(l[0],[ll for ll in l[1:]]) for l in 
+                        (line.strip().split('\t') for line in open(cu_file))])
+        #cinfo = dict([(int(v[0]),[v[1]] + [int(vv) for vv in v[2:6]] + [float(vv) for vv in v[6:]]) # C2
+        cinfo = dict([(v[0],[v[1]] + [vv for vv in v[2:6]] + [float(vv) for vv in v[6:]])
+                        for v in (line.strip().split('\t') for line in open(core_file))])
+
+        ret = {}
+        for vec in (l.strip().split('\t') for l in open(hitmap_file)):
+            sid = vec[0]
+            #sid = int(vec[0]) # C2
+            tgts_l = set([s for s in vec[1:]])
+            # tgts_l = set([int(s) for s in vec[1:]]) # C2
+            lca = self.lca( cus[sid], ids2clades )
+            if lca.is_terminal():
+                tin = set([lca.imgid])
+                tout = tgts_l - tin 
+            else:
+                tout = tgts_l - lca.imgids
+                tin = lca.imgids & tgts_l
+            ci = cinfo[sid]
+            ltin = len(tin)
+            ltout = len(tout)
+            uniqueness = float(ltin)/float(ltin+ltout)
+            coreness = float( ci[-1] )
+            cn_min, cp_max, cn_avg = [float(f) for f in ci[-4:-1]]
+            gtax = ci[0]
+            cobs, ctot = ci[1], ci[2]
+            # cobs, ctot = int(ci[1]), int(ci[2]) # C2
+            markerness = self.markerness( coreness, uniqueness, cn_min, cp_max, cn_avg )
+            
+            res_lin = [ gtax, markerness, coreness, uniqueness, cobs, ctot, cn_min, cp_max, cn_avg,
+                        ltin, ltout, "|".join([str(s) for s in tin]), "|".join([str(s) for s in tout]) ]
+            ret[sid] = res_lin 
+        return ret
+       
+
+    def select_markers( self, marker_file, markerness_th = 0.0, max_markers = 200 ):
+        cl2markers = colls.defaultdict( list ) 
+        for line in (l.strip().split('\t') for l in open( marker_file )):
+            gid = line[1]
+            markerness = float(line[2])
+            if markerness < markerness_th:
+                continue
+            cl2markers[gid].append( line )
+        for k,v in cl2markers.items():
+            cl2markers[k] = sorted(v,key=lambda x:float(x[2]),reverse=True)[:max_markers]
+        return cl2markers.values()
+
+    def get_c2t( self ):
+        tc2t = {}
+
+        def _get_c2t_( clade ):
+            lterms = clade.get_terminals()
+            tc2t[clade] = set([l.name for l in lterms])
+            if clade.is_terminal():
+                return
+            for c in clade.clades:
+                _get_c2t_( c )
+        _get_c2t_( self.tree.root )
+        return tc2t 
+
+    def ltcs( self, terminals, tc2t = None, terminals2clades = None, lca_precomputed = None ):
+        set_terminals = set( terminals )
+        lca = lca_precomputed if lca_precomputed else self.lca( terminals, terminals2clades )
+        def _ltcs_rec_( clade, cur_max ):
+            if clade.is_terminal() and clade.name in set_terminals:
+                return clade,1
+            terms = tc2t[clade] if tc2t else set([cc.name for cc in clade.get_terminals()])
+            if len(terms) < cur_max:
+                return None,0
+            if terms <= set_terminals: 
+                return clade,len(terms)
+            rets = []
+            for c in clade.clades:
+                r,tmax = _ltcs_rec_( c, cur_max )
+                if tmax >= cur_max:
+                    cur_max = tmax
+                    if r:
+                        rets.append((r,tmax))
+            if rets:
+                return sorted(rets,key=lambda x:x[1])[-1][0],cur_max
+            else:
+                return None,None 
+        return _ltcs_rec_( lca, cur_max = 0 )[0]
+
+    def lca( self, terminals, terminals2clades = None ):
+        clade_targets = []
+        if terminals2clades:
+            clade_targets = [terminals2clades[str(t)] for t in terminals]
+        else:
+            clade_targets = [t for t in self.tree.get_terminals() if t.name in terminals]
+            """
+            for t in terminals:
+                ct = list(self.tree.find_clades( {"name": str(t)} ))
+                if len( ct ) > 1:
+                    sys.stderr.write( "Error: non-unique target specified." )
+                    sys.exit(-1)
+                clade_targets.append( ct[0] )
+            """
+        lca = self.tree.common_ancestor( clade_targets )
+        return lca
+
+    
+    def lcca( self, t, t2c ):
+        node_path = list(self.tree.get_path(t))
+        if not node_path or len(node_path) < 2:
+            return None,None,None
+        tlevs = t2c[t].split(self.lev_sep)[2:-1]
+        for p in node_path[-15:]:
+            terms = list(p.get_terminals())
+            descn = [t2c[l.name].split(self.lev_sep)[2:-1] for l in  terms if l.name!=t]
+            if not descn or len(descn) < 2:
+                continue
+
+            l = tlevs[-1]
+            descr_l = [d[-1] for d in descn]
+            if len(set(descr_l)) == 1 and descr_l[0] != l and \
+                l != "s__sp_" and not l.endswith("unclassified") and \
+                descr_l[0] != "s__sp_" and not descr_l[0].endswith("unclassified"):
+                return p,terms,self.lev_sep.join(tlevs)
+        return None,None,None
+
+
+    def tax_precision( self, c2t_f, strategy = 'lca' ):
+        c2t = self.read_tax_clades( c2t_f )
+        res = []
+        for c,terms in c2t.items():
+            lca = self.lca( terms )
+            num = partial_branch_length(lca,terms)
+            den = lca.total_branch_length()
+            prec = num / den
+            res.append([c,str(prec)])
+        return res
+
+    def tax_recall( self, c2t_f ):
+        c2t = self.read_tax_clades( c2t_f )
+        res = []
+        for c,terms in c2t.items():
+            lca = self.lca( terms )
+            ltcs = self.ltcs( terms )
+            lca_terms = set(lca.get_terminals())
+            ltcs_terms = set(ltcs.get_terminals())
+            out_terms = lca_terms - ltcs_terms
+            outs = [c]
+            if len(out_terms):
+                diam = sum(sorted(ltcs.depths().values())[-2:])
+                outs += [":".join([t.name,str( self.tree.distance(ltcs,t)/diam )]) 
+                             for t in out_terms]
+            res.append( outs )
+        return res
+
+    def tax_resolution( self, terminals ):
+        pass
+
+    
+    def prune( self, strategy = 'lca', n = None, fn = None, name = None, newname = None ):
+        prune = None
+        if strategy == 'root_name':
+            ct = list(self.tree.find_clades( {"name": name} ))
+            if len( ct ) > 1:
+                sys.stderr.write( "Error: non-unique target specified." )
+                sys.exit(-1)
+            prune = ct[0]
+        elif strategy == 'lca':
+            terms = self.read_targets( fn ) if isinstance(fn,str) else fn
+            prune = self.lca( terms )
+        elif strategy == 'ltcs':
+            terms = self.read_targets( fn ) if isinstance(fn,str) else fn
+            prune = self.ltcs( terms )
+        elif strategy == 'n_anc':
+            if n is None:
+                n = 1
+            ct = list(self.tree.find_clades( {"name": name} ))
+            if len( ct ) > 1:
+                sys.stderr.write( "Error: non-unique target specified.\n" )
+                sys.exit(-1)
+            node_path = list(self.tree.get_path(name))
+            if not node_path or len(node_path) < n:
+                sys.stderr.write( "Error: no anchestors or number of anchestors < n." )
+                sys.exit(-1)
+            toprune = node_path[-n]
+            fat = node_path[-n-1]
+            fat.clades = [cc for cc in fat.clades if cc != toprune]
+            prune = None 
+        else:
+            sys.stderr.write( strategy + " not supported yet." )
+            sys.exit(-1)
+        if prune:
+            prune.clades = []
+            if newname:
+                prune.name = newname
+
+    def subtree( self, strategy, n = None, fn = None ):
+        newroot = None
+        if strategy == 'name':
+            ct = list(self.tree.find_clades( {"name": fn} ))
+            if len( ct ) != 1:
+                int_clades = self.tree.get_nonterminals()
+                for cl in int_clades:
+                    if n == cl.full_name:
+                        ct = [cl]
+                        break
+                if not ct:
+                    sys.stderr.write( "Error: target not found." )
+                    sys.exit(-1)
+            newroot = ct[0]
+        elif strategy == 'lca':
+            terms = self.read_targets( fn ) if isinstance(fn,str) else fn
+            newroot = self.lca( terms )
+        elif strategy == 'ltcs':
+            terms = self.read_targets( fn ) if isinstance(fn,str) else fn
+            newroot = self.ltcs( terms )
+        if newroot:
+            self.tree.root = newroot
+
+    def rename( self, strategy, n = None, terms = None ):
+        newroot = None
+        if strategy == 'root_name':
+            ct = list(self.tree.find_clades( {"name": n} ))
+            if len( ct ) > 1:
+                sys.stderr.write( "Error: non-unique target specified.\n" )
+                sys.exit(-1)
+            newroot = ct[0]
+        elif strategy == 'lca':
+            newroot = self.lca( terms )
+        elif strategy == 'ltcs':
+            newroot = self.ltcs( terms )
+        if newroot:
+            newroot.name = n
+
+    def export( self, out_file ):
+        self.tree = self.tree.as_phyloxml()
+        Phylo.write( self.tree, out_file, "phyloxml")
+
+    def read_tax_clades( self, tf ):
+        with open( tf ) as inpf:
+            return dict([(ll[0],ll[1:]) for ll in [l.strip().split('\t') for l in inpf]])
+
+    def read_targets( self, tf ):
+        if tf.count(":"):
+            return tf.split(":")
+        with open( tf ) as inpf:
+            return [l.strip() for l in inpf]
+
+    def reroot( self, strategy = 'lca', tf = None, n = None ):
+        if strategy in [ 'lca', 'ltcs' ]:
+            targets = self.read_targets( tf ) 
+          
+            lca = self.lca( targets ) if strategy == 'lca' else self.ltcs( targets )
+            reroot_mid_fat_edge( self.tree, lca)
+
+            #lca_f = get_parent( self.tree, lca )
+            #
+            #bl = lca.branch_length
+            #new_clade = PClade(branch_length=bl*0.5, clades = [lca])
+            #lca.branch_length = bl*0.5
+            #if lca_f:
+            #    lca_f.clades = [c for c in lca_f.clades if c != lca] + [new_clade]
+            #    reroot( self.tree, new_clade )
+            #else:
+            #    self.tree.root = new_clade
+
+        elif strategy == 'midpoint':
+            pass
+            #self.tree.reroot_at_midpoint(update_splits=True)
+        elif strategy == 'longest_edge':
+            nodes = list(self.tree.get_nonterminals()) + list(self.tree.get_terminals())
+            longest = max( nodes, key=lambda x:x.branch_length ) 
+            reroot_mid_fat_edge( self.tree, longest )
+            #longest_edge = max( self.ntree.get_edge_set(),
+            #                    key=lambda x:x.length)
+            #self.tree.reroot_at_edge(longest_edge, update_splits=True)
+        elif strategy == 'longest_internal_edge':
+            nodes = list(self.tree.get_nonterminals())
+            longest = max( nodes, key=lambda x:x.branch_length ) 
+            if self.tree.root != longest:
+                reroot_mid_fat_edge( self.tree, longest )
+            #longest = get_lensorted_int_edges( self.tree )
+            #self.tree.reroot_at_edge( list(longest)[-1], update_splits=True )
+        elif strategy == 'longest_internal_edge_n':
+            nodes = list(self.tree.get_nonterminals())
+            longest = max( nodes, key=lambda x:x.branch_length 
+                    if len(x.get_terminals()) >= n else -1.0) 
+            reroot_mid_fat_edge( self.tree, longest )
+            #longest = get_lensorted_int_edges( self.tree, n )
+            #self.tree.reroot_at_edge( list(longest)[-1], update_splits=True )
+
+
+    def reorder_tree( self ):
+        self._ord_terms = []
+        def reorder_tree_rec( clade ):
+            if clade.is_terminal():
+                self._ord_terms.append( clade )
+                return clade,clade
+            clade.clades.sort( key=lambda x:len(x.get_terminals()), reverse = True)
+            for c in clade.clades:
+                c.fc,c.lc = reorder_tree_rec( c ) 
+            return clade.clades[0].fc,clade.clades[-1].lc 
+            #clade.fc, clade.lc = clade.clades[0], clade.clades[-1]
+        
+        reorder_tree_rec( self.tree.root )
+        last = None # self._ord_terms[-1]
+        for c in self._ord_terms:
+            c.pc = last
+            if last:
+                last.nc = c
+            last = c
+        c.nc = None
+        #self._ord_terms[-1].nc = None # self._ord_terms[0]
+
+
+    def get_subtree_leaves( self, full_names = False ):
+
+        subtrees = []
+        def rec_subtree_leaves( clade ):
+            if not len(clade.clades):
+                return [clade.name]
+            leaves = []
+            for c in clade.clades:
+                leaves += rec_subtree_leaves( c )
+            leaves = [l for l in leaves if l]
+            subtrees.append( (clade.name if clade.name else "",leaves)  )
+            return leaves 
+
+        rec_subtree_leaves( self.tree.root )
+        return subtrees
+
+    def get_clade_names( self, full_names = False, leaves = True, internals = True ):
+        clades = []
+        if leaves:
+            clades += self.tree.get_terminals()
+        if internals:
+            clades += self.tree.get_nonterminals()
+        if full_names:
+            def rec_name( clade, nam = "" ):
+                ret = []
+                if not nam and not clade.name:
+                    lnam = ""
+                elif not nam:
+                    lnam = clade.name 
+                elif not clade.name:
+                    lnam = nam
+                else:
+                    lnam = self.lev_sep.join( [nam, clade.name if clade.name else ""] ) 
+                ret += [lnam] if lnam else [] 
+                for c in clade.clades:
+                    ret += rec_name(c,lnam) 
+                return ret
+            names = set(rec_name(self.tree.root))
+        else:
+            names = set([c.name for c in clades])
+        return sorted(names)
+
diff --git a/tab.py b/tab.py
new file mode 100644
index 0000000..f267023
--- /dev/null
+++ b/tab.py
@@ -0,0 +1,106 @@
+from pandas import *
+import sys
+import re
+import msg
+
+class Tab:
+
+    def __init__( self, fin = sys.stdin, sep = "\t" ):
+        self.data = read_csv( fin, sep=sep, index_col=0 ) 
+
+    def sel_rows( self, col = None, val = None ):
+        #self.data = self.data[self.data[col] == val]
+        pass     
+
+    def eq( self, x, y ):
+        t = type(y)
+        if x == y:
+            return True
+        try:
+            return t(x) == y
+        except Exception:
+            return False
+
+    def get_cols( self, row = None, val = None, regex = None ):
+        cols = []
+        if row is None:
+            if regex:
+                for c,cs in self.data.iteritems():
+                    if any((re.search(regex,str(v)) for v in cs)):
+                        #del self.data[c]
+                        cols.append(c)
+            elif val:
+                for c,cs in self.data.iteritems():
+                    if any((val == str(v) for v in cs)):
+                    #if val not in cs:
+                        #del self.data[c]
+                        cols.append(c)
+            else:
+                msg.exit("Error") 
+        else:
+            if regex:
+                for c,cs in self.data.iteritems():
+                    if re.search(regex,str(cs[row])):
+                        #del self.data[c]
+                        cols.append(c)
+            elif val:
+                for c,cs in self.data.iteritems():
+                    if self.eq(val, cs[row]):
+                        #del self.data[c]
+                        cols.append(c)
+            else:
+                msg.exit("Error") 
+        return cols
+
+    def sel_columns( self, row = None, val = None, regex = None ):
+        cols = set(self.get_cols( row, val, regex ))
+        for c in set(self.data.columns) - cols:
+            del self.data[c]
+
+    def sub( self, row = None, col = None, val = None, regex = None, new_val = None, inverse = False ):
+        if row and col:
+            if regex:
+                if inverse:
+                    if not re.match( regex, str(self.data[col][row]) ):
+                        self.data[col][row] = new_val
+                else:
+                    self.data[col][row] = re.sub( regex, new_val, str(self.data[col][row]) )
+            else:
+                self.data[col][row] = new_val
+        elif col:
+            if regex:
+                if inverse:
+                    for i,r in enumerate(self.data[col]):
+                        if not re.match( regex, str(r) ):
+                            self.data[col][i] = new_val
+                else:
+                    for i,r in enumerate(self.data[col]):
+                        self.data[col][i] = re.sub( regex, new_val, str(r) )
+            elif val:
+                for i,r in enumerate(self.data[col]):
+                    if str(r) == val:
+                        self.data[col][i] = new_val
+            else:
+                msg.exit("Error")
+        elif row:
+            if regex:
+                if inverse:
+                    for i,c in enumerate(self.data.ix[row]):
+                        if not re.match( regex, str(c) ):
+                            self.data.ix[row][i] = new_val
+                else:
+                    for i,c in enumerate(self.data.ix[row]):
+                        self.data.ix[row][i] = re.sub( regex, new_val, str(c) )
+            elif val:
+                for i,c in enumerate(self.data.ix[row]):
+                    if str(c) == val:
+                        self.data.ix[row][i] = new_val
+            else:
+                msg.exit("Error")
+        
+        #cols = self.get_cols( row, val, regex )
+         
+
+    def save( self, outf = sys.stdout, sep = "\t" ):
+        self.data.to_csv(outf,sep="\t")
+
diff --git a/tab_sel_cols.py b/tab_sel_cols.py
new file mode 100644
index 0000000..da46a8a
--- /dev/null
+++ b/tab_sel_cols.py
@@ -0,0 +1,24 @@
+#!/usr/bin/env python
+
+import util_argparse as ap
+from tab import *
+
+def read_params():
+    p = ap.util_argparse() 
+    p.inp_0()
+    p.out_0()
+    p.arg( '-r','--row',type=str, default = None)  
+    p.arg( '-v','--value',type=str, default = None)  
+    p.arg( '--regex',type=str, default = None)  
+    p.arg( '-s','--sep',type=str, default = "\t")  
+    p.parse()
+    return p
+
+if __name__ == '__main__':
+    parser = read_params()
+    args = parser.args
+    
+    tab = Tab( fin = parser.get_inp_0(), sep = args['sep'] ) 
+    tab.sel_columns( args['row'], args['value'], args['regex'] )
+    tab.save( parser.get_out_0() )
+
diff --git a/tab_sub.py b/tab_sub.py
new file mode 100644
index 0000000..f1d7c16
--- /dev/null
+++ b/tab_sub.py
@@ -0,0 +1,27 @@
+#!/usr/bin/env python
+
+import util_argparse as ap
+from tab import *
+
+def read_params():
+    p = ap.util_argparse() 
+    p.inp_0()
+    p.out_0()
+    p.arg( '-r','--row',type=str, default = None)  
+    p.arg( '-c','--col',type=str, default = None)  
+    p.arg( '-v','--value',type=str, default = None)  
+    p.arg( '-i','--inverse', action='store_true', default = False)  
+    p.arg( '--regex',type=str, default = None)  
+    p.arg( '--new_val',type=str, default = None)  
+    p.arg( '-s','--sep',type=str, default = "\t")  
+    p.parse()
+    return p
+
+if __name__ == '__main__':
+    parser = read_params()
+    args = parser.args
+    
+    tab = Tab( fin = parser.get_inp_0(), sep = args['sep'] ) 
+    tab.sub( args['row'], args['col'], args['value'], args['regex'], args['new_val'], args['inverse'] )
+    tab.save( parser.get_out_0() )
+
diff --git a/tree_cores.py b/tree_cores.py
new file mode 100644
index 0000000..e51bb67
--- /dev/null
+++ b/tree_cores.py
@@ -0,0 +1,55 @@
+#!/usr/bin/env python
+
+import sys
+import utils 
+
+try:
+    import argparse as ap
+except ImportError:
+    sys.stderr.write( "argparse not found" )
+    sys.exit(-1)
+
+try:
+    import pyphlan as ppa
+except ImportError:
+    sys.stderr.write( "pyphlan.py not found\n" )
+    sys.exit(-1)
+
+
+def read_params( args ):
+    p = ap.ArgumentParser(
+            description='Screen clusters of taxa for core genes')
+
+    p.add_argument( 'intree', nargs='?', default=None, type=str,
+            help=   "the input tree [stdin if not present]")
+    p.add_argument('outfile', nargs='?', default=None, type=str,
+            help=   "the output core file [stdout if not present]")
+    p.add_argument('-f', metavar="File containing sets of taxa",
+            default=None, type=str )
+    p.add_argument('-e', metavar="Error rate [def 0.95]",
+            default=0.95, type=float )
+    p.add_argument('-s', metavar="Subtree of interest",
+            default=None, type=str )
+    p.add_argument('--lev_sep', metavar="Taxonomic level separator",
+            default=".", type=str )
+    p.add_argument('--skip_qm', metavar="Whether to skip question mark clades or not",
+            default=1, type=int )
+
+    return vars( p.parse_args() )
+
+
+if __name__ == "__main__":
+    args = read_params( sys.argv )
+    tree = ppa.PpaTree( args['intree'], lev_sep = args['lev_sep'] )
+    cores = tree.find_cores(args['f'], error_rate = args['e'], subtree = args['s'], skip_qm = args['skip_qm'])
+
+    with utils.openw( args['outfile'] ) as outf:
+        for k,v in sorted(cores.items(),key=lambda x:x[0]):
+            for vv in v:
+                outf.write( "\t".join( [str(s) for s in [k]+list(vv)]) +"\n" )
+
+    #ctree.export_cores( args['outfile']  )
+    """
+    ctree.reroot( strategy = args['s'], tf = args['f'] )
+    ctree.export( args['outtree'] )
+    """
diff --git a/tree_get_leaves.py b/tree_get_leaves.py
new file mode 100644
index 0000000..3b3867b
--- /dev/null
+++ b/tree_get_leaves.py
@@ -0,0 +1,43 @@
+#!/usr/bin/env python
+
+import sys
+
+try:
+    import argparse as ap
+except ImportError:
+    sys.stderr.write( "argparse not found" )
+    sys.exit(-1)
+
+try:
+    import pyphlan as ppa
+except ImportError:
+    sys.stderr.write( "pyphlan.py not found\n" )
+    sys.exit(-1)
+
+def read_params( args ):
+    p = ap.ArgumentParser(description='Get the leaves of a tree')
+
+    p.add_argument( 'intree', nargs='?', default=None, type=str,
+            help=   "the input tree [stdin if not present]")
+    p.add_argument('out_file', nargs='?', default=None, type=str,
+            help=   "the out file [stdout if not present]")
+
+    """
+    p.add_argument('-f', default=None, type=str,
+            help=   "file containing the list of taxonomic clades to test " 
+                    "end the corresponding leaf taxa" )
+
+    st = ['lca','ltcs']
+    p.add_argument('-s', choices=st, default='lca', type=str,
+            help=  "select the lcs strategy")
+    """
+    return vars( p.parse_args() )
+
+
+if __name__ == "__main__":
+    args = read_params( sys.argv )
+    ppa = ppa.PpaTree( args['intree'] )
+    res = ppa.get_clade_names( args['intree'], internals = False )
+    with (open( args['out_file'], "w") if args['out_file'] else sys.stdout) as outf:
+        outf.write( "\n".join( res ) + "\n" ) 
+
diff --git a/tree_get_subtrees.py b/tree_get_subtrees.py
new file mode 100644
index 0000000..51aa552
--- /dev/null
+++ b/tree_get_subtrees.py
@@ -0,0 +1,45 @@
+#!/usr/bin/env python
+
+import sys
+
+try:
+    import argparse as ap
+except ImportError:
+    sys.stderr.write( "argparse not found" )
+    sys.exit(-1)
+
+try:
+    import pyphlan as ppa
+except ImportError:
+    sys.stderr.write( "pyphlan.py not found\n" )
+    sys.exit(-1)
+
+def read_params( args ):
+    p = ap.ArgumentParser(description='Get the leaves of a tree')
+
+    p.add_argument( 'intree', nargs='?', default=None, type=str,
+            help=   "the input tree [stdin if not present]")
+    p.add_argument('out_file', nargs='?', default=None, type=str,
+            help=   "the out file [stdout if not present]")
+
+    """
+    p.add_argument('-f', default=None, type=str,
+            help=   "file containing the list of taxonomic clades to test " 
+                    "end the corresponding leaf taxa" )
+
+    st = ['lca','ltcs']
+    p.add_argument('-s', choices=st, default='lca', type=str,
+            help=  "select the lcs strategy")
+    """
+    return vars( p.parse_args() )
+
+
+if __name__ == "__main__":
+    args = read_params( sys.argv )
+    ppa = ppa.PpaTree( args['intree'] )
+    res = ppa.get_subtree_leaves( args['intree'])
+    with (open( args['out_file'], "w") if args['out_file'] else sys.stdout) as outf:
+        for r in res:
+            #outf.write( "\t".join([r[0]]+r[1]) +  "\n" ) 
+            outf.write( "\t".join(r[1]) +  "\n" ) 
+
diff --git a/tree_markers.py b/tree_markers.py
new file mode 100644
index 0000000..306f42a
--- /dev/null
+++ b/tree_markers.py
@@ -0,0 +1,45 @@
+#!/usr/bin/env python
+
+import sys
+
+try:
+    import argparse as ap
+except ImportError:
+    sys.stderr.write( "argparse not found" )
+    sys.exit(-1)
+
+try:
+    import pyphlan as ppa 
+except ImportError:
+    sys.stderr.write( "pyphlan.py not found" )
+    sys.exit(-1)
+
+
+def read_params( args ):
+    p = ap.ArgumentParser(
+            description='Screen core genes for markers')
+
+    p.add_argument( 'intree', nargs='?', default=None, type=str,
+            help=   "the input tree [stdin if not present]")
+    p.add_argument('outfile', nargs='?', default=None, type=str,
+            help=   "the output core file [stdout if not present]")
+    p.add_argument('--hitmap', metavar="File containing global hits",
+            default=None, type=str )
+    p.add_argument('--cores', metavar="File containing unique cores",
+            default=None, type=str )
+    p.add_argument('--core_info', metavar="File containing unique core info",
+            default=None, type=str )
+
+    return vars( p.parse_args() )
+
+
+if __name__ == "__main__":
+    args = read_params( sys.argv )
+    tree = ppa.PpaTree( args['intree'] )
+    cores = tree.find_markers(args['cores'], args['hitmap'], args['core_info'])
+
+    with open( args['outfile'], "w" ) as outf:
+        for k,v in sorted(cores.items(),key=lambda x:x[0]):
+            if len(v) > 1:
+                continue
+            outf.write( "\t".join( [str(k)] + [str(s) for s in v]) + "\n" )
diff --git a/tree_pairwisedists.py b/tree_pairwisedists.py
new file mode 100644
index 0000000..406b698
--- /dev/null
+++ b/tree_pairwisedists.py
@@ -0,0 +1,54 @@
+#!/usr/bin/env python
+
+import sys
+import utils
+import bz2 
+
+try:
+    import argparse as ap
+except ImportError:
+    sys.stderr.write( "argparse not found" )
+    sys.exit(-1)
+
+try:
+    import pyphlan as ppa
+except ImportError:
+    sys.stderr.write( "pyphlan.py not found\n" )
+    sys.exit(-1)
+
+def read_params( args ):
+    p = ap.ArgumentParser(description='Finds pairwise distances between nodes in the tree based on branch lengths')
+
+    p.add_argument( 'intree', nargs='?', default=None, type=str,
+            help=   "the input tree")
+    p.add_argument('out_file', nargs='?', default=None, type=str,
+            help=   "the output file (b2zipped if ending with '.bz2')\n"
+                    "[stdout if not present]")
+    p.add_argument( '-n',  action='store_true', 
+                    help = "Distances normalized with respect to the total branch length" )
+    p.add_argument( '-m',  action='store_true', 
+                    help = "Output a matrix" )
+
+    return vars( p.parse_args() )
+
+
+if __name__ == "__main__":
+    args = read_params( sys.argv )
+    ppatree = ppa.PpaTree( args['intree'] )
+
+    dists = ppa.dist_matrix(ppatree.tree) 
+    tbl = ppatree.tree.total_branch_length() if args['n'] else 1.0
+    #tbl = ppatree.tree.total_branch_length()-1.0 if args['n'] else 1.0
+
+    with utils.openw( args['out_file'] ) as out:
+        if args['m']:
+            keys = sorted(dists.keys())
+            out.write( "\t".join(["ID"]+keys) +"\n" )
+            for k1 in keys:
+                out.write( "\t".join([k1]+[str(dists[k1][k2]/tbl) for k2 in keys]) +"\n" )
+        else:
+            for k1,v1 in dists.items():
+                for k2,v2 in v1.items():
+                    if k1 < k2:
+                        out.write( "\t".join([k1,k2,str(v2/tbl)]) +"\n" )    
+
diff --git a/tree_precision.py b/tree_precision.py
new file mode 100644
index 0000000..8c50265
--- /dev/null
+++ b/tree_precision.py
@@ -0,0 +1,42 @@
+#!/usr/bin/env python
+
+import sys
+
+try:
+    import argparse as ap
+except ImportError:
+    sys.stderr.write( "argparse not found" )
+    sys.exit(-1)
+
+try:
+    import pyphlan as ppa
+except ImportError:
+    sys.stderr.write( "pyphlan.py not found" )
+    sys.exit(-1)
+
+def read_params( args ):
+    p = ap.ArgumentParser(description='Compute taxonomic precision')
+
+    p.add_argument( 'intree', nargs='?', default=None, type=str,
+            help=   "the input tree [stdin if not present]")
+    p.add_argument('out_file', nargs='?', default=None, type=str,
+            help=   "the out file [stdout if not present]")
+    p.add_argument('-f', default=None, type=str,
+            help=   "file containing the list of taxonomic clades to test " 
+                    "end the corresponding leaf taxa" )
+
+    st = ['lca','ltcs']
+    p.add_argument('-s', choices=st, default='lca', type=str,
+            help=  "select the lcs strategy")
+
+    return vars( p.parse_args() )
+
+
+if __name__ == "__main__":
+    args = read_params( sys.argv )
+    tree = ppa.PpaTree( args['intree'] )
+    res = tree.tax_precision( args['f'], strategy = args['s'] )
+    with open( args['out_file'], "w") as outf:
+        for r in res:
+            outf.write( "\t".join( r ) + "\n" ) 
+
diff --git a/tree_prune.py b/tree_prune.py
new file mode 100644
index 0000000..d578b07
--- /dev/null
+++ b/tree_prune.py
@@ -0,0 +1,46 @@
+#!/usr/bin/env python
+
+import sys
+
+try:
+    import argparse as ap
+except ImportError:
+    sys.stderr.write( "argparse not found" )
+    sys.exit(-1)
+
+try:
+    import pyphlan as ppa 
+except ImportError:
+    sys.stderr.write( "pyphlan.py not found\n" )
+    sys.exit(-1)
+
+
+def read_params( args ):
+    p = ap.ArgumentParser(description='Extract sub tree from atree')
+
+    p.add_argument( 'intree', nargs='?', default=None, type=str,
+            help=   "the input tree [stdin if not present]")
+    p.add_argument('outtree', nargs='?', default=None, type=str,
+            help=   "the output PhyloXML tree [stdout if not present]")
+    p.add_argument('-f', metavar="File containing clade names",
+            default=None, type=str )
+    p.add_argument( '-n', metavar="N leaves for longest_internal_edge_n, or N for n_anc", 
+                    default=1, type=int)
+    p.add_argument( '--name', metavar="Name of the new root (the name must already exists)", 
+                    default=None, type=str)
+    p.add_argument( '--newname', metavar="New name ", 
+                    default=None, type=str)
+
+    reroot_st = ['root_name','lca','ltcs','longest_edge','longest_internal_edge','longest_internal_edge_n',"n_anc"]
+    #reroot_st = ['lca','ltcs','midpoint','longest_edge','longest_internal_edge','longest_internal_edge_n']
+    p.add_argument('-s', choices=reroot_st, default='longest_internal_edge', type=str,
+            help=  "select rerooting strategy")
+
+    return vars( p.parse_args() )
+
+
+if __name__ == "__main__":
+    args = read_params( sys.argv )
+    tree = ppa.PpaTree( args['intree'] )
+    tree.prune( strategy = args['s'], fn = args['f'], n = args['n'], name = args['name'], newname = args['newname'] )
+    tree.export( args['outtree'] )
diff --git a/tree_recall.py b/tree_recall.py
new file mode 100644
index 0000000..492d709
--- /dev/null
+++ b/tree_recall.py
@@ -0,0 +1,38 @@
+#!/usr/bin/env python
+
+import sys
+
+try:
+    import argparse as ap
+except ImportError:
+    sys.stderr.write( "argparse not found" )
+    sys.exit(-1)
+
+try:
+    import pyphlan as ppa
+except ImportError:
+    sys.stderr.write( "pyphlan.py not found" )
+    sys.exit(-1)
+
+def read_params( args ):
+    p = ap.ArgumentParser(description='Compute taxonomic recall')
+
+    p.add_argument( 'intree', nargs='?', default=None, type=str,
+            help=   "the input tree [stdin if not present]")
+    p.add_argument('out_file', nargs='?', default=None, type=str,
+            help=   "the out file [stdout if not present]")
+    p.add_argument('-f', default=None, type=str,
+            help=   "file containing the list of taxonomic clades to test " 
+                    "end the corresponding leaf taxa" )
+
+    return vars( p.parse_args() )
+
+
+if __name__ == "__main__":
+    args = read_params( sys.argv )
+    tree = ppa.PpaTree( args['intree'] )
+    res = tree.tax_recall( args['f'] )
+    with open( args['out_file'], "w") as outf:
+        for r in res:
+            outf.write( "\t".join( r ) + "\n" ) 
+
diff --git a/tree_rename.py b/tree_rename.py
new file mode 100644
index 0000000..79a872f
--- /dev/null
+++ b/tree_rename.py
@@ -0,0 +1,48 @@
+#!/usr/bin/env python
+
+import sys
+
+
+try:
+    import argparse as ap
+except ImportError:
+    sys.stderr.write( "argparse not found" )
+    sys.exit(-1)
+
+try:
+    import pyphlan as ppa
+except ImportError:
+    sys.stderr.write( "pyphlan.py not found\n" )
+    sys.exit(-1)
+
+def read_params( args ):
+    p = ap.ArgumentParser(description='Extract a subtree from a tree')
+
+    p.add_argument( 'intree', nargs='?', default=None, type=str,
+            help=   "the input tree [stdin if not present]")
+    p.add_argument('outtree', nargs='?', default=None, type=str,
+            help=   "the out file [stdout if not present]")
+    p.add_argument('-f', default=None, type=str,
+            help=   "file containing the list of clades for lca and ltcs " ) 
+    p.add_argument('-n', default=None, type=str,
+            help=   "name of the new root " )
+
+    st = ['root_name','lca','ltcs']
+    p.add_argument('-s', choices=st, default='root_name', type=str,
+            help=  "set the strategy the select the root of the new tree")
+
+    return vars( p.parse_args() )
+
+
+if __name__ == "__main__":
+    args = read_params( sys.argv )
+    ctree = ppa.PpaTree( args['intree'] )
+    if args['s'] in ['lca','ltcs']:
+        with open( args['f'] ) as inp:
+            for l in inp:
+                line = l.strip().split('\t')
+                ctree.rename( strategy = args['s'], n = line[0], terms = line[1:]  )
+    else:
+        ctree.rename( strategy = args['s'], n = args['n'] )
+    ctree.export( args['outtree'] )
+
diff --git a/tree_reroot.py b/tree_reroot.py
new file mode 100644
index 0000000..f4de27d
--- /dev/null
+++ b/tree_reroot.py
@@ -0,0 +1,45 @@
+#!/usr/bin/env python
+
+import sys
+
+try:
+    import argparse as ap
+except ImportError:
+    sys.stderr.write( "argparse not found" )
+    sys.exit(-1)
+
+try:
+    import pyphlan as ppa
+except ImportError:
+    sys.stderr.write( "pyphlan.py not found\n" )
+    sys.exit(-1)
+
+
+def read_params( args ):
+    p = ap.ArgumentParser(description='Reroot trees')
+
+    p.add_argument( 'intree', nargs='?', default=None, type=str,
+            help=   "the input tree [stdin if not present]")
+    p.add_argument('outtree', nargs='?', default=None, type=str,
+            help=   "the output PhyloXML tree [stdout if not present]")
+    p.add_argument('-f', metavar="File containing clade names",
+            default=None, type=str )
+    p.add_argument( '-n', metavar="N leaves for longest_internal_edge_n", 
+                    default=1, type=int)
+    p.add_argument( '--name', metavar="Name of the new root (the name must already exists)", 
+                    default=None, type=str)
+
+    reroot_st = ['name','lca','ltcs','longest_edge','longest_internal_edge','longest_internal_edge_n']
+    #reroot_st = ['lca','ltcs','midpoint','longest_edge','longest_internal_edge','longest_internal_edge_n']
+    p.add_argument('-s', choices=reroot_st, default='longest_internal_edge', type=str,
+            help=  "select rerooting strategy")
+
+    return vars( p.parse_args() )
+
+
+if __name__ == "__main__":
+    args = read_params( sys.argv )
+    tree = ppa.PpaTree( args['intree'] )
+    tree.reroot( strategy = args['s'], tf = args['f'], n = args['n']) # , name = args['name'] )
+    tree.export( args['outtree'] )
+
diff --git a/tree_select_markers.py b/tree_select_markers.py
new file mode 100644
index 0000000..59e0437
--- /dev/null
+++ b/tree_select_markers.py
@@ -0,0 +1,45 @@
+#!/usr/bin/env python
+
+import sys
+
+try:
+    import argparse as ap
+except ImportError:
+    sys.stderr.write( "argparse not found" )
+    sys.exit(-1)
+
+try:
+    import pyphlan as ppa 
+except ImportError:
+    sys.stderr.write( "pyphlan.py not found" )
+    sys.exit(-1)
+
+
+def read_params( args ):
+    p = ap.ArgumentParser(
+            description='Select top markers')
+
+    p.add_argument( 'infile', nargs='?', default=None, type=str,
+            help=   "the input marker file [stdin if not present]")
+    p.add_argument('outfile', nargs='?', default=None, type=str,
+            help=   "the output core file [stdout if not present]")
+    p.add_argument('-n', metavar="Maximum number of markers to be selected"
+                                 "for each clade",
+            default=100, type=int )
+    p.add_argument('--th', metavar="Threshold on markerness",
+            default=None, type=str )
+
+    return vars( p.parse_args() )
+
+
+if __name__ == "__main__":
+    args = read_params( sys.argv )
+    tree = ppa.PpaTree( None )
+    markers = tree.select_markers( args['infile'], markerness_th = args['th'], max_markers = args['n'] )
+
+    with open( args['outfile'], "w" ) as outf:
+        for clade in markers:
+            for m in clade:
+                outf.write( "\n".join( m ) + "\n" )
+
+
diff --git a/tree_subtree.py b/tree_subtree.py
new file mode 100644
index 0000000..d70e602
--- /dev/null
+++ b/tree_subtree.py
@@ -0,0 +1,42 @@
+#!/usr/bin/env python
+
+import sys
+
+try:
+    import argparse as ap
+except ImportError:
+    sys.stderr.write( "argparse not found" )
+    sys.exit(-1)
+
+try:
+    import pyphlan as ppa
+except ImportError:
+    sys.stderr.write( "pyphlan.py not found\n" )
+    sys.exit(-1)
+
+
+def read_params( args ):
+    p = ap.ArgumentParser(description='Extract sub tree from atree')
+
+    p.add_argument( 'intree', nargs='?', default=None, type=str,
+            help=   "the input tree [stdin if not present]")
+    p.add_argument('outtree', nargs='?', default=None, type=str,
+            help=   "the output PhyloXML tree [stdout if not present]")
+    p.add_argument('-f', metavar="File containing clade names",
+            default=None, type=str )
+    p.add_argument( '-n', metavar="N leaves for longest_internal_edge_n, or N for n_anc", 
+                    default=1, type=int)
+
+    reroot_st = ['name','lca','ltcs','longest_edge','longest_internal_edge','longest_internal_edge_n',"n_anc"]
+    #reroot_st = ['lca','ltcs','midpoint','longest_edge','longest_internal_edge','longest_internal_edge_n']
+    p.add_argument('-s', choices=reroot_st, default='longest_internal_edge', type=str,
+            help=  "select rerooting strategy")
+
+    return vars( p.parse_args() )
+
+
+if __name__ == "__main__":
+    args = read_params( sys.argv )
+    tree = ppa.PpaTree( args['intree'] )
+    tree.subtree( strategy = args['s'], fn = args['f'], n = args['n'] )
+    tree.export( args['outtree'] )
diff --git a/util_argparse.py b/util_argparse.py
new file mode 100644
index 0000000..b9c61f5
--- /dev/null
+++ b/util_argparse.py
@@ -0,0 +1,41 @@
+import argparse as ap
+import sys
+
+
+class util_argparse:
+
+    def __init__( self, script_name = None ):
+        self.script_name = script_name
+        self.parser = ap.ArgumentParser()
+        self.arg = self.parser.add_argument
+        self.args = {}
+
+    def inp_0( self ):
+        self.arg( 'inp', metavar='INPUT_FILE', type=str, nargs='?', default=None )
+
+    def out_0( self ):
+        self.arg( 'out', metavar='OUTPUT_FILE', type=str, nargs='?', default=None )
+
+    def version( self ):
+        arg( '-v','--version', action='store_true', 
+             help="Prints the current "+self.script_name+" version and exit\n" if self.script_name else "" ) 
+
+    def get_inp_0( self ):
+        if not 'inp' in self.args:
+            return None
+        if self.args['inp'] is None:
+            return sys.stdin
+        return self.args['inp']
+    
+    def get_out_0( self ):
+        if not 'out' in self.args:
+            return None
+        if self.args['out'] is None:
+            return sys.stdout
+        return self.args['out']
+
+    def parse( self ):
+        self.args = vars(self.parser.parse_args())
+    
+
+
diff --git a/utils.py b/utils.py
new file mode 100644
index 0000000..b7178e4
--- /dev/null
+++ b/utils.py
@@ -0,0 +1,23 @@
+import bz2
+import sys
+
+def openr( fn, mode = "r" ):
+    if fn is None:
+        return sys.stdin
+    return bz2.BZ2File(fn) if fn.endswith(".bz2") else open(fn,mode)
+
+def openw( fn ):
+    if fn is None:
+        return sys.stdout
+    return bz2.BZ2File(fn,"w") if fn.endswith(".bz2") else open(fn,"w")
+
+
+
+def is_number(s):
+    try:
+        int(s)
+        return True
+    except ValueError:
+        return False
+
+

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/python-pyphlan.git



More information about the debian-med-commit mailing list