[med-svn] [phybin] 01/02: Imported Upstream version 0.3
Andreas Tille
tille at debian.org
Tue Dec 8 12:53:27 UTC 2015
This is an automated email from the git hooks/post-receive script.
tille pushed a commit to branch master
in repository phybin.
commit bff5fc5bc3407ef25f24bcf0d46222d635b746ee
Author: Andreas Tille <tille at debian.org>
Date: Tue Dec 8 13:36:07 2015 +0100
Imported Upstream version 0.3
---
.gitignore | 3 +
Bio/Phylogeny/PhyBin.hs | 560 +++++++++++++++++++
Bio/Phylogeny/PhyBin/Binning.hs | 421 ++++++++++++++
Bio/Phylogeny/PhyBin/CoreTypes.hs | 359 ++++++++++++
Bio/Phylogeny/PhyBin/Parser.hs | 252 +++++++++
Bio/Phylogeny/PhyBin/PreProcessor.hs | 95 ++++
Bio/Phylogeny/PhyBin/RFDistance.hs | 444 +++++++++++++++
Bio/Phylogeny/PhyBin/Util.hs | 139 +++++
Bio/Phylogeny/PhyBin/Visualize.hs | 449 +++++++++++++++
DEVLOG.md | 140 +++++
LICENSE | 31 ++
Main.hs | 617 +++++++++++++++++++++
Makefile | 121 ++++
README.md | 166 ++++++
Setup.hs | 12 +
TestAll.hs | 41 ++
old_script/cheztrace.ss | 62 +++
old_script/find_tree_matches.ss | 420 ++++++++++++++
old_script/iu-exptime.ss | 67 +++
old_script/iu-match.ss | 698 +++++++++++++++++++++++
old_script/match.ss | 800 +++++++++++++++++++++++++++
old_script/parser2.ss | 45 ++
old_script/parser_fullnames.ss | 187 +++++++
phybin.cabal | 149 +++++
point_check.sh | 6 +
stripTable.hs | 50 ++
tests/t1/112.tr | 1 +
tests/t1/13.tr | 1 +
tests/t2/116.tr | 1 +
tests/t2/233.tr | 1 +
tests/t30_mismatched/mismatched.tr | 3 +
tests/t3_consensus/cluster1_284_alltrees.tr | 284 ++++++++++
tests/t3_consensus/cluster1_284_consensus.tr | 1 +
tests/t4_consensus/cluster1_16_alltrees.tr | 16 +
tests/t4_consensus/cluster1_16_consensus.tr | 1 +
tests/t5_consensus/cluster1_35_alltrees.tr | 35 ++
tests/t5_consensus/cluster1_35_consensus.tr | 1 +
vary_branchlen.sh | 47 ++
website/trees.jpg | Bin 0 -> 40409 bytes
website/website.css | 74 +++
40 files changed, 6800 insertions(+)
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..7431621
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,3 @@
+Rickettsia
+Rickettsiales
+Wolbachia
\ No newline at end of file
diff --git a/Bio/Phylogeny/PhyBin.hs b/Bio/Phylogeny/PhyBin.hs
new file mode 100644
index 0000000..722d5d3
--- /dev/null
+++ b/Bio/Phylogeny/PhyBin.hs
@@ -0,0 +1,560 @@
+{-# LANGUAGE ScopedTypeVariables, RecordWildCards, TypeSynonymInstances, CPP #-}
+{-# LANGUAGE NamedFieldPuns #-}
+{-# LANGUAGE OverloadedStrings #-}
+{-# OPTIONS_GHC -fwarn-incomplete-patterns #-}
+{-# OPTIONS_GHC -fwarn-unused-imports #-}
+
+-- | This module contains the code that does the tree normalization and binning.
+-- It's the heart of the program.
+
+module Bio.Phylogeny.PhyBin
+ ( driver, binthem, normalize, annotateWLabLists, unitTests, acquireTreeFiles,
+ deAnnotate, retrieveHighlights, matchAnyHighlight )
+ where
+
+import qualified Data.Foldable as F
+import Data.Function (on)
+import Data.List (delete, minimumBy, sortBy, foldl1', foldl', intersperse, isPrefixOf)
+import Data.Maybe (fromMaybe, catMaybes)
+import Data.Either (partitionEithers)
+import Data.Time.Clock
+import Data.Tuple (swap)
+import qualified Data.ByteString.Char8 as B
+import qualified Data.Map as M
+import qualified Data.Set as S
+import qualified Data.Text.Lazy.IO as T
+import qualified Data.Vector as V
+import qualified Data.Vector.Unboxed as U
+import Control.Monad (forM, forM_, filterM, when, unless)
+import qualified Control.Concurrent.Async as Async
+import Control.Exception (evaluate)
+import Control.Applicative ((<$>),(<*>))
+import Control.Concurrent (Chan)
+import System.FilePath (combine)
+import System.Directory (doesFileExist, doesDirectoryExist, createDirectoryIfMissing,
+ getDirectoryContents, getCurrentDirectory)
+import System.IO (openFile, hClose, IOMode(..), stdout, withFile)
+import System.Info (os)
+import System.Process (system)
+import System.Exit (ExitCode(..))
+import Test.HUnit ((~:),(~=?),Test,test)
+import qualified Data.Clustering.Hierarchical as C
+import qualified Data.GraphViz as Gv
+
+-- For vizualization:
+import Text.PrettyPrint.HughesPJClass hiding (char, Style)
+import Data.GraphViz.Printing (renderDot)
+import Bio.Phylogeny.PhyBin.CoreTypes
+import Bio.Phylogeny.PhyBin.Parser (parseNewick, parseNewicks)
+import Bio.Phylogeny.PhyBin.PreProcessor (collapseBranchLenThresh,
+ collapseBranchBootStrapThresh, pruneTreeLeaves)
+import Bio.Phylogeny.PhyBin.Visualize (dotToPDF, dotNewickTree, viewNewickTree, dotDendrogram)
+import Bio.Phylogeny.PhyBin.RFDistance
+import Bio.Phylogeny.PhyBin.Binning
+import Bio.Phylogeny.PhyBin.Util
+
+
+import Debug.Trace
+----------------------------------------------------------------------------------------------------
+
+-- Turn on for extra invariant checking:
+debug :: Bool
+debug = True
+
+#ifdef SEQUENTIALIZE
+#warning "SEQUENTIALIING execution. Disabling all parallelism."
+type Async t = t
+async a = a
+wait x = return x
+#else
+type Async t = Async.Async t
+async = Async.async
+wait = Async.wait
+#endif
+
+-- | A dendrogram PLUS consensus trees at the intermediate points.
+data DendroPlus a = DPLeaf (FullTree a)
+ | DPBranch !Double (FullTree ()) (DendroPlus a) (DendroPlus a)
+
+----------------------------------------------------------------------------------------------------
+
+-- | Driver to put all the pieces together (parse, normalize, bin)
+driver :: PhyBinConfig -> IO ()
+driver cfg at PBC{ verbose, num_taxa, name_hack, output_dir, inputs=files,
+ do_graph, branch_collapse_thresh, bootstrap_collapse_thresh, highlights,
+ dist_thresh, clust_mode, rfmode, preprune_labels,
+ print_rfmatrix } =
+ -- Unused: do_draw
+ do
+ --------------------------------------------------------------------------------
+ -- First, find out where we are and open the files:
+ --------------------------------------------------------------------------------
+ cd <- getCurrentDirectory
+ --putStrLn$ "PHYBIN RUNNING IN DIRECTORY: "++ cd
+
+ bl <- doesDirectoryExist output_dir
+ unless bl $ createDirectoryIfMissing True output_dir
+
+ if isPrefixOf "mingw" os then
+ -- TODO: Need a portable version of this. 'filemanip' would do:
+ putStrLn$ "Not cleaning away previous phybin outputs (TODO: port this to Windows)."
+ else do
+ putStrLn$ "Cleaning away previous phybin outputs..."
+ system$ "rm -f "++output_dir++"/dendrogram.*"
+ system$ "rm -f "++output_dir++"/cluster*"
+ system$ "rm -f "++output_dir++"/distance_matrix.txt"
+ system$ "rm -f "++output_dir++"/WARNINGS.txt"
+ return ()
+
+ putStrLn$ "Parsing "++show (length files)++" Newick tree files."
+ --putStrLn$ "\nFirst ten \n"++ concat (map (++"\n") $ map show $ take 10 files)
+
+ --------------------------------------------------------------------------------
+ -- Next, parse the files and do error checking and annotation.
+ --------------------------------------------------------------------------------
+
+ (goodFiles,warnings1) <- fmap partitionEithers $
+ forM files $ \ file -> do
+ reg <- is_regular_file file
+ if reg
+ then return$ Left file
+ else return$ Right file
+ let num_files = length goodFiles
+ bstrs <- mapM B.readFile goodFiles
+ let (labelTab, fulltrees) = parseNewicks name_hack (zip files bstrs)
+ nameToLabel = M.fromList $ map swap $ M.toList labelTab
+
+ highlightTrs <- retrieveHighlights name_hack labelTab highlights
+
+ let allTaxaLs = M.elems labelTab
+ totalTaxa = length allTaxaLs
+ putStrLn$ "\nTotal unique taxa ("++ show (M.size labelTab) ++"):\n "++ (unwords allTaxaLs)
+
+ expected_num_taxa <-
+ case preprune_labels of
+ Nothing -> return totalTaxa
+ Just ls -> return (length ls)
+ case rfmode of
+ TolerantNaive -> return ()
+ HashRF -> putStrLn$ "Note: defaulting to expecting ALL "++show expected_num_taxa++" to be present.."
+ --------------------------------------------------------------------------------
+
+ case bootstrap_collapse_thresh of
+ Just thr -> putStrLn$" !+ Collapsing branches of bootstrap value less than "++show thr
+ Nothing -> return ()
+
+ case branch_collapse_thresh of
+ Just thr -> putStrLn$" !+ Collapsing branches of length less than "++show thr
+ Nothing -> return ()
+
+ let do_one :: FullTree DefDecor -> IO (Int, [FullTree DefDecor], Maybe (Int, String))
+ do_one (FullTree treename lblAcc parsed) = do
+ let
+ pruned0 = parsed
+
+ -- Important: MUST collapse bootstrap first if we're doing both:
+ pruned1 = case bootstrap_collapse_thresh of
+ Nothing -> pruned0
+ Just thr -> collapseBranchBootStrapThresh thr pruned0
+ pruned2 = case branch_collapse_thresh of
+ Nothing -> pruned1
+ Just thr -> collapseBranchLenThresh thr pruned1
+ pruned3 = case preprune_labels of
+ Nothing -> Just pruned2
+ Just lst -> pruneTreeLeaves (S.fromList (map lkup lst)) pruned2
+
+ lkup trnm =
+ case M.lookup trnm nameToLabel of
+ Nothing -> error$ "Tree name "++show trnm++" did not occur ANYWHERE in the input set."
+ Just lab -> lab
+
+ -- TEMPTOGGLE
+ -- when False $ do putStrLn$ "DRAWING TREE"
+ -- viewNewickTree "Annotated" (FullTree file lblAcc' annot)
+ -- viewNewickTree "Normalized" (FullTree file lblAcc' normal)
+ -- putStrLn$ "WEIGHTS OF NORMALIZED' CHILDREN: "++
+ -- show (map get_weight$ get_children normal)
+ when verbose$ putStr "."
+ case pruned3 of
+ Nothing -> do when verbose$ putStrLn$ "\n WARNING: tree empty after preprocessing, discarding: "++ treename
+ return (0, [], Just (0, treename))
+ Just pruned3' ->
+ let numL = numLeaves pruned3'
+ looksOK = return (numL, [FullTree treename lblAcc pruned3'], Nothing)
+ discardIt = return (0, [], Just (numL, treename))
+ in case rfmode of
+ TolerantNaive -> looksOK
+ HashRF | numL == expected_num_taxa -> looksOK
+ | otherwise -> do
+ when verbose$
+ putStrLn$ "\n WARNING: tree contained unexpected number of leaves ("
+ ++ show numL ++"): "++ treename
+ discardIt
+
+ results <- mapM do_one fulltrees
+ let (counts::[Int], validtreess, pairs:: [Maybe (Int, String)]) = unzip3 results
+ let validtrees = concat validtreess
+ warnings2 = catMaybes pairs
+
+ putStrLn$ "\nNumber of input tree files: " ++ show num_files
+ -- mapM_ (print . displayStrippedTree) validtrees -- Debugging.
+ case preprune_labels of
+ Nothing -> return ()
+ Just lst -> putStrLn$ "PRUNING trees to just these taxa: "++show lst
+ when (length warnings2 > 0) $
+ putStrLn$ "Number of bad/unreadable input tree files: " ++ show (length warnings2)
+ putStrLn$ "Number of VALID trees (correct # of leaves/taxa): " ++ show (length validtrees)
+ putStrLn$ "Total tree nodes contained in valid trees: "++ show (sum counts)
+ let all_branches = concatMap (F.foldr' (\x ls -> getBranchLen x:ls) []) validtrees
+ unless (null all_branches) $ do
+ putStrLn$ "Average branch len over valid trees: "++ show (avg all_branches)
+ putStrLn$ "Max/Min branch lengths: "++ show (foldl1' max all_branches,
+ foldl1' min all_branches)
+
+ --------------------------------------------------------------------------------
+ -- Next, dispatch on the mode and do the actual clustering or binning.
+ --------------------------------------------------------------------------------
+
+ (classes,binlist,asyncs) <- case clust_mode of
+ BinThem -> do x <- doBins validtrees
+ -- A list of bins, sorted by size:
+ let binlist = reverse $ sortBy (compare `on` fst) $
+ map (\ (tr,OneCluster ls) -> (length ls, OneCluster ls)) $
+ M.toList x
+ return (x,binlist,[])
+ ClusterThem{linkage} -> do
+ (mat, dendro) <- doCluster (rfmode==HashRF) expected_num_taxa linkage validtrees
+ --------------------
+ when print_rfmatrix $ printDistMat stdout mat
+ withFile (combine output_dir ("distance_matrix.txt")) WriteMode $ \ hnd ->
+ printDistMat hnd mat
+ --------------------
+ writeFile (combine output_dir ("dendrogram.txt"))
+ (show$ fmap treename dendro)
+ putStrLn " [finished] Wrote full dendrogram to file dendrogram.txt"
+ sanityCheck dendro
+ --------------------
+ let plotIt mnameMap =
+ if True -- do_graph
+ then async (do
+ t0 <- getCurrentTime
+ let dot = dotDendrogram cfg "dendrogram" 1.0 dendro mnameMap highlightTrs
+ -- Be wary of cycles, something appears to be buggy:
+ putStrLn$ " [async] writing dendrogram as a graph to dendrogram.dot"
+ -- writeFile (combine output_dir "dendrogram.dot")
+ -- (show $ renderDot $ Gv.toDot dot)
+ mtxt <- safePrintDendro dot
+ case mtxt of
+ Nothing ->
+ putStrLn "WARNING: because we couldn't print it, we're not drawing the dendrogram either."
+ Just txt -> do
+ T.writeFile (combine output_dir "dendrogram.dot") txt
+ putStrLn$ " [async] Next, plot dendrogram.pdf"
+ _ <- dotToPDF dot (combine output_dir "dendrogram.pdf")
+ t1 <- getCurrentTime
+ putStrLn$ " [finished] Writing dendrogram diagram ("
+ ++show(diffUTCTime t1 t0)++")")
+ else async (return ())
+ case dist_thresh of
+ Nothing -> do a <- plotIt Nothing
+ return (M.empty,[],[a])
+ -- error "Fully hierarchical cluster output is not finished! Use --editdist."
+ Just dstThresh -> do
+ putStrLn$ "Combining all clusters at distance less than or equal to "++show dstThresh
+ let clusts = sliceDendro (fromIntegral dstThresh) dendro
+ classes = clustsToMap clusts
+ -- Cache the lengths:
+ wlens = [ (length ls, OneCluster ls) | OneCluster ls <- clusts ]
+ sorted0 = reverse$ sortBy (compare `on` fst) wlens -- TODO: Parallel sorting?
+ nameMap = clustsToNameMap (map snd sorted0)
+ gvizAsync <- plotIt (Just nameMap)
+ putStrLn$ "After flattening, cluster sizes are: "++show (map fst sorted0)
+ -- Flatten out the dendogram:
+ return (classes, sorted0, [gvizAsync])
+
+ unless (null binlist)$ reportClusts clust_mode binlist
+
+ ----------------------------------------
+ -- TEST, TEMPTOGGLE: print out edge weights :
+ -- forM_ (map snd3 results) $ \parsed -> do
+ -- let weights = all_edge_weights (head$ S.toList taxa) parsed
+ -- trace ("weights of "++ show parsed ++" "++ show weights) $
+ -- return ()
+ -- exitSuccess
+ ----------------------------------------
+
+ --------------------------------------------------------------------------------
+ -- Finally, produce all the required outputs.
+ --------------------------------------------------------------------------------
+
+-- putStrLn$ "Final number of tree bins: "++ show (M.size classes)
+
+ unless (null warnings1 && null warnings2) $
+ writeFile (combine output_dir "WARNINGS.txt")
+ ("This file was generated to record all of the files which WERE NOT incorporated successfully into the results.\n" ++
+ "Each of these files had some kind of problem, likely one of the following:\n"++
+ " (1) a mismatched number of taxa (leaves) in the tree relative to the rest of the dataset\n"++
+ " (2) a file that could not be read.\n"++
+ " (3) a file that could not be parsed.\n\n"++
+ concat (map (\ file -> "Not a regular/readable file: "++ file++"\n")
+ warnings1) ++
+ concat (map (\ (n,file) ->
+ "Wrong number of taxa ("++ show n ++"): "++ file++"\n")
+ warnings2))
+
+ async2 <- case clust_mode of
+ BinThem -> outputBins binlist output_dir do_graph
+ ClusterThem{} -> outputClusters expected_num_taxa binlist output_dir do_graph
+
+ -- Wait on parallel tasks:
+ putStrLn$ "Waiting for "++show (length$ async2:asyncs)++" asynchronous tasks to finish..."
+ mapM_ wait (async2:asyncs)
+ putStrLn$ "Phybin completed."
+ --------------------------------------------------------------------------------
+ -- End driver
+ --------------------------------------------------------------------------------
+
+-- filePrefix = "bin"
+filePrefix = "cluster"
+
+--------------------------------------------------------------------------------
+-- Driver helpers:
+--------------------------------------------------------------------------------
+
+-- doBins :: [FullTree DefDecor] -> t1 -> t2 -> IO BinResults
+doBins :: [FullTree DefDecor] -> IO (BinResults StandardDecor)
+doBins validtrees = do
+ putStrLn$ "Creating equivalence classes (bins)..."
+ let classes = --binthem_normed$ zip files $ concat$ map snd3 results
+ binthem validtrees
+ return (classes)
+
+doCluster :: Bool -> Int -> C.Linkage -> [FullTree DefDecor] -> IO (DistanceMatrix, C.Dendrogram (FullTree DefDecor))
+doCluster use_hashrf expected_num_taxa linkage validtrees = do
+ t0 <- getCurrentTime
+ when use_hashrf$ putStrLn " Using HashRF-style algorithm..."
+ let nwtrees = map nwtree validtrees
+ numtrees = length validtrees
+ mat = if use_hashrf
+ then hashRF expected_num_taxa nwtrees
+ else fst (naiveDistMatrix nwtrees)
+ ixtrees = zip [0..] validtrees
+ dist (i,t1) (j,t2) | j == i = 0
+-- | i == numtrees-1 = 0
+ | j < i = fromIntegral ((mat V.! i) U.! j)
+ | otherwise = fromIntegral ((mat V.! j) U.! i)
+ dist1 a b = trace ("Taking distance between "++show (fst a, fst b)) $ dist a b
+ dendro = fmap snd $ C.dendrogram linkage ixtrees dist
+ -- Force the distance matrix:
+ V.mapM_ evaluate mat
+ t1 <- getCurrentTime
+ putStrLn$ "Time to compute distance matrix: "++show(diffUTCTime t1 t0)
+ putStrLn$ "Clustering using method "++show linkage
+ return (mat,dendro)
+
+
+reportClusts :: ClustMode -> [(Int, OneCluster StandardDecor)] -> IO ()
+reportClusts mode binlist = do
+ let binsizes = map fst binlist
+
+ putStrLn$ " Outcome: "++show (length binlist)++" clusters found, "++show (length$ takeWhile (>1) binsizes)
+ ++" non-singleton, top bin sizes: "++show(take 10 binsizes)
+ putStrLn$" Up to first 30 bin sizes, excluding singletons:"
+ forM_ (zip [1..30] binlist) $ \ (ind, (len, OneCluster ftrees)) -> do
+ when (len > 1) $ -- Omit that long tail of single element classes...
+ putStrLn$show$
+ hcat [text (" * cluster#"++show ind++", members "++ show len ++", "),
+ case mode of
+ BinThem -> vcat [text ("avg bootstraps "++show (get_bootstraps$ avg_trees$ map nwtree ftrees)++", "),
+ text "all: " <> pPrint (filter (not . null) $ map (get_bootstraps . nwtree) ftrees)]
+ ClusterThem{} -> hcat [] ]
+
+-- | Convert a flat list of clusters into a map from individual trees to clusters.
+clustsToMap :: [OneCluster StandardDecor] -> BinResults StandardDecor
+clustsToMap clusts = F.foldl' fn M.empty clusts
+ where
+ fn acc theclust@(OneCluster ftrs) =
+ F.foldl' (fn2 theclust) acc ftrs
+ fn2 theclust acc (FullTree{nwtree}) =
+ M.insert (anonymize_annotated nwtree) theclust acc
+
+-- | Map each tree NAME onto the one-based index (in sorted order) of the cluster it
+-- comes from.
+clustsToNameMap :: [OneCluster StandardDecor] -> M.Map TreeName Int
+clustsToNameMap clusts = foldl' fn M.empty (zip [1..] clusts)
+ where
+ fn acc (ix,(OneCluster ftrs)) =
+ foldl' (\acc' t -> M.insert (treename t) ix acc') acc ftrs
+
+flattenDendro :: (C.Dendrogram (FullTree DefDecor)) -> OneCluster StandardDecor
+flattenDendro dendro =
+ case dendro of
+ C.Leaf (FullTree{treename,labelTable,nwtree}) ->
+ OneCluster [FullTree treename labelTable (annotateWLabLists nwtree)]
+ C.Branch _ left right ->
+ -- TODO: fix quadratic append
+ flattenDendro left `appendClusts` flattenDendro right
+ where
+ appendClusts (OneCluster ls1) (OneCluster ls2) = OneCluster (ls1++ls2)
+
+-- | Turn a hierarchical clustering into a flat clustering.
+sliceDendro :: Double -> (C.Dendrogram (FullTree DefDecor)) -> [OneCluster StandardDecor]
+sliceDendro dstThresh den = loop den
+ where
+ loop br@(C.Branch dist left right)
+ -- Too far apart to combine:
+ | dist > dstThresh = loop left ++ loop right
+ | otherwise = [flattenDendro br]
+ loop br@(C.Leaf _) = [flattenDendro br]
+
+
+--------------------------------------------------------------------------------
+
+-- outputClusters :: (Num a1, Ord a1, Show a1) => Int -> [(a1, t, OneCluster a)] -> String -> Bool -> IO ()
+outputClusters :: Int -> [(Int, OneCluster a)] -> String -> Bool -> IO (Async ())
+outputClusters num_taxa binlist output_dir do_graph = do
+ let numbins = length binlist
+ let base i size = combine output_dir (filePrefix ++ show i ++"_"++ show size)
+ let consTrs = [ FullTree "consensus" (labelTable $ head ftrees) nwtr
+ | (_, OneCluster ftrees) <- binlist
+ , let nwtr :: NewickTree StandardDecor
+ nwtr = annotateWLabLists $ fmap (const (Nothing,0)) $
+ consensusTree num_taxa $ map nwtree ftrees ]
+
+ forM_ (zip3 [1::Int ..] binlist consTrs) $ \ (i, (size, OneCluster ftrees), fullConsTr) -> do
+ writeFile (base i size ++".txt") (concat$ map ((++"\n") . treename) ftrees)
+ writeFile (base i size ++"_consensus.tr") (show (displayStrippedTree fullConsTr) ++ "\n")
+ writeFile (base i size ++"_alltrees.tr")
+ (unlines [ show (displayStrippedTree ft) | ft <- ftrees ])
+
+ putStrLn$ " [finished] Wrote contents of each cluster to cluster<N>_<size>.txt"
+ putStrLn$ " [finished] Wrote representative (consensus) trees to cluster<N>_<size>_consensus.tr"
+ if do_graph then do
+ putStrLn$ "Next start the time consuming operation of writing out graphviz visualizations:"
+ asyncs <- forM (zip3 [1::Int ..] binlist consTrs) $
+ \ (i, (size, OneCluster membs), fullConsTr) -> async$ do
+ when (size > 1 || numbins < 100) $ do
+ let dot = dotNewickTree ("cluster #"++ show i) 1.0 fullConsTr
+ _ <- dotToPDF dot (base i size ++ ".pdf")
+ return ()
+ async $ do
+ mapM_ wait asyncs
+ putStrLn$ " [finished] Wrote visual representations of consensus trees to "++filePrefix++"<N>_<size>.pdf"
+ else do putStrLn$ "NOT creating processes to build per-cluster .pdf visualizations. (Not asked to.)"
+ async (return ())
+
+outputBins :: [(Int, OneCluster StandardDecor)] -> String -> Bool -> IO (Async ())
+outputBins binlist output_dir do_graph = do
+ let numbins = length binlist
+ let base i size = combine output_dir (filePrefix ++ show i ++"_"++ show size)
+ let avgs = map (avg_trees . map nwtree . clustMembers . snd) binlist
+ forM_ (zip3 [1::Int ..] binlist avgs) $ \ (i, (size, OneCluster ftrees), avgTree) -> do
+ let FullTree fstName labs _ = head ftrees
+ fullAvgTr = FullTree fstName labs avgTree
+ writeFile (base i size ++".txt") (concat$ map ((++"\n") . treename) ftrees)
+ when debug$ do
+ writeFile (base i size ++".dbg") (show (pPrint avgTree) ++ "\n")
+ -- Printing the average tree instead of the stripped cannonical one:
+ writeFile (base i size ++"_avg.tr") (show (displayDefaultTree$ deAnnotate fullAvgTr) ++ "\n") -- FIXME
+
+ putStrLn$ " [finished] Wrote contents of each bin to "++filePrefix++"<N>_<binsize>.txt"
+ putStrLn$ " Wrote representative trees to "++filePrefix++"<N>_<binsize>_avg.tr"
+ if do_graph then do
+ putStrLn$ "Next start the time consuming operation of writing out graphviz visualizations:"
+ asyncs <- forM (zip3 [1::Int ..] binlist avgs) $ \ (i, (size, OneCluster membs), avgTree) -> async $ do
+ let FullTree fstName labs _ = head membs
+ fullAvgTr = FullTree fstName labs avgTree
+ when (size > 1 || numbins < 100) $ do
+ let dot = dotNewickTree ("cluster #"++ show i) (1.0 / avg_branchlen (map nwtree membs))
+ --(annotateWLabLists$ fmap (const 0) tr)
+ -- TEMP FIXME -- using just ONE representative tree:
+ ( --trace ("WEIGHTED: "++ show (head$ trees bentry)) $
+ --(head$ trees bentry)
+ fullAvgTr)
+ _ <- dotToPDF dot (base i size ++ ".pdf")
+ return ()
+ async $ do
+ mapM_ wait asyncs
+ putStrLn$ " [finished] Wrote visual representations of trees to "++filePrefix++"<N>_<size>.pdf"
+ else async (return ())
+--------------------------------------------------------------------------------
+
+-- Monadic mapAccum
+mapAccumM :: Monad m => (acc -> x -> m (acc,y)) -> acc -> [x] -> m (acc,[y])
+mapAccumM fn acc ls = F.foldrM fn' (acc,[]) ls
+ where
+ fn' x (acc,ls) = do (acc',y) <- fn acc x
+ return (acc',y:ls)
+
+fst3 :: (t, t1, t2) -> t
+snd3 :: (t, t1, t2) -> t1
+thd3 :: (t, t1, t2) -> t2
+fst3 (a,_,_) = a
+snd3 (_,b,_) = b
+thd3 (_,_,c) = c
+
+
+-- Come up with an average tree from a list of isomorphic trees.
+-- This comes up with some blending of edge lengths.
+avg_trees :: [AnnotatedTree] -> AnnotatedTree
+avg_trees origls =
+ fmap unlift $
+ foldIsomorphicTrees (foldl1 sumthem) $
+ map (fmap lift) origls
+ where
+ totalCount = fromIntegral$ length origls
+
+ -- Here we do the actual averaging:
+ unlift :: TempDecor -> StandardDecor
+ unlift (bl, (bs, bootcount), wt, ls) =
+ (StandardDecor (bl / totalCount)
+ (case bootcount of
+ 0 -> Nothing
+ _ -> Just$ round (fromIntegral bs / fromIntegral bootcount))
+ wt ls)
+
+ lift :: StandardDecor -> TempDecor
+ lift (StandardDecor bl bs wt ls) = (bl, (fromMaybe 0 bs, countMayb bs), wt, ls)
+
+ sumthem :: TempDecor -> TempDecor -> TempDecor
+ sumthem (bl1, (bs1, cnt1), wt, ls)
+ (bl2, (bs2, cnt2), _, _) =
+ (bl1+bl2, (bs1 + bs2, cnt1+cnt2), wt, ls)
+ countMayb Nothing = 0
+ countMayb (Just _) = 1
+
+-- Used only by avg_trees above...
+type TempDecor = (Double, (Int, Int), Int, [Label])
+
+avg ls = sum ls / fromIntegral (length ls)
+
+-- | Parse extra trees in addition to the main inputs (for --highlight).
+retrieveHighlights :: (String->String) -> LabelTable -> [FilePath] -> IO [[NewickTree ()]]
+retrieveHighlights name_hack labelTab ls =
+ mapM parseHighlight ls
+ where
+ parseHighlight file = do
+ bs <- B.readFile file
+ let (lt2,htr) = parseNewick labelTab name_hack file bs
+ unless (lt2 == labelTab) $
+ error$"Tree given as --highlight includes taxa not present in main tree set: "++
+ show(M.keys$ M.difference lt2 labelTab)
+ return (map (fmap (const())) htr)
+
+
+-- | Create a predicate that tests trees for consistency with the set of --highlight
+-- (consensus) trees.
+--
+-- Note, tree consistency is not the same as an exact match. It's
+-- like (<=) rather than (==). All trees are consistent with the
+-- "star topology".
+matchAnyHighlight :: [[NewickTree ()]] -> NewickTree () -> Bool
+-- matchAnyHighlight :: [[NewickTree ()]] -> NewickTree () -> Maybe Int
+-- If there is a match, return the index of the highlight that matched.
+matchAnyHighlight highlightTrs =
+ let matchers = map mkMatcher highlightTrs
+ mkMatcher ls = let fns = map compatibleWith ls -- Multiple predicate functions
+ in \ tr -> -- Does it match any predicate?
+ or$ map (\f -> f tr) fns
+ in \ newtr ->
+ any (\f -> f newtr) matchers
diff --git a/Bio/Phylogeny/PhyBin/Binning.hs b/Bio/Phylogeny/PhyBin/Binning.hs
new file mode 100644
index 0000000..9c0369b
--- /dev/null
+++ b/Bio/Phylogeny/PhyBin/Binning.hs
@@ -0,0 +1,421 @@
+{-# LANGUAGE ScopedTypeVariables, RecordWildCards, TypeSynonymInstances, CPP #-}
+{-# LANGUAGE NamedFieldPuns #-}
+{-# LANGUAGE OverloadedStrings #-}
+{-# OPTIONS_GHC -fwarn-incomplete-patterns #-}
+{-# OPTIONS_GHC -fwarn-unused-imports #-}
+
+-- | This module contains the code that does the tree normalization and binning.
+
+module Bio.Phylogeny.PhyBin.Binning
+ ( -- * Binning and normalization
+ binthem, normalize, normalizeFT, annotateWLabLists,
+ deAnnotate, OneCluster(..), BinResults, StrippedTree,
+ -- * Utilities and unit tests
+ get_weight, unitTests, anonymize_annotated
+ )
+ where
+
+import Data.Function (on)
+import Data.List (delete, minimumBy, sortBy, insertBy, intersperse, sort)
+import qualified Data.ByteString.Char8 as B
+import qualified Data.Map as M
+import Control.Applicative ((<$>),(<*>))
+import Control.Concurrent (Chan)
+import Test.HUnit ((~:),(~=?),Test,test)
+
+-- For vizualization:
+import Text.PrettyPrint.HughesPJClass hiding (char, Style)
+import Bio.Phylogeny.PhyBin.CoreTypes
+import Bio.Phylogeny.PhyBin.Parser (parseNewicks, parseNewick)
+import Bio.Phylogeny.PhyBin.Visualize (dotNewickTree, viewNewickTree)
+
+-- Turn on for extra invariant checking:
+debug :: Bool
+debug = True
+
+----------------------------------------------------------------------------------------------------
+-- Normal form for unordered, unrooted trees
+----------------------------------------------------------------------------------------------------
+
+-- The basic idea is that what we *want* is the following,
+-- ROOT: most balanced point
+-- ORDER: sorted in increasing subtree weight
+
+-- But that's not quite good enough. There are ties to break. To do
+-- that we fall back on the (totally ordered) leaf labels.
+
+--------------------------------------------------------------------------------
+
+
+-- Our sorting criteria for the children of interior nodes:
+compare_childtrees :: AnnotatedTree -> AnnotatedTree -> Ordering
+compare_childtrees node1 node2 =
+ case (subtreeWeight $ get_dec node1) `compare` (subtreeWeight $ get_dec node2) of
+ -- Comparisons on atoms cause problems WRT to determinism between runs if parallelism is introduced.
+ -- Can consider it an optimization for the serial case perhaps:
+-- EQ -> case map deAtom (get_label_list node1) `compare`
+-- map deAtom (get_label_list node2) of
+ EQ -> case (sortedLabels$ get_dec node1) `compare` (sortedLabels$ get_dec node2) of
+ EQ -> error$ "Internal invariant broken. These two children have equal ordering priority:\n"
+ ++ "Pretty printing:\n "
+ ++ show (pPrint node1) ++ "\n " ++ show (pPrint node2)
+ ++ "\nFull data structures:\n "
+ ++ show (node1) ++ "\n " ++ show (node2)
+ x -> x
+ x -> x
+
+-- | A version lifted to operate over full trees.
+normalizeFT :: FullTree StandardDecor -> FullTree StandardDecor
+normalizeFT (FullTree nm labs tr) = FullTree nm labs (normalize tr)
+
+-- | This is it, here's the routine that transforms a tree into normal form.
+-- This relies HEAVILY on lazy evaluation.
+normalize :: AnnotatedTree -> AnnotatedTree
+normalize tree = snd$ loop tree Nothing
+ where
+
+ add_context dec Nothing = dec
+ add_context dec (Just c) = add_weight dec c
+
+ -- loop: Walk over the tree, turning it inside-out in the process.
+ -- Inputs:
+ -- 1. node: the NewickTree node to process ("us")
+ -- 2. context: all nodes connected through the parent, "flipped" as though *we* were root
+ -- The "flipped" part has ALREADY been normalized.
+ -- Outputs:
+ -- 1. new node
+ -- 3. the best candidate root anywhere under this subtree
+ loop :: AnnotatedTree -> Maybe (AnnotatedTree) -> (AnnotatedTree, AnnotatedTree)
+ loop node ctxt = case node of
+ NTLeaf (StandardDecor _ _ w sorted) _name ->
+ (node,
+ -- If the leaf becomes the root... we could introduce another node:
+ NTInterior (add_context (StandardDecor 0 Nothing w sorted) ctxt) $
+ (verify_sorted "1" id$ maybeInsert compare_childtrees ctxt [node])
+
+ -- It may be reasonable to not support leaves becoming root.. that changes the number of nodes!
+ --error "normalize: leaf becoming root not currently supported."
+ )
+
+ NTInterior dec ls ->
+ let
+ -- If this node becomes the root, the parent becomes one of our children:
+ inverted = NTInterior inverted_dec inverted_children
+ inverted_dec = add_context dec ctxt
+ inverted_children = verify_sorted "2" id$ maybeInsert compare_childtrees ctxt newchildren
+
+ newchildren = --trace ("SORTED "++ show (map (get_label_list . fst) sorted)) $
+ map fst sorted
+ sorted = sortBy (compare_childtrees `on` fst) possibs
+
+ possibs =
+ flip map ls $ \ child ->
+ let
+
+ -- Will this diverge??? Probably depends on how equality (for delete) is defined...
+
+ -- Reconstruct the current node missing one child (because it became a parent):
+ -- Update its metadata appropriately:
+ newinverted = NTInterior (subtract_weight inverted_dec child)
+ (verify_sorted "3" id$ delete newnode inverted_children)
+ (newnode, _) = result
+
+ result = loop child (Just newinverted)
+ in
+ result
+
+ -- Either us or a candidate suggested by one of the children:
+ rootcandidates = inverted : map snd sorted
+
+ -- Who wins? The "most balanced". Minimize max subtree weight.
+ -- The compare operator is NOT allowed to return EQ here. Therefore there will be a unique minima.
+ winner = --trace ("Candidates: \n"++ show (nest 6$ vcat (map pPrint (zip (map max_subtree_weight rootcandidates) rootcandidates )))) $
+ minimumBy cmpr_subtree_weight rootcandidates
+
+ max_subtree_weight = maximum . map get_weight . get_children
+ fat_id = map get_label_list . get_children
+
+ cmpr_subtree_weight tr1 tr2 =
+ case max_subtree_weight tr1 `compare` max_subtree_weight tr2 of
+ EQ -> -- As a fallback we compare the alphabetic order of the "bignames" of the children:
+ case fat_id tr1 `compare` fat_id tr2 of
+ EQ -> error$ "\nInternal invariant broken. These two were equally good roots:\n"
+ ++ show (pPrint tr1) ++ "\n" ++ show (pPrint tr2)
+ x -> x
+ x -> x
+
+ in (NTInterior dec newchildren, winner)
+
+
+-- Verify that our invariants are met:
+verify_sorted :: (Show a, Pretty a) => String -> (a -> AnnotatedTree) -> [a] -> [a]
+verify_sorted msg =
+ if debug
+ then \ project nodes ->
+ let weights = map (get_weight . project) nodes in
+ if sort weights == weights
+ then nodes
+-- else error$ "Child list failed verification: "++ show (pPrint nodes)
+ else error$ msg ++ ": Child list failed verification, not sorted: "++ show (weights)
+ ++"\n "++ show (sep $ map pPrint nodes) ++
+ "\n\nFull output:\n " ++ (concat$ intersperse "\n " $ map show nodes)
+ else \ _ nodes -> nodes
+
+
+-- TODO: Salvage any of these tests that are worthwhile and get them into the unit tests:
+tt :: AnnotatedTree
+tt = normalize $ annotateWLabLists $ head$ snd$ parseNewick M.empty id "" "(A,(C,D,E),B);"
+
+norm4 :: FullTree StandardDecor
+norm4 = norm "((C,D,E),B,A);"
+
+norm5 :: AnnotatedTree
+norm5 = normalize$ annotateWLabLists$ head$ snd$ parseNewick M.empty id "" "(D,E,C,(B,A));"
+
+
+----------------------------------------------------------------------------------------------------
+-- Equivalence classes on Trees:
+----------------------------------------------------------------------------------------------------
+
+-- | When binning, the members of a OneCluster are isomorphic trees. When clustering
+-- based on robinson-foulds distance they are merely similar trees.
+newtype OneCluster a = OneCluster { clustMembers :: [FullTree a] }
+ deriving Show
+
+-- | Ignore metadata (but keep weights) for the purpose of binning
+type StrippedTree = NewickTree Int
+
+-- | Index the results of binning by topology-only stripped trees
+-- that have their decorations removed.
+type BinResults a = M.Map StrippedTree (OneCluster a)
+
+-- | The binning function.
+-- Takes labeled trees, classifies labels into equivalence classes.
+binthem :: [FullTree DefDecor] -> BinResults StandardDecor
+binthem ls = binthem_normed normalized
+ where
+ normalized = map (\ (FullTree n lab tree) ->
+ FullTree n lab (normalize $ annotateWLabLists tree)) ls
+
+-- | This version accepts trees that are already normalized:
+binthem_normed :: [FullTree StandardDecor] -> BinResults StandardDecor
+binthem_normed normalized =
+-- foldl (\ acc (lab,tree) -> M.insertWith update tree (OneCluster{ members=[lab] }) acc)
+ foldl (\ acc ft@(FullTree treename _ tree) ->
+ M.insertWith update (anonymize_annotated tree) (OneCluster [ft]) acc)
+ M.empty normalized
+ --(map (mapSnd$ fmap (const ())) normalized) -- still need to STRIP them
+ where
+-- update new old = OneCluster{ members= (members new ++ members old) }
+ update (OneCluster new) (OneCluster old) = OneCluster (new++old)
+ --strip = fmap (const ())
+
+-- | For binning. Remove branch lengths and labels but leave weights.
+anonymize_annotated :: AnnotatedTree -> StrippedTree
+anonymize_annotated = fmap (\ (StandardDecor bl bs w labs) -> w)
+
+
+----------------------------------------------------------------------------------------------------
+-- Other tools and algorithms.
+----------------------------------------------------------------------------------------------------
+
+-- Extract all edges connected to a particular node in every tree. Return branch lengths.
+all_edge_weights lab trees =
+ concat$ map (loop []) trees
+ where
+ loop acc (NTLeaf len name) | lab == name = len:acc
+ loop acc (NTLeaf _ _) = acc
+ loop acc (NTInterior _ ls) = foldl loop acc ls
+
+
+----------------------------------------------------------------------------------------------------
+-- Bitvector based normalization.
+----------------------------------------------------------------------------------------------------
+
+-- TODO: This approach is probably faster. Give it a try.
+
+{-
+int NumberOfSetBits(int i)
+{
+ i = i - ((i >> 1) & 0x55555555);
+ i = (i & 0x33333333) + ((i >> 2) & 0x33333333);
+ return ((i + (i >> 4) & 0xF0F0F0F) * 0x1010101) >> 24;
+}
+
+int __builtin_popcount (unsigned int x);
+-}
+
+
+----------------------------------------------------------------------------------------------------
+
+
+
+{-
+ ----------------------------------------
+ PARSING TIMING TEST:
+ ----------------------------------------
+
+ Compiling this with GHC 6.12 on my laptop -O2...
+ It takes 0.043s startup to parse ten files.
+ And 0.316 seconds to parse 2648.. so we can say that's almost all time spent parsing/building/traversing.
+ (All nodes summed to 14966)
+ (The tested version uses Strings for labels... not Atoms)
+
+ Comparing against the original mzscheme version (with Racket 5.0)
+ with default optimization (there's no obvious -O2), well the
+ generated .exe has a ~0.5 second startup time overhead...
+ 0.881 seconds total to do the parsing, or about 380ms just for parsing.
+ But that doesn't do the counting!
+ Ok, this mzscheme version is in a messed up state at this point, but hacking
+ it to do a count (and it gets a different one.. 12319), I get 0.882 seconds real time,
+ that is neglibly more.
+
+ If anything parsec should be at a disadvantage because of the lack of
+ a preprocessing phase to generate the FSM...
+
+ Btw, switching node labels over to Atoms made no difference. (But
+ didn't slow down at least.) We wouldn't expect this to save anything
+ on the construction side... parsec still allocates/builds the strings
+ before we intern them.
+
+ -}
+
+
+
+
+----------------------------------------------------------------------------------------------------
+-- General helper/utility functions:
+----------------------------------------------------------------------------------------------------
+
+merge :: Ord a => [a] -> [a] -> [a]
+merge [] ls = ls
+merge ls [] = ls
+merge l@(a:b) r@(x:y) =
+ if a < x
+ then a : merge b r
+ else x : merge y l
+
+-- Set subtraction for sorted lists:
+demerge :: (Ord a, Show a) => [a] -> [a] -> [a]
+demerge ls [] = ls
+demerge [] ls = error$ "demerge: first list did not contain all of second, remaining: " ++ show ls
+demerge (a:b) r@(x:y) =
+ case a `compare` x of
+ EQ -> demerge b y
+ LT -> a : demerge b r
+ GT -> error$ "demerge: element was missing from first list: "++ show x
+
+-- maybeCons :: Maybe a -> [a] -> [a]
+-- maybeCons Nothing ls = ls
+-- maybeCons (Just x) ls = x : ls
+
+maybeInsert :: (a -> a -> Ordering) -> Maybe a -> [a] -> [a]
+maybeInsert _ Nothing ls = ls
+maybeInsert fn (Just x) ls = insertBy fn x ls
+
+-- | Add the metadata that is used for binning
+annotateWLabLists :: NewickTree DefDecor -> AnnotatedTree
+annotateWLabLists tr = case tr of
+ NTLeaf (bs,bl) n -> NTLeaf (StandardDecor bl bs 1 [n]) n
+ NTInterior (bs,bl) [] -> NTInterior (StandardDecor bl bs 0 []) []
+ -- NTInterior (bs,bl) [] -> error "annotateWLabLists: internal invariant broken. Shouldn't have NTInterior with null children."
+ NTInterior (bs,bl) ls ->
+ let children = map annotateWLabLists ls in
+ NTInterior (StandardDecor bl bs
+ (sum $ map (subtreeWeight . get_dec) children)
+ (foldl1 merge $ map (sortedLabels . get_dec) children))
+ children
+
+----------------------------------------------------------------------------------------------------
+-- Simple Helper Functions
+----------------------------------------------------------------------------------------------------
+
+-- | Take the extra annotations away. Inverse of `annotateWLabLists`.
+deAnnotate :: FullTree StandardDecor -> FullTree DefDecor
+deAnnotate (FullTree a b tr) = FullTree a b (fmap (\ (StandardDecor bl bs _ _) -> (bs,bl)) tr)
+
+-- Number of LEAVES contained in subtree:
+get_weight :: AnnotatedTree -> Int
+get_weight = subtreeWeight . get_dec
+
+-- Sorted list of leaf labels contained in subtree
+get_label_list :: AnnotatedTree -> [Label]
+get_label_list = sortedLabels . get_dec
+
+add_weight :: StandardDecor -> AnnotatedTree -> StandardDecor
+add_weight (StandardDecor l1 bs1 w1 sorted1) node =
+ let (StandardDecor _ bs2 w2 sorted2) = get_dec node in
+ StandardDecor l1 ((+) <$> bs1 <*> bs2) (w1+w2) (merge sorted1 sorted2)
+
+-- Remove the influence of one subtree from the metadata of another.
+subtract_weight :: StandardDecor -> AnnotatedTree -> StandardDecor
+subtract_weight (StandardDecor l1 bs1 w1 sorted1) node =
+ let (StandardDecor _ bs2 w2 sorted2) = get_dec node in
+ StandardDecor l1 ((-) <$> bs1 <*> bs2) (w1-w2) (demerge sorted1 sorted2)
+
+
+----------------------------------------------------------------------------------------------------
+-- UNIT TESTING
+----------------------------------------------------------------------------------------------------
+
+tre1 :: (LabelTable, NewickTree DefDecor)
+tre1 = (x,head y)
+ where
+ (x,y) = parseNewick M.empty id "" "(A:0.1,B:0.2,(C:0.3,D:0.4):0.5);"
+
+tre1draw :: IO (Chan (), FullTree StandardDecor)
+tre1draw = viewNewickTree "tre1"$ (FullTree "" (fst tre1) (annotateWLabLists (snd tre1)))
+
+tre1dot :: IO ()
+tre1dot = print $ dotNewickTree "" 1.0 $ (FullTree "" (fst tre1) (annotateWLabLists$ snd tre1))
+
+norm :: String -> FullTree StandardDecor
+norm = norm2 . B.pack
+
+norm2 :: B.ByteString -> FullTree StandardDecor
+norm2 bstr = FullTree "" tbl (normalize $ annotateWLabLists$ head tr)
+ where
+ (tbl,tr) = parseNewick M.empty id "test" bstr
+
+unitTests :: Test
+unitTests =
+ let ntl s = NTLeaf (Nothing,0.0) s
+ (tbl,tre1_) = tre1
+ in
+ test
+ [
+ "merge" ~: [1,2,3,4,5,6] ~=? merge [1,3,5] [2,4,6::Int]
+ , "demerge" ~: [2,4,6] ~=? demerge [1,2,3,4,5,6] [1,3,5::Int]
+ , "demerge" ~: [1,3,5] ~=? demerge [1,2,3,4,5,6] [2,4,6::Int]
+ , "annotateWLabLists" ~:
+ ["A","B","C","D"] -- ORD on atoms is expensive... it must use the whole string.
+ ~=? map (tbl M.!)
+ (sortedLabels (get_dec (annotateWLabLists tre1_)))
+
+ -- Make sure that all of these normalize to the same thing.
+ , "normalize1" ~: "(C, D, E, (A, B))" ~=? show (displayDefaultTree $ deAnnotate$ norm "(A,(C,D,E),B);")
+ , "normalize2A" ~: "(C, D, E, (A, B))" ~=? show (pPrint$ norm "((C,D,E),B,A);")
+ , "normalize2B" ~: "(C, D, E, (A, B))" ~=? show (pPrint$ norm "(D,E,C,(B,A));")
+
+ -- Here's an example from the rhizobia datasetsthat that caused my branch-sorting to fail.
+ , "normalize3" ~: "(((BB, BJ)), (MB, ML), (RE, (SD, SM)))"
+ ~=? show (pPrint$ norm2 (B.pack "(((ML,MB),(RE,(SD,SM))),(BB,BJ));"))
+
+-- "((BB: 2.691831, BJ: 1.179707): 0.000000, ((ML: 0.952401, MB: 1.020319): 0.000000, (RE: 2.031345, (SD: 0.180786, SM: 0.059988): 0.861187): 0.717913): 0.000000);"
+
+ , "phbin: these 3 trees should fall in the same category" ~:
+ 1 ~=? (length $ M.toList $
+ binthem $ snd $
+ parseNewicks id [("one", "(A,(C,D,E),B);"),
+ ("two", "((C,D,E),B,A);"),
+ ("three","(D,E,C,(B,A));")]
+ -- [("one", snd$parseNewick M.empty id "" "(A,(C,D,E),B);"),
+ -- ("two", snd$parseNewick M.empty id "" "((C,D,E),B,A);"),
+ -- ("three", snd$parseNewick M.empty id "" "(D,E,C,(B,A));")]
+ )
+
+ , "dotConversion" ~: True ~=? 100 < length (show $ dotNewickTree "" 1.0$ norm "(D,E,C,(B,A));") -- 444
+ ]
+
+
+--------------------------------------------------------------------------------
diff --git a/Bio/Phylogeny/PhyBin/CoreTypes.hs b/Bio/Phylogeny/PhyBin/CoreTypes.hs
new file mode 100644
index 0000000..8c4d081
--- /dev/null
+++ b/Bio/Phylogeny/PhyBin/CoreTypes.hs
@@ -0,0 +1,359 @@
+{-# LANGUAGE NamedFieldPuns, BangPatterns #-}
+{-# LANGUAGE OverloadedStrings, TypeSynonymInstances, FlexibleInstances #-}
+{-# LANGUAGE CPP #-}
+{-# OPTIONS_GHC -fwarn-incomplete-patterns #-}
+{-# OPTIONS_GHC -fwarn-unused-imports #-}
+
+module Bio.Phylogeny.PhyBin.CoreTypes
+ (
+ -- * Tree and tree decoration types
+ NewickTree(..),
+ DefDecor, StandardDecor(..), AnnotatedTree, FullTree(..),
+ ClustMode(..), TreeName, NumTaxa(..),
+
+ -- * Tree operations
+ displayDefaultTree, displayStrippedTree,
+ treeSize, numLeaves, liftFT,
+ get_dec, set_dec, get_children,
+ map_labels, all_labels, foldIsomorphicTrees,
+
+ -- * Utilities specific to StandardDecor:
+ avg_branchlen, get_bootstraps,
+
+ -- * Command line config options
+ PhyBinConfig(..), default_phybin_config,
+ WhichRFMode(..),
+
+ -- * General helpers
+ Label, LabelTable,
+
+ -- * Experimenting with abstracting decoration operations
+ HasBranchLen(..)
+ )
+ where
+
+import qualified Data.Map as M
+import qualified Data.Set as S
+import Data.Foldable (Foldable(..))
+import Data.Maybe (maybeToList)
+import Data.Monoid (mappend, mconcat)
+import Text.PrettyPrint.HughesPJClass hiding (char, Style)
+
+import qualified Data.Clustering.Hierarchical as C
+
+#define NO_ATOMS
+#ifndef NO_ATOMS
+import StringTable.Atom
+#endif
+
+----------------------------------------------------------------------------------------------------
+-- Type definitions
+----------------------------------------------------------------------------------------------------
+
+type BranchLen = Double
+
+-- | Even though the Newick format allows it, here we ignore interior node
+-- labels. (They are not commonly used.)
+--
+-- Note that these trees are rooted. The `normalize` function ensures that a
+-- single, canonical rooted representation is chosen.
+data NewickTree a =
+ NTLeaf a {-# UNPACK #-} !Label
+ | NTInterior a [NewickTree a]
+ deriving (Show, Eq, Ord)
+
+-- TODO: Ordering maybe shouldn't need to touch the metadata. At least on the fast
+-- path.
+
+{-
+-- [2010.09.22] Disabling:
+instance NFData Atom where
+ rnf a = rnf (fromAtom a :: Int)
+
+instance NFData a => NFData (NewickTree a) where
+ rnf (NTLeaf l n) = rnf (l,n)
+ rnf (NTInterior l ls) = rnf (l,ls)
+-}
+
+instance Functor NewickTree where
+ fmap fn (NTLeaf dec x) = NTLeaf (fn dec) x
+ fmap fn (NTInterior dec ls) = NTInterior (fn dec) (map (fmap fn) ls)
+
+instance Foldable NewickTree where
+ foldMap f (NTLeaf dec x) = f dec
+ foldMap f (NTInterior dec ls) = mappend (f dec) $
+ mconcat (map (foldMap f) ls)
+
+instance Foldable FullTree where
+ foldMap f (FullTree _ _ tr) = foldMap f tr
+
+instance Functor FullTree where
+ fmap f (FullTree n l tr) = FullTree n l $ fmap f tr
+
+instance Pretty dec => Pretty (NewickTree dec) where
+ -- I'm using displayDefaultTree for the "prettiest" printing and
+ -- replacing pPrint with a whitespace-improved version of show:
+ pPrint (NTLeaf dec name) = "NTLeaf" <+> pPrint dec <+> text (show name)
+ pPrint (NTInterior dec ls) = "NTInterior" <+> pPrint dec <+> pPrint ls
+
+instance Pretty a => Pretty (FullTree a) where
+ pPrint (FullTree name mp tr) =
+ "FullTree " <+> text name <+> loop tr
+ where
+ loop (NTLeaf dec ind) = "NTLeaf" <+> pPrint dec <+> text (mp M.! ind)
+ loop (NTInterior dec ls) = "NTInterior" <+> pPrint dec <+> pPrint ls
+
+instance (Pretty k, Pretty v) => Pretty (M.Map k v) where
+ pPrint mp = pPrint (M.toList mp)
+
+-- | Display a tree WITH the bootstrap and branch lengths.
+-- This prints in NEWICK format.
+displayDefaultTree :: FullTree DefDecor -> Doc
+displayDefaultTree orig = loop tr <> ";"
+ where
+ (FullTree _ mp tr) = orig -- normalize orig
+ loop (NTLeaf (Nothing,_) name) = text (mp M.! name)
+ loop (NTLeaf _ _) = error "WEIRD -- why did a leaf node have a bootstrap value?"
+ loop (NTInterior (bootstrap,_) ls) =
+ case bootstrap of
+ Nothing -> base
+ Just val -> base <> text ":[" <> text (show val) <> text "]"
+ where base = parens$ sep$ map_but_last (<>text",") $ map loop ls
+
+-- | The same, except with no bootstrap or branch lengths. Any tree annotations
+-- ignored.
+displayStrippedTree :: FullTree a -> Doc
+displayStrippedTree orig = loop tr <> ";"
+ where
+ (FullTree _ mp tr) = orig -- normalize orig
+ loop (NTLeaf _ name) = text (mp M.! name)
+ loop (NTInterior _ ls) = parens$ sep$ map_but_last (<>text",") $ map loop ls
+
+----------------------------------------------------------------------------------------------------
+-- Labels
+----------------------------------------------------------------------------------------------------
+
+-- | Labels are inexpensive unique integers. The table is necessary for converting them back.
+type Label = Int
+
+-- | Map labels back onto meaningful names.
+type LabelTable = M.Map Label String
+
+----------------------------------------------------------------------------------------------------
+-- Tree metadata (decorators)
+----------------------------------------------------------------------------------------------------
+
+
+-- | The barebones default decorator for NewickTrees contains BOOTSTRAP and
+-- BRANCHLENGTH. The bootstrap values, if present, will range in [0..100]
+type DefDecor = (Maybe Int, BranchLen)
+
+-- | Additionally includes some scratch data that is used by the binning algorithm.
+type AnnotatedTree = NewickTree StandardDecor
+
+-- | The standard decoration includes everything in `DefDecor` plus
+-- some extra cached data:
+--
+-- (1) branch length from parent to "this" node
+-- (2) bootstrap values for the node
+--
+-- (3) subtree weights for future use
+-- (defined as number of LEAVES, not counting intermediate nodes)
+-- (4) sorted lists of labels for symmetry breaking
+data StandardDecor = StandardDecor {
+ branchLen :: BranchLen,
+ bootStrap :: Maybe Int,
+
+ -- The rest of these are used by the computations below. These are
+ -- cached (memoized) values that could be recomputed:
+ ----------------------------------------
+ subtreeWeight :: Int,
+ sortedLabels :: [Label]
+ }
+ deriving (Show,Read,Eq,Ord)
+
+class HasBranchLen a where
+ getBranchLen :: a -> BranchLen
+
+instance HasBranchLen StandardDecor where
+ getBranchLen = branchLen
+
+-- This one is kind of sloppy:
+instance HasBranchLen DefDecor where
+ getBranchLen = snd
+
+-- | A common type of tree contains the standard decorator and also a table for
+-- restoring the human-readable node names.
+data FullTree a =
+ FullTree { treename :: TreeName
+ , labelTable :: LabelTable
+ , nwtree :: NewickTree a
+ }
+ deriving (Show, Ord, Eq)
+
+liftFT :: (NewickTree t -> NewickTree a) -> FullTree t -> FullTree a
+liftFT fn (FullTree nm labs x) = FullTree nm labs (fn x)
+
+type TreeName = String
+
+instance Pretty StandardDecor where
+ pPrint (StandardDecor bl bs wt ls) = parens$
+ "StandardDecor" <+> hsep [pPrint bl, pPrint bs
+-- , pPrint wt, pPrint ls
+ ]
+
+
+----------------------------------------------------------------------------------------------------
+-- * Configuring and running the command line tool.
+----------------------------------------------------------------------------------------------------
+
+-- | Due to the number of configuration options for the driver, we pack them into a record.
+data PhyBinConfig =
+ PBC { verbose :: Bool
+ , num_taxa :: NumTaxa
+ , name_hack :: String -> String
+ , output_dir :: String
+ , inputs :: [String]
+ , do_graph :: Bool
+ , do_draw :: Bool
+ , clust_mode :: ClustMode
+ , highlights :: [FilePath] -- [FullTree ()]
+ , show_trees_in_dendro :: Bool
+ , show_interior_consensus :: Bool
+ , rfmode :: WhichRFMode
+ , preprune_labels :: Maybe [String]
+ , print_rfmatrix :: Bool
+ , dist_thresh :: Maybe Int
+ , branch_collapse_thresh :: Maybe Double -- ^ Branches less than this length are collapsed.
+ , bootstrap_collapse_thresh :: Maybe Int
+ -- ^ BootStrap values less than this result in the intermediate node being collapsed.
+ }
+
+-- | Supported modes for computing RFDistance.
+data WhichRFMode = HashRF | TolerantNaive
+ deriving (Show, Eq, Ord)
+
+-- | How many taxa should we expect in the incoming dataset?
+data NumTaxa = Expected Int -- ^ Supplied by the user. Committed.
+ | Unknown -- ^ In the future we may automatically pick a behavior. Now this one is usually an error.
+ | Variable -- ^ Explicitly ignore this setting in favor of comparing all trees
+ -- (even if some are missing taxa). This only works with certain modes.
+ deriving (Show, Read, Eq)
+
+-- | The default phybin configuration.
+default_phybin_config :: PhyBinConfig
+default_phybin_config =
+ PBC { verbose = False
+-- , num_taxa = error "must be able to determine the number of taxa expected in the dataset. (Supply it manually with -n.)"
+ , num_taxa = Unknown
+ , name_hack = id -- Default, no transformation of leaf-labels
+ , output_dir = "./phybin_out/"
+ , inputs = []
+ , do_graph = False
+ , do_draw = False
+-- , clust_mode = BinThem
+ , clust_mode = ClusterThem C.UPGMA
+ , rfmode = HashRF
+ , preprune_labels = Nothing
+ , highlights = []
+ , show_trees_in_dendro = False
+ , show_interior_consensus = False
+ , print_rfmatrix = False
+ , dist_thresh = Nothing
+ , branch_collapse_thresh = Nothing
+ , bootstrap_collapse_thresh = Nothing
+ }
+
+data ClustMode = BinThem | ClusterThem { linkage :: C.Linkage }
+
+----------------------------------------------------------------------------------------------------
+-- * Simple utility functions for the core types:
+----------------------------------------------------------------------------------------------------
+
+-- | How many nodes (leaves and interior) are contained in a NewickTree?
+treeSize :: NewickTree a -> Int
+treeSize (NTLeaf _ _) = 1
+treeSize (NTInterior _ ls) = 1 + sum (map treeSize ls)
+
+-- | This counts only leaf nodes, which should include all taxa.
+numLeaves :: NewickTree a -> Int
+numLeaves (NTLeaf _ _) = 1
+numLeaves (NTInterior _ ls) = sum (map numLeaves ls)
+
+
+map_but_last :: (a -> a) -> [a] -> [a]
+map_but_last _ [] = []
+map_but_last _ [h] = [h]
+map_but_last fn (h:t) = fn h : map_but_last fn t
+
+
+
+get_dec :: NewickTree t -> t
+get_dec (NTLeaf dec _) = dec
+get_dec (NTInterior dec _) = dec
+
+-- Set all the decorations to a constant:
+set_dec :: b -> NewickTree a -> NewickTree b
+set_dec d = fmap (const d)
+--set_dec d (NTLeaf _ x) = NTLeaf d x
+--set_dec d (NTInterior _ ls) = NTInterior d $ map (set_dec d) ls
+
+get_children :: NewickTree t -> [NewickTree t]
+get_children (NTLeaf _ _) = []
+get_children (NTInterior _ ls) = ls
+
+
+-- | Average branch length across all branches in all all trees.
+avg_branchlen :: HasBranchLen a => [NewickTree a] -> Double
+avg_branchlen origls = fst total / snd total
+ where
+ total = sum_ls $ map sum_tree origls
+ sum_ls ls = (sum$ map fst ls, sum$ map snd ls)
+ sum_tree (NTLeaf dec _) | getBranchLen dec == 0 = (0,0)
+ | otherwise = (abs (getBranchLen dec),1)
+ sum_tree (NTInterior dec ls) =
+ let branchLen = getBranchLen dec
+ (x,y) = sum_ls$ map sum_tree ls in
+ if branchLen == 0 then (x, y) else ((abs branchLen) + x, 1+y)
+
+-- | Retrieve all the bootstraps values actually present in a tree.
+get_bootstraps :: NewickTree StandardDecor -> [Int]
+get_bootstraps (NTLeaf (StandardDecor{bootStrap}) _) = maybeToList bootStrap
+get_bootstraps (NTInterior (StandardDecor{bootStrap}) ls) =
+ maybeToList bootStrap ++ concatMap get_bootstraps ls
+
+-- | Apply a function to all the *labels* (leaf names) in a tree.
+map_labels :: (Label -> Label) -> NewickTree a -> NewickTree a
+map_labels fn (NTLeaf dec lbl) = NTLeaf dec $ fn lbl
+map_labels fn (NTInterior dec ls) = NTInterior dec$ map (map_labels fn) ls
+
+-- | Return all the labels contained in the tree.
+all_labels :: NewickTree t -> [Label]
+all_labels (NTLeaf _ lbl) = [lbl]
+all_labels (NTInterior _ ls) = concat$ map all_labels ls
+
+
+
+-- | This function allows one to collapse multiple trees while looking
+-- only at the "horizontal slice" of all the annotations *at a given
+-- position* in the tree.
+--
+-- "Isomorphic" must apply both to the shape and the name labels or it
+-- is an error to apply this function.
+foldIsomorphicTrees :: ([a] -> b) -> [NewickTree a] -> NewickTree b
+foldIsomorphicTrees _ [] = error "foldIsomorphicTrees: empty list of input trees"
+foldIsomorphicTrees fn ls@(hd:_) = fmap fn horiztrees
+ where
+ -- Preserve the input order:
+ horiztrees = Prelude.foldr consTrees (fmap (const []) hd) ls
+ -- We use the tree datatype itself as the intermediate data
+ -- structure. This is VERY allocation-expensive, it would be
+ -- possible to trade compute for allocation here:
+ consTrees a b = case (a,b) of
+ (NTLeaf dec nm1, NTLeaf decls nm2) | nm1 /= nm2 -> error$"foldIsomorphicTrees: mismatched names: "++show (nm1,nm2)
+ | otherwise ->
+ NTLeaf (dec : decls) nm1
+ (NTInterior dec ls1, NTInterior decls ls2) ->
+ NTInterior (dec:decls) $ zipWith consTrees ls1 ls2
+ _ -> error "foldIsomorphicTrees: difference in tree shapes"
+
diff --git a/Bio/Phylogeny/PhyBin/Parser.hs b/Bio/Phylogeny/PhyBin/Parser.hs
new file mode 100644
index 0000000..a6ad401
--- /dev/null
+++ b/Bio/Phylogeny/PhyBin/Parser.hs
@@ -0,0 +1,252 @@
+{-# LANGUAGE ScopedTypeVariables, BangPatterns, ParallelListComp #-}
+{-# OPTIONS_GHC -fwarn-incomplete-patterns #-}
+{-# OPTIONS_GHC -fwarn-unused-imports #-}
+
+module Bio.Phylogeny.PhyBin.Parser
+ (newick_parser, parseNewick, parseNewicks, parseNewickFiles, unitTests)
+ where
+import Control.Exception (evaluate, handle, SomeException)
+import qualified Data.ByteString.Char8 as B
+import Data.Char (isSpace)
+import Data.Map as M
+import Data.List as L
+import Text.Parsec
+import Text.Parsec.ByteString
+import Test.HUnit ((~:),(~=?),Test,test,assertFailure)
+import System.FilePath (takeBaseName)
+
+import Bio.Phylogeny.PhyBin.CoreTypes
+import Prelude as P
+
+type NameHack = (String->String)
+
+-- | Parse a bytestring into a NewickTree with branch lengths. The
+-- first argument is file from which the data came and is just for
+-- better error messages.
+--
+-- If the single bytestring contains more than one tree, then a number is appended to
+-- the tree names.
+parseNewick :: LabelTable -> NameHack -> String -> B.ByteString -> (LabelTable, [NewickTree DefDecor])
+parseNewick tbl0 name_hack file input = (lbls,trs)
+ where
+ (lbls,trs) =
+ extractLabelTable tbl0 $
+ runB file (many1$ newick_parser name_hack) $
+ B.filter (not . isSpace) input
+
+-- treeFiles <- acquireTreeFiles files
+ -- let fn f = do raw <- B.readFile f
+ -- let ls = map (`B.append` (B.pack ";")) $
+ -- B.splitWith (== ';') raw
+ -- return (map (f,) ls)
+ -- trees0 <- concat <$> mapM fn treeFiles
+ -- FIXME: no name_hack here:
+ -- let (lbls, trees) = parseNewicks id trees0
+ -- putStrLn$ "Read trees! "++show (length trees)
+ -- putStrLn$ "Taxa: "++show (pPrint lbls)
+ -- putStrLn$ "First tree: "++show (displayDefaultTree (head trees))
+
+-- | Parse a list of trees, starting with an empty map of labels and accumulating a final map.
+parseNewickFiles :: NameHack -> [String] -> IO (LabelTable, [FullTree DefDecor])
+parseNewickFiles fn nms = do
+ -- WARNING: Lazy-IO here. We need to be careful about leaking file descriptors.
+ bss <- mapM B.readFile nms
+ return $ parseNewicks fn (zip nms bss)
+
+-- | A version which takes in-memory trees as ByteStrings.
+parseNewicks :: NameHack -> [(String,B.ByteString)] -> (LabelTable, [FullTree DefDecor])
+parseNewicks name_hack pairs = (labtbl, fullTrs)
+ where
+ fullTrs = [ FullTree (tweakName file ind) labtbl tr
+ | (file,tr) <- trs
+ | ind <- [(0::Int)..] ]
+ tweakName file ind = -- Here we do the renaming/numbering business:
+ if addSuffix
+ then (takeBaseName file)++"_"++show ind
+ else (takeBaseName file)
+ addSuffix = case trs of
+ [] -> False -- Should this be an error?
+ [_] -> False
+ _ -> True
+ (labtbl, trs) = P.foldr fn (M.empty,[]) pairs
+ fn (file,bstr) (!acc,!ls) =
+ let (acc',trs) = parseNewick acc name_hack file bstr
+ names = if length trs > 1
+ then zipWith (\x y -> x++"_"++show y) (repeat file) [0..]
+ else [file]
+ in (acc', (zip names trs) ++ ls)
+
+runB :: Show a => String -> Parser a -> B.ByteString -> a
+runB file p input = case (parse p "" input) of
+ Left err -> error ("parse error in file "++ show file ++" at "++ show err)
+ Right x -> x
+
+extractLabelTable :: LabelTable -> [TempTree] -> (LabelTable, [NewickTree DefDecor])
+extractLabelTable tbl0 trs = (finMap,finTree)
+ where
+ flipped = M.fromList $ L.map (\(x,y)->(y,x)) $ M.toList tbl0
+ -- (_,finMap,finTree) = loop2 (S.fromList (M.elems tbl0)) tbl0 tr
+ (_,finMap,finTree) = loop1 flipped tbl0 trs
+
+ -- Mere plumbing:
+ loop1 seen acc [] = (seen, acc, [])
+ loop1 seen acc (hd:tl) =
+ let (seen2,acc2,hd') = loop2 seen acc hd
+ (seen3,acc3,tl') = loop1 seen2 acc2 tl in
+ (seen3,acc3, hd':tl')
+
+ loop2 seen acc (NTLeaf (d,Just nm) _)
+ | M.member nm seen = (seen, acc, NTLeaf d (seen M.! nm))
+ | otherwise = let nxt = M.size acc in
+ (M.insert nm nxt seen,
+ M.insert nxt nm acc, NTLeaf d nxt)
+ loop2 seen1 acc1 (NTInterior (d,Nothing) chlds) =
+ let (seen',acc',ls') =
+ P.foldr (\ x (seen2,acc2,ls) ->
+ let (seen3,acc3,x') = loop2 seen2 acc2 x in
+ (seen3, acc3, x':ls))
+ (seen1,acc1,[])
+ chlds
+ in (seen',acc', NTInterior d ls')
+
+----------------------------------------------------------------------------------------------------
+-- Newick file format parser definitions:
+----------------------------------------------------------------------------------------------------
+
+-- | Hack: we store the names in the leaves.
+type TempTree = NewickTree (DefDecor,Maybe String)
+
+-- | This is used to post-facto splice metadata into the data structure.
+tag :: DefDecor -> TempTree -> TempTree
+tag l s =
+ case s of
+ NTLeaf (_,m) n -> NTLeaf (l,m) n
+ NTInterior (_,Nothing) ls -> NTInterior (l,Nothing) ls
+
+-- | This parser ASSUMES that whitespace has been prefiltered from the input.
+newick_parser :: NameHack -> Parser TempTree
+newick_parser name_hack =
+ do x <- subtree name_hack
+ -- Get the top-level metadata:
+ l <- branchMetadat name_hack
+ _ <- char ';'
+ return$ tag l x
+
+subtree :: NameHack -> Parser TempTree
+subtree name_hack = internal name_hack <|> leaf name_hack
+
+defaultMeta :: (Maybe Int, Double)
+defaultMeta = (Nothing,0.0)
+
+leaf :: NameHack -> Parser TempTree
+leaf name_hack =
+ do nm <- name
+ let nm' = name_hack nm
+ return$ NTLeaf (defaultMeta,Just nm') 0
+
+internal :: NameHack -> Parser TempTree
+internal name_hack =
+ do _ <- char '('
+ bs <- branchset name_hack
+ _ <- char ')'
+ _nm <- name -- IGNORED, internal names.
+ return$ NTInterior (defaultMeta,Nothing) bs
+
+branchset :: NameHack -> Parser [TempTree]
+branchset name_hack =
+ do b <- branch name_hack <?> "at least one branch"
+ rest <- option [] $ try$ do char ','; branchset name_hack
+ return (b:rest)
+
+branch :: NameHack -> Parser TempTree
+branch name_hack =
+ do s <- subtree name_hack
+ l <- branchMetadat name_hack
+ return$ tag l s
+
+-- If the length is omitted, it is implicitly zero.
+branchMetadat :: NameHack -> Parser DefDecor
+branchMetadat name_hack = option defaultMeta $ do
+ char ':'
+ n <- (try sciNotation <|> number)
+ -- IF the branch length succeeds then we go for the bracketed bootstrap value also:
+ bootstrap <- option Nothing $ do
+ char '['
+ s <- many1 digit
+ char ']'
+ return (Just (read s))
+ return (bootstrap,n)
+
+-- | Parse a normal, decimal number.
+number :: Parser Double
+number =
+ do sign <- option "" $ string "-"
+ first <- many1 digit
+ second <- option "0" $ try$ do char '.'; many1 digit
+ return (read (sign ++ first++"."++second) :: Double)
+
+-- | Parse a number in scientific notation.
+sciNotation :: Parser Double
+sciNotation =
+ do coeff <- do first <- many1 digit
+ second <- option "0" $ try$ do char '.'; many1 digit
+ return $ first++"."++second
+ char 'e'
+ sign <- option "" $ string "-"
+ expon <- many1 digit
+ return (read (coeff++"e"++sign++expon))
+
+-- | Names are a mess... they contain all kinds of garbage sometimes it seems.
+-- Thus we are very permissive. We allow anything that is not something we specifically need to reserve.
+name :: Parser String
+name = option "" $ many1 (noneOf "()[]:;, \t\n\r\f\v")
+-- name = option "" $ many1 (letter <|> digit <|> oneOf "_.-")
+
+
+--------------------------------------------------------------------------------
+-- Unit Tests
+--------------------------------------------------------------------------------
+
+tre1 :: NewickTree DefDecor
+tre1 = head $ snd $ parseNewick M.empty id "" $ B.pack "(A:0.1,B:0.2,(C:0.3,D:0.4):0.5);"
+
+unitTests :: Test
+unitTests = test
+ [ "test name" ~: "foo" ~=? run name "foo"
+ , "test number" ~: 3.3 ~=? run number "3.3"
+ , "test number" ~: 3.0 ~=? run number "3"
+ , "test number" ~: -3.0 ~=? run number "-3"
+
+ , "leaf" ~: ntl "A" ~=? run (leaf id) "A"
+ , "subtree" ~: ntl "A" ~=? run (subtree id) "A"
+
+ -- These are not allowed:
+ , "null branchset" ~: errortest$ run (branchset id) ""
+
+ , "internal" ~: NTInterior nada [ntl "A"] ~=? run (internal id) "(A);"
+ , "example: no nodes are named" ~: NTInterior nada
+ [ntl "", ntl "", NTInterior nada [ntl "", ntl ""]]
+ ~=? run (newick_parser id) "(,,(,));"
+ , "example: leaf nodes are named" ~: 6 ~=? treeSize (run (newick_parser id) "(A,B,(C,D));")
+ , "example: all nodes are named" ~: 6 ~=? treeSize (run (newick_parser id) "(A,B,(C,D)E)F;")
+
+ , "example: all but root node have a distance to parent" ~: 6 ~=? treeSize (run (newick_parser id) "(:0.1,:0.2,(:0.3,:0.4):0.5);")
+ , "example: all have a distance to parent" ~: 6 ~=? treeSize (run (newick_parser id) "(:0.1,:0.2,(:0.3,:0.4):0.5):0.6;")
+ , "example: distances and leaf names (popular)" ~: 6 ~=? treeSize tre1
+ , "example: distances and all names" ~: 6 ~=? treeSize (run (newick_parser id) "(A:0.1,B:0.2,(C:0.3,D:0.4)E:0.5)F;")
+ , "example: a tree rooted on a leaf node (rare)" ~: 6 ~=? treeSize (run (newick_parser id) "((B:0.2,(C:0.3,D:0.4)E:0.5)F:0.1)A;")
+ ]
+ where
+ ntl s = NTLeaf ((Nothing,0.0),Just s) 0
+ nada = ((Nothing,0),Nothing)
+
+run :: Show a => Parser a -> String -> a
+run p input = runB "<unknown>" p (B.pack input)
+
+errortest :: t -> IO ()
+errortest x =
+ --() ~=?
+ handle (\ (e::SomeException) -> return ()) $
+ do evaluate x
+ assertFailure "test was expected to throw an error"
+
diff --git a/Bio/Phylogeny/PhyBin/PreProcessor.hs b/Bio/Phylogeny/PhyBin/PreProcessor.hs
new file mode 100644
index 0000000..dda3075
--- /dev/null
+++ b/Bio/Phylogeny/PhyBin/PreProcessor.hs
@@ -0,0 +1,95 @@
+{-# LANGUAGE ScopedTypeVariables #-}
+
+-- | This is a preprocessor which can (optionally) be invoked to
+-- collapse short branch lengths.
+
+module Bio.Phylogeny.PhyBin.PreProcessor
+ ( collapseBranches,
+ collapseBranchLenThresh, collapseBranchBootStrapThresh,
+ pruneTreeLeaves
+ )
+ where
+
+import qualified Data.Set as S
+import Data.Maybe (catMaybes)
+import Bio.Phylogeny.PhyBin.CoreTypes
+
+
+-- | Prune the leaves of the tree to only those leaves in the provided set.
+--
+-- If ALL leaves are pruned from the set, this function returns nothing.
+pruneTreeLeaves :: S.Set Label -> NewickTree DefDecor -> Maybe (NewickTree DefDecor)
+pruneTreeLeaves set tr = loop tr
+ where
+ loop orig@(NTLeaf _ lab)
+ | S.member lab set = Just orig
+ | otherwise = Nothing
+ loop (NTInterior dec@(_,blen) ls) =
+ case catMaybes $ map loop ls of
+ [] -> Nothing
+ -- Here we ELIMINATE the intermediate node, but we add in its branch length:
+ [one] -> Just$ fmap (\(bs,bl) -> (bs,blen + bl)) one
+ ls' -> Just (NTInterior dec ls')
+
+
+-- | Removes branches that do not meet a predicate, leaving a shallower, "bushier"
+-- tree. This does NOT change the set of leaves (taxa), it only removes interior
+-- nodes.
+--
+-- `collapseBranches pred collapser tree` uses `pred` to test the meta-data to see
+-- if collapsing the intermediate node below the branch is necessary, and if it is,
+-- it uses `collapser` to reduce all the metadata for the collapsed branches into a
+-- single piece of metadata.
+collapseBranches :: forall a . (a -> Bool) -> (a -> a -> a) -> NewickTree a -> NewickTree a
+collapseBranches isCollapsable collapse origtr = final
+ where
+ (_,_, final) = loop origtr
+
+ -- This loop returns:
+ -- (1) a list of leaf "floaters" that can still move upwards,
+ -- (2) immovable subtrees that can't
+ -- (3) a final node IF the result is the root.
+ loop :: NewickTree a -> ([(a,Label)], [NewickTree a], NewickTree a)
+ loop lf@(NTLeaf dec lb) | isCollapsable dec = ([(dec,lb)], [], lf)
+ | otherwise = ([], [lf], lf)
+ loop (NTInterior dec children) =
+ let (floats, anchors, _) = unzip3 $ map loop children
+ thenode = NTInterior dec $ concat anchors ++
+ map (uncurry NTLeaf) (concat floats)
+ in
+ if isCollapsable dec then
+ -- If we are collapsable we keep floating upwards.
+ -- We combine our metadata (on the left) into the floatees:
+ (map (\ (a,l) -> (collapse dec a, l))
+ (concat floats),
+ concat anchors,
+ thenode)
+ else
+ -- Otherwise this is the end of the road for these floaters:
+ ([], [thenode], thenode)
+
+
+-- | A common configuration. Collapse branches based on a length threshold.
+collapseBranchLenThresh :: Double -> NewickTree DefDecor -> NewickTree DefDecor
+-- collapseBranchLenThresh :: HasBranchLen a => Double -> NewickTree a -> NewickTree a
+collapseBranchLenThresh thr tr =
+ collapseBranches ((< thr) . getBranchLen) collapser tr
+ where
+ -- We REMOVE BootStraps as part of this process, they are not well-defined after this point.
+ collapser _intermediate@(_bt1, len1) _floatee@(_bt2, len2) =
+ (Nothing, len1 + len2)
+
+-- | A common configuration. Collapse branches based on bootstrap values.
+collapseBranchBootStrapThresh :: Int -> NewickTree DefDecor -> NewickTree DefDecor
+-- collapseBranchLenThresh :: HasBranchLen a => Double -> NewickTree a -> NewickTree a
+collapseBranchBootStrapThresh thr tr =
+ collapseBranches ((< thr) . getBoot) collapser tr
+ where
+ getBoot (Nothing,_) = error$"collapseBranchBootStrapThresh: bootstrap value missing on tree node!\n"++
+ "They must be present if --minbootstrap is used."
+ getBoot (Just boot,_) = boot
+ -- This had better happen BEFORE branch-length based collapsing is done:
+ collapser (_,len1) (_,len2) = (Nothing, len1+len2)
+
+
+
diff --git a/Bio/Phylogeny/PhyBin/RFDistance.hs b/Bio/Phylogeny/PhyBin/RFDistance.hs
new file mode 100644
index 0000000..91e0767
--- /dev/null
+++ b/Bio/Phylogeny/PhyBin/RFDistance.hs
@@ -0,0 +1,444 @@
+{-# LANGUAGE ScopedTypeVariables, CPP, BangPatterns #-}
+
+module Bio.Phylogeny.PhyBin.RFDistance
+ (
+ -- * Types
+ DenseLabelSet, DistanceMatrix,
+
+ -- * Bipartition (Bip) utilities
+ allBips, foldBips, dispBip,
+ consensusTree, bipsToTree, filterCompatible, compatibleWith,
+
+ -- * ADT for dense sets
+ mkSingleDense, mkEmptyDense, bipSize,
+ denseUnions, denseDiff, invertDense, markLabel,
+
+ -- * Methods for computing distance matrices
+ naiveDistMatrix, hashRF,
+
+ -- * Output
+ printDistMat)
+ where
+
+import Control.Monad
+import Control.Monad.ST
+import Data.Function (on)
+import Data.Word
+import qualified Data.Vector as V
+import qualified Data.Vector.Mutable as MV
+import qualified Data.Vector.Unboxed.Mutable as MU
+import qualified Data.Vector.Unboxed as U
+import Text.PrettyPrint.HughesPJClass hiding (char, Style)
+import System.IO (hPutStrLn, hPutStr, Handle)
+import System.IO.Unsafe
+
+-- import Control.LVish
+-- import qualified Data.LVar.Set as IS
+-- import qualified Data.LVar.SLSet as SL
+
+-- import Data.LVar.Map as IM
+-- import Data.LVar.NatArray as NA
+
+import Bio.Phylogeny.PhyBin.CoreTypes
+import Bio.Phylogeny.PhyBin.PreProcessor (pruneTreeLeaves)
+-- import Data.BitList
+import qualified Data.Set as S
+import qualified Data.List as L
+import qualified Data.IntSet as SI
+import qualified Data.Map.Strict as M
+import qualified Data.Foldable as F
+import qualified Data.Traversable as T
+import Data.Monoid
+import Prelude as P
+import Debug.Trace
+
+#ifdef BITVEC_BIPS
+import qualified Data.Vector.Unboxed.Bit as UB
+import qualified Data.Bit as B
+#endif
+
+-- I don't understand WHY, but I seem to get the same answers WITHOUT this.
+-- Normalization and symmetric difference do make things somewhat slower (e.g. 1.8
+-- seconds vs. 2.2 seconds for 150 taxa / 100 trees)
+#define NORMALIZATION
+-- define BITVEC_BIPS
+
+--------------------------------------------------------------------------------
+-- A data structure choice
+--------------------------------------------------------------------------------
+
+-- type DenseLabelSet s = BitList
+
+
+-- | Dense sets of taxa, aka Bipartitions or BiPs
+-- We assume that taxa labels have been mapped onto a dense, contiguous range of integers [0,N).
+--
+-- NORMALIZATION Rule: Bipartitions are really two disjoint sets. But as long as
+-- the parent set (the union of the partitions, aka "all taxa") then a bipartition
+-- can be represented just by *one* subset. Yet we must choose WHICH subset for
+-- consistency. We use the rule that we always choose the SMALLER. Thus the
+-- DenseLabelSet should always be half the size or less, compared to the total
+-- number of taxa.
+--
+-- A set that is more than a majority of the taxa can be normalized by "flipping",
+-- i.e. taking the taxa that are NOT in that set.
+#ifdef BITVEC_BIPS
+
+# if 1
+type DenseLabelSet = UB.Vector B.Bit
+markLabel lab = UB.modify (\vec -> MU.write vec lab (B.fromBool True))
+mkEmptyDense size = UB.replicate size (B.fromBool False)
+mkSingleDense size ind = markLabel ind (mkEmptyDense size)
+denseUnions = UB.unions
+bipSize = UB.countBits
+denseDiff = UB.difference
+invertDense size bip = UB.invert bip
+dispBip labs bip = show$ map (\(ix,_) -> (labs M.! ix)) $
+ filter (\(_,bit) -> B.toBool bit) $
+ zip [0..] (UB.toList bip)
+denseIsSubset a b = UB.or (UB.difference b a)
+traverseDense_ fn bip =
+ U.ifoldr' (\ ix bit acc ->
+ (if B.toBool bit
+ then fn ix
+ else return ()) >> acc)
+ (return ()) bip
+
+# else
+-- TODO: try tracking the size:
+data DenseLabelSet = DLS {-# UNPACK #-} !Int (UB.Vector B.Bit)
+markLabel lab (DLS _ vec)= DLS (UB.modify (\vec -> return (MU.write vec lab (B.fromBool True))) ) vec
+-- ....
+# endif
+
+#else
+type DenseLabelSet = SI.IntSet
+markLabel lab set = SI.insert lab set
+mkEmptyDense _size = SI.empty
+mkSingleDense _size = SI.singleton
+denseUnions _size = SI.unions
+bipSize = SI.size
+denseDiff = SI.difference
+denseIsSubset = SI.isSubsetOf
+
+dispBip labs bip = "[" ++ unwords strs ++ "]"
+ where strs = map (labs M.!) $ SI.toList bip
+invertDense size bip = loop SI.empty (size-1)
+ where -- There's nothing for it but to iterate and test for membership:
+ loop !acc ix | ix < 0 = acc
+ | SI.member ix bip = loop acc (ix-1)
+ | otherwise = loop (SI.insert ix acc) (ix-1)
+traverseDense_ fn bip =
+ -- FIXME: need guaranteed non-allocating way to do this.
+ SI.foldr' (\ix acc -> fn ix >> acc) (return ()) bip
+#endif
+
+markLabel :: Label -> DenseLabelSet -> DenseLabelSet
+mkEmptyDense :: Int -> DenseLabelSet
+mkSingleDense :: Int -> Label -> DenseLabelSet
+denseUnions :: Int -> [DenseLabelSet] -> DenseLabelSet
+bipSize :: DenseLabelSet -> Int
+
+-- | Print a BiPartition in a pretty form
+dispBip :: LabelTable -> DenseLabelSet -> String
+
+-- | Assume that total taxa are 0..N-1, invert membership:
+invertDense :: Int -> DenseLabelSet -> DenseLabelSet
+
+traverseDense_ :: Monad m => (Int -> m ()) -> DenseLabelSet -> m ()
+
+
+--------------------------------------------------------------------------------
+-- Dirt-simple reference implementation
+--------------------------------------------------------------------------------
+
+type DistanceMatrix = V.Vector (U.Vector Int)
+
+-- | Returns a triangular distance matrix encoded as a vector.
+-- Also return the set-of-BIPs representation for each tree.
+--
+-- This uses a naive method, directly computing the pairwise
+-- distance between each pair of trees.
+--
+-- This method is TOLERANT of differences in the laba/taxa sets between two trees.
+-- It simply prunes to the intersection before doing the distance comparison.
+-- Other scoring methods may be added in the future. (For example, penalizing for
+-- missing taxa.)
+naiveDistMatrix :: [NewickTree DefDecor] -> (DistanceMatrix, V.Vector (S.Set DenseLabelSet))
+naiveDistMatrix lst =
+ let sz = P.length lst
+ treeVect = V.fromList lst
+ labelSets = V.map treeLabels treeVect
+ eachbips = V.map allBips treeVect
+ mat = V.generate sz $ \ i ->
+ U.generate i $ \ j ->
+ let
+ inI = (labelSets V.! i)
+ inJ = (labelSets V.! j)
+ inBoth = S.intersection inI inJ
+
+ -- Match will always succeed due to size==0 test below:
+ Just prI = pruneTreeLeaves inBoth (treeVect V.! i)
+ Just prJ = pruneTreeLeaves inBoth (treeVect V.! j)
+
+ -- Memoization: If we are using it at its full size we can use the cached one:
+ bipsI = if S.size inBoth == S.size inI
+ then (eachbips V.! i)
+ else allBips prI
+ bipsJ = if S.size inBoth == S.size inJ
+ then (eachbips V.! j)
+ else allBips prJ
+
+ diff1 = S.size (S.difference bipsI bipsJ)
+ diff2 = S.size (S.difference bipsJ bipsI) -- symettric difference
+ in if S.size inBoth == 0
+ then 0 -- This is weird, but what other answer could we give?
+ else diff1 + diff2
+ in (mat, eachbips)
+
+ where
+ treeLabels :: NewickTree a -> S.Set Label
+ treeLabels (NTLeaf _ lab) = S.singleton lab
+ treeLabels (NTInterior _ ls) = S.unions (map treeLabels ls)
+
+-- | The number of bipartitions implied by a tree is one per EDGE in the tree. Thus
+-- each interior node carries a list of BiPs the same length as its list of children.
+labelBips :: NewickTree a -> NewickTree (a, [DenseLabelSet])
+labelBips tr =
+-- trace ("labelbips "++show allLeaves++" "++show size) $
+#ifdef NORMALIZATION
+ fmap (\(a,ls) -> (a,map (normBip size) ls)) $
+#endif
+ loop tr
+ where
+ size = numLeaves tr
+ zero = mkEmptyDense size
+ loop (NTLeaf dec lab) = NTLeaf (dec, [markLabel lab zero]) lab
+ loop (NTInterior dec chlds) =
+ let chlds' = map loop chlds
+ sets = map (denseUnions size . snd . get_dec) chlds' in
+ NTInterior (dec, sets) chlds'
+
+ allLeaves = leafSet tr
+ leafSet (NTLeaf _ lab) = mkSingleDense size lab
+ leafSet (NTInterior _ ls) = denseUnions size $ map leafSet ls
+
+-- normBip :: DenseLabelSet -> DenseLabelSet -> DenseLabelSet
+-- normBip allLeaves bip =
+normBip :: Int -> DenseLabelSet -> DenseLabelSet
+normBip totsize bip =
+ let -- size = bipSize allLeaves
+ halfSize = totsize `quot` 2
+-- flipped = denseDiff allLeaves bip
+ flipped = invertDense totsize bip
+ in
+ case compare (bipSize bip) halfSize of
+ LT -> bip
+ GT -> flipped -- Flip it
+ EQ -> min bip flipped -- This is a painful case, we need a tie-breaker
+
+
+foldBips :: Monoid m => (DenseLabelSet -> m) -> NewickTree a -> m
+foldBips f tr = F.foldMap f' (labelBips tr)
+ where f' (_,bips) = F.foldMap f bips
+
+-- | Get all non-singleton BiPs implied by a tree.
+allBips :: NewickTree a -> S.Set DenseLabelSet
+allBips tr = S.filter ((> 1) . bipSize) $ foldBips S.insert tr S.empty
+
+--------------------------------------------------------------------------------
+-- Optimized, LVish version
+--------------------------------------------------------------------------------
+-- First, necessary types:
+
+-- UNFINISHED:
+#if 0
+-- | A collection of all observed bipartitons (bips) with a mapping of which trees
+-- contain which Bips.
+type BipTable s = IMap DenseLabelSet s (SparseTreeSet s)
+-- type BipTable = IMap BitList (U.Vector Bool)
+-- type BipTable s = IMap BitList s (NA.NatArray s Word8)
+
+-- | Sets of taxa (BiPs) that are expected to be sparse.
+type SparseTreeSet s = IS.ISet s TreeID
+-- TODO: make this a set of numeric tree IDs...
+-- NA.NatArray s Word8
+
+type TreeID = AnnotatedTree
+-- | Tree's are identified simply by their order within the list of input trees.
+-- type TreeID = Int
+#endif
+
+--------------------------------------------------------------------------------
+-- Alternate way of slicing the problem: HashRF
+--------------------------------------------------------------------------------
+
+-- The distance matrix is an atomically-bumped matrix of numbers.
+-- type DistanceMat s = NA.NatArray s Word32
+-- Except... bump isn't supported by our idempotent impl.
+
+-- | This version slices the problem a different way. A single pass over the trees
+-- populates the table of bipartitions. Then the table can be processed (locally) to
+-- produce (non-localized) increments to a distance matrix.
+hashRF :: Int -> [NewickTree a] -> DistanceMatrix
+hashRF num_taxa trees = build M.empty (zip [0..] trees)
+ where
+ num_trees = length trees
+ -- First build the table:
+ build acc [] = ingest acc
+ build acc ((ix,hd):tl) =
+ let bips = allBips hd
+ acc' = S.foldl' fn acc bips
+ fn acc bip = M.alter fn2 bip acc
+ fn2 (Just membs) = Just (markLabel ix membs)
+ fn2 Nothing = Just (mkSingleDense num_taxa ix)
+ in
+ build acc' tl
+
+ -- Second, ingest the table to construct the distance matrix:
+ ingest :: M.Map DenseLabelSet DenseLabelSet -> DistanceMatrix
+ ingest bipTable = runST theST
+ where
+ theST :: forall s0 . ST s0 DistanceMatrix
+ theST = do
+ -- Triangular matrix, starting narrow and widening:
+ matr <- MV.new num_trees
+ -- Too bad MV.replicateM is insufficient. It should pass index.
+ -- Instead we write this C-style:
+ for_ (0,num_trees) $ \ ix -> do
+ row <- MU.replicate ix (0::Int)
+ MV.write matr ix row
+ return ()
+
+ unsafeIOToST$ putStrLn$" Built matrix for dim "++show num_trees
+
+ let bumpMatr i j | j < i = incr i j
+ | otherwise = incr j i
+ incr :: Int -> Int -> ST s0 ()
+ incr i j = do -- Not concurrency safe yet:
+-- unsafeIOToST$ putStrLn$" Reading at position "++show(i,j)
+ row <- MV.read matr i
+ elm <- MU.read row j
+ MU.write row j (elm+1)
+ return ()
+ fn bipMembs =
+ -- Here we quadratically consider all pairs of trees and ask whether
+ -- their edit distance is increased based on this particular BiP.
+ -- Actually, as an optimization, it is sufficient to consider only the
+ -- cartesian product of those that have and those that don't.
+ let haveIt = bipMembs
+ -- Depending on how invertDense is written, it could be useful to
+ -- fuse this in and deforest "dontHave".
+ dontHave = invertDense num_trees bipMembs
+ fn1 trId = traverseDense_ (fn2 trId) dontHave
+ fn2 trId1 trId2 = bumpMatr trId1 trId2
+ in
+-- trace ("Computed donthave "++ show dontHave) $
+ traverseDense_ fn1 haveIt
+ F.traverse_ fn bipTable
+ v1 <- V.unsafeFreeze matr
+ T.traverse (U.unsafeFreeze) v1
+
+
+--------------------------------------------------------------------------------
+-- Miscellaneous Helpers
+--------------------------------------------------------------------------------
+
+instance Pretty a => Pretty (S.Set a) where
+ pPrint s = pPrint (S.toList s)
+
+
+printDistMat :: Handle -> V.Vector (U.Vector Int) -> IO ()
+printDistMat h mat = do
+ hPutStrLn h "Robinson-Foulds distance (matrix format):"
+ hPutStrLn h "-----------------------------------------"
+ V.forM_ mat $ \row -> do
+ U.forM_ row $ \elem -> do
+ hPutStr h (show elem)
+ hPutStr h " "
+ hPutStr h "0\n"
+ hPutStrLn h "-----------------------------------------"
+
+-- My own forM for numeric ranges (not requiring deforestation optimizations).
+-- Inclusive start, exclusive end.
+{-# INLINE for_ #-}
+for_ :: Monad m => (Int, Int) -> (Int -> m ()) -> m ()
+for_ (start, end) _fn | start > end = error "for_: start is greater than end"
+for_ (start, end) fn = loop start
+ where
+ loop !i | i == end = return ()
+ | otherwise = do fn i; loop (i+1)
+
+-- | Which of a set of trees are compatible with a consensus?
+filterCompatible :: NewickTree a -> [NewickTree b] -> [NewickTree b]
+filterCompatible consensus trees =
+ let cbips = allBips consensus in
+ [ tr | tr <- trees
+ , cbips `S.isSubsetOf` allBips tr ]
+
+-- | `compatibleWith consensus tree` -- Is a tree compatible with a consensus?
+-- This is more efficient if partially applied then used repeatedly.
+--
+-- Note, tree compatibility is not the same as an exact match. It's
+-- like (<=) rather than (==). The "star topology" is consistent with the
+-- all trees, because it induces the empty set of bipartitions.
+compatibleWith :: NewickTree a -> NewickTree b -> Bool
+compatibleWith consensus =
+ let consBips = allBips consensus in
+ \ newTr -> S.isSubsetOf consBips (allBips newTr)
+
+-- | Consensus between two trees, which may even have different label maps.
+consensusTreeFull (FullTree n1 l1 t1) (FullTree n2 l2 t2) =
+ error "FINISHME - consensusTreeFull"
+
+-- | Take only the bipartitions that are agreed on by all trees.
+consensusTree :: Int -> [NewickTree a] -> NewickTree ()
+consensusTree _ [] = error "Cannot take the consensusTree of the empty list"
+consensusTree num_taxa (hd:tl) = bipsToTree num_taxa intersection
+ where
+ intersection = L.foldl' S.intersection (allBips hd) (map allBips tl)
+-- intersection = loop (allBips hd) tl
+-- loop :: S.Set DenseLabelSet -> [NewickTree a] -> S.Set DenseLabelSet
+-- loop !remain [] = remain
+-- -- Was attempting to use foldBips here as an optimization:
+-- -- loop !remain (hd:tl) = loop (foldBips S.delete hd remain) tl
+-- loop !remain (hd:tl) = loop (S.difference remain (allBips hd)) tl
+
+-- | Convert from bipartitions BACK to a single tree.
+bipsToTree :: Int -> S.Set DenseLabelSet -> NewickTree ()
+bipsToTree num_taxa origbip =
+-- trace ("Doing bips in order: "++show sorted++"\n") $
+ loop lvl0 sorted
+ where
+ -- We consider each subset in increasing size order.
+ -- FIXME: If we tweak the order on BIPs, then we can just use S.toAscList here:
+ sorted = L.sortBy (compare `on` bipSize) (S.toList origbip)
+
+ lvl0 = [ (mkSingleDense num_taxa ix, NTLeaf () ix)
+ | ix <- [0..num_taxa-1] ]
+
+ -- VERY expensive! However, due to normalization issues this is necessary for now:
+ -- TODO: in the future make it possible to definitively denormalize.
+ -- isMatch bip x = denseIsSubset x bip || denseIsSubset x (invertDense num_taxa bip)
+ isMatch bip x = denseIsSubset x bip
+
+ -- We recursively glom together subtrees until we have a complete tree.
+ -- We only process larger subtrees after we have processed all the smaller ones.
+ loop !subtrees [] =
+ case subtrees of
+ [] -> error "bipsToTree: internal error"
+ [(_,one)] -> one
+ lst -> NTInterior () (map snd lst)
+ loop !subtrees (bip:tl) =
+-- trace (" -> looping, subtrees "++show subtrees) $
+ let (in_,out) = L.partition (isMatch bip. fst) subtrees in
+ case in_ of
+ [] -> error $"bipsToTree: Internal error! No match for bip: "++show bip
+ ++" out is\n "++show out++"\n and remaining bips "++show (length tl)
+ ++"\n when processing orig bip set:\n "++show origbip
+ -- loop out tl
+ _ ->
+ -- Here all subtrees that match the current bip get merged:
+ loop ((denseUnions num_taxa (map fst in_),
+ NTInterior () (map snd in_)) : out) tl
+
diff --git a/Bio/Phylogeny/PhyBin/Util.hs b/Bio/Phylogeny/PhyBin/Util.hs
new file mode 100644
index 0000000..5fb107d
--- /dev/null
+++ b/Bio/Phylogeny/PhyBin/Util.hs
@@ -0,0 +1,139 @@
+{-# LANGUAGE ScopedTypeVariables #-}
+-- RecordWildCards, TypeSynonymInstances, CPP
+-- {-# LANGUAGE NamedFieldPuns #-}
+-- {-# LANGUAGE OverloadedStrings #-}
+-- {-# OPTIONS_GHC -fwarn-incomplete-patterns #-}
+-- {-# OPTIONS_GHC -fwarn-unused-imports #-}
+
+-- | This module contains misc bits used by (multiple) other modules.
+
+module Bio.Phylogeny.PhyBin.Util
+ (
+ is_regular_file, acquireTreeFiles,
+ safePrintDendro, sanityCheck
+ )
+ where
+
+import qualified Data.Foldable as F
+import Data.Function (on)
+import Data.List (delete, minimumBy, sortBy, insertBy, intersperse, sort)
+import Data.Maybe (fromMaybe, catMaybes)
+import qualified Data.Map as M
+import qualified Data.Set as S
+import qualified Data.Text.Lazy as T
+import Control.Monad (forM, forM_, filterM, when, unless)
+import Control.Exception (evaluate)
+import Control.Applicative ((<$>),(<*>))
+import Control.Concurrent (Chan)
+import System.FilePath (combine)
+import System.Directory (doesFileExist, doesDirectoryExist,
+ getDirectoryContents, getCurrentDirectory)
+import System.IO (openFile, hClose, IOMode(ReadMode), stderr,
+ hPutStr, hPutStrLn)
+import System.Exit (ExitCode(..))
+import System.Timeout (timeout)
+import Test.HUnit ((~:),(~=?),Test,test)
+
+-- For vizualization:
+import Text.PrettyPrint.HughesPJClass hiding (char, Style)
+import Bio.Phylogeny.PhyBin.CoreTypes
+-- import Bio.Phylogeny.PhyBin.Parser (parseNewick)
+-- import Bio.Phylogeny.PhyBin.Visualize (dotToPDF, dotNewickTree, viewNewickTree)
+
+import qualified Data.Clustering.Hierarchical as C
+import qualified Data.Graph.Inductive as G
+import qualified Data.GraphViz as Gv
+import Data.GraphViz.Printing (renderDot)
+import Data.GraphViz.Types.Canonical (nodeStmts, graphStatements)
+
+----------------------------------------------------------------------------------------------------
+-- OS specific bits:
+----------------------------------------------------------------------------------------------------
+-- #ifdef WIN32
+-- is_regular_file = undefined
+-- is_directory path =
+-- getFileAttributes
+-- --getFileInformationByHandle
+-- -- bhfiFileAttributes
+-- file_exists = undefined
+-- #else
+-- is_regular_file :: FilePath -> IO Bool
+-- is_regular_file file =
+-- do stat <- getFileStatus file;
+-- -- Hmm, this is probably bad practice... hard to know its exhaustive:
+-- return$ isRegularFile stat || isNamedPipe stat || isSymbolicLink stat
+-- is_directory :: FilePath -> IO Bool
+-- is_directory path =
+-- do stat <- getFileStatus path
+-- return (isDirectory stat)
+-- file_exists = fileExist
+-- #endif
+
+-- Here we ASSUME it exists, then these functions are good enough:
+is_regular_file :: FilePath -> IO Bool
+is_regular_file = doesFileExist
+
+is_directory :: FilePath -> IO Bool
+is_directory = doesDirectoryExist
+
+file_exists :: FilePath -> IO Bool
+file_exists path =
+ do f <- doesFileExist path
+ d <- doesDirectoryExist path
+ return (f || d)
+
+--------------------------------------------------------------------------------
+
+-- | Expand out directories to find all the tree files.
+acquireTreeFiles :: [String] -> IO [String]
+acquireTreeFiles inputs = do
+ all :: [[String]] <- forM inputs $ \ path -> do
+ exists <- file_exists path
+
+ --stat <- if exists then getFileStatus path else return (error "internal invariant")
+ -- [2010.09.23] This is no longer really necessary:
+ if not exists then do
+ error$ "No file or directory found at this path!: "++path
+ -- hPutStr stderr$ "Input not a file/directory, assuming wildcard, using 'find' for expansion"
+ -- entries <- HSH.run$ "find " ++ path
+ -- hPutStrLn stderr$ "("++show (length entries)++" files found): "++ show path
+ -- return entries
+ else do
+ isdir <- is_directory path
+ reg <- is_regular_file path
+ if isdir then do
+ hPutStr stderr$ "Input is a directory, reading all regular files contained "
+ children <- getDirectoryContents path
+ filtered <- filterM is_regular_file $ map (combine path) children
+ hPutStrLn stderr$ "("++show (length filtered)++" regular files found): "++ show path
+ return$ filtered
+ else if reg then do
+ return [path]
+ else error$ "phybin: Unhandled input path: " ++ path
+
+ return (concat all)
+
+--------------------------------------------------------------------------------
+
+-- | Step carefully in case of cycles (argh).
+safePrintDendro :: Gv.DotGraph G.Node -> IO (Maybe T.Text)
+safePrintDendro dotg= do
+-- putStrLn$ "Dendrogram graph size: "++ show (F.foldl' (\a _ -> a+1) 0 dotg)
+ mx <- timeout (2 * 1000 * 1000) $ do
+-- putStrLn$ "Dendrogram graph, is directed?: "++ show (Gv.directedGraph dotg)
+ putStrLn$ "Dendrogram graph size: "++ show (length $ nodeStmts $ graphStatements dotg)
+ let str = renderDot $ Gv.toDot dotg
+ evaluate (T.length str)
+ return str
+ case mx of
+ Nothing -> do putStrLn "WARNING: DotGraph appears to be a cyclic structure. This is probably a bug."
+ return Nothing
+ _ -> return mx
+
+sanityCheck :: C.Dendrogram (FullTree DefDecor) -> IO ()
+sanityCheck dendro = do
+ let fn seen elm | S.member (treename elm) seen =
+ error$"Dendrogram failed sanity check! Tree name occurs multiple times: "++(treename elm)
+ | otherwise = S.insert (treename elm) seen
+ sz = S.size $ F.foldl' fn S.empty dendro
+ putStrLn$ "Sanity checked dendrogram of size: "++show sz
diff --git a/Bio/Phylogeny/PhyBin/Visualize.hs b/Bio/Phylogeny/PhyBin/Visualize.hs
new file mode 100644
index 0000000..2f72774
--- /dev/null
+++ b/Bio/Phylogeny/PhyBin/Visualize.hs
@@ -0,0 +1,449 @@
+{-# LANGUAGE NamedFieldPuns #-}
+
+module Bio.Phylogeny.PhyBin.Visualize
+ (dotNewickTree, dotToPDF, viewNewickTree,
+ dotNewickTree_debug,
+
+ -- * Dendrogram visualization
+ dendrogramToGraph, dotDendrogram
+ )
+ where
+import Text.Printf (printf)
+import Data.List (elemIndex, isPrefixOf)
+import Data.List.Split (chunksOf)
+import Data.Maybe (fromJust, isJust)
+import qualified Data.Map as M
+import qualified Data.Set as S
+import qualified Data.Vector as V
+import Data.Text.Lazy (pack)
+import Control.Monad (void)
+import Control.Concurrent (Chan, newChan, writeChan, forkIO)
+import qualified Data.Graph.Inductive as G hiding (run)
+import qualified Data.GraphViz as Gv hiding (parse, toLabel)
+import qualified Data.GraphViz.Attributes.Complete as GA
+import qualified Data.GraphViz.Attributes.Colors as GC
+import Data.GraphViz.Attributes.Colors (Color(RGB))
+
+-- import Test.HUnit ((~:),(~=?),Test,test)
+
+import System.Timeout (timeout)
+
+import qualified Data.Clustering.Hierarchical as C
+
+import Bio.Phylogeny.PhyBin.CoreTypes
+import Bio.Phylogeny.PhyBin.RFDistance (filterCompatible, compatibleWith, consensusTree)
+
+import Debug.Trace
+
+----------------------------------------------------------------------------------------------------
+-- Visualization with GraphViz and FGL:
+----------------------------------------------------------------------------------------------------
+
+-- First we need to be able to convert our trees to FGL graphs:
+toGraph :: FullTree StandardDecor -> G.Gr String Double
+toGraph (FullTree _ tbl tree) = G.run_ G.empty $ loop tree
+ where
+ fromLabel ix = tbl M.! ix
+ loop (NTLeaf _ name) =
+ do let str = fromLabel name
+ _ <- G.insMapNodeM str
+ return str
+ loop (NTInterior (StandardDecor{sortedLabels}) ls) =
+ do let bigname = concatMap fromLabel sortedLabels
+ names <- mapM loop ls
+ _ <- G.insMapNodeM bigname
+ mapM_ (\x -> G.insMapEdgeM (bigname, x, 0.0)) names
+ return bigname
+
+-- This version uses the tree nodes themselves as graph labels.
+toGraph2 :: FullTree StandardDecor -> G.Gr (NewickTree StandardDecor) Double
+toGraph2 (FullTree _ tbl tree) = G.run_ G.empty $ loop tree
+ where
+ loop node@(NTLeaf _ _) =
+ do _ <- G.insMapNodeM node
+ return ()
+ loop node@(NTInterior _ ls) =
+ do mapM_ loop ls
+ _ <- G.insMapNodeM node
+ -- Edge weights as just branchLen (not bootstrap):
+ mapM_ (\x -> G.insMapEdgeM (node, x, branchLen$ get_dec x)) ls
+ return ()
+
+----------------------------------------------------------------------------------------------------
+-- Dendrograms
+ ----------------------------------------------------------------------------------------------------
+
+-- | Convert to a dotGraph. Some duplicated code with dotNewickTree.
+dotDendrogram :: PhyBinConfig -> String -> Double -> C.Dendrogram (FullTree a) ->
+ Maybe (M.Map TreeName Int) -> [[NewickTree ()]] -> Gv.DotGraph G.Node
+dotDendrogram PBC{show_trees_in_dendro, show_interior_consensus}
+ title edge_scale origDendro mNameMap highlightTrs =
+ Gv.graphToDot myparams (G.nmap uid graph)
+ where
+ (charsDropped, dendro) = truncateNames origDendro
+ -- This is ugly, but we modify the name map to match:
+ nameMap' = fmap (M.mapKeys (drop charsDropped)) mNameMap
+ graph :: DendroGraph
+ graph = dendrogramToGraph num_taxa dendro
+
+ num_taxa = numLeaves $ nwtree fstLeaf
+ FullTree{labelTable} = fstLeaf
+ fstLeaf = getFstLeaf dendro
+ getFstLeaf (C.Leaf ft) = ft
+ getFstLeaf (C.Branch _ l _) = getFstLeaf l
+
+ uidsToNames = M.fromList $
+ map (\nd at NdLabel{uid} -> (uid,nd)) $
+-- filter (isJust . tre) $
+ map (fromJust . G.lab graph) $
+ G.nodes graph
+
+ matchers = map mkMatcher highlightTrs
+ mkMatcher ls = let fns = map compatibleWith ls -- Multiple predicate functions
+ in \ tr -> -- Does it match any predicate?
+ or$ map (\f -> f tr) fns
+
+ wcolors = zip matchers defaultPalette
+ findColor tr = loop wcolors
+ where loop [] = Nothing
+ loop ((f,col):rst) | f tr = Just col
+ | otherwise = loop rst
+
+ -- Map uids to highlight color
+ highlightMap = M.map fromJust $
+ M.filter isJust $
+ M.map (\ (NdLabel _ _ _ ct) -> findColor ct)
+ uidsToNames
+
+ myparams :: Gv.GraphvizParams G.Node String Double () String -- (Either String String)
+ myparams = Gv.defaultParams { Gv.globalAttributes= [Gv.GraphAttrs [GA.Label$ GA.StrLabel$ pack title]],
+ Gv.fmtNode= nodeAttrs,
+ Gv.fmtEdge= edgeAttrs
+ }
+ nodeAttrs :: (Int, UniqueNodeName) -> [GA.Attribute]
+ nodeAttrs (_num, uid) =
+ let uid' = if show_trees_in_dendro
+ then printed_tree++"\n"++uid
+ else uid
+ printed_tree =
+ case M.lookup uid uidsToNames of
+ Nothing -> ""
+ Just NdLabel{clumpSize,consensus} ->
+ (if clumpSize>1 then "size "++show clumpSize++"\n" else "")
+ ++ show (displayStrippedTree (FullTree "" labelTable consensus))
+ (tag,shp,styl) = -- case eith of
+ if isPrefixOf "DUMMY_" uid
+ then (if show_trees_in_dendro && show_interior_consensus
+ then printed_tree else "",
+ if show_interior_consensus
+ then GA.BoxShape -- GA.PlainText
+ else GA.PointShape,
+ [ GA.Color [weighted$ GA.X11Color Gv.Transparent] ]
+ )
+ else (uid', GA.Ellipse, [ GA.Style [GA.SItem GA.Filled []]])
+ highlightColor =
+ case M.lookup uid highlightMap of
+ Nothing -> []
+ Just col -> [ GA.Color [weighted col ] ]
+ clustColor | not (null highlightTrs) = []
+ | otherwise =
+ case (nameMap', M.lookup uid uidsToNames) of
+ (Just nm, Just NdLabel{tre=Just FullTree{treename}}) ->
+ case M.lookup treename nm of
+ Nothing -> []
+ -- Here we color the top TEN clusters:
+ Just ind | ind <= 10 -> [ GA.Color [weighted$ defaultPaletteV V.! (ind-1) ] ]
+ | otherwise -> []
+ -- TODO: shall we color intermediate nodes?
+ _ -> []
+ in
+ [ GA.Label$ GA.StrLabel$ pack tag
+ , GA.Shape shp
+ ] ++ styl ++ highlightColor ++ clustColor
+
+ edgeAttrs = getEdgeAttrs edge_scale
+
+
+type UniqueNodeName = String
+-- type DendroGraph = G.Gr (Maybe TreeName,UniqueNodeName) Double
+type DendroGraph = G.Gr NdLabel Double
+
+-- | When we first convert to a graph representation, there is a bunch of information
+-- hanging off of each node.
+data NdLabel =
+ NdLabel
+ { uid :: UniqueNodeName
+ , tre :: Maybe (FullTree ())
+ , clumpSize :: !Int
+ , consensus :: NewickTree ()
+ }
+ deriving (Show, Ord, Eq)
+
+-- | Create a graph using TreeNames for node labels and edit-distance for edge labels.
+dendrogramToGraph :: Int -> C.Dendrogram (FullTree a) -> DendroGraph
+dendrogramToGraph num_taxa orig =
+ G.run_ G.empty $ void$
+ loop (fmap (fmap (const())) orig)
+ where
+-- loop node@(C.Leaf ft) = G.insMapNodeM (NdLabel (treename ft) (Just$ fmap (const()) ft) 1 ft)
+ loop node@(C.Leaf ft) =
+ let stripped = fmap (const()) ft in
+ G.insMapNodeM (NdLabel (treename ft) (Just stripped) 1 (nwtree stripped))
+ loop node@(C.Branch 0 left right) = do
+ -- As a preprocessing step we collapse clusters that are separated by zero edit distance.
+ ----------------------------------------
+ let lvs = collapseZeroes left ++ collapseZeroes right
+ nms = map (treename) lvs
+ lens = map length nms
+ total = sum lens
+ avg = total `quot` length nms
+ -- The goal here is to make an approximately square arrangement:
+ -- goal: avg * perline == total / (avg * perline)
+ perline = ceiling$ sqrt (fromIntegral total / ((fromIntegral avg)^2))
+ chunked = chunksOf perline nms
+ fatname = unlines (map unwords chunked)
+ G.insMapNodeM (NdLabel fatname (Just (head lvs))
+ (length lvs) (consensusTree num_taxa (map nwtree lvs)))
+ ----------------------------------------
+ loop node@(C.Branch dist left right) =
+ do (_,ll@(NdLabel lid _ s1 c1)) <- loop left
+ (_,rr@(NdLabel rid _ s2 c2)) <- loop right
+ -- Interior nodes do NOT have their names drawn:
+ let ndname = "DUMMY_"++(lid++"_"++rid) -- HACK!
+ (midN,mid) <- G.insMapNodeM (NdLabel ndname Nothing (s1+s2) (consensusTree num_taxa [c1,c2]))
+ G.insMapEdgeM (ll, mid, dist)
+ G.insMapEdgeM (rr, mid, dist)
+ return (midN,mid)
+
+ collapseZeroes (C.Leaf tr) = [tr]
+ collapseZeroes (C.Branch 0 l r) = collapseZeroes l ++ collapseZeroes r
+ collapseZeroes oth = error "dendrogramToGraph: internal error. Not expecting non-zero branch length here."
+
+
+-- | The plot looks nicer when the names aren't bloated with repeated stuff. This
+-- replaces all tree names with potentially shorter names by removing the common prefix.
+-- Returns how many characters were dropped.
+truncateNames :: C.Dendrogram (FullTree a) -> (Int, C.Dendrogram (FullTree a))
+truncateNames dendro = (prefChars, fmap chopName dendro)
+ where
+ chopName ft = ft{ treename= drop prefChars (treename ft) }
+ prefChars = length$ commonPrefix$ S.toList$ allNames dendro
+ allNames (C.Leaf tr) = S.singleton (treename tr)
+ allNames (C.Branch _ l r) = S.union (allNames l) (allNames r)
+
+----------------------------------------------------------------------------------------------------
+
+
+-- | Open a GUI window to displaya tree.
+--
+-- Fork a thread that then runs graphviz.
+-- The channel retuned will carry a single message to signal
+-- completion of the subprocess.
+viewNewickTree :: String -> FullTree StandardDecor -> IO (Chan (), FullTree StandardDecor)
+-- TODO: UPDATE THIS TO RETURN AN ASYNC!!
+viewNewickTree title tree@(FullTree{nwtree}) =
+ do chan <- newChan
+ let dot = dotNewickTree title (1.0 / avg_branchlen [nwtree])
+ tree
+ runit = do mx <- -- timeout defaultTimeout $
+ -- This one is interactive, so we don't need a timeout.
+ Gv.runGraphvizCanvas default_cmd dot Gv.Xlib
+ -- case mx of
+ -- Nothing -> putStrLn$ "WARNING: call to graphviz TIMED OUT."++file
+ -- _ -> return ()
+ writeChan chan ()
+ --str <- prettyPrint d
+ --putStrLn$ "Generating the following graphviz tree:\n " ++ str
+ _ <- forkIO runit
+ --do runGraphvizCanvas Dot dot Xlib; return ()
+
+ return (chan, tree)
+
+
+--default_cmd = TwoPi -- Totally ignores edge lengths.
+default_cmd :: Gv.GraphvizCommand
+default_cmd = Gv.Neato
+
+-- Show a float without scientific notation:
+myShowFloat :: Double -> String
+-- showFloat weight = showEFloat (Just 2) weight ""
+myShowFloat fl =
+ let rnd = round fl in
+ if fl == fromIntegral rnd
+ then show rnd
+ else printf "%.4f" fl
+
+-- | Convert a .dot file to .pdf.
+dotToPDF :: Gv.DotGraph G.Node -> FilePath -> IO (Maybe FilePath)
+dotToPDF dot file = do
+ -- If we have any problem with graphviz we want to time that out rather than let
+ -- our whole run hang:
+ x <- timeout defaultTimeout $
+ Gv.runGraphvizCommand default_cmd dot Gv.Pdf file
+ case x of
+ Nothing -> do putStrLn$ "WARNING: call to graphviz TIMED OUT. File not plotted: "++file
+ return Nothing
+ _ -> return x
+
+-- Arbitrary: 15 second timeout.
+defaultTimeout :: Int
+defaultTimeout = (15 * 1000 * 1000)
+
+-- | Convert a NewickTree to a graphviz Dot graph representation.
+dotNewickTree :: String -> Double -> FullTree StandardDecor -> Gv.DotGraph G.Node
+dotNewickTree title edge_scale atree@(FullTree _ tbl tree) =
+ --trace ("EDGE SCALE: " ++ show edge_scale) $
+ Gv.graphToDot myparams graph
+ where
+ graph = toGraph2 atree
+ fromLabel ix = tbl M.! ix
+ myparams :: Gv.GraphvizParams G.Node (NewickTree StandardDecor) Double () (NewickTree StandardDecor)
+ myparams = Gv.defaultParams { Gv.globalAttributes= [Gv.GraphAttrs [GA.Label$ GA.StrLabel$ pack title]],
+ Gv.fmtNode= nodeAttrs, Gv.fmtEdge= edgeAttrs }
+ nodeAttrs :: (Int,NewickTree StandardDecor) -> [GA.Attribute]
+ nodeAttrs (_num, node) =
+ let children = get_children node in
+ [ GA.Label$ GA.StrLabel$ pack$
+ concatMap fromLabel $
+ sortedLabels $ get_dec node
+ , GA.Shape (if null children then {-PlainText-} GA.Ellipse else GA.PointShape)
+ , GA.Style [GA.SItem GA.Filled []]
+ ]
+ edgeAttrs = getEdgeAttrs edge_scale
+
+
+-- | Some arbitrarily chosen colors from the X11 set:
+defaultPaletteV :: V.Vector GA.Color
+defaultPaletteV = V.fromList defaultPalette
+
+defaultPalette :: [GA.Color]
+defaultPalette = concat$ replicate 4 $ map GA.X11Color
+ [ Gv.Aquamarine
+ , Gv.PaleVioletRed
+ , Gv.MediumPurple
+ , Gv.PaleGreen
+ , Gv.PapayaWhip
+ , Gv.SkyBlue
+ , Gv.Yellow
+ , Gv.Crimson
+ , Gv.Gray
+ , Gv.PaleGoldenrod
+ ]
+
+-- Grabbed the first couple palettes off a website:
+altPalette :: V.Vector GA.Color
+altPalette = V.fromList $ concat $ replicate 3 $
+ -- http://www.colourlovers.com/palette/2962435/Autumn_Roses
+ [ RGB 159 74 81, RGB 217 183 173, RGB 149 91 116, RGB 185 138 148
+ -- , RGB 101 69 82 -- too dark
+ ] ++
+ -- http://www.colourlovers.com/palette/2962434/Earthy_warm
+ [ RGB 108 74 39, RGB 207 179 83, RGB 180 149 60, RGB 244 242 185
+ -- , RGB 61 63 39
+ ]
+
+getEdgeAttrs :: Double -> (t, t1, Double) -> [GA.Attribute]
+getEdgeAttrs edge_scale = edgeAttrs
+ where
+ -- TOGGLE:
+ -- edgeAttrs (_,_,weight) = [ArrowHead noArrow, Len (weight * edge_scale + bump), GA.Label (StrLabel$ show (weight))]
+ edgeAttrs (_,_,weight) =
+ let draw_weight = compute_draw_weight weight edge_scale in
+ --trace ("EDGE WEIGHT "++ show weight ++ " drawn at "++ show draw_weight) $
+ [GA.ArrowHead Gv.noArrow,
+ GA.Label$ GA.StrLabel$ pack$ myShowFloat weight] ++ -- TEMPTOGGLE
+ --[ArrowHead noArrow, GA.Label (StrLabel$ show draw_weight)] ++ -- TEMPTOGGLE
+ if weight == 0.0
+ then [GA.Color [weighted$ GA.X11Color Gv.Red],
+ GA.LWidth 3.0, GA.Len minlen]
+ else [GA.Len draw_weight]
+ minlen = 0.7
+ maxlen = 3.0
+ compute_draw_weight w scale =
+ let scaled = (abs w) * scale + minlen in
+ -- Don't draw them too big or it gets annoying:
+ (min scaled maxlen)
+
+weighted c = GC.WC {GC.wColor=c, GC.weighting=Nothing}
+
+-- | This version shows the ordered/rooted structure of the normalized tree.
+dotNewickTree_debug :: String -> FullTree StandardDecor -> Gv.DotGraph G.Node
+dotNewickTree_debug title atree@(FullTree _ tbl tree) = Gv.graphToDot myparams graph
+ where
+ graph = toGraph2 atree
+ fromLabel ix = tbl M.! ix
+ myparams :: Gv.GraphvizParams G.Node (NewickTree StandardDecor) Double () (NewickTree StandardDecor)
+ myparams = Gv.defaultParams { Gv.globalAttributes= [Gv.GraphAttrs [GA.Label$ GA.StrLabel$ pack title]],
+ Gv.fmtNode= nodeAttrs, Gv.fmtEdge= edgeAttrs }
+ nodeAttrs :: (Int,(NewickTree StandardDecor)) -> [GA.Attribute]
+ nodeAttrs (num,node) =
+ let children = get_children node in
+ [ GA.Label (if null children
+ then GA.StrLabel$ pack$ concatMap fromLabel $ sortedLabels $ get_dec node
+ else GA.RecordLabel$ take (length children) $
+ -- This will leave interior nodes unlabeled:
+ map (GA.PortName . GA.PN . pack) $ map show [1..]
+ -- This version gives some kind of name to interior nodes:
+-- map (\ (i,ls) -> LabelledTarget (PN$ show i) (fromLabel$ head ls)) $
+-- zip [1..] (map (thd3 . get_dec) children)
+ )
+ , GA.Shape GA.Record
+ , GA.Style [GA.SItem GA.Filled []]
+ ]
+
+ edgeAttrs (num1, num2, _weight) =
+ let node1 = fromJust$ G.lab graph num1
+ node2 = fromJust$ G.lab graph num2
+ ind = fromJust$ elemIndex node2 (get_children node1)
+ in [GA.TailPort$ GA.LabelledPort (GA.PN$ pack$ show$ 1+ind) (Just GA.South)]
+
+
+
+
+--------------------------------------------------------------------------------
+-- Unit Testing
+--------------------------------------------------------------------------------
+
+
+-- tt0 :: IO (Chan (), FullTree)
+-- tt0 = drawNewickTree "tt0" $ annotateWLabLists $ parseNewick "" "(A,(C,D,E),B);"
+
+-- tt3 :: IO (Chan (), FullTree)
+-- tt3 = drawNewickTree "tt3" tt
+
+-- tt4 :: IO (Chan (), FullTree)
+-- tt4 = drawNewickTree "tt4"$ trace ("FINAL: "++ show (pPrint norm4)) $ norm4
+
+-- tt5 :: IO (Chan (), FullTree)
+-- tt5 = drawNewickTree "tt5"$ norm5
+
+
+-- tt5' :: String
+-- tt5' = prettyPrint' $ dotNewickTree "norm5" 1.0 norm5
+
+-- ttall :: IO (Chan (), FullTree)
+-- ttall = do tt3; tt4; tt5
+
+-- tt2 :: G.Gr String Double
+-- tt2 = toGraph tt
+
+
+-- unitTests :: Test
+-- unitTests = test
+-- [
+-- ]
+
+
+-- TEMP / HACK:
+prettyPrint' :: Show a => a -> String
+prettyPrint' = show
+
+-- | Common prefix of a list of lists.
+commonPrefix :: Eq a => [[a]] -> [a]
+commonPrefix [] = []
+commonPrefix ls@(hd:tl)
+ | any null ls = []
+ | otherwise =
+ if all ((== (head hd)) . head) tl
+ then head hd : commonPrefix (map tail ls)
+ else commonPrefix (map tail ls)
diff --git a/DEVLOG.md b/DEVLOG.md
new file mode 100644
index 0000000..606d83b
--- /dev/null
+++ b/DEVLOG.md
@@ -0,0 +1,140 @@
+
+
+[2012.11.19]
+------------
+
+Right now the rickettsia dataset is causing problems:
+
+ phybin.exe: Rickettsia/phybin_output/bin1_16.txt: openFile: does not exist (No such file or directory)
+
+[2013.07.22] {Beginning optimization of all-to-all RF distance}
+---------------------------------------------------------------
+
+Making the labels into Ints and switching to IntSet has sped up the
+150-taxa/100-trees example from 3 sec to 1.6 seconds on my macbook
+retina.
+
+
+[2013.07.23] {Debugging}
+------------------------
+
+Ok, clustering on RFDistance with --editdist 0 should give the same
+answers as phybin --bin, but it's not!
+
+On Irene's current Wolbachia dataset, traditional phybin --bin comes
+up with this as its top cluster:
+
+ RAxML_bipartitions.88
+ RAxML_bipartitions.506
+ RAxML_bipartitions.476
+ RAxML_bipartitions.39
+ RAxML_bipartitions.386
+ RAxML_bipartitions.321
+ RAxML_bipartitions.3
+ RAxML_bipartitions.276
+ RAxML_bipartitions.260
+ RAxML_bipartitions.256
+ RAxML_bipartitions.233
+ RAxML_bipartitions.197
+ RAxML_bipartitions.171
+ RAxML_bipartitions.165
+ RAxML_bipartitions.141
+ RAxML_bipartitions.116
+
+By contrast, --UPGMA --editdist0 comes up with this:
+
+ RAxML_bipartitions.120
+ RAxML_bipartitions.167
+ RAxML_bipartitions.250
+ RAxML_bipartitions.38
+ RAxML_bipartitions.444
+ RAxML_bipartitions.494
+
+This is with no branch-collapsing... Ugh, that's no ovelap at all.
+If we look at the second bin for --UPGMA though, we see overlap:
+
+ RAxML_bipartitions.233
+ RAxML_bipartitions.3
+ RAxML_bipartitions.321
+ RAxML_bipartitions.39
+ RAxML_bipartitions.476
+ RAxML_bipartitions.88
+
+Ok, at least one is a subset of the other this time... and here's the diff:
+
+ RAxML_bipartitions.116
+ RAxML_bipartitions.141
+ RAxML_bipartitions.165
+ RAxML_bipartitions.171
+ RAxML_bipartitions.197
+ RAxML_bipartitions.256
+ RAxML_bipartitions.260
+ RAxML_bipartitions.276
+ RAxML_bipartitions.386
+ RAxML_bipartitions.506
+
+And here's the tree that phybin --bin produced:
+
+ ((14, 3_), (19, (5_, 13)), ((1_, 2_), (7_, (18, 6_))));
+
+I singled out 116 and 233, and then printed their normalized forms
+using "phybin -p 2 --printnorms tests/t2/*":
+
+ Tree "116"
+ ((1_, 2_), (7_, (18, 6_)), ((14, 3_), (19, (13, 5_))))
+ Tree "233"
+ ((1_, 2_), (7_, (18, 6_)), ((14, 3_), (19, (13, 5_))))
+
+And UNNORMALIZED:
+
+ Tree "116"
+ (3_, ((((6_, 18), 7_), (2_, 1_)), (19, (13, 5_))), 14)
+ Tree "233"
+ (5_, (19, ((3_, 14), ((2_, 1_), (7_, (6_, 18))))), 13)
+
+Wait.... this makes it look like normalization is WRONG!!!
+Tree 233 starts with 13+5 in a three-way split with a big chunk
+containing everything else.
+
+In fact, the RFDistance is three, which is probably right.
+
+[2013.07.23] {Updating set reps, timing}
+----------------------------------------
+
+Ok, went ahead and switched over to unboxed bitvectors for
+DenseLabelSet. It's 10X slower!
+
+The BITVEC_BIPS option is getting the right answer now though.
+Apparently the normalization bug just fixed was the only bug.
+
+For 100 trees, 150 taxa (from the hashrf suite):
+
+ * IntSet takes 2.2s, 7.2G alloc, 114M copied.
+ * bit vec takes 21s and 69G alloc, 264M copied.
+
+Productivities are high in both cases.
+
+Oops... there was a bug in bipSize. I'm surprised it was getting the
+right answer. Fixing that brought it down to 39G alloc, 12.3s.
+Stil, bit vectors are NOT a win for the direct/simple N^2 version.
+
+
+[2013.07.25] {Comparison ogainst HashRF}
+----------------------------------------
+
+Note that HashRF doesn't have very good error messages. For example,
+if given our messy data with sum trees missing taxa:
+
+ ./hashrf Wolbachia/all_trees.tr 928
+
+ *** Collecting the taxon labels ***
+ Number of taxa = 10
+
+ *** Reading tree file and collecting bipartitions ***
+ libc++abi.dylib: terminate called throwing an exception
+ Abort trap: 6
+
+In fact... even if I use phybin to prune out the 503 trees with 10
+taxa, I still get the error.
+
+
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..1a44106
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,31 @@
+NOTE: This License applies to all files in the "Haskell CnC"
+distribution, irrespective of whether the license is appended at the
+top of each file.
+
+ BSD STANDARD THREE CLAUSE LICENSE ("New BSD License")
+
+Copyright (c) 2010, Intel Corporation.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ * Neither the name of the <organization> nor the
+ names of its contributors may be used to endorse or promote products
+ derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
diff --git a/Main.hs b/Main.hs
new file mode 100644
index 0000000..deaaf0c
--- /dev/null
+++ b/Main.hs
@@ -0,0 +1,617 @@
+{-# LANGUAGE RecordWildCards, TupleSections, NamedFieldPuns #-}
+{-# OPTIONS_GHC -fwarn-unused-imports -fwarn-incomplete-patterns #-}
+
+-- | The MAIN module, of course. This is the script that deals with
+-- command line options and calling into the heart of the beast.
+
+module Main where
+import Data.List (sort, intersperse, foldl')
+import Data.List.Split (splitOneOf)
+import qualified Data.ByteString.Char8 as B
+import qualified Data.Map as M
+import qualified Data.Set as S
+import Control.Monad
+import Control.Concurrent (readChan)
+import System.Environment (getArgs, withArgs)
+import System.Console.GetOpt (OptDescr(Option), ArgDescr(..), ArgOrder(..), usageInfo, getOpt)
+import System.Exit (exitSuccess)
+import System.IO (stdout)
+
+import Control.Exception (catch, SomeException)
+import qualified Data.Vector as V
+import Test.HUnit as HU
+
+import Bio.Phylogeny.PhyBin.CoreTypes
+import Bio.Phylogeny.PhyBin (driver, normalize, annotateWLabLists,
+ unitTests, acquireTreeFiles, deAnnotate,
+ retrieveHighlights, matchAnyHighlight)
+import Bio.Phylogeny.PhyBin.Parser (parseNewick, parseNewickFiles, unitTests)
+import Bio.Phylogeny.PhyBin.Visualize (viewNewickTree) -- dotNewickTree_debug
+import Bio.Phylogeny.PhyBin.RFDistance
+import Bio.Phylogeny.PhyBin.PreProcessor
+
+import qualified Data.Clustering.Hierarchical as C
+
+import Data.Version (showVersion)
+import Paths_phybin (version)
+
+----------------------------------------------------------------------------------------------------
+-- MAIN script: Read command line options and call the program.
+----------------------------------------------------------------------------------------------------
+
+-- Note: ORDER is important here, we process options in this order:
+data Flag
+ = Verbose
+ | Version
+ | Output String
+ | SetNumTaxa Int
+ | BranchThresh Double
+ | BootStrapThresh Int
+ | PruneTaxa [String]
+ | NullOpt
+ | Graph | Draw
+ | Force
+ | View
+ | TabDelimited Int Int
+
+ | Highlight FilePath
+ | ShowTreesInDendro | ShowInterior
+
+ | RFMode WhichRFMode
+ | SelfTest
+ | RFMatrix | LineSetDiffMode
+ | PrintNorms | PrintReg | PrintConsensus | PrintMatching
+ | Cluster C.Linkage
+ | BinningMode
+ | EditDistThresh Int
+ | DendogramOnly
+
+ | NameCutoff String
+ | NamePrefix Int
+ | NameTable String -- Must come after Cutoff/Prefix
+
+ deriving (Show, Eq, Ord) -- ORD is really used.
+
+parseTabDelim :: String -> Flag
+parseTabDelim _str =
+ TabDelimited 8 9
+
+options :: [OptDescr Flag]
+options =
+ [ Option ['v'] ["verbose"] (NoArg Verbose) "print WARNINGS and other information (recommended at first)"
+ , Option ['V'] ["version"] (NoArg Version) "show version number"
+
+ , Option ['o'] ["output"] (ReqArg Output "DIR") "set directory to contain all output files (default \"./phybin_out/\")"
+ , Option [] ["selftest"] (NoArg SelfTest) "run internal unit tests"
+
+{- -- TODO: FIXME: IMPLEMENT THIS:
+ , Option [] [] (NoArg NullOpt) ""
+ , Option ['t'] ["tabbed"] (ReqArg parseTabDelim "NUM1:NUM2")$ "assume the input is a ab-delimited file with gene names \n"++
+ "in column NUM1 and Newick trees in NUM2"
+-}
+ , Option [] [] (NoArg NullOpt) ""
+ , Option [] [] (NoArg$ error "internal problem") "----------------------------- Clustering Options ------------------------------"
+
+ , Option [] ["bin"] (NoArg BinningMode)$ "Use simple binning, the cheapest form of 'clustering'"
+ , Option [] ["single"] (NoArg$ Cluster C.SingleLinkage) $ "Use single-linkage clustering (nearest neighbor)"
+ , Option [] ["complete"] (NoArg$ Cluster C.CompleteLinkage)$ "Use complete-linkage clustering (furthest neighbor)"
+ , Option [] ["UPGMA"] (NoArg$ Cluster C.UPGMA) $ "Use Unweighted Pair Group Method (average linkage) - DEFAULT mode"
+
+ , Option [] ["editdist"] (ReqArg (EditDistThresh . read) "DIST")$
+ "Combine all clusters separated by DIST or less. Report a flat list of clusters.\n"++
+ "Irrespective of whether this is activated, a hierarchical clustering (dendogram.pdf) is produced."
+-- , Option [] ["dendogram"] (NoArg DendogramOnly)$ "Report a hierarchical clustering (default)"
+
+ , Option [] [] (NoArg NullOpt) ""
+ , Option [] [] (NoArg$ error "internal problem") "--- Select Robinson-Foulds (symmetric difference) distance algorithm: ---"
+ , Option [] ["hashrf"] (NoArg$ RFMode HashRF)
+ "(default) use a variant of the HashRF algorithm for the distance matrix"
+ , Option [] ["tolerant"] (NoArg$ RFMode TolerantNaive)
+ "use a slower, modified RF metric that tolerates missing taxa"
+
+ , Option [] [] (NoArg NullOpt) ""
+ , Option [] [] (NoArg$ error "internal problem") "----------------------------- Visualization --------------------------------"
+ , Option ['g'] ["graphbins"] (NoArg Graph) "use graphviz to produce .dot and .pdf output files"
+ -- TODO: Produce the consensus tree as well as the individual trees.
+ , Option ['d'] ["drawbins"] (NoArg Draw) "like -g, but open GUI windows to show each bin's tree"
+
+ , Option ['w'] ["view"] (NoArg View)$ "for convenience, \"view mode\" simply displays input Newick files without binning"
+
+ , Option [] ["showtrees"] (NoArg ShowTreesInDendro) "Print (textual) tree topology inside the nodes of the dendrogram"
+ , Option [] ["highlight"] (ReqArg Highlight "FILE") $
+ "Highlight nodes in the tree-of-trees (dendrogram) consistent with the.\n"++
+ "given tree file. Multiple highlights are permitted and use different colors."
+-- "Highlights may be SUBTREES that use any subset of the taxa."
+ , Option [] ["interior"] (NoArg ShowInterior)
+ "Show the consensus trees for interior nodes in the dendogram, rather than just points."
+
+ , Option [] [] (NoArg NullOpt) ""
+ , Option [] [] (NoArg$ error "internal problem") "---------------------------- Tree pre-processing -----------------------------"
+ , Option [] ["prune"] (ReqArg (PruneTaxa . parseTaxa) "TAXA")
+ ("Prune trees to only TAXA before doing anything else.\n"++
+ "Space and comma separated lists of taxa are allowed. Use quotes.")
+ , Option ['b'] ["minbranchlen"] (ReqArg (BranchThresh . read) "LEN") "collapse branches less than LEN"
+
+ , Option [] ["minbootstrap"] (ReqArg (BootStrapThresh . read) "INT")
+ "collapse branches with bootstrap values less than INT"
+ -- , Option ['n'] ["numtaxa"] (ReqArg (SetNumTaxa . read) "NUM")
+ -- "expect NUM taxa for this dataset, demand trees have all of them"
+
+ , Option [] [] (NoArg NullOpt) ""
+ , Option [] [] (NoArg$ error "internal problem") "--------------------------- Extracting taxa names ----------------------------"
+-- , Option ['n'] ["numtaxa"] (ReqArg (SetNumTaxa . read) "NUM") "expect NUM taxa for this dataset (otherwise it will guess)"
+ -- TODO, FIXME: The "guessing" part doesn't actually work yet -- implement it!!
+ -- What's a good algorithm? Insist they all have the same number? Take the mode?
+
+{- -- TODO: FIXME: IMPLEMENT THIS:
+ , Option ['f'] ["force"] (NoArg Force) "force phybin to consume and bin trees with different numbers of taxa"
+-}
+ , Option [] [] (NoArg NullOpt) ""
+ , Option ['p'] ["nameprefix"] (ReqArg (NamePrefix . read) "NUM") $
+ "Leaf names in the input Newick trees can be gene names, not taxa.\n"++
+ "Then it is typical to extract taxa names from genes. This option extracts\n"++
+ "a prefix of NUM characters to serve as the taxa name."
+
+ , Option [] [] (NoArg NullOpt) ""
+ , Option ['s'] ["namesep"] (ReqArg NameCutoff "STR") $ --"\n"++
+ "An alternative to --nameprefix, STR provides a set of delimeter characters,\n"++
+ "for example '-' or '0123456789'. The taxa name is then a variable-length\n"++
+ "prefix of each gene name up to but not including any character in STR."
+
+ , Option [] [] (NoArg NullOpt) ""
+ , Option ['m'] ["namemap"] (ReqArg NameTable "FILE") $
+ "Even once prefixes are extracted it may be necessary to use a lookup table\n"++
+ "to compute taxa names, e.g. if multiple genes/plasmids map onto one taxa.\n"++
+ "This option specifies a text file with find/replace entries of the form\n"++
+ "\"<string> <taxaname>\", which are applied AFTER -s and -p."
+
+ , Option [] [] (NoArg NullOpt) ""
+ , Option [] [] (NoArg$ error "internal problem") "--------------------------- Utility Modes ----------------------------"
+ , Option [] ["rfdist"] (NoArg RFMatrix) "print a Robinson Foulds distance matrix for the input trees"
+ , Option [] ["setdiff"] (NoArg LineSetDiffMode) "for convenience, print the set difference between cluster*.txt files"
+ , Option [] ["print"] (NoArg PrintReg) "simply print out a concise form of each input tree"
+ , Option [] ["printnorms"] (NoArg PrintNorms) "simply print out a concise and NORMALIZED form of each input tree"
+ , Option [] ["consensus"] (NoArg PrintConsensus) "print a strict consensus tree for the inputs, then exit"
+ , Option [] ["matching"] (NoArg PrintMatching) "print a list of tree names that match any --highlight argument"
+ ]
+
+
+-- | Parse the list of taxa provided on the command line.
+parseTaxa :: String -> [String]
+parseTaxa str = filter (not . null) $
+ splitOneOf ",; " str
+
+usage :: String
+usage = "\nUsage: phybin [OPTION...] files or directories...\n\n"++
+
+-- "MODE must be one of 'bin' or 'cluster'.\n\n" ++
+
+ "PhyBin takes Newick tree files as input. Paths of Newick files can\n"++
+ "be passed directly on the command line. Or, if directories are provided,\n"++
+ "all files in those directories will be read. Taxa are named based on the files\n"++
+ "containing them. If a file contains multiple trees, all are read by phybin, and\n"++
+ "the taxa name then includes a suffix indicating the position in the file:\n"++
+ " e.g. FILENAME_0, FILENAME_1, etc.\n"++
+ "\n"++
+
+ -- "In binning mode, Phybin output contains, at minimum, files of the form binXX_YY.tr,\n"++
+ -- "each containing a list of input file paths that fall into that bin. XX signifies\n"++
+ -- "the rank of the bin (biggest first), and YY is how many trees it contains.\n"++
+ -- "\n"++
+
+ "When clustering trees, Phybin computes a complete all-to-all Robinson-Foulds distance matrix.\n"++
+ "If a threshold distance (tree edit distance) is given, then a flat set of clusters\n"++
+ "will be produced in files clusterXX_YY.tr. Otherwise it produces a full dendogram.\n"++
+ "\n"++
+
+ "Binning mode provides an especially quick-and-dirty form of clustering.\n"++
+ "When running with the --bin option, only exactly equal trees are put in the same cluster.\n"++
+ "Tree pre-processing still applies, however: for example collapsing short branches.\n"++
+ "\n"++
+
+ "USAGE NOTES: \n"++
+ " * Currently phybin ignores input trees with the wrong number of taxa.\n"++
+ " * If given a directory as input phybin will assume all contained files are Newick trees.\n\n"++
+
+ "\nOptions include:\n"
+
+defaultErr :: [String] -> t
+defaultErr errs = error $ "ERROR!\n" ++ (concat errs ++ usageInfo usage options)
+
+--------------------------------------------------------------------------------
+
+main :: IO ()
+main =
+ do argv <- getArgs
+
+ (opts,files) <-
+ case getOpt Permute options argv of
+ (o,n,[] ) -> return (o,n)
+ (_,_,errs) -> defaultErr errs
+
+ all_inputs <- acquireTreeFiles files
+ let process_opt cfg opt = case opt of
+ NullOpt -> return cfg
+ Verbose -> return cfg { verbose= True }
+ Version -> do putStrLn$ "phybin version "++ showVersion version; exitSuccess
+
+ SelfTest -> do _ <- runTestTT allUnitTests; exitSuccess
+
+ RFMatrix -> return cfg { print_rfmatrix= True }
+
+ ShowTreesInDendro -> return cfg { show_trees_in_dendro = True }
+ ShowInterior -> return cfg { show_interior_consensus = True }
+ Highlight path -> return cfg { highlights = path : highlights cfg }
+
+ LineSetDiffMode -> do
+ bss <- mapM B.readFile files
+ case map (S.fromList . B.lines) bss of
+ [set1,set2] -> do let [f1,f2] = files
+ let diff = S.difference set1 set2
+ putStrLn$" Found "++show(S.size diff)++" lines occuring in "++f1++" but not "++f2
+ mapM_ B.putStrLn $ S.toList diff
+ oth -> error $"Line set difference mode expects two files as input, got "++show(length oth)
+ exitSuccess
+
+ PrintNorms -> return cfg
+ PrintReg -> return cfg
+ PrintConsensus -> return cfg
+ PrintMatching -> return cfg
+
+ Cluster lnk -> return cfg { clust_mode = ClusterThem lnk }
+ RFMode TolerantNaive
+ | Expected _ <- num_taxa cfg -> error "should not set --numtaxa AND --tolerant"
+ | otherwise -> return cfg { rfmode = TolerantNaive }
+ RFMode HashRF -> return cfg { rfmode = HashRF }
+ BinningMode -> return cfg { clust_mode = BinThem }
+ EditDistThresh n -> return cfg { dist_thresh = Just n }
+ DendogramOnly -> return cfg { dist_thresh = Nothing }
+
+ Output s -> return cfg { output_dir= s }
+
+ SetNumTaxa n
+ | rfmode cfg == TolerantNaive -> error "should not set --tolerant AND --numtaxa"
+ | otherwise -> return cfg { num_taxa= Expected n }
+ BranchThresh n -> return cfg { branch_collapse_thresh = Just n }
+ BootStrapThresh n -> return cfg { bootstrap_collapse_thresh = Just n }
+
+ PruneTaxa ls -> return cfg { preprune_labels = Just ls }
+
+ Graph -> return cfg { do_graph= True }
+ Draw -> return cfg { do_draw = True }
+ View -> return cfg -- Handled below
+
+ TabDelimited _ _ -> error "tabbed option not handled yet"
+ Force -> error "force option not handled yet"
+
+
+ NameCutoff str -> let chopper = takeWhile (not . flip S.member (S.fromList str))
+ in return cfg { name_hack = chopper . name_hack cfg }
+ NamePrefix n -> return cfg { name_hack = (take n) . name_hack cfg }
+
+ -- This should always be after cutoff/prefix:
+ NameTable file -> do reader <- name_table_reader file
+ return cfg { name_hack = reader . name_hack cfg }
+
+ config <- foldM process_opt default_phybin_config{ inputs= all_inputs }
+ (sort opts) -- NOTE: options processed in sorted order.
+
+ when (null files) $ do
+ defaultErr ["No file arguments!\n"]
+
+ --------------------------------------------------------------------
+ ----- Handle SPECIAL modes before going into normal processing -----
+ --------------------------------------------------------------------
+ -- This mode kicks in AFTER config options are processed.
+ when (elem PrintReg opts) $ do
+ (_,fts) <- parseNewickFiles (name_hack config) all_inputs
+ forM_ fts $ \ ft@(FullTree name _ _) -> do
+ putStrLn $ "Tree "++show name
+ putStrLn$ show$ displayDefaultTree ft
+ exitSuccess
+ ------------------------------------------------------------
+ when (elem PrintNorms opts) $ do
+ (_,fts) <- parseNewickFiles (name_hack config) all_inputs
+ forM_ fts $ \ ft@(FullTree name _ _) -> do
+ putStrLn $ "Tree , NORMALIZED:"++show name
+ putStrLn$ show$ displayDefaultTree$ deAnnotate $
+ liftFT (normalize . annotateWLabLists) ft
+ exitSuccess
+ ------------------------------------------------------------
+ when (elem PrintConsensus opts) $
+ case (num_taxa config) of
+ Expected numtax -> do
+ (_,fts) <- parseNewickFiles (name_hack config) all_inputs
+ putStrLn $ "Strict Consensus Tree of "++show (length fts)++" trees:"
+ when (null fts) $ error "No trees provided!"
+ let ctree = consensusTree numtax (map nwtree fts)
+ FullTree{labelTable} = head fts
+ print$ displayStrippedTree$ FullTree "" labelTable ctree
+ exitSuccess
+ _ -> error "--consensus: cannot compute consensus when number of taxa is unknown or variable"
+ ------------------------------------------------------------
+ when (elem PrintMatching opts) $ do
+ (labs,fts) <- parseNewickFiles (name_hack config) all_inputs
+ when (null$ highlights config) $ error "No --highlight given, so --matching makes no sense. Matching what?"
+ htrs <- retrieveHighlights (name_hack config) labs (highlights config)
+ let isMatch = matchAnyHighlight htrs
+ forM_ fts $ \ FullTree{treename,nwtree} ->
+ when (isMatch$ fmap (const()) nwtree) $
+ putStrLn treename
+ exitSuccess
+ ------------------------------------------------------------
+ when (View `elem` opts) $ do
+ view_graphs config
+ exitSuccess
+ ------------------------------------------------------------
+ -- Otherwise do the main, normal thing:
+ driver config
+
+-- | Completely optional visualization step. If this fails, we don't
+-- sweat it.
+view_graphs :: PhyBinConfig -> IO ()
+view_graphs PBC{..} = catch act hndl
+ where
+ hndl :: SomeException -> IO ()
+ hndl exn = putStrLn$ "Warning: Viewing graph failed, ignoring. Error was: "++show exn
+ act = do chans <- forM inputs $ \ file -> do
+ putStrLn$ "Drawing "++ file ++"...\n"
+ str <- B.readFile file
+ putStrLn$ "Parsed: " ++ (B.unpack str)
+ let (tbl,tr) = parseNewick M.empty name_hack file str
+ (chan, _tr) <- viewNewickTree file (FullTree "" tbl (annotateWLabLists (head tr)))
+ return chan
+ forM_ chans readChan
+ return ()
+
+
+--------------------------------------------------------------------------------
+-- Every dataset it seems needs a new hack on the names!
+
+name_table_reader :: String -> IO (String -> String)
+name_table_reader file =
+ do contents <- readFile file
+ let mp = M.fromList $
+ map (\ls -> case ls of
+ [a,b] -> (a,b)
+ _ -> error$ "Each line of "++file++" must contain two whitespace free strings: "++ unwords ls) $
+ filter (not . null) $
+ map tokenize $
+ lines contents
+ return$
+ \ name_to_hack ->
+
+ case M.lookup name_to_hack mp of -- Could use a trie
+ Just x -> x
+ Nothing -> name_to_hack
+ where
+ tokenize :: String -> [String]
+ tokenize line =
+ case words line of
+ [] -> []
+ [one] -> error$"Name table contained bad line: "++ show one
+ [one,two] -> [one,two]
+ (one:rest) -> [one, unwords rest]
+
+temp :: IO ()
+temp = driver default_phybin_config{ num_taxa= Expected 7, inputs=["../datasets/test.tr"] }
+
+
+--------------------------------------------------------------------------------
+-- Aggregated Unit Tests
+--------------------------------------------------------------------------------
+
+allUnitTests :: Test
+-- allUnitTests = unitTests ++
+allUnitTests = test
+ [ Bio.Phylogeny.PhyBin.unitTests
+ , Bio.Phylogeny.PhyBin.Parser.unitTests
+ , "norm/Bip1" ~: (testNorm prob1)
+ , "bipTreeConversion" ~: testBipConversion
+ , "t3" ~: t3_consensusTest,
+ "t4" ~: t4_consensusTest,
+ "t5" ~: t5_consensusTest
+ ]
+
+-- [2013.07.23]
+-- This was INCORRECTLY normalizing to:
+-- ((1_, 2_), (7_, (18, 6_)), ((14, 3_), (19, (13, 5_))))
+prob1 :: String
+prob1 = "(5_, (19, ((3_, 14), ((2_, 1_), (7_, (6_, 18))))), 13);"
+
+-- | Make sure that the normalized version of a tree yields the same bipartitions as
+-- the unnormalized one.
+testNorm :: String -> IO ()
+testNorm str = do
+ let (labs,parseds) = parseNewick M.empty id "test" (B.pack str)
+ parsed = head parseds
+ normed = normalize $ annotateWLabLists parsed
+ bips1 = allBips parsed
+ bips2 = allBips normed
+ added = S.difference bips2 bips1
+ removed = S.difference bips1 bips2
+ -- dispBips bip = show$
+ -- map (map (labs M.!)) $
+ -- map IS.toList$ S.toList bip
+ unless (bips1 == bips2) $ do
+ putStrLn$ "Normalized this: "++show (displayDefaultTree $ FullTree "" labs parsed)
+ putStrLn$ "To this : "++show (displayDefaultTree $ deAnnotate $ FullTree "" labs normed)
+ error$ "Normalization added and removed these bipartitions, respectively:\n "
+ ++concat (intersperse " " (map (dispBip labs) (S.toList added))) ++"\n "
+ ++concat (intersperse " " (map (dispBip labs) (S.toList removed)))
+
+
+-- 112 and 13
+rftest :: IO ()
+rftest = do
+ (_mp,[t1,t2]) <- parseNewickFiles (take 2) ["tests/13.tr", "tests/112.tr"]
+ putStrLn$ "Tree 13 : " ++ show (displayDefaultTree t1)
+ putStrLn$ "Tree 112 : "++ show (displayDefaultTree t2)
+
+ putStrLn$ "Tree 13 normed : "++ show (disp t1)
+ putStrLn$ "Tree 112 normed : "++ show (disp t2)
+
+ putStrLn$ "13 collapsed 0.02: " ++show (disp$ liftFT (collapseBranchLenThresh 0.02) t1)
+ putStrLn$ "112 collapsed 0.02: " ++show (disp$ liftFT (collapseBranchLenThresh 0.02) t2)
+
+ putStrLn$ "13 collapsed 0.03: " ++show (disp$ liftFT (collapseBranchLenThresh 0.03) t1)
+ putStrLn$ "112 collapsed 0.03: " ++show (disp$ liftFT (collapseBranchLenThresh 0.03) t2)
+
+ let (mat,_) = naiveDistMatrix [nwtree t1, nwtree t2]
+ printDistMat stdout mat
+ return ()
+ where
+ disp (FullTree nm labs tr) =
+ let collapsed :: AnnotatedTree
+ collapsed = normalize$ annotateWLabLists tr
+ in displayDefaultTree$ deAnnotate $ FullTree nm labs collapsed
+
+-- | This test was done with --editdist 4 --complete
+t3_consensusTest :: IO ()
+t3_consensusTest = consensusTest "./tests/t3_consensus/cluster1_284_alltrees.tr"
+ "./tests/t3_consensus/cluster1_284_consensus.tr"
+
+-- | This test was done with --editdist 0 --complete
+t4_consensusTest :: IO ()
+t4_consensusTest = consensusTest "./tests/t4_consensus/cluster1_16_alltrees.tr"
+ "./tests/t4_consensus/cluster1_16_consensus.tr"
+
+-- | This test was done with --editdist 1 --complete
+t5_consensusTest :: IO ()
+t5_consensusTest = consensusTest "./tests/t5_consensus/cluster1_35_alltrees.tr"
+ "./tests/t5_consensus/cluster1_35_consensus.tr"
+
+consensusTest :: String -> String -> IO ()
+consensusTest alltrees consensus = do
+ (_,ctree:ftrees) <- parseNewickFiles id [consensus,alltrees]
+ let num_taxa = numLeaves (nwtree ctree)
+ plainTrs = map nwtree ftrees
+ eachbips = map allBips plainTrs
+ totalBips = foldl' S.union S.empty eachbips
+ intersectBips = foldl' S.intersection S.empty eachbips
+ FullTree _ labs _ = ctree
+ linesPrnt x = unlines (map ((" "++) . dispBip labs) $ S.toList x)
+ putStrLn$ "Bips in each: "++ show (map S.size eachbips)
+ putStrLn$ "Total bips in all: "++show (S.size totalBips)
+ putStrLn$ "Bips in common: "++ show (S.size intersectBips)
+ putStrLn$ "Bips of first member:\n" ++ linesPrnt (head eachbips)
+ putStrLn$ "Some bips in the union that are NOT in the intersection:\n" ++
+ linesPrnt (S.fromList$ take 20$ S.toList$ S.difference totalBips intersectBips)
+ let cbips = allBips $ nwtree ctree
+ putStrLn$ "ConsensusBips ("++show (S.size cbips)++"):\n"++linesPrnt cbips
+ putStrLn$"Things in the consensus that should NOT be:\n"++linesPrnt (S.difference cbips intersectBips)
+ putStrLn$"Things not in the consensus that SHOULD be:\n"++linesPrnt (S.difference intersectBips cbips)
+
+ putStrLn$ "Now recomputing consensus tree for "++show num_taxa++" taxa"
+ let ctree2 = consensusTree num_taxa plainTrs
+ cbips2 = allBips ctree2
+ putStrLn$ "Freshly recomputed consensusBips ("++show (S.size cbips2)++"):\n"++linesPrnt cbips2
+ HU.assertEqual "Consensus tree on disk should match computed one:"
+ cbips cbips2 -- (allBips$ fmap (const ()) $ nwtree ctree)
+
+ putStrLn " Partial distance matrix WITHIN this cluster:"
+ let (mat,_) = naiveDistMatrix (map nwtree ftrees)
+ printDistMat stdout (V.take 30 mat)
+ HU.assertBool "Consensus should only include bicbips2ps in the members" (S.isSubsetOf cbips totalBips)
+ HU.assertEqual "Consensus tree matches intersected bips" cbips intersectBips
+ return ()
+
+testBipConversion :: IO ()
+testBipConversion =
+ do (_,trs) <- allTestTrees
+ mapM_ testOne trs
+ putStrLn "All trees passed bip conversion."
+ where
+ testOne (FullTree{nwtree}) = do
+ let sz = numLeaves nwtree
+ bips = allBips nwtree
+ tr2 = bipsToTree sz bips
+ bips2 = allBips tr2
+ assertEqual "round trip bips->tree->bips" bips bips2
+
+-- | Read in all test trees which we happen to have put in the repository for testing
+-- purposes.
+allTestTrees :: IO (LabelTable, [FullTree DefDecor])
+allTestTrees =
+ parseNewickFiles id $
+ [ "./tests/t3_consensus/cluster1_284_alltrees.tr"
+ , "./tests/t3_consensus/cluster1_284_consensus.tr"
+ , "./tests/t4_consensus/cluster1_16_alltrees.tr"
+ , "./tests/t4_consensus/cluster1_16_consensus.tr"
+ , "./tests/t5_consensus/cluster1_35_alltrees.tr"
+ , "./tests/t5_consensus/cluster1_35_consensus.tr"
+ ]
+
+
+----------------------------------------------------------------------------------------------------
+-- TODO: expose a command line argument for testing.
+-- The below test exposed my normalization bug relating to "deriving Ord".
+-- I need to transform it into one or more proper unit tests.
+
+main_test :: IO ()
+main_test =
+ withArgs ["-w","~/newton_and_newton_local/datasets/yersinia/yersinia_trees/111.dnd","-m","../datasets/yersinia/name_table_hack_yersinia.txt"]
+ main
+
+-- [2013.07.22] Disabling for the new Label representation:
+{-
+pa :: NewickTree DefDecor
+pa = set_dec (Nothing,1) $
+ NTInterior () [NTInterior () [NTLeaf () "RE",NTInterior () [NTLeaf () "SD",NTLeaf () "SM"]],NTInterior () [NTLeaf () "BB",NTLeaf () "BJ"],NTInterior () [NTLeaf () "MB",NTLeaf () "ML"]]
+
+pb :: NewickTree DefDecor
+pb = set_dec (Nothing,1) $
+ NTInterior () [NTInterior () [NTLeaf () "BB",NTLeaf () "BJ"],NTInterior () [NTLeaf () "MB",NTLeaf () "ML"],NTInterior () [NTLeaf () "RE",NTInterior () [NTLeaf () "SD",NTLeaf () "SM"]]]
+
+ls1 :: [(String, NewickTree DefDecor)]
+ls1 = [("a",pa),("b",pb)]
+
+-- This is one:
+num_binned :: Int
+num_binned = M.size $ binthem ls1
+
+a_ :: (String, NewickTree DefDecor)
+a_ = ("980.dnd",
+ fmap (\x -> (Nothing,x)) $
+ NTInterior 0.0 [NTInterior 5.697e-2 [NTLeaf 3.95e-2 "SM",NTLeaf 5.977e-2 "SD"],NTLeaf 0.13143 "RE",NTInterior 0.13899 [NTInterior 9.019e-2 [NTLeaf 0.11856 "BB",NTLeaf 0.13592 "BJ"],NTInterior 0.13194 [NTLeaf 0.19456 "MB",NTLeaf 0.16603 "ML"]]])
+
+b_ :: (String, NewickTree DefDecor)
+b_ = ("999.dnd",
+ fmap (\x -> (Nothing,x)) $
+ NTInterior 0.0 [NTInterior 6.527e-2 [NTInterior 0.13734 [NTLeaf 2.975e-2 "SM",NTLeaf 3.002e-2 "SD"],NTLeaf 0.18443 "RE"],NTInterior 6.621e-2 [NTLeaf 0.16184 "MB",NTLeaf 0.15233 "ML"],NTInterior 0.23143 [NTLeaf 9.192e-2 "BB",NTLeaf 0.10125 "BJ"]])
+
+-- But THIS is two: ack!
+num2 :: Int
+num2 = M.size $ binthem [a_,b_]
+
+-- Here's the test that breaks things:
+a_norm :: NewickTree StandardDecor
+a_norm = normalize (annotateWLabLists$ snd a_)
+
+b_norm_ :: NewickTree (Double, Int, [Label])
+b_norm_ = NTInterior (0.0,7,["BB","BJ","MB","ML","RE","SD","SM"])
+ [NTInterior (0.23143,2,["BB","BJ"]) [NTLeaf (9.192e-2,1,["BB"]) "BB",NTLeaf (0.10125,1,["BJ"]) "BJ"],NTInterior (6.621e-2,2,["MB","ML"]) [NTLeaf (0.16184,1,["MB"]) "MB",NTLeaf (0.15233,1,["ML"]) "ML"],NTInterior (6.527e-2,3,["RE","SD","SM"]) [NTLeaf (0.18443,1,["RE"]) "RE",NTInterior (0.13734,2,["SD","SM"]) [NTLeaf (3.002e-2,1,["SD"]) "SD",NTLeaf (2.975e-2,1,["SM"]) "SM"]]]
+
+b_norm :: NewickTree StandardDecor
+b_norm = fmap (\ (bl,w,ls) -> StandardDecor bl Nothing w ls) b_norm_
+
+d1 :: IO (Chan (), NewickTree StandardDecor)
+d1 = viewNewickTree "" a_norm
+
+d2 :: IO (Chan (), NewickTree StandardDecor)
+d2 = viewNewickTree "" b_norm
+
+d1_ :: IO ThreadId
+d1_ = forkIO $ do runGraphvizCanvas Dot (dotNewickTree_debug "" a_norm) Xlib; return ()
+
+d2_ :: IO ThreadId
+d2_ = forkIO $ do runGraphvizCanvas Dot (dotNewickTree_debug "" b_norm) Xlib; return ()
+
+
+-- | A of a tree with _____ weights attached to it:
+withBootstrap :: String
+withBootstrap = "((((A8F330_:0.01131438136322714984,(G0GWK2_:0.00568050636963043226,(Q92FV4_:0.00284163304504484121,((B0BVQ5_:0.00319487112504297311,A8GU65_:0.00000122123005994819)74:0.00279881991324161267,(C3PM27_:0.00560787769333294297,C4K2Z0_:0.00559642713265556899)15:0.00000122123005994819)4:0.00000122123005994819)56:0.00276851661606284868)60:0.00283144414216590342)76:0.00886304965525876697,(A8GQC0_:0.05449879836105625541,(A8F0B2_:0.04736199885985507840,Q4UJN9_:0.02648399728559588939 [...]
+
+withBootstrap2 :: String
+withBootstrap2 = "((((A8F330_:0.01131438136322714984,(G0GWK2_:0.00568050636963043226,(Q92FV4_:0.00284163304504484121,((B0BVQ5_:0.00319487112504297311,A8GU65_:0.00000122123005994819):0.00279881991324161267[74],(C3PM27_:0.00560787769333294297,C4K2Z0_:0.00559642713265556899):0.00000122123005994819[15]):0.00000122123005994819[4]):0.00276851661606284868[56]):0.00283144414216590342[60]):0.00886304965525876697[76],(A8GQC0_:0.05449879836105625541,(A8F0B2_:0.04736199885985507840,Q4UJN9_:0.0264839 [...]
+-}
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..7f55d5a
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,121 @@
+
+
+
+default: haskell
+
+all: find_tree_matches.exe haskell
+
+# GHC=ghc-7.4.2
+GHC=ghc-7.6.3
+
+release: haskell
+ upx phybin.exe
+ du -h phybin.exe
+
+haskell:
+ cabal configure
+ cabal build
+ cp ./dist/build/phybin/phybin phybin.exe
+ du -h phybin.exe
+ strip phybin.exe
+ du -h phybin.exe
+
+win:
+ $(GHC) -DWIN32 --make Main.hs -o phybin.exe
+
+find_tree_matches.exe: find_tree_matches.ss
+ mzc --exe find_tree_matches.exe find_tree_matches.ss
+
+clean:
+ rm -f phybin *.exe *.hi *.o Bio/Phylogeny/PhyBin/*.o Bio/Phylogeny/PhyBin/*.hi
+
+################################################################################
+
+#launch: clean release doc distro push upload
+launch: doc distro push upload
+
+doc:
+ pandoc -s -S --toc -c website.css README.md -o website/index.html
+
+release_website:
+ cp website/* ~/.hyplan/projects/phybin/
+
+distro:
+ cabal sdist
+ cp -v dist/*.tar.gz website/
+
+# HASKELLSRV=haskell.intel
+HASKELLSRV=community.haskell.org
+
+
+################################################################################
+# Collecting shortcuts for running different datasets:
+
+
+# LOCALSETS=~/newton_and_newton_local/datasets
+LOCALSETS=.
+
+rhizobia:
+ rm -rf $(LOCALSETS)/rhizobia/phybin_outputs
+ mkdir $(LOCALSETS)/rhizobia/phybin_outputs
+ ./phybin.exe $(LOCALSETS)/rhizobia/gde/*.dnd -o $(LOCALSETS)/rhizobia/phybin_output/ -p 2 -n 7 -g -v
+
+#yersinia_test_names:
+# ./phybin.exe -w ~/newton_and_newton_local/datasets/yersinia/yersinia_trees/111.dnd -s . -m ../datasets/yersinia/name_table_hack_yersinia.txt
+
+yersinia:
+ rm -rf $(LOCALSETS)/yersinia/phybin_outputs
+ mkdir $(LOCALSETS)/yersinia/phybin_outputs
+ ./phybin.exe $(LOCALSETS)/yersinia/yersinia_trees/ -o $(LOCALSETS)/yersinia/phybin_outputs/ -m ../datasets/yersinia/name_table_hack_yersinia.txt -s . -n 9 -g
+
+legionella:
+ rm -rf $(LOCALSETS)/legionella/phybin_outputs
+ mkdir $(LOCALSETS)/legionella/phybin_outputs
+ ./phybin.exe $(LOCALSETS)/legionella/legionella_orthologs_aa/ -o $(LOCALSETS)/legionella/phybin_outputs/ -m ../datasets/legionella/name_table_hack_legionella.txt -s 0123456789 -n 4 -g
+
+
+
+# Newer ones [2012.11.19]:
+newbatch: rickettsia rickettsiales wolbachia
+
+# BRANCHTHRESH=0.01
+BRANCHTHRESH=0.05
+PHYBIN= ./phybin.exe -b $(BRANCHTHRESH) -g
+
+#------------------------------------------------------------
+rickettsia: $(LOCALSETS)/Rickettsia/renaming_table.txt
+ rm -rf $(LOCALSETS)/Rickettsia/phybin_outputs
+ mkdir $(LOCALSETS)/Rickettsia/phybin_outputs
+ $(PHYBIN) -n 15 -m $(LOCALSETS)/Rickettsia/renaming_table.txt -s '_' -o $(LOCALSETS)/Rickettsia/phybin_output/ $(LOCALSETS)/Rickettsia/final_trees/*BranchLab*.out
+
+$(LOCALSETS)/Rickettsia/renaming_table.txt: $(LOCALSETS)/Rickettsia/Rickettsia_orthololgs.txt
+ runghc stripTable.hs $^ > $@
+
+#------------------------------------------------------------
+rickettsiales: $(LOCALSETS)/Rickettsiales/renaming_table.txt
+ rm -rf $(LOCALSETS)/Rickettsiales/phybin_outputs
+ mkdir $(LOCALSETS)/Rickettsiales/phybin_outputs
+ $(PHYBIN) -n 29 -m $(LOCALSETS)/Rickettsiales/renaming_table.txt -s '_' -o $(LOCALSETS)/Rickettsiales/phybin_output/ $(LOCALSETS)/Rickettsiales/final_trees/*BranchLab*.out
+
+$(LOCALSETS)/Rickettsiales/renaming_table.txt: $(LOCALSETS)/Rickettsiales/Rickettsiales_orthologs.txt
+ runghc stripTable.hs $^ > $@
+
+#------------------------------------------------------------
+wolbachia: $(LOCALSETS)/Wolbachia/renaming_table.txt
+ rm -rf $(LOCALSETS)/Wolbachia/phybin_outputs
+ mkdir $(LOCALSETS)/Wolbachia/phybin_outputs
+ $(PHYBIN) -n 4 -m $(LOCALSETS)/Wolbachia/renaming_table.txt -s '_' -o $(LOCALSETS)/Wolbachia/phybin_output/ $(LOCALSETS)/Wolbachia/final_trees/*BranchLab*.out
+
+$(LOCALSETS)/Wolbachia/renaming_table.txt: $(LOCALSETS)/Wolbachia/Wolbachia_orthologs.txt
+ runghc stripTable.hs $^ > $@
+
+
+
+vary_branchlen: $(LOCALSETS)/Rickettsia/renaming_table.txt $(LOCALSETS)/Rickettsiales/renaming_table.txt $(LOCALSETS)/Wolbachia/renaming_table.txt
+ ./vary_branchlen.sh 15 Rickettsia
+ ./vary_branchlen.sh 29 Rickettsiales
+ ./vary_branchlen.sh 4 Wolbachia
+
+temp:
+ ./phybin.exe ~/newton_and_newton_local/datasets/rhizobia/gde/*.dnd -o temp -p 2 -n 7
+
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..35a38e4
--- /dev/null
+++ b/README.md
@@ -0,0 +1,166 @@
+% PhyBin (0.3): Binning/Clustering Newick Trees by Topology
+
+
+PhyBin is a simple command line tool that classifies a set of
+ [Newick tree files](http://en.wikipedia.org/wiki/Newick_format)
+by their topology. The purpose of it is to take a large set of tree
+files and browse through the most common tree topologies.
+
+![(Above figure) Trees corresponding to the three largest bins resulting from a
+ phybin run. The file `binXX_YYY`, where `XX` is the rank of the bin and
+ `YYY` is the number of trees having that topology.](trees.jpg)
+
+Change Log
+==========
+
+In version 0.2, PhyBin was extended to do clustering as well as binning:
+
+ * Computee full all-to-all Robinson-Foulds distance matrices (quickly)
+ * Hierarchical clustering of all trees into a tree-of-trees dendrogram based on
+ Robinson Foulds symmetric (tree edit) distance.
+
+In version 0.3, PhyBin gained a number of new features
+
+ * A `--tolerant` mode for computing RF distance matrices even for trees missing taxa.
+ * A `--prune` option for "zooming in" on a specific set of taxa.
+ * The `--minboostrap` option was added.
+
+
+Invoking PhyBin
+===============
+
+PhyBin is a command-line program that produces output in the form of
+text files and pdfs, but to produce pdfs (to visualize trees) the
+ [GraphViz program](http://www.graphviz.org/),
+including the `dot` command, must be installed on the machine.
+
+The following is a simple invocation of PhyBin:
+
+ phybin --bin *.tree -o output_dir/
+
+The input trees can be specified directly on the command-line, or, if the
+name of a directory is provided instead, all contained files are
+assumed to be trees in Newick format.
+
+PhyBin, at minimum, produces files of the form
+`output_dir/clusterXX_YY.tr`, one for each bin. If
+requested, it will also produce visual representations of each bin in
+the form `output_dir/clusterXX_YY.pdf`.
+
+Downloading and Installing PhyBin
+=================================
+
+The source code to PhyBin can be downloaded here:
+
+ * [Download Source from Hackage](http://hackage.haskell.org/package/phybin)
+
+PhyBin is written in Haskell and if you have
+ [Haskell Platform](http://hackage.haskell.org/platform/).
+installed you can install phybin with this one-liner:
+
+ cabal install phybin
+
+Otherwise PhyBin is also available for download as a statically-linked
+executable for Mac-OS, Linux, and Windows:
+
+ * [Download Mac-OS Binary](phybin-0.2.11.mac)
+ * [Download Linux Binary](phybin-0.2.11.x86_64_linux)
+ * [Download Windows Binary](phybin-0.2.11_windows.exe)
+
+
+
+Command-line Options
+====================
+
+In addition to input files and directories, `phybin` supports a number
+of command-line options. Run "phybin --help" to see these options.
+Here is a snapshot of the current help output (version 0.2.11):
+
+ Usage: phybin [OPTION...] files or directories...
+
+ PhyBin takes Newick tree files as input. Paths of Newick files can
+ be passed directly on the command line. Or, if directories are provided,
+ all files in those directories will be read. Taxa are named based on the files
+ containing them. If a file contains multiple trees, all are read by phybin, and
+ the taxa name then includes a suffix indicating the position in the file:
+ e.g. FILENAME_0, FILENAME_1, etc.
+
+ When clustering trees, Phybin computes a complete all-to-all Robinson-Foulds distance matrix.
+ If a threshold distance (tree edit distance) is given, then a flat set of clusters
+ will be produced in files clusterXX_YY.tr. Otherwise it produces a full dendogram (UNFINISHED).
+
+ Binning mode provides an especially quick-and-dirty form of clustering.
+ When running with the --bin option, only exactly equal trees are put in the same cluster.
+ Tree pre-processing still applies, however: for example collapsing short branches.
+
+ USAGE NOTES:
+ * Currently phybin ignores input trees with the wrong number of taxa.
+ * If given a directory as input phybin will assume all contained files are Newick trees.
+
+
+ Options include:
+
+ -v --verbose print WARNINGS and other information (recommended at first)
+ -V --version show version number
+ -o DIR --output=DIR set directory to contain all output files (default "./phybin_out/")
+ --selftest run internal unit tests
+
+ ----------------------------- Clustering Options ------------------------------
+ --bin Use simple binning, the cheapest form of 'clustering'
+ --single Use single-linkage clustering (nearest neighbor)
+ --complete Use complete-linkage clustering (furthest neighbor)
+ --UPGMA Use Unweighted Pair Group Method (average linkage) - DEFAULT mode
+ --editdist=DIST Combine all clusters separated by DIST or less. Report a flat list of clusters.
+ Irrespective of whether this is activated, a hierarchical clustering (dendogram.pdf) is produced.
+ Select Robinson-Foulds (symmetric difference) distance algorithm:
+ --simple use direct all-to-all comparison
+ --hashrf (default) use a variant of the HashRF algorithm for the distance matrix
+
+ ----------------------------- Visualization --------------------------------
+ -g --graphbins use graphviz to produce .dot and .pdf output files
+ -d --drawbins like -g, but open GUI windows to show each bin's tree
+ -w --view for convenience, "view mode" simply displays input Newick files without binning
+ --showtrees Print (textual) tree topology inside the nodes of the dendrogram
+ --highlight=FILE Highlight nodes in the tree-of-trees (dendrogram) consistent with the.
+ given tree file. Multiple highlights are permitted and use different colors.
+ --interior Show the consensus trees for interior nodes in the dendogram, rather than just points.
+
+ ---------------------------- Tree pre-processing -----------------------------
+ -n NUM --numtaxa=NUM expect NUM taxa for this dataset
+ -b LEN --branchcut=LEN collapse branches less than LEN
+
+ --------------------------- Extracting taxa names ----------------------------
+
+ -p NUM --nameprefix=NUM Leaf names in the input Newick trees can be gene names, not taxa.
+ Then it is typical to extract taxa names from genes. This option extracts
+ a prefix of NUM characters to serve as the taxa name.
+
+ -s STR --namesep=STR An alternative to --nameprefix, STR provides a set of delimeter characters,
+ for example '-' or '0123456789'. The taxa name is then a variable-length
+ prefix of each gene name up to but not including any character in STR.
+
+ -m FILE --namemap=FILE Even once prefixes are extracted it may be necessary to use a lookup table
+ to compute taxa names, e.g. if multiple genes/plasmids map onto one taxa.
+ This option specifies a text file with find/replace entries of the form
+ "<string> <taxaname>", which are applied AFTER -s and -p.
+
+ --------------------------- Utility Modes ----------------------------
+ --rfdist print a Robinson Foulds distance matrix for the input trees
+ --setdiff for convenience, print the set difference between cluster*.txt files
+ --print simply print out a concise form of each input tree
+ --printnorms simply print out a concise and NORMALIZED form of each input tree
+ --consensus print a strict consensus tree for the inputs, then exit
+ --matching print a list of tree names that match any --highlight argument
+
+
+- - - - - - - - - - - - - - -
+Authors: Irene and Ryan Newton
+
+Contact email: `irnewton` and `rrnewton` at `indiana` `edu` (with "at" and "dot" inserted).
+
+[Irene's](http://www.bio.indiana.edu/faculty/directory/profile.php?person=irnewton) and
+[Ryan](http://www.cs.indiana.edu/~rrnewton/homepage.html) homepages.
+
+.
+
+
diff --git a/Setup.hs b/Setup.hs
new file mode 100644
index 0000000..a3e2dc0
--- /dev/null
+++ b/Setup.hs
@@ -0,0 +1,12 @@
+#!/usr/bin/env runhaskell
+
+import Distribution.Simple
+import Distribution.Simple.PreProcess
+import Distribution.PackageDescription
+import Distribution.Simple.LocalBuildInfo
+import System.Cmd(system)
+import System.Exit
+
+main :: IO ()
+main = do putStrLn$ "Running Setup.hs ..."
+ defaultMainWithHooks simpleUserHooks
diff --git a/TestAll.hs b/TestAll.hs
new file mode 100644
index 0000000..9a53665
--- /dev/null
+++ b/TestAll.hs
@@ -0,0 +1,41 @@
+{-# LANGUAGE TemplateHaskell #-}
+{-# LANGUAGE ScopedTypeVariables, BangPatterns, ParallelListComp, OverloadedStrings #-}
+module Main where
+
+import Data.Vector as V
+import Data.Vector.Unboxed as U
+import Data.ByteString.Char8 as B
+import Bio.Phylogeny.PhyBin
+import Bio.Phylogeny.PhyBin.CoreTypes
+import Bio.Phylogeny.PhyBin.RFDistance
+import Bio.Phylogeny.PhyBin.Parser
+import System.IO
+import Prelude as P
+
+import qualified Test.HUnit as HU
+-- import Test.Framework
+import Test.Framework.Providers.HUnit
+import Test.Framework.TH (defaultMainGenerator)
+
+-- Here we compare a tree against the same tree with 18 removed:
+-- Or the same tree with one intermediate node removed.
+(_,t30) = parseNewicks id $
+ [ ("t30_0", "(2_, (((14, 3_), (19, (5_, 13))), (18, (6_, 7_))), 1_);")
+ , ("t30_1", "(2_, (((14, 3_), (19, (5_, 13))), (6_, 7_)), 1_);")
+ , ("t30_2", "(2_, (((14, 3_), (19, (5_, 13))), (18, 6_, 7_)), 1_);")
+ ]
+
+t30' = naiveDistMatrix$ P.map nwtree t30
+ -- "[[],[1]]"
+
+case_t30 = HU.assertEqual "3-tree distance matrix"
+ "[[],[0],[1,0]]" (showMat t30')
+
+-- m = printDistMat stdout (fst d)
+
+
+-- Simple show for a distance matrix:
+showMat (m,_) = show$ V.toList$ V.map U.toList m
+
+main = $(defaultMainGenerator)
+
diff --git a/old_script/cheztrace.ss b/old_script/cheztrace.ss
new file mode 100644
index 0000000..b11b7b4
--- /dev/null
+++ b/old_script/cheztrace.ss
@@ -0,0 +1,62 @@
+;; [2005.11.13 ] Having a go at implementing these in PLT:
+;; DOESNT WORK YET.
+
+(module cheztrace mzscheme
+; (require (lib "errortrace.ss" "errortrace")
+; (lib "stacktrace.ss" "errortrace"))
+ (require (lib "trace.ss"))
+
+ (provide trace-lambda trace-define trace-let inspect break)
+
+ ;; Just stubs.
+ (define (inspect x) (void))
+ (define (break x) x)
+
+
+ #;
+ (define-syntax trace-lambda
+ (lambda (x)
+ (syntax-case x ()
+ [(_ params bods ...)
+ #`(lambda params
+ #,(annotate #'(begin bods ...) #f))])))
+
+ (define-syntax trace-lambda
+ (lambda (x)
+ (syntax-case x ()
+ [(_ name (params ...) bods ...)
+ (identifier? #'name)
+ #'(let ((name (lambda (params ...) bods ...)))
+ (trace name)
+ name)])))
+#|
+ (define-syntax trace-define
+ (lambda (x)
+ (syntax-case x ()
+ [(_ (id v ...) e) #'(begin (define (id v ...) e) (trace id))]
+ [(_ x e) (identifier? #'x) #'(begin (define x e) (trace x))])))
+
+ (define-syntax trace-let
+ (syntax-rules ()
+ [(_ n ([l* r*] ...) bods ...)
+ (letrec ((n (lambda (l* ...) bods ...)))
+ (trace n)
+ (n r* ...))]))
+|#
+
+;; [2006.03.23] For now killing these ^^^, not sure how much they worked.
+
+;; These simply don't do anything under PLT for the moment.
+(define-syntax trace-define (syntax-rules () [(_ e ...) (define e ...)]))
+(define-syntax trace-let (syntax-rules () [(_ e ...) (let e ...)]))
+
+
+)
+
+;(require cheztrace)
+;(define f (trace-lambda (x) x (if (odd? x) (error 'foo "") (add1 x))))
+;(f 40)
+;(f 39)
+
+;(trace-let loop ((n 10))
+; (if (> n 0) (loop (sub1 n))))
diff --git a/old_script/find_tree_matches.ss b/old_script/find_tree_matches.ss
new file mode 100644
index 0000000..969f2e9
--- /dev/null
+++ b/old_script/find_tree_matches.ss
@@ -0,0 +1,420 @@
+#! /bin/bash
+#|
+exec mzscheme -qu "$0" ${1+"$@"}
+|#
+;;====================================================================================================
+;; This is a script to parse .dnd tree files.
+
+;; NOTE: This was the "Version 1" script and has been replaced by the haskell version, phybin.hs
+
+;;====================================================================================================
+
+;; [2009.12.06] Hacking it to work with wolbachia_A_group/input
+
+(module find_tree_matches mzscheme
+ ;; Import the parser and lexer generators.
+ (require (lib "yacc.ss" "parser-tools")
+ (lib "lex.ss" "parser-tools")
+ (prefix : (lib "lex-sre.ss" "parser-tools"))
+ (lib "list.ss")
+ ;(lib "iu-match.ss")
+ "iu-match.ss"
+ )
+; (provide (all-defined))
+
+(require (lib "pretty.ss")
+ (lib "process.ss")
+ (prefix srfi: (lib "1.ss" "srfi")))
+
+;;====================================================================================================
+;; First define the PARSER
+;;====================================================================================================
+
+(define-empty-tokens op-tokens
+ (LeftParen RightParen Comma SemiColon Colon
+ EOF ))
+
+(define-tokens value-tokens (NAME NUM))
+
+(define-lex-abbrevs (lower-letter (:/ #\a #\z))
+ (upper-letter (:/ #\A #\Z))
+ (alpha (:or lower-letter upper-letter))
+ (digit (:/ #\0 #\9))
+ (digits (:+ digit))
+ (letter (:or alpha digit "." "'" "-" "/"))
+
+ ;; This requires that names have an underscore in them:
+ ;; That's what distinguishes them from numbers:
+ ;(name (:seq (:+ letter) (:+ (:seq "_" (:* letter)))))
+
+ ;; [2010.08.13] I can't remember why I did that... were there gene names that started with numbers??
+ (name (:seq alpha (:+ (:or letter "_"))))
+
+ ;; [2009.12.06] Odd, getting negative branch lengths!!
+ ;; [2009.12.07] Ran into some trees with 0 distance and no decimal point:
+ (number (:seq (:? "-") digits (:? (:seq "." digits (:? (:seq "E-" digits))))))
+ ;(number (:seq digit digits))
+ )
+
+(define lex
+ (lexer-src-pos
+
+ [(eof) 'EOF]
+
+ ;; Ignore all whitespace:
+ [(:or #\tab #\space #\newline #\return) (return-without-pos (lex input-port))]
+
+ ["(" 'LeftParen]
+ [")" 'RightParen]
+ ["," 'Comma]
+ [";" 'SemiColon]
+ [":" 'Colon]
+
+ ;; Like most of these scripts this is a hack. It assumes all node ID's in the tree start with a particular string.
+
+ ;[(:seq "c" name) (token-NAME lexeme)]
+ [name (token-NAME lexeme)]
+
+ [number (token-NUM (string->number lexeme))]
+ ; [number (token-NUM lexeme)]
+
+ ))
+
+(define (format-pos pos)
+ (if (position-line pos)
+ (format "line ~a:~a" (position-line pos) (position-col pos))
+ (format "char ~a" (position-offset pos))))
+
+(define parse
+ (parser
+ (src-pos)
+ (start wholefile)
+ (end EOF)
+
+ (tokens value-tokens op-tokens)
+ (error (lambda (a b c start end)
+ (fprintf (current-error-port)
+ "PARSE ERROR: after ~a token, ~a~a.\n"
+ (if a "valid" "invalid") b
+ (if c (format " carrying value ~s" c) ""))
+ (fprintf (current-error-port)
+ " Located between ~a and ~a.\n"
+ (format-pos start) (format-pos end))))
+ ;; Precedence:
+ ;(precs )
+ ;(debug "_parser.log")
+
+; Tree: The full input Newick Format for a single tree
+; Subtree: an internal node (and its descendants) or a leaf node
+; Leaf: a leaf node
+; Internal: an internal node (and its descendants)
+; BranchSet: a set of one or more Branches
+; Branch: a tree edge and its descendant subtree.
+; Name: the name of a node
+; Length: the length of a tree edge.
+; [edit]The grammar rules
+
+(grammar
+
+ (wholefile [(Subtree SemiColon) $1]
+ [(Branch SemiColon) $1])
+ (Subtree [(Leaf) $1]
+ [(Internal) $1])
+ (Leaf [(Name) $1])
+ (Internal [(LeftParen BranchSet RightParen Name)
+ (if $4 (cons $4 $2) $2)])
+
+ (BranchSet [(Branch) (list $1)]
+ ;[(BranchSet Comma Branch) (cons $1 $3)]
+ [(Branch Comma BranchSet) (cons $1 $3)]
+ )
+ (Branch [(Subtree Length) (list $2 $1)])
+
+ (Name [() #f]
+ [(NAME) $1])
+ (Length [() 0]
+ [(Colon NUM) $2])
+)
+
+#;
+ (grammar
+
+ (wholefile [(tree SemiColon) $1]
+ ;; ACK making semicolon and final tag optional!!
+ [(tree) $1]
+ [(LeftParen nodes+ RightParen) (cons 0.0 $2)]
+ [(LeftParen nodes+ RightParen SemiColon) (cons 0.0 $2)]
+ )
+
+ (numtag [(Colon NUM) $2]
+ ;; [2010.08.13] Making them optional everywhere:
+ [() 0.0]
+ )
+
+
+ (tree [(NAME numtag) (list $2 $1)]
+ [(LeftParen nodes+ RightParen numtag) (cons $4 $2)])
+
+ (nodes+ [(tree) (list $1)]
+ [(tree Comma nodes+) (cons $1 $3)])
+)
+
+))
+
+;; ================================================================================
+
+(define (parse-file f)
+ (let ([p (open-input-file f)])
+ (let ([res (parse-port p)])
+ ;(printf "PARSED: \n")(pretty-print res)
+ (close-input-port p)
+ res)))
+
+(define (parse-port ip)
+ (port-count-lines! ip)
+ (parse (lambda ()
+ (let ((lexed (lex ip)))
+ ;(printf "LEXED: ~s\n" (position-token-token lexed))
+ (flatten lexed))))
+ )
+
+(define (flatten pt)
+ (let loop ((x pt))
+ (if (position-token? (position-token-token x))
+ (begin (error 'flatten "No nested position-tokens!")
+ (loop (position-token-token x)))
+ x)))
+
+(define allargs (vector->list (current-command-line-arguments)))
+(define pretty? #t)
+ ;; End main
+
+(print-graph #t)
+(print-vector-length #f)
+
+
+;;====================================================================================================
+;; Now normalize the trees.
+;;====================================================================================================
+
+;; The basic idea here is to sort branches by some criteria.
+;; If that criteria turns out to be insufficient... then we can add extra criteria.
+(define (normalize-tree tr)
+ ;; This returns a branch-sorted tree
+ (define (loop tr )
+ (match tr
+ [(,dist ,name) (guard (string? name))
+ (values (list dist name) 1 1)]
+ [(,dist ,x)
+ (error 'normalize-tree "was not expecting interior nodes with one sub-branch... ~s" (list dist x))]
+ [(,dist ,[nodes sizes depths] ...)
+ ;; This creates an ugly data type (list overuse) with both the branches and their size/depth measurement:
+ (define sorted
+ (sort
+ (map list nodes sizes depths)
+ (lambda (ls1 ls2)
+ ;(printf "Comparing ~s and ~s\n" (cdr ls1) (cdr ls2))
+ (or (< (cadr ls1) (cadr ls2))
+ (and (= (cadr ls1) (cadr ls2))
+ (< (caddr ls1) (caddr ls2)))))
+ ))
+ ;(printf "\n ~a Node sizes ~s and depths ~s\n" dist sizes depths)
+ ;; If there are any non-leaf "plateaus" in the ramp then we can't deal with that yet:
+ (let loop ((last (car sorted))
+ (rest (cdr sorted)))
+ (cond
+ [(null? rest) (void)]
+ ;; If they're equal according to the size/depth metric:
+ [(and (equal? (cdr last) (cdar rest))
+ ;; And non-leaf:
+ (not (equal? (cdr last) '(1 1))))
+ ;; Ok, given that we have a match, that's still ok if they are the SAME tree.
+ ;; The only problem is if they're different in a way that our metric doesn't capture.
+ (if (tree-shape-equals? (car last) (caar rest))
+ (void)
+ (begin
+ (printf "\nERRORful subbranches:\n")
+ (pretty-print (car last))
+ (pretty-print (caar rest))
+ (error 'normalize-tree "Cannot yet handle same depth/size signatures of sub-branches: ~s ~s"
+ (cdr last) (cdar rest))))]
+ [else (loop (car rest) (cdr rest))]))
+
+ (values (cons dist (map car sorted))
+ (apply + 1 sizes)
+ (add1 (apply max depths))
+ )]))
+ (define-values (tree _ __) (loop tr))
+ tree)
+
+;; Assumes that the trees are already in normal form:
+;; Compares tree shape and leaf nodes, IGNORES interior nodes.
+(define (tree-equals? tr1 tr2)
+ ;(printf "EQUALS? ~s ~s\n" tr1 tr2)
+ (match (cons tr1 tr2)
+ [((,dist1 ,name1) . (,dist2 ,name2))
+ (guard (string? name1) (string? name2))
+ (if
+ #t ;; -- This would give you only shape.
+ (string=? name1 name2) ;; -- This requires that the leaf names be the same.
+ )
+ ]
+ [((,dist1 ,nodes1 ...) . (,dist2 ,nodes2 ...))
+ (and (equal? (length nodes1) (length nodes2))
+ (andmap tree-equals? nodes1 nodes2))]
+ [(,s1 . ,s2) (guard (string? s1) (string? s2)) (error 'tree-equals? "implementation assumption violated... oops")]
+ [,else #f]))
+
+(define (tree-shape-equals? tr1 tr2)
+ ;(printf "\nSHAPE EQUALS? ~s ~s\n" tr1 tr2)
+ (match (cons tr1 tr2)
+ [((,dist1 ,name1) . (,dist2 ,name2))
+ (guard (string? name1) (string? name2))
+ #t ;; -- This would give you only shape.
+ ]
+ [((,dist1 ,nodes1 ...) . (,dist2 ,nodes2 ...))
+ (and (equal? (length nodes1) (length nodes2))
+ (andmap tree-shape-equals? nodes1 nodes2))]
+ [(,s1 . ,s2) (guard (string? s1) (string? s2)) (error 'tree-shape-equals? "implementation assumption violated... oops")]
+ [,else #f]))
+
+
+;;====================================================================================================
+;; TESTS
+;;====================================================================================================
+
+(define tests
+ '(
+ "(,,(,));" ; no nodes are named
+ "(A,B,(C,D));" ; leaf nodes are named
+ "(A,B,(C,D)E)F;" ; all nodes are named
+ "(:0.1,:0.2,(:0.3,:0.4):0.5);" ; all but root node have a distance to parent
+ "(:0.1,:0.2,(:0.3,:0.4):0.5):0.0;" ; all have a distance to parent
+ "(A:0.1,B:0.2,(C:0.3,D:0.4):0.5);" ; distances and leaf names (popular)
+ "(A:0.1,B:0.2,(C:0.3,D:0.4)E:0.5)F;" ; distances and all names
+ "((B:0.2,(C:0.3,D:0.4)E:0.5)F:0.1)A;" ; a tree rooted on a leaf node (rare)
+ ))
+
+(define (test) (map parse tests))
+
+;;====================================================================================================
+;; Here's our script "main"
+;;====================================================================================================
+
+(define (new-main files)
+ (define __ (begin (printf "\nBeginning processing of ~s files\n" (length files))))
+ ; (define parsed (parse-file file))
+ ; (define normalized (normalize-tree parsed))
+ ;(pretty-print parsed) ;(newline)
+ ;(printf "NORMALIZED ~s\n " file)
+ ;(pretty-print normalized)
+ ;;(exit)
+
+ (define ___ (begin (printf "Parsing and normalizing trees: ")(flush-output)))
+ (define all-trees
+ (map (lambda (file)
+ ;(define __ (begin (printf " ~a " file)(flush-output)))
+ (define parsed (parse-file file))
+ (define normalized (normalize-tree parsed))
+ (printf ".") (flush-output)
+ (list file normalized))
+ files))
+
+ ;; Type: (((name tree) ...) ...)
+ ;; Each equivalence class is represented by (tree name ...)
+ ;; Only a single representative tree is needed.
+ (define classes '())
+
+ (if (null? (cdr all-trees))
+ (begin
+ ;; If we have one input just print it:
+ (printf "\nOnly one file as input. Not making equivalence classes, just printing it:\n")
+ (pretty-print (cadar all-trees))
+ )
+ (begin
+ (printf "\nComputing equivalence classes on ~s trees: " (length all-trees)) (flush-output)
+ (for-each
+ (lambda (entry)
+ (define found-hit #f)
+ (match entry
+ [(,file ,normtree)
+ (let ([new (map (lambda (class)
+ (if (tree-equals? normtree (car class))
+ (begin
+ (set! found-hit #t)
+ (cons (car class) (cons file (cdr class))))
+ class))
+ classes)])
+ (printf ".") (flush-output)
+ (set! classes
+ (if found-hit new
+ (cons `(,normtree ,file) classes))))
+ ]))
+ all-trees)
+ (newline)
+
+ ;; Now sort them:
+ (set! classes (sort classes (lambda (a b) (< (length a) (length b)))))
+
+ (printf "Finished. ~s files fell into ~s equivalence classes\n" (length files) (length classes))
+ (printf "Sizes of each equivalence class: ")
+ (for-each (lambda (class) (printf " ~s" (length (cdr class)))) classes)
+ (newline)
+ (printf "Dumping lists of files to class<N>.txt, and representative tree to class<N>.tree...\n")
+
+ (system "rm -f class*.txt class*.tree")
+
+ (for-each (lambda (i class)
+ ;(when (file-exists? (format "class~s.txt" i)) (delete-file (format "class~s.txt" i)))
+ ;(when (file-exists? (format "class~s.tree" i)) (delete-file (format "class~s.tree" i)))
+ (with-output-to-file (format "class~s.txt" i)
+ (lambda ()
+ (for-each (lambda (file) (printf "~a " file)) (cdr class))
+ (newline)))
+ (with-output-to-file (format "class~s.tree" i)
+ (lambda () (pretty-print (car class))))
+ )
+ (srfi:iota (length classes))
+ classes)
+)))
+
+(define (print-usage)
+ (printf "Usage: find_tree_matches <file-to-process> ...\n\n"))
+
+
+;COUNTING: ((0 ((0 "lpg140") (0 "lpl135"))) (0 "lpp135") (0 "LPC_081"))
+
+(define (sumcount stuff)
+ ;(printf "sumcount ~s\n" stuff)
+ (if (or (string? stuff) (not stuff)) 0
+ (foldl (lambda (x acc) (+ acc (count x))) 0 stuff)))
+
+(define (count tree)
+ ;(define __ (printf "COUNTING: ~s\n" tree))
+ (if (number? (car tree))
+ (+ 1 (sumcount (cadr tree)))
+ (sumcount tree)))
+
+;; BENCHMARK TEST:
+
+;(for-each (lambda (file) (parse-file file)) allargs)
+
+(pretty-print (foldl + 0 (map (lambda (file) (count (parse-file file))) allargs)))
+;(pretty-print (map (lambda (file) (pretty-print (count (parse-file file)))) allargs))
+
+;(pretty-print (count '((0 ((0 "lpg292") (0 "lpl285"))) (0 "lpp299") (0 "LPC_323"))))
+
+
+
+#;
+(begin
+ (when (null? allargs)
+ (print-usage) (flush-output)
+ ;(error 'find_tree_matches "no files given")
+ (printf "Error: no files given!\n")
+ (exit 1))
+ (new-main allargs))
+
+
+) ;; End module.
+
diff --git a/old_script/iu-exptime.ss b/old_script/iu-exptime.ss
new file mode 100644
index 0000000..855c341
--- /dev/null
+++ b/old_script/iu-exptime.ss
@@ -0,0 +1,67 @@
+;;; match.ss
+;;; (broken out for PLT port only)
+
+;;; This program was originally designed and implemented by Dan
+;;; Friedman. It was redesigned and implemented by Erik Hilsdale;
+;;; some improvements were suggested by Steve Ganz. Additional
+;;; modifications were made by Kent Dybvig. Original port to PLT Scheme
+;;; by Matthew Flatt. As of 20020328, Matt Jadud is maintaining
+;;; the PLT Scheme port of this code.
+
+;;; Copyright (c) 1992-2002 Cadence Research Systems
+;;; Permission to copy this software, in whole or in part, to use this
+;;; software for any lawful purpose, and to redistribute this software
+;;; is granted subject to the restriction that all copies made of this
+;;; software must include this copyright notice in full. This software
+;;; is provided AS IS, with NO WARRANTY, EITHER EXPRESS OR IMPLIED,
+;;; INCLUDING BUT NOT LIMITED TO IMPLIED WARRANTIES OF MERCHANTABILITY
+;;; OR FITNESS FOR ANY PARTICULAR PURPOSE. IN NO EVENT SHALL THE
+;;; AUTHORS BE LIABLE FOR CONSEQUENTIAL OR INCIDENTAL DAMAGES OF ANY
+;;; NATURE WHATSOEVER.
+
+;;; If you are reading this comment, then you do not have
+;;; the most recent version of 'match'.
+
+
+(module iu-exptime mzscheme
+ (provide syntax-error
+ let-synvalues*
+ with-values syntax-lambda
+ with-temp with-temps)
+
+ (define (syntax-error e m)
+ (raise-syntax-error #f m e))
+
+ (define-syntax let-synvalues*
+ (syntax-rules ()
+ ((_ () B0 B ...) (begin B0 B ...))
+ ((_ (((Formal ...) Exp) Decl ...) B0 B ...)
+ (call-with-values (lambda () Exp)
+ (lambda (Formal ...)
+ (with-syntax ((Formal Formal) ...)
+ (let-synvalues* (Decl ...) B0 B ...)))))))
+
+ (define-syntax with-values
+ (syntax-rules ()
+ ((_ P C) (call-with-values (lambda () P) C))))
+
+ (define-syntax syntax-lambda
+ (lambda (x)
+ (syntax-case x ()
+ ((_ (Pat ...) Body0 Body ...)
+ (with-syntax (((X ...) (generate-temporaries #'(Pat ...))))
+ #'(lambda (X ...)
+ (with-syntax ((Pat X) ...)
+ Body0 Body ...)))))))
+
+ (define-syntax with-temp
+ (syntax-rules ()
+ ((_ V Body0 Body ...)
+ (with-syntax (((V) (generate-temporaries '(x))))
+ Body0 Body ...))))
+ (define-syntax with-temps
+ (syntax-rules ()
+ ((_ (V ...) (Exp ...) Body0 Body ...)
+ (with-syntax (((V ...) (generate-temporaries #'(Exp ...))))
+ Body0 Body ...))))
+)
diff --git a/old_script/iu-match.ss b/old_script/iu-match.ss
new file mode 100644
index 0000000..c963cca
--- /dev/null
+++ b/old_script/iu-match.ss
@@ -0,0 +1,698 @@
+;;; match.ss
+
+;;; This program was originally designed and implemented by Dan
+;;; Friedman. It was redesigned and implemented by Erik Hilsdale;
+;;; some improvements were suggested by Steve Ganz. Additional
+;;; modifications were made by Kent Dybvig. Original port to PLT Scheme
+;;; by Matthew Flatt. As of 20020328, Matt Jadud is maintaining
+;;; the PLT Scheme port of this code.
+
+;;; Copyright (c) 1992-2002 Cadence Research Systems
+;;; Permission to copy this software, in whole or in part, to use this
+;;; software for any lawful purpose, and to redistribute this software
+;;; is granted subject to the restriction that all copies made of this
+;;; software must include this copyright notice in full. This software
+;;; is provided AS IS, with NO WARRANTY, EITHER EXPRESS OR IMPLIED,
+;;; INCLUDING BUT NOT LIMITED TO IMPLIED WARRANTIES OF MERCHANTABILITY
+;;; OR FITNESS FOR ANY PARTICULAR PURPOSE. IN NO EVENT SHALL THE
+;;; AUTHORS BE LIABLE FOR CONSEQUENTIAL OR INCIDENTAL DAMAGES OF ANY
+;;; NATURE WHATSOEVER.
+
+;;; A complete CHANGELOG and some documentation can be found at the
+;;; end of this file.
+
+;;; If you are reading this comment, then you do not have
+;;; the most recent version of 'match'.
+
+;; =============================================================
+
+;; Exp ::= (match Exp Clause)
+;; || (trace-match Exp Clause)
+;; || (match+ (Id*) Exp Clause*)
+;; || (trace-match+ (Id*) Exp Clause*)
+;; || OtherSchemeExp
+
+;; Clause ::= (Pat Exp+) || (Pat (guard Exp*) Exp+)
+
+;; Pat ::= (Pat ... . Pat)
+;; || (Pat . Pat)
+;; || ()
+;; || #(Pat* Pat ... Pat*)
+;; || #(Pat*)
+;; || ,Id
+;; || ,[Id*]
+;; || ,[Cata -> Id*]
+;; || Id
+
+;; Cata ::= Exp
+
+;; YOU'RE NOT ALLOWED TO REFER TO CATA VARS IN GUARDS. (reasonable!)
+
+#cs
+(module iu-match mzscheme
+ (provide match+ trace-match+ match trace-match )
+ (require-for-syntax "iu-exptime.ss" (lib "stx.ss" "syntax"))
+ (require (lib "list.ss") (lib "etc.ss") )
+ ;; RRN: [2005.11.13]
+ (require "cheztrace.ss")
+
+ (define (make-list n v)
+ (vector->list (make-vector n v)))
+
+
+ (define match-equality-test
+ (make-parameter
+ equal?
+ (lambda (x)
+ (unless (procedure? x)
+ (error 'match-equality-test "~s is not a procedure" x))
+ x)))
+
+ (define-syntax match+
+ (lambda (x)
+ (syntax-case x ()
+ [(_ (ThreadedId ...) Exp Clause ...)
+ #'(let f ((ThreadedId ThreadedId) ... (x Exp))
+ (match-help _ f x (ThreadedId ...) Clause ...))])))
+
+ (define-syntax match
+ (lambda (x)
+ (syntax-case x ()
+ [(_ Exp Clause ...)
+ #'(let f ((x Exp))
+ (match-help _ f x () Clause ...))])))
+
+ (define-syntax trace-match+
+ (lambda (x)
+ (syntax-case x ()
+ [(_ (ThreadedId ...) Name Exp Clause ...)
+ #'(letrec ((f (trace-lambda Name (ThreadedId ... x)
+ (match-help _ f x (ThreadedId ...) Clause ...))))
+ (f ThreadedId ... x))])))
+
+ (define-syntax trace-match
+ (lambda (x)
+ (syntax-case x ()
+ [(_ Name Exp Clause ...)
+ #'(letrec ((f (trace-lambda Name (x)
+ (match-help _ f x () Clause ...))))
+ (f Exp))])))
+
+;;; ------------------------------
+
+ (define-syntax let-values**
+ (syntax-rules ()
+ ((_ () B0 B ...) (begin B0 B ...))
+ ((_ ((Formals Exp) Rest ...) B0 B ...)
+ (let-values** (Rest ...)
+ (call-with-values (lambda () Exp)
+ (lambda Formals B0 B ...))))))
+
+ (define-syntax match-help
+ (lambda (x)
+ (syntax-case x ()
+ ((_ Template Cata Obj ThreadedIds)
+ #'(error 'match "Unmatched datum: ~s" Obj))
+ ((_ Template Cata Obj ThreadedIds (Pat B0 B ...) Rest ...)
+ #'(convert-pat Pat
+ (match-help1 Template Cata Obj ThreadedIds
+ (B0 B ...)
+ Rest ...))))))
+
+ (define-syntax match-help1
+ (syntax-rules (guard)
+ ((_ PatLit Vars Cdecls Template Cata Obj ThreadedIds
+ ((guard G ...) B0 B ...) Rest ...)
+ (let ((ls/false (sexp-dispatch Obj PatLit)))
+ (if (and ls/false (apply (lambda Vars
+ (guard-body Cdecls
+ (extend-backquote Template (and G ...))))
+ ls/false))
+ (apply (lambda Vars
+ (clause-body Cata Cdecls ThreadedIds
+ (extend-backquote Template B0 B ...)))
+ ls/false)
+ (match-help Template Cata Obj ThreadedIds Rest ...))))
+ ((_ PatLit Vars Cdecls Template Cata Obj ThreadedIds
+ (B0 B ...) Rest ...)
+ (let ((ls/false (sexp-dispatch Obj PatLit)))
+ (if ls/false
+ (apply (lambda Vars
+ (clause-body Cata Cdecls ThreadedIds
+ (extend-backquote Template B0 B ...)))
+ ls/false)
+ (match-help Template Cata Obj ThreadedIds Rest ...))))))
+
+ (define-syntax clause-body
+ (lambda (x)
+ (define build-mapper
+ (lambda (vars depth cata tIds)
+ (if (zero? depth)
+ cata
+ (with-syntax ((rest (build-mapper vars (- depth 1) cata tIds))
+ (vars vars)
+ (tIds tIds))
+ #'(mapper rest vars tIds)))))
+ (syntax-case x ()
+ ((_ Cata ((CVar CDepth CMyCata CFormal ...) ...) (ThreadedId ...) B)
+ (with-syntax (((Mapper ...)
+ (map (lambda (mycata formals depth)
+ (build-mapper formals
+ (syntax-object->datum depth)
+ (syntax-case mycata ()
+ [#f #'Cata]
+ [exp #'exp])
+ #'(ThreadedId ...)))
+ (syntax->list #'(CMyCata ...))
+ (syntax->list #'((CFormal ...) ...))
+ (syntax->list #'(CDepth ...))))
+ (((ThreadedId ...) ...)
+ (map (lambda (x) #'(ThreadedId ...))
+ (syntax->list #'((CFormal ...) ...)))))
+ #'(let-values** (([ThreadedId ... CFormal ...]
+ (Mapper ThreadedId ... CVar))
+ ...)
+ B))))))
+
+ (define-syntax guard-body
+ (lambda (x)
+ (syntax-case x ()
+ ((_ ((Cvar Cdepth MyCata Cformal ...) ...) B)
+ (with-syntax (((CF ...) (apply append (map syntax->list (syntax->list #'((Cformal ...) ...))))))
+ #'(let-syntax
+ ((CF
+ (lambda (x)
+ (syntax-case x ()
+ (Name
+ (syntax-error #'Name
+ "guard cannot refer to return-value variable")))))
+ ...)
+ B))))))
+
+ (define-syntax convert-pat
+ ;; returns sexp-pat x vars x cdecls
+ (let ()
+ (define ellipsis?
+ (lambda (x)
+ (and (identifier? x) (eq? (syntax-e x) '...))))
+ (define Var?
+ (lambda (x)
+ (syntax-case x (->)
+ [-> #f]
+ [id (identifier? #'id)])))
+ (define (f syn vars cdecls depth)
+ (syntax-case syn (unquote)
+ ((unquote . stuff) ; separate for better error detection
+ (syntax-case syn (unquote ->)
+ ((unquote [MyCata -> Var ...])
+ (andmap Var? (syntax->list #'(Var ...)))
+ (with-syntax (((Temp) (generate-temporaries '(x)))
+ (Depth depth))
+ (values #'any
+ (cons #'Temp vars)
+ (cons #'[Temp Depth MyCata Var ...] cdecls))))
+ ((unquote [Var ...])
+ (andmap Var? (syntax->list #'(Var ...)))
+ (with-syntax (((Temp) (generate-temporaries '(x)))
+ (Depth depth))
+ (values #'any
+ (cons #'Temp vars)
+ (cons #'[Temp Depth #f Var ...] cdecls))))
+ ((unquote Var)
+ (Var? #'Var)
+ (values #'any (cons #'Var vars) cdecls))))
+ (((unquote . stuff) Dots) ; separate for better error detection
+ (ellipsis? #'Dots)
+ (syntax-case syn (unquote ->)
+ (((unquote [MyCata -> Var ...]) Dots)
+ (andmap Var? (syntax->list #'(Var ...)))
+ (with-syntax (((Temp) (generate-temporaries '(x)))
+ (Depth+1 (add1 depth)))
+ (values #'each-any
+ (cons #'Temp vars)
+ (cons #'[Temp Depth+1 MyCata Var ...] cdecls))))
+ (((unquote [Var ...]) Dots)
+ (andmap Var? (syntax->list #'(Var ...)))
+ (with-syntax (((Temp) (generate-temporaries '(x)))
+ (Depth+1 (add1 depth)))
+ (values #'each-any
+ (cons #'Temp vars)
+ (cons #'[Temp Depth+1 #f Var ...] cdecls))))
+ (((unquote Var) Dots)
+ (Var? #'Var)
+ (values #'each-any (cons #'Var vars) cdecls))
+ ((expr Dots) (syntax-error #'expr "match-pattern unquote syntax"))))
+ ((Pat Dots)
+ (ellipsis? #'Dots)
+ (let-synvalues* (((Dpat Dvars Dcdecls)
+ (f #'Pat vars cdecls (add1 depth))))
+; (printf "GOT VARS: ~s\n" vars)
+ (with-syntax ((Size (- (length (syntax->list #'Dvars))
+ ;; RRN: this is screwy:
+ (length
+ (if (syntax? vars)
+ (syntax->list vars)
+ vars))
+ )))
+ ;;MCJ wrapped syntax->list around vars
+ (values #'#(each Dpat Size) #'Dvars #'Dcdecls))))
+ ((Pat Dots . Rest)
+ (ellipsis? #'Dots)
+ (let-synvalues* (((Rpat Rvars Rcdecls)
+ (f #'Rest vars cdecls depth))
+ ((Dpat Dvars Dcdecls)
+ (f #'(Pat (... ...)) #'Rvars #'Rcdecls
+ depth)))
+ (with-syntax ((Size (- (length (syntax->list #'Dvars)) (length (syntax->list #'Rvars))))
+ ((RevRestTl . RevRest) (reverseX (syntax->list #'Rpat) '())))
+ (values #'#(tail-each Dpat Size RevRest RevRestTl)
+ #'Dvars #'Dcdecls))))
+ ((X . Y)
+ (let-synvalues* (((Ypat Yvars Ycdecls)
+ (f #'Y vars cdecls depth))
+ ((Xpat Xvars Xcdecls)
+ (f #'X #'Yvars #'Ycdecls depth)))
+ (values #'(Xpat . Ypat) #'Xvars #'Xcdecls)))
+ (() (values #'() vars cdecls))
+ (#(X ...)
+ (let-synvalues* (((Pat Vars CDecls) (f #'(X ...) vars cdecls depth)))
+ (values #'#(vector Pat) #'Vars #'CDecls)))
+ (Thing (values #'#(atom Thing) vars cdecls))))
+ (define reverseX
+ (lambda (ls acc)
+ (if (pair? ls)
+ (reverseX (cdr ls) (cons (car ls) acc))
+ (cons ls acc))))
+ (lambda (syn)
+ (syntax-case syn ()
+ ((_ syn (kh . kt))
+ (let-synvalues* (((a b c) (f #'syn '() '() 0)))
+ #'(kh 'a b c . kt)))))))
+
+ (define-syntax mapper
+ (lambda (x)
+ (syntax-case x ()
+ ((_ F (RetId ...) (ThreadId ...))
+ (with-syntax (((t ...) (generate-temporaries #'(RetId ...)))
+ ((ts ...) (generate-temporaries #'(RetId ...)))
+ ((null ...) (map (lambda (x) #'()) (syntax->list #'(RetId ...)))))
+ #'(let ((fun F))
+ (rec g
+ (lambda (ThreadId ... ls)
+ (if (null? ls)
+ (values ThreadId ... null ...)
+ (call-with-values
+ (lambda () (g ThreadId ... (cdr ls)))
+ (lambda (ThreadId ... ts ...)
+ (call-with-values
+ (lambda () (fun ThreadId ... (car ls)))
+ (lambda (ThreadId ... t ...)
+ (values ThreadId ... (cons t ts) ...))))))))))))))
+
+;;; ------------------------------
+
+ (define-syntax my-backquote
+ (lambda (x)
+ (define ellipsis?
+ (lambda (x)
+ (and (identifier? x) (eq? (syntax-e x) '...))))
+ (define destruct
+ (lambda (Orig x depth)
+ (syntax-case x (quasiquote unquote unquote-splicing)
+ ;; inner quasiquote
+ ((quasiquote Exp)
+ (with-values (destruct Orig #'Exp (add1 depth))
+ (syntax-lambda (Builder Vars Exps)
+ (if (null? #'Vars)
+ (values #''(quasiquote Exp) '() '())
+ (values #'(list 'quasiquote Builder) #'Vars #'Exps)))))
+ ;; unquote
+ ((unquote Exp)
+ (zero? depth)
+ (with-temp X
+ (values #'X (list #'X) (list #'Exp))))
+ ((unquote Exp)
+ (with-values (destruct Orig #'Exp (sub1 depth))
+ (syntax-lambda (Builder Vars Exps)
+ (if (null? #'Vars)
+ (values #''(unquote Exp) '() '())
+ (values #'(list 'unquote Builder) #'Vars #'Exps)))))
+ ;; splicing
+ (((unquote-splicing Exp))
+ (zero? depth)
+ (with-temp X
+ (values #'X (list #'X) (list #'Exp))))
+ (((unquote-splicing Exp ...))
+ (zero? depth)
+ (with-temps (X ...) (Exp ...)
+ (values #'(append X ...) #'(X ...) #'(Exp ...))))
+ (((unquote-splicing Exp ...) . Rest)
+ (zero? depth)
+ (with-values (destruct Orig #'Rest depth)
+ (syntax-lambda (Builder Vars Exps)
+ (with-temps (X ...) (Exp ...)
+ (if (null? #'Vars)
+ (values #'(append X ... 'Rest)
+ #'(X ...) #'(Exp ...))
+ (values #'(append X ... Builder)
+ #'(X ... . Vars) #'(Exp ... . Exps)))))))
+ ((unquote-splicing Exp ...)
+ (with-values (destruct Orig #'(Exp ...) (sub1 depth))
+ (syntax-lambda (Builder Vars Exps)
+ (if (null? #'Vars)
+ (values #''(unquote-splicing Exp ...) '() '())
+ (values #'(cons 'unquote-splicing Builder)
+ #'Vars #'Exps)))))
+ ;; dots
+ (((unquote Exp) Dots)
+ (and (zero? depth) (ellipsis? #'Dots))
+ (with-temp X
+ (values #'X (list #'X) (list #'Exp))))
+ (((unquote Exp) Dots . Rest)
+ (and (zero? depth) (ellipsis? #'Dots))
+ (with-values (destruct Orig #'Rest depth)
+ (syntax-lambda (RestBuilder RestVars RestExps)
+ (with-syntax ((TailExp
+ (if (stx-null? #'RestVars)
+ #''Rest
+ #'RestBuilder)))
+ (with-temp X
+ (values #'(append X TailExp)
+ (cons #'X #'RestVars)
+ (cons #'Exp #'RestExps)))))))
+ ((Exp Dots . Rest)
+ (and (zero? depth) (ellipsis? #'Dots))
+ (with-values (destruct Orig #'Exp depth)
+ (syntax-lambda (ExpBuilder (ExpVar ...) (ExpExp ...))
+ (if (null? #'(ExpVar ...))
+ (syntax-error Orig "Bad ellipsis")
+ (with-values (destruct Orig #'Rest depth)
+ (syntax-lambda (RestBuilder RestVars RestExps)
+ (with-syntax ((TailExp
+ (if (null? #'RestVars)
+ #''Rest
+ #'RestBuilder))
+ (Orig Orig))
+ (values #'(let f ((ExpVar ExpVar) ...)
+ (if (and (pair? ExpVar) ...)
+ (cons
+ (let ((ExpVar (car ExpVar)) ...)
+ ExpBuilder)
+ (f (cdr ExpVar) ...))
+ (if (and (null? ExpVar) ...)
+ TailExp
+ (error 'unquote
+ "Mismatched lists in ~s"
+ Orig))))
+ (append (syntax->list #'(ExpVar ...)) (syntax->list #'RestVars))
+ (append (syntax->list #'(ExpExp ...)) (syntax->list #'RestExps))))))))))
+ ;; Vectors
+ (#(X ...)
+ (with-values (destruct Orig #'(X ...) depth)
+ (syntax-lambda (LsBuilder LsVars LsExps)
+ (values #'(list->vector LsBuilder) #'LsVars #'LsExps))))
+ ;; random stuff
+ ((Hd . Tl)
+ (with-values (destruct Orig #'Hd depth)
+ (syntax-lambda (HdBuilder HdVars HdExps)
+ (with-values (destruct Orig #'Tl depth)
+ (syntax-lambda (TlBuilder TlVars TlExps)
+ (with-syntax ((Hd (if (null? #'HdVars)
+ #''Hd
+ #'HdBuilder))
+ (Tl (if (null? #'TlVars)
+ #''Tl
+ #'TlBuilder)))
+ (values #'(cons Hd Tl)
+ (append (syntax->list #'HdVars) (syntax->list #'TlVars))
+ (append (syntax->list #'HdExps) (syntax->list #'TlExps)))))))))
+ (OtherThing
+ (values #''OtherThing '() '())))))
+ ;; macro begins
+ (syntax-case x ()
+ ((_ Datum)
+ (with-values (destruct #'(quasiquote Datum) #'Datum 0)
+ (syntax-lambda (Builder (Var ...) (Exp ...))
+ (if (null? #'(Var ...))
+ #''Datum
+ #'(let ((Var Exp) ...)
+ Builder))))))))
+
+ (define-syntax extend-backquote
+ (lambda (x)
+ (syntax-case x ()
+ ((_ Template Exp ...)
+ (with-syntax ((quasiquote
+ (datum->syntax-object #'Template 'quasiquote)))
+ #'(let-syntax ((quasiquote
+ (lambda (x)
+ (syntax-case x ()
+ ((_ Foo) #'(my-backquote Foo))))))
+ Exp ...))))))
+
+;;; ------------------------------
+
+ (define-syntax with-values
+ (syntax-rules ()
+ ((_ P C) (call-with-values (lambda () P) C))))
+
+ (define-syntax letcc
+ (syntax-rules ()
+ ((_ V B0 B ...) (call/ec (lambda (V) B0 B ...)))))
+
+ (define classify-list
+ (lambda (ls)
+ (cond
+ ((null? ls) 'proper)
+ ((not (pair? ls)) 'improper)
+ (else
+ (let f ((tortoise ls) (hare (cdr ls)))
+ (cond
+ ((eq? tortoise hare) 'infinite)
+ ((null? hare) 'proper)
+ ((not (pair? hare)) 'improper)
+ (else
+ (let ((hare (cdr hare)))
+ (cond
+ ((null? hare) 'proper)
+ ((not (pair? hare)) 'improper)
+ (else (f (cdr ls) (cdr hare))))))))))))
+
+ (define ilist-copy-flat
+ (lambda (ils)
+ (let f ((tortoise ils) (hare (cdr ils)))
+ (if (eq? tortoise hare)
+ (list (car tortoise))
+ (cons (car tortoise) (f (cdr tortoise) (cddr hare)))))))
+
+ (define sexp-dispatch
+ (lambda (obj pat);; #f or list of vars
+ (letcc escape
+ (let ((fail (lambda () (escape #f))))
+ (let f ((pat pat) (obj obj) (vals '()))
+ (cond
+ ((eq? pat 'any)
+ (cons obj vals))
+ ((eq? pat 'each-any)
+ ;; handle infinities
+ (case (classify-list obj)
+ ((proper infinite) (cons obj vals))
+ ((improper) (fail))))
+ ((pair? pat)
+ (if (pair? obj)
+ (f (car pat) (car obj) (f (cdr pat) (cdr obj) vals))
+ (fail)))
+ ((vector? pat)
+ (case (vector-ref pat 0)
+ ((atom)
+ (let ((a (vector-ref pat 1)))
+ (if (eqv? obj a)
+ vals
+ (fail))))
+ ((vector)
+ (if (vector? obj)
+ (let ((vec-pat (vector-ref pat 1)))
+ (f vec-pat (vector->list obj) vals))
+ (fail)))
+ ((each)
+ ;; if infinite, copy the list as flat, then do the matching,
+ ;; then do some set-cdrs.
+ (let ((each-pat (vector-ref pat 1))
+ (each-size (vector-ref pat 2)))
+ (case (classify-list obj)
+ ((improper) (fail))
+ ((infinite)
+ (error 'iu-match "not handling infinite lists at the moment...")
+ #;
+ (let ((each-vals (f pat (ilist-copy-flat obj) '())))
+ (for-each (lambda (x) (set-cdr! (last-pair x) x))
+ each-vals)
+ (append each-vals vals)))
+ ((proper)
+ (append
+ (let g ((obj obj))
+ (if (null? obj)
+ (make-list each-size '())
+ (let ((hd-vals (f each-pat (car obj) '()))
+ (tl-vals (g (cdr obj))))
+ (map cons hd-vals tl-vals))))
+ vals)))))
+ ((tail-each)
+ (let ((each-pat (vector-ref pat 1))
+ (each-size (vector-ref pat 2))
+ (revtail-pat (vector-ref pat 3))
+ (revtail-tail-pat (vector-ref pat 4)))
+ (when (eq? (classify-list obj) 'infinite) (fail))
+ (with-values
+ (let g ((obj obj))
+ ;; in-tail?, vals, revtail-left/ls
+ (cond
+ ((pair? obj)
+ (with-values (g (cdr obj))
+ (lambda (in-tail? vals tail-left/ls)
+ (if in-tail?
+ (if (null? tail-left/ls)
+ (values #f vals (list (car obj)))
+ (values #t (f (car tail-left/ls)
+ (car obj)
+ vals)
+ (cdr tail-left/ls)))
+ (values #f vals
+ (cons (car obj) tail-left/ls))))))
+ (else
+ (values #t
+ (f revtail-tail-pat obj '())
+ revtail-pat))))
+ (lambda (in-tail? vals tail-left/ls)
+ (if in-tail?
+ (if (null? tail-left/ls)
+ (append (make-list each-size '())
+ vals)
+ (fail))
+ (f each-pat tail-left/ls vals))))))))
+ (else
+ (if (eqv? obj pat)
+ vals
+ (fail)))))))))
+ )
+
+
+
+;;;CHANGELOG
+
+;; [04 Nov 2005]
+;; rrn fixed a bug in the code for handling ellipses.
+;; (was (length (syntax->list vars)) rather than (length vars))
+
+;; [13 March 2002]
+;; rkd added following change by Friedman and Ganz to the main source
+;; code thread and fixed a couple of minor problems.
+
+;; [9 March 2002]
+;; Dan Friedman and Steve Ganz added the ability to use identical pattern
+;; variables. The patterns represented by the variables are compared
+;; using the value of the parameter match-equality-test, which defaults
+;; to equal?.
+;;
+;; > (match '(1 2 1 2 1)
+;; [(,a ,b ,a ,b ,a) (guard (number? a) (number? b)) (+ a b)])
+;; 3
+;; ;;
+;; > (match '((1 2 3) 5 (1 2 3))
+;; [((,a ...) ,b (,a ...)) `(,a ... ,b)])
+;; (1 2 3 5)
+;; ;;
+;; > (parameterize ([match-equality-test (lambda (x y) (equal? x (reverse y)))])
+;; (match '((1 2 3) (3 2 1))
+;; [(,a ,a) 'yes]
+;; [,oops 'no]))
+;; yes
+
+;; [10 Jan 2002]
+;; eh fixed bug that caused (match '((1 2 3 4)) (((,a ... ,d) . ,x) a)) to
+;; blow up. The bug was caused by a bug in the sexp-dispatch procedure
+;; where a base value empty list was passed to an accumulator from inside
+;; the recursion, instead of passing the old value of the accumulator.
+
+;; [14 Jan 2001]
+;; rkd added syntax checks to unquote pattern parsing to weed out invalid
+;; patterns like ,#(a) and ,[(vector-ref d 1)].
+
+;; [14 Jan 2001]
+;; rkd added ,[Cata -> Id* ...] to allow specification of recursion
+;; function. ,[Id* ...] recurs to match; ,[Cata -> Id* ...] recurs
+;; to Cata.
+
+;; [14 Jan 2001]
+;; rkd tightened up checks for ellipses and nested quasiquote; was comparing
+;; symbolic names, which, as had been noted in the source, is a possible
+;; hygiene bug. Replaced error call in guard-body with syntax-error to
+;; allow error to include source line/character information.
+
+;; [13 Jan 2001]
+;; rkd fixed match patterns of the form (stuff* ,[x] ... stuff+), which
+;; had been recurring on subforms of each item rather than on the items
+;; themselves.
+
+;; [29 Feb 2000]
+;; Fixed a case sensitivity bug.
+
+;; [24 Feb 2000]
+;; Matcher now handles vector patterns. Quasiquote also handles
+;; vector patterns, but does NOT do the csv6.2 optimization of
+;; `#(a 1 ,(+ 3 4) x y) ==> (vector 'a 1 (+ 3 4) 'x 'y).
+;; Also fixed bug in (P ... . P) matching code.
+
+;; [23 Feb 2000]
+;; KSM fixed bug in unquote-splicing inside quasiquote.
+
+;; [10 Feb 2000]
+;; New forms match+ and trace-match+ thread arguments right-to-left.
+;; The pattern (P ... . P) now works the way you might expect.
+;; Infinite lists are now properly matched (and not matched).
+;; Removed the @ pattern.
+;; Internal: No longer converting into syntax-case.
+
+;; [6 Feb 2000]
+;; Added expansion-time error message for referring to cata variable
+;; in a guard.
+
+;; [4 Feb 2000]
+;; Fixed backquote so it can handle nested backquote (oops).
+;; Double-backquoted elipses are neutralized just as double-backquoted
+;; unquotes are. So:
+;; `(a ,'(1 2 3) ... b) =eval=> (a 1 2 3 b)
+;; ``(a ,'(1 2 3) ... b) =eval=> `(a ,'(1 2 3) ... b)
+;; ``(a ,(,(1 2 3) ...) b) =eval=> `(a ,(1 2 3) b)
+;; Added support for
+;; `((unquote-splicing x y z) b) =expand==> (append x y z (list 'b))
+
+;; [1 Feb 2000]
+;; Fixed a bug involving forgetting to quote stuff in the revised backquote.
+;; Recognized unquote-splicing and signalled errors in the appropriate places.
+;; Added support for deep elipses in backquote.
+;; Rewrote backquote so it does the rebuilding directly instead of
+;; expanding into Chez's backquote.
+
+;; [31 Jan 2000]
+;; Kent Dybvig fixed template bug.
+
+;; [31 Jan 2000]
+;; Added the trace-match form, and made guards contain
+;; an explicit and expression:
+;; (guard E ...) ==> (guard (and E ...))
+
+;; [26 Jan 2000]
+;; Inside the clauses of match expressions, the following
+;; transformation is performed inside backquote expressions:
+;; ,v ... ==> , at v
+;; (,v ,w) ... ==> ,@(map list v w)
+;; etc.
+
+;(require iu-match)
+
+;; RRN: DEBUGGING
+;(define (f)
+; (match '((1 2) (3 4))
+; [((,x ,y) ...) y]))
+;(f)
\ No newline at end of file
diff --git a/old_script/match.ss b/old_script/match.ss
new file mode 100644
index 0000000..a215765
--- /dev/null
+++ b/old_script/match.ss
@@ -0,0 +1,800 @@
+;;; match.ss
+
+;;; Time-stamp: Sun Jan 14 09:04:21 EST 2001 rkd
+
+;; [14 Jan 2001]
+;; rkd added syntax checks to unquote pattern parsing to weed out invalid
+;; patterns like ,#(a) and ,[(vector-ref d 1)].
+
+;; [14 Jan 2001]
+;; rkd added ,[Cata -> Id* ...] to allow specification of recursion
+;; function. ,[Id* ...] recurs to match; ,[Cata -> Id* ...] recurs
+;; to Cata.
+
+;; [14 Jan 2001]
+;; rkd tightened up checks for ellipses and nested quasiquote; was comparing
+;; symbolic names, which, as had been noted in the source, is a possible
+;; hygiene bug. Replaced error call in guard-body with syntax-error to
+;; allow error to include source line/character information.
+
+;; [13 Jan 2001]
+;; rkd fixed match patterns of the form (stuff* ,[x] ... stuff+), which
+;; had been recurring on subforms of each item rather than on the items
+;; themselves.
+
+;; Previous changelog listings at end of file.
+
+;; =============================================================
+
+;; Exp ::= (match Exp Clause)
+;; || (trace-match Exp Clause)
+;; || (match+ (Id*) Exp Clause*)
+;; || (trace-match+ (Id*) Exp Clause*)
+;; || OtherSchemeExp
+
+;; Clause ::= (Pat Exp+) || (Pat (guard Exp*) Exp+)
+
+;; Pat ::= (Pat ... . Pat)
+;; || (Pat . Pat)
+;; || ()
+;; || #(Pat* Pat ... Pat*)
+;; || #(Pat*)
+;; || ,Id
+;; || ,[Id*]
+;; || ,[Cata -> Id*]
+;; || Id
+
+;; Cata ::= Exp
+
+;; YOU'RE NOT ALLOWED TO REFER TO CATA VARS IN GUARDS. (reasonable!)
+
+(module iu-match
+ ((match+ match-help match-help1 clause-body let-values**
+ guard-body convert-pat mapper my-backquote extend-backquote
+ sexp-dispatch)
+ (trace-match+ match-help match-help1 clause-body let-values**
+ guard-body convert-pat mapper my-backquote extend-backquote
+ sexp-dispatch)
+ (match match-help match-help1 clause-body let-values**
+ guard-body convert-pat mapper my-backquote extend-backquote
+ sexp-dispatch)
+ (trace-match match-help match-help1 clause-body let-values**
+ guard-body convert-pat mapper my-backquote extend-backquote
+ sexp-dispatch)
+
+ ;; TEMP
+ clause-body extend-backquote
+ )
+
+;(import scheme)
+
+(define-syntax match+
+ (lambda (x)
+ (syntax-case x ()
+ [(_ (ThreadedId ...) Exp Clause ...)
+ #'(let f ((ThreadedId ThreadedId) ... (x Exp))
+ (match-help _ f x (ThreadedId ...) Clause ...))])))
+
+(define-syntax match
+ (lambda (x)
+ (syntax-case x ()
+ [(_ Exp Clause ...)
+ #'(let f ((x Exp))
+ (match-help _ f x () Clause ...))])))
+
+(define-syntax trace-match+
+ (lambda (x)
+ (syntax-case x ()
+ [(_ (ThreadedId ...) Name Exp Clause ...)
+ #'(letrec ((f (trace-lambda Name (ThreadedId ... x)
+ (match-help _ f x (ThreadedId ...) Clause ...))))
+ (f ThreadedId ... x))])))
+
+(define-syntax trace-match
+ (lambda (x)
+ (syntax-case x ()
+ [(_ Name Exp Clause ...)
+ #'(letrec ((f (trace-lambda Name (x)
+ (match-help _ f x () Clause ...))))
+ (f Exp))])))
+
+;;; ------------------------------
+
+(define-syntax let-values**
+ (syntax-rules ()
+ ((_ () B0 B ...) (begin B0 B ...))
+ ((_ ((Formals Exp) Rest ...) B0 B ...)
+ (let-values** (Rest ...)
+ (call-with-values (lambda () Exp)
+ (lambda Formals B0 B ...))))))
+
+(define-syntax match-help
+ (lambda (x)
+ (syntax-case x ()
+ ((_ Template Cata Obj ThreadedIds)
+ ;(inspect `(HRM ,(datum Template) ,(datum Cata) ,(datum Obj) ,(datum ThreadedIds)))
+ ;(inspect #'Template)
+ #'(error 'match "Unmatched datum.\n Datum: ~s\n Source-Location: ~s\n" Obj #'Template))
+
+ ((_ Template Cata Obj ThreadedIds (Pat B0 B ...) Rest ...)
+ #'(convert-pat Pat
+ (match-help1 Template Cata Obj ThreadedIds
+ (B0 B ...)
+ Rest ...)))
+
+ )))
+
+(define-syntax match-help1
+ (syntax-rules (guard)
+ ((_ PatLit Vars Cdecls Template Cata Obj ThreadedIds
+ ((guard G ...) B0 B ...) Rest ...)
+ (let ((ls/false (sexp-dispatch Obj PatLit)))
+ (if (and ls/false (apply (lambda Vars
+ (guard-body Cdecls
+ (extend-backquote Template (and G ...))))
+ ls/false))
+ (apply (lambda Vars
+ (clause-body Cata Cdecls ThreadedIds
+ (extend-backquote Template B0 B ...)))
+ ls/false)
+ (match-help Template Cata Obj ThreadedIds Rest ...))))
+ ((_ PatLit Vars Cdecls Template Cata Obj ThreadedIds
+ (B0 B ...) Rest ...)
+ (let ((ls/false (sexp-dispatch Obj PatLit)))
+ (if ls/false
+ (apply (lambda Vars
+ (clause-body Cata Cdecls ThreadedIds
+ (extend-backquote Template B0 B ...)))
+ ls/false)
+ (match-help Template Cata Obj ThreadedIds Rest ...))))))
+
+;((clause-body f () () (extend-backquote _ (void))))
+(define-syntax clause-body
+ (lambda (x)
+ (define build-mapper
+ (lambda (vars depth cata tIds)
+ (if (zero? depth)
+ cata
+ (with-syntax ((rest (build-mapper vars (- depth 1) cata tIds))
+ (vars vars)
+ (tIds tIds))
+ #'(mapper rest vars tIds)))))
+ (syntax-case x ()
+ ((_ Cata ((CVar CDepth CMyCata CFormal ...) ...) (ThreadedId ...) B)
+ ;;; RRN: Hacking this a bit for larceny compat:
+ (with-syntax ((((TIDs ...) ...)
+ (datum->syntax-object #'_
+ (map (lambda _ #'(ThreadedId ...))
+ (syntax-object->datum #'((CFormal ...) ...))))))
+ (with-syntax (((Mapper ...)
+ (map (lambda (mycata formals depth)
+ (build-mapper formals
+ (syntax-object->datum depth)
+ (syntax-case mycata ()
+ [#f #'Cata]
+ [exp #'exp])
+ #'(ThreadedId ...)))
+ #'(CMyCata ...)
+ #'((CFormal ...) ...)
+ #'(CDepth ...))))
+ ;; [2007.12.26] RRN: Removing ellipses from references to ThreadedId here:
+ ;; This appears to have been a bug. Introducing TIDs instead.
+ #'(let-values** (([TIDs ... CFormal ...]
+ (Mapper TIDs ... CVar))
+ ...)
+ B)
+
+ )
+ )))))
+
+(define-syntax guard-body
+ (lambda (x)
+ (syntax-case x ()
+ ((_ ((Cvar Cdepth MyCata Cformal ...) ...) B)
+ (with-syntax (((CF ...) (apply append #'((Cformal ...) ...))))
+ #'(let-syntax
+ ((CF
+ (lambda (x)
+ (syntax-case x ()
+ (Name
+ (syntax-error #'Name
+ "guard cannot refer to return-value variable")))))
+ ...)
+ B))))))
+
+(define-syntax convert-pat
+ ;; returns sexp-pat x vars x cdecls
+ (let ()
+ (define ellipsis?
+ (lambda (x)
+ (and (identifier? x) (literal-identifier=? x #'(... ...)))))
+ (define Var?
+ (lambda (x)
+ (syntax-case x (->)
+ [-> #f]
+ [id (identifier? #'id)])))
+ (define (f syn vars cdecls depth)
+ (syntax-case syn (unquote)
+ ((unquote . stuff) ; separate for better error detection
+ (syntax-case syn (unquote ->)
+ ((unquote [MyCata -> Var ...])
+ (andmap Var? #'(Var ...))
+ (with-syntax (((Temp) (generate-temporaries '(x)))
+ (Depth depth))
+ (values #'any
+ (cons #'Temp vars)
+ (cons #'[Temp Depth MyCata Var ...] cdecls))))
+ ((unquote [Var ...])
+ (andmap Var? #'(Var ...))
+ (with-syntax (((Temp) (generate-temporaries '(x)))
+ (Depth depth))
+ (values #'any
+ (cons #'Temp vars)
+ (cons #'[Temp Depth #f Var ...] cdecls))))
+ ((unquote Var)
+ (Var? #'Var)
+ (values #'any (cons #'Var vars) cdecls))))
+ (((unquote . stuff) Dots) ; separate for better error detection
+ (ellipsis? #'Dots)
+ (syntax-case syn (unquote ->)
+ (((unquote [MyCata -> Var ...]) Dots)
+ (andmap Var? #'(Var ...))
+ (with-syntax (((Temp) (generate-temporaries '(x)))
+ (Depth+1 (+ 1 depth)))
+ (values #'each-any
+ (cons #'Temp vars)
+ (cons #'[Temp Depth+1 MyCata Var ...] cdecls))))
+ (((unquote [Var ...]) Dots)
+ (andmap Var? #'(Var ...))
+ (with-syntax (((Temp) (generate-temporaries '(x)))
+ (Depth+1 (+ 1 depth)))
+ (values #'each-any
+ (cons #'Temp vars)
+ (cons #'[Temp Depth+1 #f Var ...] cdecls))))
+ (((unquote Var) Dots)
+ (Var? #'Var)
+ (values #'each-any (cons #'Var vars) cdecls))
+ ((expr Dots) (syntax-error #'expr "match-pattern unquote syntax"))))
+ ((Pat Dots)
+ (ellipsis? #'Dots)
+ (let-synvalues* (((Dpat Dvars Dcdecls)
+ (f #'Pat vars cdecls (+ 1 depth))))
+ (with-syntax ((Size (- (length #'Dvars) (length vars))))
+ (values #'#(each Dpat Size) #'Dvars #'Dcdecls))))
+ ((Pat Dots . Rest)
+ (ellipsis? #'Dots)
+ (let-synvalues* (((Rpat Rvars Rcdecls)
+ (f #'Rest vars cdecls depth))
+ ((Dpat Dvars Dcdecls)
+ (f #'(Pat (... ...)) #'Rvars #'Rcdecls
+ depth)))
+ (with-syntax ((Size (- (length #'Dvars) (length #'Rvars)))
+ ((RevRestTl . RevRest) (reverseX #'Rpat '())))
+ (values #'#(tail-each Dpat Size RevRest RevRestTl)
+ #'Dvars #'Dcdecls))))
+ ((X . Y)
+ (let-synvalues* (((Ypat Yvars Ycdecls)
+ (f #'Y vars cdecls depth))
+ ((Xpat Xvars Xcdecls)
+ (f #'X #'Yvars #'Ycdecls depth)))
+ (values #'(Xpat . Ypat) #'Xvars #'Xcdecls)))
+ (() (values #'() vars cdecls))
+ (#(X ...)
+ (let-synvalues* (((Pat Vars CDecls) (f #'(X ...) vars cdecls depth)))
+ (values #'#(vector Pat) #'Vars #'CDecls)))
+ (Thing (values #'#(atom Thing) vars cdecls))))
+ (define reverseX
+ (lambda (ls acc)
+ (if (pair? ls)
+ (reverseX (cdr ls) (cons (car ls) acc))
+ (cons ls acc))))
+ (define-syntax let-synvalues*
+ (syntax-rules ()
+ ((_ () B0 B ...) (begin B0 B ...))
+ ((_ (((Formal ...) Exp) Decl ...) B0 B ...)
+ (call-with-values (lambda () Exp)
+ (lambda (Formal ...)
+ (with-syntax ((Formal Formal) ...)
+ (let-synvalues* (Decl ...) B0 B ...)))))))
+ (lambda (syn)
+ (syntax-case syn ()
+ ((_ syn (kh . kt))
+ (let-synvalues* (((a b c) (f #'syn '() '() 0)))
+ #'(kh 'a b c . kt)))))))
+
+(define-syntax mapper
+ (lambda (x)
+ (syntax-case x ()
+ ((_ F (RetId ...) (ThreadId ...))
+ (with-syntax (((t ...) (generate-temporaries #'(RetId ...)))
+ ((ts ...) (generate-temporaries #'(RetId ...)))
+ ((null ...) (map (lambda (x) #'()) #'(RetId ...))))
+ #'(let ((fun F))
+ (rec g
+ (lambda (ThreadId ... ls)
+ (if (null? ls)
+ (values ThreadId ... null ...)
+ (call-with-values
+ (lambda () (g ThreadId ... (cdr ls)))
+ (lambda (ThreadId ... ts ...)
+ (call-with-values
+ (lambda ()
+ (fun ThreadId ...
+ (car ls)))
+ (lambda (ThreadId ... t ...)
+ (values ThreadId ... (cons t ts) ...))))))))))))))
+
+;;; ------------------------------
+
+(define-syntax my-backquote
+ (lambda (x)
+ (define ellipsis?
+ (lambda (x)
+ (and (identifier? x) (literal-identifier=? x #'(... ...)))))
+ (define-syntax with-values
+ (syntax-rules ()
+ ((_ P C) (call-with-values (lambda () P) C))))
+ (define-syntax syntax-lambda
+ (lambda (x)
+ (syntax-case x ()
+ ((_ (Pat ...) Body0 Body ...)
+ (with-syntax (((X ...) (generate-temporaries #'(Pat ...))))
+ #'(lambda (X ...)
+ (with-syntax ((Pat X) ...)
+ Body0 Body ...)))))))
+ (define-syntax with-temp
+ (syntax-rules ()
+ ((_ V Body0 Body ...)
+ (with-syntax (((V) (generate-temporaries '(x))))
+ Body0 Body ...))))
+ (define-syntax with-temps
+ (syntax-rules ()
+ ((_ (V ...) (Exp ...) Body0 Body ...)
+ (with-syntax (((V ...) (generate-temporaries #'(Exp ...))))
+ Body0 Body ...))))
+ (define destruct
+ (lambda (Orig x depth)
+ (syntax-case x (quasiquote unquote unquote-splicing)
+ ;; inner quasiquote
+ ((quasiquote Exp)
+ (with-values (destruct Orig #'Exp (+ 1 depth))
+ (syntax-lambda (Builder Vars Exps)
+ (if (null? #'Vars)
+ (values #''(quasiquote Exp) '() '())
+ (values #'(list 'quasiquote Builder) #'Vars #'Exps)))))
+ ;; unquote
+ ((unquote Exp)
+ (zero? depth)
+ (with-temp X
+ (values #'X (list #'X) (list #'Exp))))
+ ((unquote Exp)
+ (with-values (destruct Orig #'Exp (+ -1 depth))
+ (syntax-lambda (Builder Vars Exps)
+ (if (null? #'Vars)
+ (values #''(unquote Exp) '() '())
+ (values #'(list 'unquote Builder) #'Vars #'Exps)))))
+ ;; splicing
+ (((unquote-splicing Exp))
+ (zero? depth)
+ (with-temp X
+ (values #'X (list #'X) (list #'Exp))))
+ (((unquote-splicing Exp ...))
+ (zero? depth)
+ (with-temps (X ...) (Exp ...)
+ (values #'(append X ...) #'(X ...) #'(Exp ...))))
+ (((unquote-splicing Exp ...) . Rest)
+ (zero? depth)
+ (with-values (destruct Orig #'Rest depth)
+ (syntax-lambda (Builder Vars Exps)
+ (with-temps (X ...) (Exp ...)
+ (if (null? #'Vars)
+ (values #'(append X ... 'Rest)
+ #'(X ...) #'(Exp ...))
+ (values #'(append X ... Builder)
+ #'(X ... . Vars) #'(Exp ... . Exps)))))))
+ ((unquote-splicing Exp ...)
+ (with-values (destruct Orig #'(Exp ...) (+ -1 depth))
+ (syntax-lambda (Builder Vars Exps)
+ (if (null? #'Vars)
+ (values #''(unquote-splicing Exp ...) '() '())
+ (values #'(cons 'unquote-splicing Builder)
+ #'Vars #'Exps)))))
+ ;; dots
+ (((unquote Exp) Dots)
+ (and (zero? depth) (ellipsis? #'Dots))
+ (with-temp X
+ (values #'X (list #'X) (list #'Exp))))
+ (((unquote Exp) Dots . Rest)
+ (and (zero? depth) (ellipsis? #'Dots))
+ (with-values (destruct Orig #'Rest depth)
+ (syntax-lambda (RestBuilder RestVars RestExps)
+ (with-syntax ((TailExp
+ (if (null? #'RestVars)
+ #''Rest
+ #'RestBuilder)))
+ (with-temp X
+ (values #'(append X TailExp)
+ (cons #'X #'RestVars)
+ (cons #'Exp #'RestExps)))))))
+ ((Exp Dots . Rest)
+ (and (zero? depth) (ellipsis? #'Dots))
+ (with-values (destruct Orig #'Exp depth)
+ (syntax-lambda (ExpBuilder (ExpVar ...) (ExpExp ...))
+ (if (null? #'(ExpVar ...))
+ (syntax-error Orig "Bad ellipsis")
+ (with-values (destruct Orig #'Rest depth)
+ (syntax-lambda (RestBuilder RestVars RestExps)
+ (with-syntax ((TailExp
+ (if (null? #'RestVars)
+ #''Rest
+ #'RestBuilder))
+ (Orig Orig))
+ (values #'(let f ((ExpVar ExpVar) ...)
+ (if (and (pair? ExpVar) ...)
+ (cons
+ (let ((ExpVar (car ExpVar)) ...)
+ ExpBuilder)
+ (f (cdr ExpVar) ...))
+ (if (and (null? ExpVar) ...)
+ TailExp
+ (error 'unquote
+ "Mismatched lists in ~s"
+ Orig))))
+ (append #'(ExpVar ...) #'RestVars)
+ (append #'(ExpExp ...) #'RestExps)))))))))
+ ;; Vectors
+ (#(X ...)
+ (with-values (destruct Orig #'(X ...) depth)
+ (syntax-lambda (LsBuilder LsVars LsExps)
+ (values #'(list->vector LsBuilder) #'LsVars #'LsExps))))
+ ;; random stuff
+ ((Hd . Tl)
+ (with-values (destruct Orig #'Hd depth)
+ (syntax-lambda (HdBuilder HdVars HdExps)
+ (with-values (destruct Orig #'Tl depth)
+ (syntax-lambda (TlBuilder TlVars TlExps)
+ (with-syntax ((Hd (if (null? #'HdVars)
+ #''Hd
+ #'HdBuilder))
+ (Tl (if (null? #'TlVars)
+ #''Tl
+ #'TlBuilder)))
+ (values #'(cons Hd Tl)
+ (append #'HdVars #'TlVars)
+ (append #'HdExps #'TlExps))))))))
+ (OtherThing
+ (values #''OtherThing '() '())))))
+ ;; macro begins
+ (syntax-case x ()
+ ((_ Datum)
+ (with-values (destruct #'(quasiquote Datum) #'Datum 0)
+ (syntax-lambda (Builder (Var ...) (Exp ...))
+ (if (null? #'(Var ...))
+ #''Datum
+ #'(let ((Var Exp) ...)
+ Builder))))))))
+
+(define-syntax extend-backquote
+ (lambda (x)
+ (syntax-case x ()
+ ((_ Template Exp ...)
+ (with-syntax ((quasiquote
+ (datum->syntax-object #'Template 'quasiquote)))
+ #'(let-syntax ((quasiquote
+ (lambda (x)
+ (syntax-case x ()
+ ((_ Foo) #'(my-backquote Foo))))))
+ Exp ...))))))
+
+;;; ------------------------------
+
+(define-syntax with-values
+ (syntax-rules ()
+ ((_ P C) (call-with-values (lambda () P) C))))
+
+(define-syntax letcc
+ (syntax-rules ()
+ ((_ V B0 B ...) (call/1cc (lambda (V) B0 B ...)))))
+
+(define classify-list
+ (lambda (ls)
+ (cond
+ ((null? ls) 'proper)
+ ((not (pair? ls)) 'improper)
+ (else
+ (let f ((tortoise ls) (hare (cdr ls)))
+ (cond
+ ((eq? tortoise hare) 'infinite)
+ ((null? hare) 'proper)
+ ((not (pair? hare)) 'improper)
+ (else
+ (let ((hare (cdr hare)))
+ (cond
+ ((null? hare) 'proper)
+ ((not (pair? hare)) 'improper)
+ (else (f (cdr ls) (cdr hare))))))))))))
+
+(define ilist-copy-flat
+ (lambda (ils)
+ (let f ((tortoise ils) (hare (cdr ils)))
+ (if (eq? tortoise hare)
+ (list (car tortoise))
+ (cons (car tortoise) (f (cdr tortoise) (cddr hare)))))))
+
+(define sexp-dispatch
+ (lambda (obj pat);; #f or list of vars
+ (letcc escape
+ (let ((fail (lambda () (escape #f))))
+ (let f ((pat pat) (obj obj) (vals '()))
+ (cond
+ ((eq? pat 'any)
+ (cons obj vals))
+ ((eq? pat 'each-any)
+ ;; handle infinities
+ (case (classify-list obj)
+ ((proper infinite) (cons obj vals))
+ ((improper) (fail))))
+ ((pair? pat)
+ (if (pair? obj)
+ (f (car pat) (car obj) (f (cdr pat) (cdr obj) vals))
+ (fail)))
+ ((vector? pat)
+ (case (vector-ref pat 0)
+ ((atom)
+ (let ((a (vector-ref pat 1)))
+ (if (eqv? obj a)
+ vals
+ (fail))))
+ ((vector)
+ (if (vector? obj)
+ (let ((vec-pat (vector-ref pat 1)))
+ (f vec-pat (vector->list obj) vals))
+ (fail)))
+ ((each)
+ ;; if infinite, copy the list as flat, then do the matching,
+ ;; then do some set-cdrs.
+ (let ((each-pat (vector-ref pat 1))
+ (each-size (vector-ref pat 2)))
+ (case (classify-list obj)
+ ((improper) (fail))
+ ((infinite)
+ (let ((each-vals (f pat (ilist-copy-flat obj) '())))
+ (for-each (lambda (x) (set-cdr! (last-pair x) x))
+ each-vals)
+ (append each-vals vals)))
+ ((proper)
+ (append
+ (let g ((obj obj))
+ (if (null? obj)
+ (make-list each-size '())
+ (let ((hd-vals (f each-pat (car obj) '()))
+ (tl-vals (g (cdr obj))))
+ (map cons hd-vals tl-vals))))
+ vals)))))
+ ((tail-each)
+ (let ((each-pat (vector-ref pat 1))
+ (each-size (vector-ref pat 2))
+ (revtail-pat (vector-ref pat 3))
+ (revtail-tail-pat (vector-ref pat 4)))
+ (when (eq? (classify-list obj) 'infinite) (fail))
+ (with-values
+ (let g ((obj obj))
+ ;; in-tail?, vals, revtail-left/ls
+ (cond
+ ((pair? obj)
+ (with-values (g (cdr obj))
+ (lambda (in-tail? vals tail-left/ls)
+ (if in-tail?
+ (if (null? tail-left/ls)
+ (values #f vals (list (car obj)))
+ (values #t (f (car tail-left/ls)
+ (car obj)
+ vals)
+ (cdr tail-left/ls)))
+ (values #f vals
+ (cons (car obj) tail-left/ls))))))
+ (else
+ (values #t
+ (f revtail-tail-pat obj '())
+ revtail-pat))))
+ (lambda (in-tail? vals tail-left/ls)
+ (if in-tail?
+ (if (null? tail-left/ls)
+ (append (make-list each-size '())
+ vals)
+ (fail))
+ (f each-pat tail-left/ls vals))))))))
+ (else
+ (if (eqv? obj pat)
+ vals
+ (fail)))))))))
+
+
+) ;; End Module
+
+
+#|
+
+;;; examples of passing along threaded information.
+
+;;; Try (collect-symbols '(if (x y 'a 'c zz) 'b 'c))
+;;; Note that it commonizes the reference to c.
+
+(define-syntax with-values
+ (syntax-rules ()
+ ((_ P C) (call-with-values (lambda () P) C))))
+(define collect-symbols
+ (lambda (exp)
+ (with-values (collect-symbols-help exp)
+ (lambda (symbol-decls exp)
+ (match symbol-decls
+ (((,symbol-name . ,symbol-var) ...)
+ `(let ((,symbol-var (quote ,symbol-name)) ...) ,exp)))))))
+(define collect-symbols-help
+ (lambda (exp)
+ (let ((symbol-env '()))
+ (match+ (symbol-env) exp
+ (,x
+ (guard (symbol? x))
+ (values symbol-env x))
+ ((quote ,x)
+ (guard (symbol? x))
+ (let ((pair/false (assq x symbol-env)))
+ (if pair/false
+ (values symbol-env (cdr pair/false))
+ (let ((v (gensym)))
+ (values (cons (cons x v) symbol-env)
+ v)))))
+ ((quote ,x)
+ (values symbol-env `(quote ,x)))
+ ((if ,[t] ,[c] ,[a])
+ (values symbol-env `(if ,t ,c ,a)))
+ ((,[op] ,[arg] ...)
+ (values symbol-env `(,op ,arg ...)))))))
+
+;;; the grammar for this one is just if-exprs and everything else
+
+(define collect-leaves
+ (lambda (exp acc)
+ (match+ (acc) exp
+ ((if ,[] ,[] ,[])
+ acc)
+ ((,[] ,[] ...)
+ acc)
+ (,x
+ (cons x acc)))))
+
+;; here's something that takes apart quoted stuff.
+
+(define destruct
+ (lambda (datum)
+ (match datum
+ (() `'())
+ ((,[X] . ,[Y])`(cons ,X ,Y))
+ (#(,[X] ...) `(vector ,X ...))
+ (,thing
+ (guard (symbol? thing))
+ `',thing)
+ (,thing
+ thing))))
+
+;; examples using explicit Catas
+
+(define sumsquares
+ (lambda (ls)
+ (define square
+ (lambda (x)
+ (* x x)))
+ (match ls
+ [(,[a*] ...) (apply + a*)]
+ [,[square -> n] n])))
+
+(define sumsquares
+ (lambda (ls)
+ (define square
+ (lambda (x)
+ (* x x)))
+ (let ([acc 0])
+ (match+ (acc) ls
+ [(,[] ...) acc]
+ [,[(lambda (acc x) (+ acc (square x))) ->] acc]))))
+
+;;; The following uses explicit Catas to parse programs in the
+;;; simple language defined by the grammar below
+
+;;; <Prog> -> (program <Stmt>* <Expr>)
+;;; <Stmt> -> (if <Expr> <Stmt> <Stmt>)
+;;; | (set! <var> <Expr>)
+;;; <Expr> -> <var>
+;;; | <integer>
+;;; | (if <Expr> <Expr> <Expr>)
+;;; | (<Expr> <Expr*>)
+
+
+(define parse
+ (lambda (x)
+ (define Prog
+ (lambda (x)
+ (match x
+ [(program ,[Stmt -> s*] ... ,[Expr -> e])
+ `(begin ,s* ... ,e)]
+ [,other (error 'parse "invalid program ~s" other)])))
+ (define Stmt
+ (lambda (x)
+ (match x
+ [(if ,[Expr -> e] ,[Stmt -> s1] ,[Stmt -> s2])
+ `(if ,e ,s1 ,s2)]
+ [(set! ,v ,[Expr -> e])
+ (guard (symbol? v))
+ `(set! ,v ,e)]
+ [,other (error 'parse "invalid statement ~s" other)])))
+ (define Expr
+ (lambda (x)
+ (match x
+ [,v (guard (symbol? v)) v]
+ [,n (guard (integer? n)) n]
+ [(if ,[e1] ,[e2] ,[e3])
+ `(if ,e1 ,e2 ,e3)]
+ [(,[rator] ,[rand*] ...) `(,rator ,rand* ...)]
+ [,other (error 'parse "invalid expression ~s" other)])))
+ (Prog x)))
+;;; (parse '(program (set! x 3) (+ x 4)))) => (begin (set! x 3) (+ x 4))
+
+;; CHANGELOG (most recent changes are logged at the top of this file)
+
+;; [29 Feb 2000]
+;; Fixed a case sensitivity bug.
+
+;; [24 Feb 2000]
+;; Matcher now handles vector patterns. Quasiquote also handles
+;; vector patterns, but does NOT do the csv6.2 optimization of
+;; `#(a 1 ,(+ 3 4) x y) ==> (vector 'a 1 (+ 3 4) 'x 'y).
+;; Also fixed bug in (P ... . P) matching code.
+
+;; [23 Feb 2000]
+;; KSM fixed bug in unquote-splicing inside quasiquote.
+
+;; [10 Feb 2000]
+;; New forms match+ and trace-match+ thread arguments right-to-left.
+;; The pattern (P ... . P) now works the way you might expect.
+;; Infinite lists are now properly matched (and not matched).
+;; Removed the @ pattern.
+;; Internal: No longer converting into syntax-case.
+
+;; [6 Feb 2000]
+;; Added expansion-time error message for referring to cata variable
+;; in a guard.
+
+;; [4 Feb 2000]
+;; Fixed backquote so it can handle nested backquote (oops).
+;; Double-backquoted elipses are neutralized just as double-backquoted
+;; unquotes are. So:
+;; `(a ,'(1 2 3) ... b) =eval=> (a 1 2 3 b)
+;; ``(a ,'(1 2 3) ... b) =eval=> `(a ,'(1 2 3) ... b)
+;; ``(a ,(,(1 2 3) ...) b) =eval=> `(a ,(1 2 3) b)
+;; Added support for
+;; `((unquote-splicing x y z) b) =expand==> (append x y z (list 'b))
+
+;; [1 Feb 2000]
+;; Fixed a bug involving forgetting to quote stuff in the revised backquote.
+;; Recognized unquote-splicing and signalled errors in the appropriate places.
+;; Added support for deep elipses in backquote.
+;; Rewrote backquote so it does the rebuilding directly instead of
+;; expanding into Chez's backquote.
+
+;; [31 Jan 2000]
+;; Kent Dybvig fixed template bug.
+
+;; [31 Jan 2000]
+;; Added the trace-match form, and made guards contain
+;; an explicit and expression:
+;; (guard E ...) ==> (guard (and E ...))
+
+;; [26 Jan 2000]
+;; Inside the clauses of match expressions, the following
+;; transformation is performed inside backquote expressions:
+;; ,v ... ==> , at v
+;; (,v ,w) ... ==> ,@(map list v w)
+;; etc.
+
+|#
\ No newline at end of file
diff --git a/old_script/parser2.ss b/old_script/parser2.ss
new file mode 100644
index 0000000..dae3a7b
--- /dev/null
+++ b/old_script/parser2.ss
@@ -0,0 +1,45 @@
+#! /bin/bash
+#|
+exec chez --script "$0" ${1+"$@"}
+|#
+
+(printf "Called with ~s\n" (command-line-arguments))
+
+(include "match.ss")
+(import iu-match)
+
+;[2001.07.15]
+(define port->slist
+ (lambda (p)
+ (let porttoslistloop ([exp (read p)] [acc '()])
+ (if (eof-object? exp)
+ (begin (close-input-port p)
+ (reverse! acc))
+ (porttoslistloop (read p) (cons exp acc))))))
+;; [2008.05.07] Adding a hack to tolerate #! lines at the top of the file.
+(define file->slist
+ (lambda (filename . opts)
+ (let* ([p (apply open-input-file filename opts)]
+ [line1 (get-line p)])
+ ;; Doesn't allow whitespace!
+ (if (and (>= (string-length line1) 2)
+ (string=? "#!" (substring line1 0 2)))
+ ;; Read the rest of it straight away:
+ (port->slist p)
+ ;; Oops, we need to "put back" that first line We do that by just starting over.
+ (begin (close-input-port p)
+ (port->slist (apply open-input-file filename opts))))
+ )))
+
+(define sexps (file->slist (car (command-line-arguments))))
+
+(define wstrings
+ (match sexps
+ [() '()]
+ [(,uq ,[x]) (guard (eq? uq 'unquote)) x]
+ [,s (guard (symbol? s)) (symbol->string s)]
+ [(,[hd] . ,[tl]) `(,hd . ,tl)]
+ ))
+
+(pretty-print wstrings)
+
diff --git a/old_script/parser_fullnames.ss b/old_script/parser_fullnames.ss
new file mode 100644
index 0000000..06d0e2f
--- /dev/null
+++ b/old_script/parser_fullnames.ss
@@ -0,0 +1,187 @@
+#! /bin/bash
+#|
+exec mzscheme -qu "$0" ${1+"$@"}
+|#
+
+;; I'm duplicating the code from parser.ss and hacking it.
+
+(module parser_fullnames mzscheme
+ ;; Import the parser and lexer generators.
+ (require (lib "yacc.ss" "parser-tools")
+ (lib "lex.ss" "parser-tools")
+ (prefix : (lib "lex-sre.ss" "parser-tools"))
+ (lib "list.ss")
+ ;(lib "iu-match.ss")
+ "iu-match.ss"
+ )
+; (provide (all-defined))
+
+(require (lib "pretty.ss"))
+
+(define-empty-tokens op-tokens
+ (LeftParen RightParen Comma SemiColon Colon
+ EOF ))
+
+(define-tokens value-tokens (NAME NUM))
+
+(define-lex-abbrevs (lower-letter (:/ "a" "z"))
+ (upper-letter (:/ #\A #\Z))
+ (digit (:/ "0" "9"))
+ (digits (:* digit))
+ (letter (:or lower-letter upper-letter digit "." "'" "-" "/"))
+ (name (:seq (:+ letter) (:* (:seq "_" (:* letter)))))
+ (number (:seq digits "." digits (:? (:seq "E-" digits))))
+ )
+
+(define ws-lex
+ (lexer-src-pos
+ [(eof) 'EOF]
+
+ ;; Ignore all whitespace:
+ [(:or #\tab #\space #\newline #\return) (return-without-pos (ws-lex input-port))]
+
+ ["(" 'LeftParen]
+ [")" 'RightParen]
+ ["," 'Comma] [";" 'SemiColon] [":" 'Colon]
+
+ [(:seq "'silva_" name "'")
+ (begin ;(printf "TEST ~a\n" (substring lexeme 7 (sub1 (string-length lexeme))))
+ (token-NAME (substring lexeme 7 (sub1 (string-length lexeme))))
+ )]
+
+ [(:seq "silva_" name) (token-NAME (substring lexeme 6 (string-length lexeme)))]
+
+
+ ;[number (token-NUM (string->number lexeme))]
+ [number (token-NUM lexeme)]
+
+ ))
+
+(define (format-pos pos)
+ (if (position-line pos)
+ (format "line ~a:~a" (position-line pos) (position-col pos))
+ (format "char ~a" (position-offset pos))))
+
+(define ws-parse
+ (parser
+ (src-pos)
+ (start wholefile)
+ (end EOF)
+
+ (tokens value-tokens op-tokens)
+ (error (lambda (a b c start end)
+ (printf "PARSE ERROR: after ~a token, ~a~a.\n"
+ (if a "valid" "invalid") b
+ (if c (format " carrying value ~s" c) ""))
+ (printf " Located between ~a and ~a.\n"
+ (format-pos start) (format-pos end))))
+ ;; Precedence:
+ ;(precs )
+ ;(debug "_parser.log")
+
+ (grammar
+
+ (wholefile [(tree SemiColon) $1]
+ ;; ACK making semicolon and final tag optional!!
+ [(tree) $1]
+ [(LeftParen nodes+ RightParen) (cons "0.0" $2)]
+ )
+
+ (numtag [(Colon NUM) $2])
+
+ (tree [(NAME numtag) (list $2 $1)]
+ [(LeftParen nodes+ RightParen numtag) (cons $4 $2)])
+
+ (nodes+ [(tree) (list $1)]
+ [(tree Comma nodes+) (cons $1 $3)])
+
+)))
+
+;; ================================================================================
+
+(define (ws-parse-file f)
+ (let ([p (open-input-file f)])
+ (let ([res (ws-parse-port p)])
+ (close-input-port p)
+ res)))
+
+(define (ws-parse-port ip)
+ (port-count-lines! ip)
+ ;; cdr
+ (ws-parse (lambda () (flatten (ws-lex ip))))
+ ;(position-token-token (ws-lex ip))
+ )
+
+(define (flatten pt)
+ ;(printf " ")
+ (let loop ((x pt))
+ (if (position-token? (position-token-token x))
+ (begin (error 'flatten "No nested position-tokens!")
+ (loop (position-token-token x)))
+ x)))
+
+(define allargs (vector->list (current-command-line-arguments)))
+(define pretty? #t)
+(define (main filename)
+ #;
+ (if pretty?
+ (pretty-print (ws-parse-file filename))
+ (begin (write (ws-parse-file filename))
+ (newline)
+ ))
+
+ (define table '())
+ (define counter 1)
+ (define (clean-name n)
+ (define matches (regexp-match #rx".*__(.*)" n))
+ (define cleaned (car (reverse matches)))
+ ;(printf "MATCHES ~a\n" matches)
+ (set! table `((,counter ,cleaned) . ,table))
+ (set! counter (+ 1 counter))
+ ;(sub1 counter)
+
+ ;; THIS IS ALL I CHANGED:
+ cleaned
+ )
+
+ (with-output-to-file (string-append filename ".nameonly")
+ (lambda ()
+ (let loop ((x (ws-parse-file filename)))
+ (match x
+ ;; A leaf:
+ [(,dist ,name) (guard (string? name))
+ (printf "~a:~a" (clean-name name) dist)]
+ [(,dist ,children ...)
+ (printf "(")
+ (let inner ((ls children))
+ (cond
+ [(null? ls) (void)]
+ [(null? (cdr ls)) (loop (car ls))]
+ [else (loop (car ls)) (printf ",")
+ (inner (cdr ls))]))
+ (printf "):~a" dist)]
+ ))
+ (printf ";\n")))
+
+ (with-output-to-file (string-append filename ".table")
+ (lambda ()
+ (for-each
+ (lambda (ls) (printf "~a ~a\n" (car ls) (cadr ls)))
+ (reverse table))))
+ )
+
+(print-graph #t)
+(print-vector-length #f)
+
+
+;; Here's our script invocation:
+
+;; When run in --persist mode we run in a loop.
+(if (null? allargs)
+ (error 'wsparse "No filename provided...")
+ (main (car allargs)))
+(printf "Done writing output files.\n")
+;(exit 0)
+
+) ;; End module.
+
diff --git a/phybin.cabal b/phybin.cabal
new file mode 100644
index 0000000..1e8c417
--- /dev/null
+++ b/phybin.cabal
@@ -0,0 +1,149 @@
+Name: phybin
+Version: 0.3
+License: BSD3
+License-file: LICENSE
+Stability: Beta
+Author: Ryan Newton <rrnewton at gmail.com>
+Maintainer: Ryan Newton <rrnewton at gmail.com>
+
+
+-- Version history:
+-- 0.1.2 -- first significant release
+-- 0.1.2.1 --
+-- 0.1.2.4 -- new release, several new features
+-- 0.1.2.5 -- bump for graphviz API changes
+-- 0.2 -- Add Robinson-Foulds distance, use Int labels.
+-- 0.2.2 -- misc changes and expose library
+-- 0.2.4 -- add consensus trees
+-- 0.2.6 -- colorization, hashrf, misc improvements
+-- 0.2.7 -- Add command line opt --showtrees
+-- 0.2.8 -- Add command line opt --highlight
+-- 0.2.9 -- Add command line opt --interior
+-- 0.2.10 -- Add command line opt --matching
+-- 0.2.11 -- Cleanup and windows compatibility.
+-- 0.3 -- significant improvements and new functionality.
+
+-- homepage: http://code.haskell.org/phybin
+-- homepage: http://people.csail.mit.edu/newton/phybin/
+homepage: http://www.cs.indiana.edu/~rrnewton/projects/phybin/
+
+Copyright: Copyright (c) 2010 Ryan Newton
+Synopsis: Utility for clustering phylogenetic trees in Newick format based on Robinson-Foulds distance.
+Description:
+ This package provides a libary and executable for dealing with Newick tree files.
+ .
+ It can do simple binning of identical trees or more complex clustering based on
+ an all-to-all Robinson-Foulds distance matrix.
+ .
+ phybin produces output files that characterize the size and contents of each bin or cluster (including
+ generating GraphViz-based visual representations of the tree topologies).
+
+Category: Bioinformatics
+Cabal-Version: >= 1.8
+
+Extra-source-files: README.md
+
+Build-type: Simple
+
+Source-repository head
+ type: git
+ location: git://github.com/rrnewton/PhyBin.git
+
+Flag hashrf
+ description: Use the HashRF algorithm by default instead of the naive one.
+ default: True
+
+Flag bitvec
+ description: Use bitvectors rather than IntSets for bipartitions.
+ default: False
+
+Flag sequential
+ description: Don't use any parallelism at all.
+ default: False
+
+Library
+ Exposed-modules: Bio.Phylogeny.PhyBin
+ Bio.Phylogeny.PhyBin.Binning
+ Bio.Phylogeny.PhyBin.CoreTypes
+ Bio.Phylogeny.PhyBin.Parser
+ Bio.Phylogeny.PhyBin.PreProcessor
+ Bio.Phylogeny.PhyBin.RFDistance
+ Bio.Phylogeny.PhyBin.Util
+ Bio.Phylogeny.PhyBin.Visualize
+ -- Data.BitList
+ Build-Depends: base >= 3 && < 5, directory, process, containers,
+ async, time,
+ filepath,
+ graphviz >= 2999.16,
+ text >= 0.11 && < 0.12,
+ prettyclass, fgl,
+ HUnit, bytestring,
+ -- For bytestring.lazy support:
+ parsec >= 3.1.0,
+ vector >= 0.10,
+ hierarchical-clustering >= 0.4, split >= 0.2
+-- lattice-par,
+ GHC-Options: -O2 -funbox-strict-fields -rtsopts
+
+ if flag(hashrf)
+ CPP-Options: -DUSE_HASHRF
+ if flag(bitvec)
+ CPP-Options: -DBITVEC_BIPS
+ Build-Depends: bitvec >= 0.1
+ if flag(sequential)
+ CPP-Options: -DSEQUENTIALIZE
+
+Executable phybin
+ Main-is: Main.hs
+ Build-Depends: phybin
+ -- DUPLICATE:
+ Build-Depends: base >= 3 && < 5, directory, process, containers,
+ async, time,
+ filepath,
+ graphviz >= 2999.16,
+ text >= 0.11 && < 0.12,
+ prettyclass, fgl,
+ HUnit, bytestring,
+ -- For bytestring.lazy support:
+ parsec >= 3.1.0,
+ vector >= 0.10,
+ hierarchical-clustering >= 0.4, split >= 0.2
+-- lattice-par,
+ GHC-Options: -O2 -funbox-strict-fields -rtsopts -threaded
+
+ if flag(hashrf)
+ CPP-Options: -DUSE_HASHRF
+ if flag(bitvec)
+ CPP-Options: -DBITVEC_BIPS
+ Build-Depends: bitvec >= 0.1
+ if flag(sequential)
+ CPP-Options: -DSEQUENTIALIZE
+
+
+-- DUPLICATED from executable:
+Test-Suite test-phybin
+ Main-is: TestAll.hs
+ Type: exitcode-stdio-1.0
+-- Build-Depends: phybin
+ Build-Depends: base >= 3 && < 5, directory, process, containers,
+ async, time,
+ filepath,
+ graphviz >= 2999.16,
+ text >= 0.11 && < 0.12,
+ prettyclass, fgl,
+ HUnit, bytestring,
+ parsec >= 3.1.0,
+ vector >= 0.10,
+ hierarchical-clustering >= 0.4, split >= 0.2
+ -- Additional depends for test:
+ Build-Depends: HUnit, test-framework, test-framework-hunit, test-framework-th
+ GHC-Options: -O2 -funbox-strict-fields -rtsopts -threaded
+
+ if flag(hashrf)
+ CPP-Options: -DUSE_HASHRF
+ if flag(bitvec)
+ CPP-Options: -DBITVEC_BIPS
+ Build-Depends: bitvec >= 0.1
+ if flag(sequential)
+ CPP-Options: -DSEQUENTIALIZE
+
diff --git a/point_check.sh b/point_check.sh
new file mode 100755
index 0000000..3a213c5
--- /dev/null
+++ b/point_check.sh
@@ -0,0 +1,6 @@
+#!/bin/bash
+
+arg1=$1
+arg2=$2
+
+phybin -p2 -b0.02 -n10 --complete --editdist=0 --rfdist Wolbachia/trees/*.$arg1.codon Wolbachia/trees/*.$arg2.codon
diff --git a/stripTable.hs b/stripTable.hs
new file mode 100644
index 0000000..bb2a8e8
--- /dev/null
+++ b/stripTable.hs
@@ -0,0 +1,50 @@
+
+
+-- | This is a quick hack to split up a table in a tab-delimited file of
+-- the format Irene produces. The result is a simple two-column,
+-- white-space delimited table of the kind phybin expects.
+
+
+import System.Environment (getArgs)
+
+import Control.Monad
+import Data.List
+import Data.List.Split
+
+import System.IO
+
+
+isDataLine :: String -> Bool
+isDataLine l =
+ case words l of
+ ("Roundup":"Orthology":_) -> False
+ (_:"results":"found":_) -> False
+ ("Gene":"Cluster":_) -> False
+ ("Id":"Genome":_) -> False
+ [] -> False
+ _ -> True
+
+main = do
+ args <- getArgs
+ let file = case args of
+ [f] -> f
+ _ -> error "Expects one argument!! [filename]"
+
+ raw <- readFile file
+ let lns = lines raw
+ filt = filter isDataLine lns
+ toks = map (splitOn ['\t']) filt
+ put = hPutStrLn stderr
+
+ put$"Read "++show (length lns)++" lines from file "++show file
+ put$" "++show (length filt)++" contain data"
+ put$" Distinct #toks found on data lines: " ++ show(nub (map length toks))
+ put$" A sample of ten parsed lines :"
+ mapM_ (put . (" "++) . show) $ take 10 $ toks
+
+ put$" Echoing columns one and two in space-separated form:"
+
+ forM_ toks $ \ (one:two:_) ->
+ putStrLn$ one ++" "++ two
+
+ return ()
diff --git a/tests/t1/112.tr b/tests/t1/112.tr
new file mode 100644
index 0000000..d5425ac
--- /dev/null
+++ b/tests/t1/112.tr
@@ -0,0 +1 @@
+((6_134_D-alanine--D-alanine_ligase_wRi_WRi_001550:0.00139828386866881348,7_88_D-alanine--D-alanine_ligase_wMel_WD0095:0.00292963228134226648)90:0.01074298822528597409,((1_177_D-alanine--D-alanine_ligase_family_protein_wUni-gwu_gwu_177:0.00000260053623247659,2_1056_D-alanine--D-alanine_ligase_family_protein_wVitA-gwv_gwv_1056:0.00000260053623247659)100:0.04173654858419943714,((3_584_D-alanine--D-alanine_ligase_wBm_Wbm0570:0.12287024242687821785,14_97_D-alanine--D-alanine_ligase_wOo_wOo_0 [...]
diff --git a/tests/t1/13.tr b/tests/t1/13.tr
new file mode 100644
index 0000000..f6526dd
--- /dev/null
+++ b/tests/t1/13.tr
@@ -0,0 +1 @@
+(1_903_uncharacterized_protein_involved_in_outer_membrane_biogenesis_wUni-gwu_gwu_903:0.00040202748005558947,(6_829_hypothetical_protein_wRi_WRi_009630:0.01489729960416468282,(7_929_hypothetical_protein_wMel_WD1011:0.00503560768415985908,(18_792_hypothetical_protein_wHa_wHa_08450:0.00231874015436256235,((14_125_hypothetical_protein_wOo_wOo_01830:0.25811131256411451451,3_124_hypothetical_protein_wBm_Wbm0121:0.13916197071632291360)100:0.06927753636164095397,(19_321_hypothetical_protein_wNo [...]
diff --git a/tests/t2/116.tr b/tests/t2/116.tr
new file mode 100644
index 0000000..d5a3a0c
--- /dev/null
+++ b/tests/t2/116.tr
@@ -0,0 +1 @@
+(3_1_uroporphyrinogen-III_decarboxylase_wBm_Wbm0001:0.24700160468123241730,((((6_846_uroporphyrinogen_decarboxylase_wRi_WRi_009840:0.00000450221771908982,18_807_Uroporphyrinogen_decarboxylase_wHa_wHa_08610:0.00294819211409539598)36:0.00000450221771908982,7_945_uroporphyrinogen_decarboxylase_wMel_WD1028:0.00196333316748527244)98:0.00593311441869561872,(2_624_uroporphyrinogen_decarboxylase_wVitA-gwv_gwv_624:0.00000450221771908982,1_914_uroporphyrinogen_decarboxylase_wUni-gwu_gwu_914:0.0009 [...]
diff --git a/tests/t2/233.tr b/tests/t2/233.tr
new file mode 100644
index 0000000..2656a8d
--- /dev/null
+++ b/tests/t2/233.tr
@@ -0,0 +1 @@
+(5_540_TrbL/VirB6_plasmid_conjugal_transfer_family_protein_wPip-Pel_WPa_0601:0.02084954811541155431,(19_278_Type_IV_secretion_system_protein_VirB6_putative_wNo_wNo_02980:0.05379840855764397162,((3_826_Type_IV_secretory_pathway_VirB6_components_wBm_Wbm0795:0.13408854198340605657,14_87_type_IV_secretory_pathway_VirB6_components_wOo_wOo_01320:0.21280678010849515824)79:0.04287407238705213258,((2_331_trbL/VirB6_plasmid_conjugal_transfer_family_protein_wVitA-gwv_gwv_331:0.00000263655140029384, [...]
diff --git a/tests/t30_mismatched/mismatched.tr b/tests/t30_mismatched/mismatched.tr
new file mode 100644
index 0000000..9b788c9
--- /dev/null
+++ b/tests/t30_mismatched/mismatched.tr
@@ -0,0 +1,3 @@
+(2_, (((14, 3_), (19, (5_, 13))), (18, (6_, 7_))), 1_);
+(2_, (((14, 3_), (19, (5_, 13))), (6_, 7_)), 1_);
+(2_, (((14, 3_), (19, (5_, 13))), (18, 6_, 7_)), 1_);
diff --git a/tests/t3_consensus/cluster1_284_alltrees.tr b/tests/t3_consensus/cluster1_284_alltrees.tr
new file mode 100644
index 0000000..e2f1176
--- /dev/null
+++ b/tests/t3_consensus/cluster1_284_alltrees.tr
@@ -0,0 +1,284 @@
+(3_, (((13, 5_), 19), ((2_, 1_), ((7_, 6_), 18))), 14);
+(2_, (((7_, 6_), 18), (((5_, 13), 19), (3_, 14))), 1_);
+(6_, (18, (((19, (5_, 13)), (3_, 14)), (2_, 1_))), 7_);
+((7_, 6_), (((19, (5_, 13)), (14, 3_)), (1_, 2_)), 18);
+((13, 5_), ((14, 3_), (((6_, 7_), 18), (1_, 2_))), 19);
+(2_, (((14, 3_), (19, (5_, 13))), (18, (6_, 7_))), 1_);
+((5_, 13), ((14, 3_), ((18, (7_, 6_)), (2_, 1_))), 19);
+(14, ((((7_, 6_), 18), (2_, 1_)), ((5_, 13), 19)), 3_);
+(13, (((3_, 14), ((1_, 2_), (18, (6_, 7_)))), 19), 5_);
+((13, 5_), ((((7_, 6_), (1_, 2_)), 18), (14, 3_)), 19);
+(5_, (19, ((18, ((6_, 7_), (2_, 1_))), (14, 3_))), 13);
+(1_, (((((5_, 13), 19), (14, 3_)), 18), (6_, 7_)), 2_);
+(((18, ((6_, 7_), (2_, 1_))), (3_, 14)), (5_, 13), 19);
+(((7_, 6_), (1_, 2_)), ((19, (5_, 13)), (14, 3_)), 18);
+(1_, ((((14, 3_), ((13, 5_), 19)), 18), (6_, 7_)), 2_);
+(6_, ((((19, (13, 5_)), (3_, 14)), 18), (1_, 2_)), 7_);
+(14, (((5_, 13), 19), (((2_, 1_), 18), (7_, 6_))), 3_);
+(5_, (19, ((((2_, 1_), 18), (7_, 6_)), (14, 3_))), 13);
+(2_, (18, (((14, 3_), (19, (5_, 13))), (7_, 6_))), 1_);
+((((14, 3_), (19, (5_, 13))), (6_, 7_)), (1_, 2_), 18);
+(2_, (((((5_, 19), 13), (3_, 14)), 18), (7_, 6_)), 1_);
+(14, ((((1_, 2_), (6_, 7_)), 18), (13, (5_, 19))), 3_);
+(((3_, 14), (13, (5_, 19))), ((7_, 6_), (2_, 1_)), 18);
+(7_, ((1_, 2_), (18, ((3_, 14), (13, (19, 5_))))), 6_);
+(14, ((18, ((6_, 7_), (2_, 1_))), (13, (19, 5_))), 3_);
+(6_, ((((13, (5_, 19)), (3_, 14)), 18), (2_, 1_)), 7_);
+(3_, (((19, 5_), 13), (18, ((6_, 7_), (1_, 2_)))), 14);
+(((1_, 2_), (6_, 7_)), ((3_, 14), (13, (5_, 19))), 18);
+((6_, 7_), ((1_, 2_), ((3_, 14), (13, (19, 5_)))), 18);
+(3_, ((13, (19, 5_)), ((18, (6_, 7_)), (2_, 1_))), 14);
+(19, ((((2_, 1_), (18, (6_, 7_))), (14, 3_)), 13), 5_);
+(7_, (18, ((1_, 2_), ((3_, 14), (13, (19, 5_))))), 6_);
+(14, (((18, (6_, 7_)), (2_, 1_)), (13, (19, 5_))), 3_);
+(6_, (18, ((2_, 1_), (((19, 5_), 13), (14, 3_)))), 7_);
+((7_, 6_), ((1_, 2_), ((14, 3_), (13, (19, 5_)))), 18);
+(7_, (((13, (19, 5_)), (3_, 14)), (18, (2_, 1_))), 6_);
+(3_, (((18, (2_, 1_)), (7_, 6_)), (13, (5_, 19))), 14);
+(14, ((5_, (13, 19)), ((6_, 7_), ((2_, 1_), 18))), 3_);
+(13, (5_, ((3_, 14), ((18, 7_), (6_, (2_, 1_))))), 19);
+(3_, (((6_, (2_, 1_)), (18, 7_)), (5_, (13, 19))), 14);
+((19, 13), (((6_, (1_, 2_)), (7_, 18)), (3_, 14)), 5_);
+(((7_, 18), ((14, 3_), (5_, (19, 13)))), (1_, 2_), 6_);
+(1_, (((18, 7_), 6_), ((5_, (19, 13)), (14, 3_))), 2_);
+(3_, (((13, 19), 5_), ((6_, (18, 7_)), (1_, 2_))), 14);
+(3_, (((1_, 2_), ((7_, 18), 6_)), (5_, (19, 13))), 14);
+(3_, ((5_, (13, 19)), ((1_, 2_), ((18, 7_), 6_))), 14);
+((18, 7_), (((14, 3_), (5_, (13, 19))), (2_, 1_)), 6_);
+(7_, (6_, ((2_, 1_), (((13, 19), 5_), (14, 3_)))), 18);
+(((14, 3_), ((2_, 1_), ((7_, 18), 6_))), (13, 19), 5_);
+(18, ((2_, 1_), ((((19, 13), 5_), (14, 3_)), 6_)), 7_);
+(19, (5_, ((6_, ((1_, 2_), (7_, 18))), (3_, 14))), 13);
+(3_, ((6_, ((2_, 1_), (18, 7_))), (5_, (13, 19))), 14);
+(((2_, 1_), (7_, 18)), ((5_, (19, 13)), (14, 3_)), 6_);
+((19, 13), ((6_, ((2_, 1_), (7_, 18))), (3_, 14)), 5_);
+(2_, ((7_, 18), (((3_, 14), ((19, 13), 5_)), 6_)), 1_);
+(13, (5_, ((6_, ((1_, 2_), (18, 7_))), (3_, 14))), 19);
+(14, ((5_, (13, 19)), (((2_, 1_), (18, 7_)), 6_)), 3_);
+(13, (5_, ((6_, ((1_, 2_), (18, 7_))), (3_, 14))), 19);
+(13, (5_, ((((2_, 1_), (18, 6_)), 7_), (14, 3_))), 19);
+((19, 13), ((14, 3_), (7_, ((1_, 2_), (18, 6_)))), 5_);
+(2_, ((6_, 18), (7_, ((5_, (19, 13)), (3_, 14)))), 1_);
+(((18, 6_), (1_, 2_)), ((5_, (13, 19)), (3_, 14)), 7_);
+(13, (5_, ((14, 3_), (((1_, 2_), (6_, 18)), 7_))), 19);
+(((3_, 14), (7_, ((2_, 1_), (6_, 18)))), (19, 13), 5_);
+((19, 13), ((7_, ((6_, 18), (1_, 2_))), (14, 3_)), 5_);
+(6_, ((((5_, (19, 13)), (14, 3_)), 7_), (2_, 1_)), 18);
+(6_, ((7_, ((3_, 14), (5_, (19, 13)))), (1_, 2_)), 18);
+(1_, (((5_, (13, 19)), (3_, 14)), (7_, (18, 6_))), 2_);
+((19, 13), (((7_, (6_, 18)), (2_, 1_)), (3_, 14)), 5_);
+((13, 19), (((2_, 1_), ((6_, 18), 7_)), (14, 3_)), 5_);
+(1_, (((5_, (13, 19)), (3_, 14)), (7_, (6_, 18))), 2_);
+(6_, (7_, (((14, 3_), (5_, (13, 19))), (2_, 1_))), 18);
+(1_, ((7_, (6_, 18)), ((5_, (19, 13)), (14, 3_))), 2_);
+(14, ((5_, (19, 13)), ((2_, 1_), ((6_, 18), 7_))), 3_);
+(1_, (((3_, 14), (5_, (13, 19))), (7_, (18, 6_))), 2_);
+((13, 19), (((2_, 1_), (7_, (18, 6_))), (14, 3_)), 5_);
+(2_, (((5_, (13, 19)), (14, 3_)), (7_, (18, 6_))), 1_);
+(1_, (((6_, 18), 7_), ((3_, 14), (5_, (13, 19)))), 2_);
+(14, (((7_, (6_, 18)), (2_, 1_)), (5_, (13, 19))), 3_);
+(18, ((7_, (1_, 2_)), ((5_, (13, 19)), (3_, 14))), 6_);
+(((18, 6_), ((5_, (13, 19)), (3_, 14))), (2_, 1_), 7_);
+(1_, (7_, ((18, 6_), ((3_, 14), ((13, 19), 5_)))), 2_);
+(19, (5_, ((14, 3_), (((1_, 2_), 7_), (18, 6_)))), 13);
+(18, (((5_, (13, 19)), (14, 3_)), (7_, (1_, 2_))), 6_);
+((1_, 2_), (((5_, (19, 13)), (14, 3_)), (18, 6_)), 7_);
+(2_, ((7_, 6_), (((3_, 14), ((13, 19), 5_)), 18)), 1_);
+(14, ((5_, (19, 13)), (((7_, 6_), (1_, 2_)), 18)), 3_);
+(19, (((14, 3_), (((2_, 1_), (6_, 7_)), 18)), 5_), 13);
+(2_, ((((5_, (19, 13)), (3_, 14)), 18), (7_, 6_)), 1_);
+((13, 19), ((18, ((6_, 7_), (1_, 2_))), (3_, 14)), 5_);
+(2_, (((((14, 3_), ((19, 13), 5_)), 18), 6_), 7_), 1_);
+(((7_, (1_, 2_)), 6_), ((14, 3_), (5_, (13, 19))), 18);
+(1_, (7_, (6_, (18, ((5_, (13, 19)), (3_, 14))))), 2_);
+(((14, 3_), (18, ((7_, (1_, 2_)), 6_))), (13, 19), 5_);
+((13, 19), ((18, (6_, (7_, (1_, 2_)))), (3_, 14)), 5_);
+(3_, ((((6_, 18), 7_), (2_, 1_)), (19, (13, 5_))), 14);
+(2_, ((((5_, 13), 19), (14, 3_)), (7_, (6_, 18))), 1_);
+(2_, (((3_, 14), ((5_, 13), 19)), ((6_, 18), 7_)), 1_);
+(1_, (((19, (13, 5_)), (3_, 14)), ((18, 6_), 7_)), 2_);
+(((2_, 1_), (((13, 5_), 19), (3_, 14))), (6_, 18), 7_);
+(5_, (19, ((3_, 14), ((2_, 1_), (7_, (6_, 18))))), 13);
+((6_, 18), (((19, (13, 5_)), (14, 3_)), (1_, 2_)), 7_);
+(((2_, 1_), ((19, (13, 5_)), (14, 3_))), (6_, 18), 7_);
+(2_, (((19, (5_, 13)), (3_, 14)), ((6_, 18), 7_)), 1_);
+(5_, (((3_, 14), (((6_, 18), 7_), (1_, 2_))), 19), 13);
+(13, (19, (((2_, 1_), ((18, 6_), 7_)), (3_, 14))), 5_);
+(14, ((19, (5_, 13)), ((7_, (6_, 18)), (2_, 1_))), 3_);
+(5_, (19, ((3_, 14), ((1_, 2_), (7_, (18, 6_))))), 13);
+(5_, (19, ((14, 3_), ((1_, 2_), (7_, (18, 6_))))), 13);
+(18, (7_, ((1_, 2_), ((14, 3_), (19, (5_, 13))))), 6_);
+(5_, (19, (((1_, 2_), ((6_, 18), 7_)), (3_, 14))), 13);
+(14, ((19, (5_, 13)), (((2_, 1_), 7_), (18, 6_))), 3_);
+(14, ((19, (13, 5_)), ((7_, (2_, 1_)), (18, 6_))), 3_);
+(14, (((18, 6_), ((1_, 2_), 7_)), (19, (13, 5_))), 3_);
+(2_, (7_, ((((13, 5_), 19), (14, 3_)), (18, 6_))), 1_);
+(3_, (((5_, 13), 19), ((6_, 18), (7_, (2_, 1_)))), 14);
+(1_, (((18, 6_), ((19, (13, 5_)), (3_, 14))), 7_), 2_);
+(14, ((19, (13, 5_)), ((6_, 18), (7_, (2_, 1_)))), 3_);
+(18, ((7_, (2_, 1_)), ((3_, 14), (19, (13, 5_)))), 6_);
+(((3_, 14), ((18, 6_), (7_, (2_, 1_)))), (5_, 13), 19);
+((5_, 13), ((14, 3_), (7_, ((2_, 1_), (6_, 18)))), 19);
+(3_, ((19, (13, 5_)), (7_, ((2_, 1_), (18, 6_)))), 14);
+(2_, ((7_, ((3_, 14), (19, (13, 5_)))), (6_, 18)), 1_);
+(6_, ((((3_, 14), ((5_, 13), 19)), 7_), (2_, 1_)), 18);
+(18, ((1_, 2_), (7_, ((14, 3_), (19, (5_, 13))))), 6_);
+(3_, ((19, (13, 5_)), (7_, ((2_, 1_), (6_, 18)))), 14);
+(2_, ((18, 6_), (7_, (((13, 5_), 19), (14, 3_)))), 1_);
+(1_, ((((14, 3_), (19, (5_, 13))), 7_), (6_, 18)), 2_);
+(6_, ((7_, ((19, (13, 5_)), (14, 3_))), (1_, 2_)), 18);
+((5_, 13), ((3_, 14), (7_, ((2_, 1_), (18, 6_)))), 19);
+((((((5_, 13), 19), (3_, 14)), 7_), 6_), (2_, 1_), 18);
+(13, (19, ((((18, (1_, 2_)), 6_), 7_), (3_, 14))), 5_);
+((6_, (18, (1_, 2_))), ((3_, 14), (19, (13, 5_))), 7_);
+(1_, ((6_, (7_, ((14, 3_), (19, (5_, 13))))), 18), 2_);
+(1_, (18, (7_, (6_, ((3_, 14), (19, (5_, 13)))))), 2_);
+(((3_, 14), (19, (5_, 13))), (7_, (18, (1_, 2_))), 6_);
+(5_, (19, ((3_, 14), (6_, (7_, (18, (1_, 2_)))))), 13);
+(5_, (19, ((14, 3_), (6_, (7_, ((2_, 1_), 18))))), 13);
+((5_, 13), ((6_, (((2_, 1_), 18), 7_)), (14, 3_)), 19);
+(5_, (19, (((((2_, 1_), 18), 7_), 6_), (14, 3_))), 13);
+((5_, 13), (((7_, (18, (2_, 1_))), 6_), (3_, 14)), 19);
+((5_, 13), ((3_, 14), (6_, ((1_, 2_), (7_, 18)))), 19);
+(1_, ((18, 7_), (((19, (5_, 13)), (14, 3_)), 6_)), 2_);
+(7_, ((6_, ((14, 3_), (19, (13, 5_)))), (1_, 2_)), 18);
+(5_, (19, ((3_, 14), (6_, ((1_, 2_), (18, 7_))))), 13);
+(3_, ((((18, 7_), (2_, 1_)), 6_), (19, (13, 5_))), 14);
+(5_, (19, ((6_, ((2_, 1_), (7_, 18))), (3_, 14))), 13);
+(3_, ((((18, 7_), (2_, 1_)), 6_), (19, (5_, 13))), 14);
+(((2_, 1_), (7_, 18)), ((14, 3_), (19, (5_, 13))), 6_);
+((13, 5_), ((14, 3_), (6_, ((1_, 2_), (7_, 18)))), 19);
+(3_, ((19, (5_, 13)), (((1_, 2_), 6_), (18, 7_))), 14);
+(13, (19, (((7_, 18), (6_, (2_, 1_))), (14, 3_))), 5_);
+(7_, ((6_, (2_, 1_)), (((13, 5_), 19), (3_, 14))), 18);
+(3_, ((19, (5_, 13)), ((6_, (2_, 1_)), (18, 7_))), 14);
+(7_, ((6_, (1_, 2_)), ((3_, 14), (19, (5_, 13)))), 18);
+(1_, ((((19, (5_, 13)), (3_, 14)), (18, 7_)), 6_), 2_);
+(((14, 3_), (18, (7_, ((1_, 2_), 6_)))), (19, 5_), 13);
+(5_, (13, ((14, 3_), (18, ((6_, (2_, 1_)), 7_)))), 19);
+((19, 5_), ((3_, 14), (18, (7_, ((1_, 2_), 6_)))), 13);
+((7_, ((((19, 5_), 13), (14, 3_)), 18)), (1_, 2_), 6_);
+(19, (13, ((18, ((6_, (1_, 2_)), 7_)), (3_, 14))), 5_);
+((1_, 2_), (7_, (18, ((14, 3_), (13, (5_, 19))))), 6_);
+(14, ((18, (7_, ((1_, 2_), 6_))), (5_, (19, 13))), 3_);
+(2_, (6_, ((((14, 3_), (5_, (19, 13))), 18), 7_)), 1_);
+((2_, 1_), (7_, (18, ((5_, (19, 13)), (14, 3_)))), 6_);
+(((2_, 1_), 6_), (18, ((3_, 14), (5_, (13, 19)))), 7_);
+((7_, ((1_, 2_), 6_)), ((3_, 14), (5_, (19, 13))), 18);
+(13, (((14, 3_), ((7_, (6_, (2_, 1_))), 18)), 19), 5_);
+(1_, (6_, (7_, (18, ((14, 3_), (19, (13, 5_)))))), 2_);
+(14, ((19, (5_, 13)), ((((1_, 2_), 6_), 7_), 18)), 3_);
+(2_, ((7_, (((19, (5_, 13)), (14, 3_)), 18)), 6_), 1_);
+((18, ((3_, 14), ((5_, 13), 19))), (6_, (1_, 2_)), 7_);
+((19, 13), ((7_, ((6_, (2_, 1_)), 18)), (3_, 14)), 5_);
+(((6_, (1_, 2_)), 18), ((14, 3_), (5_, (13, 19))), 7_);
+(14, (((((1_, 2_), 6_), 18), 7_), (5_, (13, 19))), 3_);
+(((((3_, 14), ((13, 5_), 19)), 7_), 18), (2_, 1_), 6_);
+((13, 5_), ((7_, (18, (6_, (1_, 2_)))), (14, 3_)), 19);
+(5_, (19, ((7_, (18, ((2_, 1_), 6_))), (3_, 14))), 13);
+(13, (19, ((3_, 14), ((((2_, 1_), 6_), 18), 7_))), 5_);
+(2_, (6_, ((((19, (5_, 13)), (14, 3_)), 7_), 18)), 1_);
+(3_, (((5_, 13), 19), (7_, (18, ((1_, 2_), 6_)))), 14);
+(((6_, (1_, 2_)), 18), (((19, 5_), 13), (3_, 14)), 7_);
+((((14, 3_), ((19, 5_), 13)), 7_), (6_, (1_, 2_)), 18);
+((7_, ((13, (19, 5_)), (3_, 14))), (6_, (2_, 1_)), 18);
+(5_, (13, ((14, 3_), ((18, (6_, (1_, 2_))), 7_))), 19);
+(1_, (6_, (((((19, 5_), 13), (14, 3_)), 7_), 18)), 2_);
+(2_, (7_, (18, (6_, ((14, 3_), ((13, 5_), 19))))), 1_);
+(14, (((((2_, 1_), 7_), 18), 6_), (19, (13, 5_))), 3_);
+(14, ((19, (5_, 13)), (6_, ((7_, (2_, 1_)), 18))), 3_);
+(14, ((19, (5_, 13)), (6_, (18, (7_, (1_, 2_))))), 3_);
+(1_, (7_, (((((13, 5_), 19), (3_, 14)), 6_), 18)), 2_);
+((18, (7_, (1_, 2_))), (((5_, 13), 19), (14, 3_)), 6_);
+((7_, (1_, 2_)), (6_, ((19, (5_, 13)), (3_, 14))), 18);
+(5_, (19, ((6_, (18, (7_, (1_, 2_)))), (14, 3_))), 13);
+((18, (((3_, 14), (5_, (19, 13))), 6_)), (1_, 2_), 7_);
+(1_, (7_, (18, (((5_, (19, 13)), (3_, 14)), 6_))), 2_);
+(1_, (7_, ((6_, ((5_, (13, 19)), (14, 3_))), 18)), 2_);
+(3_, ((6_, (18, ((2_, 1_), 7_))), (5_, (19, 13))), 14);
+(3_, ((5_, (19, 13)), (6_, (18, ((1_, 2_), 7_)))), 14);
+((13, 19), ((6_, (18, (7_, (2_, 1_)))), (14, 3_)), 5_);
+(14, (((19, 13), 5_), ((18, ((2_, 1_), 7_)), 6_)), 3_);
+((6_, (((13, 19), 5_), (3_, 14))), (7_, (1_, 2_)), 18);
+(14, ((6_, (18, ((2_, 1_), 7_))), (13, (5_, 19))), 3_);
+(2_, (7_, (18, (((13, (5_, 19)), (3_, 14)), 6_))), 1_);
+((1_, 2_), (18, (((3_, 14), (13, (19, 5_))), 6_)), 7_);
+(((13, (19, 5_)), (14, 3_)), (18, (7_, (2_, 1_))), 6_);
+(3_, ((((7_, (2_, 1_)), 18), 6_), (13, (5_, 19))), 14);
+(14, ((6_, (((1_, 2_), 7_), 18)), (13, (19, 5_))), 3_);
+(((1_, 2_), 7_), (6_, ((14, 3_), ((19, 5_), 13))), 18);
+((((13, (5_, 19)), (14, 3_)), 6_), (7_, (2_, 1_)), 18);
+((5_, 19), ((6_, (18, ((7_, 2_), 1_))), (3_, 14)), 13);
+(2_, (1_, (18, (((13, (19, 5_)), (14, 3_)), 6_))), 7_);
+((18, (1_, (7_, 2_))), ((14, 3_), (13, (19, 5_))), 6_);
+(7_, ((18, (((14, 3_), (13, (5_, 19))), 6_)), 1_), 2_);
+(1_, (2_, (18, (6_, ((13, (19, 5_)), (14, 3_))))), 7_);
+(18, ((1_, 2_), (6_, (((19, 5_), 13), (3_, 14)))), 7_);
+((7_, ((2_, 1_), 18)), (((5_, 19), 13), (3_, 14)), 6_);
+(19, (13, ((6_, (7_, (18, (2_, 1_)))), (3_, 14))), 5_);
+((19, 5_), ((6_, (7_, (18, (1_, 2_)))), (3_, 14)), 13);
+(5_, (13, ((14, 3_), (6_, (7_, (18, (1_, 2_)))))), 19);
+(19, (13, (((7_, (18, (2_, 1_))), 6_), (3_, 14))), 5_);
+((7_, ((1_, 2_), 18)), ((13, (19, 5_)), (3_, 14)), 6_);
+(((14, 3_), ((18, 6_), (7_, (2_, 1_)))), (19, 5_), 13);
+(6_, ((7_, (2_, 1_)), ((13, (19, 5_)), (3_, 14))), 18);
+(((3_, 14), (((1_, 2_), 7_), (18, 6_))), (5_, 19), 13);
+((5_, 19), (((18, 6_), (7_, (2_, 1_))), (14, 3_)), 13);
+(5_, (((14, 3_), ((6_, 18), (7_, (2_, 1_)))), 13), 19);
+(3_, (((7_, (2_, 1_)), (6_, 18)), (13, (19, 5_))), 14);
+((5_, 19), (((6_, 18), (7_, (1_, 2_))), (3_, 14)), 13);
+(5_, (13, ((7_, (2_, 1_)), ((6_, 18), (14, 3_)))), 19);
+(18, (((13, (5_, 19)), (7_, (2_, 1_))), (3_, 14)), 6_);
+(1_, (7_, ((14, 3_), (6_, (18, (13, (19, 5_)))))), 2_);
+(19, (13, (18, (((14, 3_), 6_), (7_, (2_, 1_))))), 5_);
+((6_, ((14, (3_, (13, (5_, 19)))), 18)), (2_, 1_), 7_);
+(2_, (7_, (6_, ((14, (3_, (13, (19, 5_)))), 18))), 1_);
+((6_, (18, ((14, 3_), (13, (19, 5_))))), (1_, 2_), 7_);
+(5_, (13, (((((1_, 2_), 7_), 6_), 18), (3_, 14))), 19);
+((6_, (7_, (2_, 1_))), ((3_, 14), (13, (19, 5_))), 18);
+((7_, (1_, 2_)), (18, ((14, 3_), ((5_, 19), 13))), 6_);
+(14, ((13, (5_, 19)), (18, (6_, (7_, (1_, 2_))))), 3_);
+((7_, (1_, 2_)), (18, ((13, (19, 5_)), (3_, 14))), 6_);
+(2_, (7_, ((18, ((13, (5_, 19)), (3_, 14))), 6_)), 1_);
+(2_, (((18, (((5_, 19), 13), (14, 3_))), 6_), 7_), 1_);
+(1_, (7_, ((18, (3_, (14, (13, (19, 5_))))), 6_)), 2_);
+((19, 5_), (3_, (14, (18, ((2_, 1_), (6_, 7_))))), 13);
+(18, (6_, (((3_, 14), ((5_, 19), 13)), (1_, 2_))), 7_);
+((((13, (5_, 19)), (3_, 14)), (2_, 1_)), (7_, 18), 6_);
+((((14, 3_), ((19, 5_), 13)), (1_, 2_)), (18, 7_), 6_);
+(7_, (6_, ((1_, 2_), ((3_, 14), ((5_, 19), 13)))), 18);
+(2_, ((6_, (18, 7_)), ((14, 3_), (13, (19, 5_)))), 1_);
+(18, (6_, (((3_, 14), ((19, 5_), 13)), (1_, 2_))), 7_);
+(14, (((19, 5_), 13), ((2_, 1_), ((7_, 18), 6_))), 3_);
+(2_, ((7_, (6_, 18)), ((14, 3_), (13, (19, 5_)))), 1_);
+(19, ((((2_, 1_), (7_, (6_, 18))), (3_, 14)), 13), 5_);
+((18, 6_), (((13, (19, 5_)), (14, 3_)), (2_, 1_)), 7_);
+(2_, (((14, 3_), ((19, 5_), 13)), (7_, (18, 6_))), 1_);
+((5_, 19), ((3_, 14), (((6_, 18), 7_), (2_, 1_))), 13);
+(6_, (7_, (((3_, 14), (13, (5_, 19))), (2_, 1_))), 18);
+((((5_, 19), 13), 14), ((1_, 2_), (18, (7_, 6_))), 3_);
+(((((13, (19, 5_)), 14), 3_), (2_, 1_)), (7_, 6_), 18);
+((6_, 7_), ((3_, (14, (13, (19, 5_)))), (1_, 2_)), 18);
+(((3_, ((13, (5_, 19)), 14)), (1_, 2_)), (6_, 18), 7_);
+(2_, (((14, (13, (19, 5_))), 3_), ((6_, 18), 7_)), 1_);
+((18, 7_), ((3_, (14, (13, (5_, 19)))), (1_, 2_)), 6_);
+((5_, 19), (3_, (((1_, 2_), ((7_, 18), 6_)), 14)), 13);
+((6_, 18), ((2_, 1_), (14, (3_, (13, (19, 5_))))), 7_);
+((13, (5_, 19)), (14, (7_, (18, ((2_, 1_), 6_)))), 3_);
+((((3_, 14), 7_), (18, ((1_, 2_), 6_))), (5_, 19), 13);
+(14, (18, (((19, 5_), 13), (7_, (6_, (2_, 1_))))), 3_);
+(6_, (2_, ((7_, ((13, (19, 5_)), (14, 3_))), 18)), 1_);
+(19, (13, ((18, (7_, ((6_, 1_), 2_))), (14, 3_))), 5_);
+(1_, (((18, ((13, (5_, 19)), (3_, 14))), 2_), 6_), 7_);
+(7_, ((((5_, 19), 13), (3_, 14)), (18, (6_, 2_))), 1_);
+(19, (13, ((((6_, (18, 1_)), 7_), 2_), (14, 3_))), 5_);
+(((((14, 3_), (13, (19, 5_))), 2_), 6_), (18, 1_), 7_);
+(19, (((14, 3_), ((1_, ((18, 2_), 6_)), 7_)), 13), 5_);
+(3_, ((1_, (7_, ((18, 2_), 6_))), (13, (19, 5_))), 14);
+(14, ((7_, ((1_, 2_), (6_, 18))), (13, (19, 5_))), 3_);
+(19, (((((6_, 18), (2_, 1_)), 7_), (3_, 14)), 13), 5_);
+(((14, 3_), (13, (19, 5_))), ((2_, 1_), (6_, 18)), 7_);
+(14, ((13, (5_, 19)), (7_, ((6_, 18), (1_, 2_)))), 3_);
+(5_, (((7_, ((6_, 18), (1_, 2_))), (3_, 14)), 13), 19);
+(5_, (13, ((7_, ((2_, 1_), (18, 6_))), (14, 3_))), 19);
+(3_, (((2_, 1_), (6_, 18)), ((13, (19, 5_)), 7_)), 14);
diff --git a/tests/t3_consensus/cluster1_284_consensus.tr b/tests/t3_consensus/cluster1_284_consensus.tr
new file mode 100644
index 0000000..985b0c6
--- /dev/null
+++ b/tests/t3_consensus/cluster1_284_consensus.tr
@@ -0,0 +1 @@
+((19, 5_, 13), 14, 3_, 18, 6_, 7_, 1_, 2_);
diff --git a/tests/t4_consensus/cluster1_16_alltrees.tr b/tests/t4_consensus/cluster1_16_alltrees.tr
new file mode 100644
index 0000000..3cffbb8
--- /dev/null
+++ b/tests/t4_consensus/cluster1_16_alltrees.tr
@@ -0,0 +1,16 @@
+(3_, ((((6_, 18), 7_), (2_, 1_)), (19, (13, 5_))), 14);
+(2_, ((((5_, 13), 19), (14, 3_)), (7_, (6_, 18))), 1_);
+(2_, (((3_, 14), ((5_, 13), 19)), ((6_, 18), 7_)), 1_);
+(1_, (((19, (13, 5_)), (3_, 14)), ((18, 6_), 7_)), 2_);
+(((2_, 1_), (((13, 5_), 19), (3_, 14))), (6_, 18), 7_);
+(5_, (19, ((3_, 14), ((2_, 1_), (7_, (6_, 18))))), 13);
+((6_, 18), (((19, (13, 5_)), (14, 3_)), (1_, 2_)), 7_);
+(((2_, 1_), ((19, (13, 5_)), (14, 3_))), (6_, 18), 7_);
+(2_, (((19, (5_, 13)), (3_, 14)), ((6_, 18), 7_)), 1_);
+(5_, (((3_, 14), (((6_, 18), 7_), (1_, 2_))), 19), 13);
+(13, (19, (((2_, 1_), ((18, 6_), 7_)), (3_, 14))), 5_);
+(14, ((19, (5_, 13)), ((7_, (6_, 18)), (2_, 1_))), 3_);
+(5_, (19, ((3_, 14), ((1_, 2_), (7_, (18, 6_))))), 13);
+(5_, (19, ((14, 3_), ((1_, 2_), (7_, (18, 6_))))), 13);
+(18, (7_, ((1_, 2_), ((14, 3_), (19, (5_, 13))))), 6_);
+(5_, (19, (((1_, 2_), ((6_, 18), 7_)), (3_, 14))), 13);
diff --git a/tests/t4_consensus/cluster1_16_consensus.tr b/tests/t4_consensus/cluster1_16_consensus.tr
new file mode 100644
index 0000000..cd2ab06
--- /dev/null
+++ b/tests/t4_consensus/cluster1_16_consensus.tr
@@ -0,0 +1 @@
+((((5_, 13), 19), (14, 3_)), ((18, 6_), 7_), (1_, 2_));
diff --git a/tests/t5_consensus/cluster1_35_alltrees.tr b/tests/t5_consensus/cluster1_35_alltrees.tr
new file mode 100644
index 0000000..c40bb39
--- /dev/null
+++ b/tests/t5_consensus/cluster1_35_alltrees.tr
@@ -0,0 +1,35 @@
+(3_, ((((6_, 18), 7_), (2_, 1_)), (19, (13, 5_))), 14);
+(2_, ((((5_, 13), 19), (14, 3_)), (7_, (6_, 18))), 1_);
+(2_, (((3_, 14), ((5_, 13), 19)), ((6_, 18), 7_)), 1_);
+(1_, (((19, (13, 5_)), (3_, 14)), ((18, 6_), 7_)), 2_);
+(((2_, 1_), (((13, 5_), 19), (3_, 14))), (6_, 18), 7_);
+(5_, (19, ((3_, 14), ((2_, 1_), (7_, (6_, 18))))), 13);
+((6_, 18), (((19, (13, 5_)), (14, 3_)), (1_, 2_)), 7_);
+(((2_, 1_), ((19, (13, 5_)), (14, 3_))), (6_, 18), 7_);
+(2_, (((19, (5_, 13)), (3_, 14)), ((6_, 18), 7_)), 1_);
+(5_, (((3_, 14), (((6_, 18), 7_), (1_, 2_))), 19), 13);
+(13, (19, (((2_, 1_), ((18, 6_), 7_)), (3_, 14))), 5_);
+(14, ((19, (5_, 13)), ((7_, (6_, 18)), (2_, 1_))), 3_);
+(5_, (19, ((3_, 14), ((1_, 2_), (7_, (18, 6_))))), 13);
+(5_, (19, ((14, 3_), ((1_, 2_), (7_, (18, 6_))))), 13);
+(18, (7_, ((1_, 2_), ((14, 3_), (19, (5_, 13))))), 6_);
+(5_, (19, (((1_, 2_), ((6_, 18), 7_)), (3_, 14))), 13);
+(14, ((19, (5_, 13)), (((2_, 1_), 7_), (18, 6_))), 3_);
+(14, ((19, (13, 5_)), ((7_, (2_, 1_)), (18, 6_))), 3_);
+(14, (((18, 6_), ((1_, 2_), 7_)), (19, (13, 5_))), 3_);
+(2_, (7_, ((((13, 5_), 19), (14, 3_)), (18, 6_))), 1_);
+(3_, (((5_, 13), 19), ((6_, 18), (7_, (2_, 1_)))), 14);
+(1_, (((18, 6_), ((19, (13, 5_)), (3_, 14))), 7_), 2_);
+(14, ((19, (13, 5_)), ((6_, 18), (7_, (2_, 1_)))), 3_);
+(18, ((7_, (2_, 1_)), ((3_, 14), (19, (13, 5_)))), 6_);
+(((3_, 14), ((18, 6_), (7_, (2_, 1_)))), (5_, 13), 19);
+((5_, 13), ((14, 3_), (7_, ((2_, 1_), (6_, 18)))), 19);
+(3_, ((19, (13, 5_)), (7_, ((2_, 1_), (18, 6_)))), 14);
+(2_, ((7_, ((3_, 14), (19, (13, 5_)))), (6_, 18)), 1_);
+(6_, ((((3_, 14), ((5_, 13), 19)), 7_), (2_, 1_)), 18);
+(18, ((1_, 2_), (7_, ((14, 3_), (19, (5_, 13))))), 6_);
+(3_, ((19, (13, 5_)), (7_, ((2_, 1_), (6_, 18)))), 14);
+(2_, ((18, 6_), (7_, (((13, 5_), 19), (14, 3_)))), 1_);
+(1_, ((((14, 3_), (19, (5_, 13))), 7_), (6_, 18)), 2_);
+(6_, ((7_, ((19, (13, 5_)), (14, 3_))), (1_, 2_)), 18);
+((5_, 13), ((3_, 14), (7_, ((2_, 1_), (18, 6_)))), 19);
diff --git a/tests/t5_consensus/cluster1_35_consensus.tr b/tests/t5_consensus/cluster1_35_consensus.tr
new file mode 100644
index 0000000..84e16db
--- /dev/null
+++ b/tests/t5_consensus/cluster1_35_consensus.tr
@@ -0,0 +1 @@
+((((5_, 13), 19), (14, 3_)), (1_, 2_), (18, 6_), 7_);
diff --git a/vary_branchlen.sh b/vary_branchlen.sh
new file mode 100755
index 0000000..b358463
--- /dev/null
+++ b/vary_branchlen.sh
@@ -0,0 +1,47 @@
+#!/bin/bash
+set -e
+if [ "$1" == "" ]; then echo "ERR: Two args required!"; exit 1; fi
+if [ "$2" == "" ]; then echo "ERR: Two args required!"; exit 1; fi
+
+taxa=$1
+dataset=$2
+# 12 settings:
+branchlens="0.0001 0.001 0.01 0.02 0.03 0.05 0.1 0.2 0.3 0.5 1.0 5.0"
+
+DOGRAPH="-g"
+
+echo Wiping "$dataset/phybin_output/" ...
+rm -rf "$dataset/phybin_output/"
+
+echo "[parallel] Starting jobs..."
+
+for bl in $branchlens; do
+ echo "Running with branch len $bl"
+ echo "=============================="
+ PHYBIN="./phybin.exe -b $bl $DOGRAPH -n $taxa"
+ outdir="$dataset/phybin_output/brlen$bl/"
+ LOG="$dataset/phybin_output/brlen$bl.log"
+
+ mkdir -p $outdir
+
+ # ASSUMPTION1! We need the renaming table at a particular spot:
+ # ASSUMPTION2! We likewise expect the inputs in a certain spot.
+ RENAMER="$dataset/renaming_table.txt"
+ echo " RUNNING: $PHYBIN -m $RENAMER -s '_' -o $outdir"
+ ($PHYBIN -m $RENAMER -s '_' -o $outdir $dataset/final_trees/*BranchLab*.out | tee $LOG) &
+done
+
+echo "[parallel] Waiting for outstanding jobs"
+FAIL=0
+for job in `jobs -p`
+do
+echo $job
+ wait $job || let "FAIL+=1"
+done
+if [ "$FAIL" == "0" ];
+then
+echo "All jobs completed successfully"
+else
+echo "ERROR some jobs ($FAIL) failed!"
+exit 1
+fi
diff --git a/website/trees.jpg b/website/trees.jpg
new file mode 100644
index 0000000..b5ad92c
Binary files /dev/null and b/website/trees.jpg differ
diff --git a/website/website.css b/website/website.css
new file mode 100644
index 0000000..521234b
--- /dev/null
+++ b/website/website.css
@@ -0,0 +1,74 @@
+body {
+ margin: auto;
+ padding-right: 1em;
+ padding-left: 1em;
+ max-width: 44em;
+ border-left: 1px solid black;
+ border-right: 1px solid black;
+ color: black;
+ font-family: Verdana, sans-serif;
+ font-size: 100%;
+ line-height: 140%;
+ color: #333;
+}
+pre {
+ border: 1px dotted gray;
+ background-color: #ececec;
+ color: #1111111;
+ padding: 0.5em;
+}
+code {
+ font-family: monospace;
+}
+h1 a, h2 a, h3 a, h4 a, h5 a {
+ text-decoration: none;
+ color: #7a5ada;
+}
+h1, h2, h3, h4, h5 { font-family: verdana;
+ font-weight: bold;
+ border-bottom: 1px dotted black;
+ color: #7a5ada; }
+h1 {
+ font-size: 130%;
+}
+
+h2 {
+ font-size: 110%;
+}
+
+h3 {
+ font-size: 95%;
+}
+
+h4 {
+ font-size: 90%;
+ font-style: italic;
+}
+
+h5 {
+ font-size: 90%;
+ font-style: italic;
+}
+
+h1.title {
+ font-size: 200%;
+ font-weight: bold;
+ padding-top: 0.2em;
+ padding-bottom: 0.2em;
+ text-align: left;
+ border: none;
+}
+
+dt code {
+ font-weight: bold;
+}
+dd p {
+ margin-top: 0;
+}
+
+#footer {
+ padding-top: 1em;
+ font-size: 70%;
+ color: gray;
+ text-align: center;
+}
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/phybin.git
More information about the debian-med-commit
mailing list